From 4480c27ca3eaaaae134633a594fba5601da13b4a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 8 Jun 2022 16:22:55 +0200 Subject: [PATCH 0001/2223] gfs2: Add glockfd debugfs file When a process has a gfs2 file open, the file is keeping a reference on the underlying gfs2 inode, and the inode is keeping the inode's iopen glock held in shared mode. In other words, the process depends on the iopen glock of each open gfs2 file. Expose those dependencies in a new "glockfd" debugfs file. The new debugfs file contains one line for each gfs2 file descriptor, specifying the tgid, file descriptor number, and glock name, e.g., 1601 6 5/816d This list is compiled by iterating all tasks on the system using find_ge_pid(), and all file descriptors of each task using task_lookup_next_fd_rcu(). To make that work from gfs2, export those two functions. Signed-off-by: Andreas Gruenbacher --- fs/file.c | 1 + fs/gfs2/glock.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/pid.c | 1 + 3 files changed, 151 insertions(+) diff --git a/fs/file.c b/fs/file.c index 3bcc1ecc314a7..5f9c802a5d8d3 100644 --- a/fs/file.c +++ b/fs/file.c @@ -980,6 +980,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } +EXPORT_SYMBOL(task_lookup_next_fd_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c992d53013d31..85352126e662c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include "gfs2.h" #include "incore.h" @@ -2745,6 +2748,149 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; +struct gfs2_glockfd_iter { + struct super_block *sb; + unsigned int tgid; + struct task_struct *task; + unsigned int fd; + struct file *file; +}; + +static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct pid *pid; + + if (i->task) + put_task_struct(i->task); + + rcu_read_lock(); +retry: + i->task = NULL; + pid = find_ge_pid(i->tgid, ns); + if (pid) { + i->tgid = pid_nr_ns(pid, ns); + i->task = pid_task(pid, PIDTYPE_TGID); + if (!i->task) { + i->tgid++; + goto retry; + } + get_task_struct(i->task); + } + rcu_read_unlock(); + return i->task; +} + +static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) +{ + if (i->file) { + fput(i->file); + i->file = NULL; + } + + rcu_read_lock(); + for(;; i->fd++) { + struct inode *inode; + + i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + if (!i->file) { + i->fd = 0; + break; + } + inode = file_inode(i->file); + if (inode->i_sb != i->sb) + continue; + if (get_file_rcu(i->file)) + break; + } + rcu_read_unlock(); + return i->file; +} + +static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (*pos) + return NULL; + while (gfs2_glockfd_next_task(i)) { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } + return NULL; +} + +static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + (*pos)++; + i->fd++; + do { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } while (gfs2_glockfd_next_task(i)); + return NULL; +} + +static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (i->file) + fput(i->file); + if (i->task) + put_task_struct(i->task); +} + +static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + struct inode *inode = file_inode(i->file); + struct gfs2_glock *gl; + + inode_lock_shared(inode); + gl = GFS2_I(inode)->i_iopen_gh.gh_gl; + if (gl) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + } + inode_unlock_shared(inode); + return 0; +} + +static const struct seq_operations gfs2_glockfd_seq_ops = { + .start = gfs2_glockfd_seq_start, + .next = gfs2_glockfd_seq_next, + .stop = gfs2_glockfd_seq_stop, + .show = gfs2_glockfd_seq_show, +}; + +static int gfs2_glockfd_open(struct inode *inode, struct file *file) +{ + struct gfs2_glockfd_iter *i; + struct gfs2_sbd *sdp = inode->i_private; + + i = __seq_open_private(file, &gfs2_glockfd_seq_ops, + sizeof(struct gfs2_glockfd_iter)); + if (!i) + return -ENOMEM; + i->sb = sdp->sd_vfs; + return 0; +} + +static const struct file_operations gfs2_glockfd_fops = { + .owner = THIS_MODULE, + .open = gfs2_glockfd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) @@ -2754,6 +2900,9 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); + debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glockfd_fops); + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); diff --git a/kernel/pid.c b/kernel/pid.c index 2fc0a16ec77b1..3fbc5e46b7217 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } +EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { -- GitLab From 56535dc695f8e215dffb9557d6bcbdf46ff785d2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 23 Jun 2022 23:29:36 +0200 Subject: [PATCH 0002/2223] gfs2: Add flocks to glockfd debugfs file Include flock glocks in the "glockfd" debugfs file. Those are similar to the iopen glocks; while an open file is holding an flock, it is holding the file's flock glock. We cannot take f_fl_mutex in gfs2_glockfd_seq_show_flock() or else dumping the "glockfd" file would block on flock operations. Instead, use the file->f_lock spin lock to protect the f_fl_gh.gh_gl glock pointer. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 22 ++++++++++++++++++++-- fs/gfs2/glock.c | 23 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 2cceb193dcd85..25f4080bc973a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1444,6 +1444,22 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); } +static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) +{ + struct gfs2_glock *gl = fl_gh->gh_gl; + + /* + * Make sure gfs2_glock_put() won't sleep under the file->f_lock + * spinlock. + */ + + gfs2_glock_hold(gl); + spin_lock(&file->f_lock); + gfs2_holder_uninit(fl_gh); + spin_unlock(&file->f_lock); + gfs2_glock_put(gl); +} + static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; @@ -1475,7 +1491,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) &gfs2_flock_glops, CREATE, &gl); if (error) goto out; + spin_lock(&file->f_lock); gfs2_holder_init(gl, state, flags, fl_gh); + spin_unlock(&file->f_lock); gfs2_glock_put(gl); } for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { @@ -1486,7 +1504,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) msleep(sleeptime); } if (error) { - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); if (error == GLR_TRYFAILED) error = -EAGAIN; } else { @@ -1508,7 +1526,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) locks_lock_file_wait(file, fl); if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); } mutex_unlock(&fp->f_fl_mutex); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 85352126e662c..533ec772166d0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2846,6 +2846,28 @@ static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) put_task_struct(i->task); } +static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, + struct gfs2_glockfd_iter *i) +{ + struct gfs2_file *fp = i->file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; + + if (!READ_ONCE(fl_gh->gh_gl)) + return; + + spin_lock(&i->file->f_lock); + if (gfs2_holder_initialized(fl_gh)) + gl_name = fl_gh->gh_gl->gl_name; + spin_unlock(&i->file->f_lock); + + if (gl_name.ln_type != LM_TYPE_RESERVED) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl_name.ln_type, + (unsigned long long)gl_name.ln_number); + } +} + static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; @@ -2859,6 +2881,7 @@ static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) i->tgid, i->fd, gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); } + gfs2_glockfd_seq_show_flock(seq, i); inode_unlock_shared(inode); return 0; } -- GitLab From cbe6d2576e2cf7571e781439728ad31bdfd9dfcb Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 5 Apr 2022 22:07:30 +0200 Subject: [PATCH 0003/2223] gfs2: Add GL_NOPID flag for process-independent glock holders Add a GL_NOPID flag to indicate that once a glock holder has been acquired, it won't be associated with the current process anymore. This is useful for iopen and flock glocks which are associated with open files, as well as journal glock holders and similar which are associated with the filesystem. Once GL_NOPID is used for all applicable glocks (see the next patches), processes will no longer be falsely reported as holding glocks which they are not actually holding in the glocks dump file. Unlike before, when a process is reported as having "(ended)", this will indicate an actual bug. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 41 +++++++++++++++++++++++++++++++---------- fs/gfs2/glock.h | 1 + 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 533ec772166d0..f80fba5d1d4d9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1467,6 +1467,15 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static inline bool pid_is_meaningful(const struct gfs2_holder *gh) +{ + if (!(gh->gh_flags & GL_NOPID)) + return true; + if (gh->gh_state == LM_ST_UNLOCKED) + return true; + return false; +} + /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add @@ -1503,10 +1512,17 @@ __acquires(&gl->gl_lockref.lock) } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && - (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && - !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) - goto trap_recursive; + if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) + continue; + if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) + continue; + if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)) + continue; + if (!pid_is_meaningful(gh2)) + continue; + goto trap_recursive; + } + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: @@ -2321,19 +2337,24 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { - struct task_struct *gh_owner = NULL; + const char *comm = "(none)"; + pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); - if (gh->gh_owner_pid) + if (pid_is_meaningful(gh)) { + struct task_struct *gh_owner; + + comm = "(ended)"; + owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + if (gh_owner) + comm = gh_owner->comm; + } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", - (void *)gh->gh_ip); + gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c0ae9100a0bcf..e764ebeba54c4 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -91,6 +91,7 @@ enum { #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 #define GL_SKIP 0x0100 +#define GL_NOPID 0x0200 #define GL_NOCACHE 0x0400 /* -- GitLab From b582d5f05ddbd61bb72896b31ff83d7f0b0862f5 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 6 Apr 2022 12:51:27 +0200 Subject: [PATCH 0004/2223] gfs2: Mark flock glock holders as GL_NOPID Add the GL_NOPID flag for flock glock holders. Clean up the flag setting code in do_flock. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 25f4080bc973a..1383f9598011d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1472,7 +1472,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; + flags = GL_EXACT | GL_NOPID; + if (!IS_SETLKW(cmd)) + flags |= LM_FLAG_TRY_1CB; mutex_lock(&fp->f_fl_mutex); @@ -1500,7 +1502,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) error = gfs2_glock_nq(fl_gh); if (error != GLR_TRYFAILED) break; - fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB; + fl_gh->gh_flags |= LM_FLAG_TRY; msleep(sleeptime); } if (error) { -- GitLab From ebdc416c9c0bed245d6cda92ae2a98483e513051 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 5 Apr 2022 22:39:16 +0200 Subject: [PATCH 0005/2223] gfs2: Mark the remaining process-independent glock holders as GL_NOPID Add the GL_NOPID flag for the remaining glock holders which are not associated with the current process. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 6 ++++-- fs/gfs2/ops_fstype.c | 14 ++++++++------ fs/gfs2/super.c | 3 ++- fs/gfs2/util.c | 6 ++++-- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c8ec876f33ea3..e211ed8636b5b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -143,7 +143,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (blktype != GFS2_BLKST_UNLINKED) gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, + GL_EXACT | GL_NOPID, &ip->i_iopen_gh); gfs2_glock_put(io_gl); if (unlikely(error)) @@ -720,7 +721,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); BUG_ON(error); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID, + &ip->i_iopen_gh); if (error) goto fail_gunlock2; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c9b423c874a32..904a2d47c4b37 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -403,7 +403,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, - LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, mount_gh); if (error) { fs_err(sdp, "can't acquire mount glock: %d\n", error); @@ -413,7 +414,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_LIVE_LOCK, &gfs2_nondisk_glops, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); if (error) { fs_err(sdp, "can't acquire live glock: %d\n", error); @@ -689,7 +690,7 @@ static int init_statfs(struct gfs2_sbd *sdp) iput(pn); pn = NULL; ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); @@ -778,7 +779,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP | GL_NOCACHE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, &sdp->sd_journal_gh); if (error) { fs_err(sdp, "can't acquire journal glock: %d\n", error); @@ -788,7 +789,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) ip = GFS2_I(sdp->sd_jdesc->jd_inode); sdp->sd_jinode_gl = ip->i_gl; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, + LM_FLAG_NOEXP | GL_EXACT | + GL_NOCACHE | GL_NOPID, &sdp->sd_jinode_gh); if (error) { fs_err(sdp, "can't acquire journal inode glock: %d\n", @@ -959,7 +961,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) pn = NULL; ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bdb773e5c88f0..90db4a2892695 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -346,7 +346,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP, &sdp->sd_freeze_gh); + LM_FLAG_NOEXP | GL_NOPID, + &sdp->sd_freeze_gh); if (error) goto out; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8241029a2a5d2..95d733dd3c254 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -226,7 +226,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) */ fs_warn(sdp, "Requesting recovery of jid %d.\n", sdp->sd_lockstruct.ls_jid); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP, + gfs2_holder_reinit(LM_ST_EXCLUSIVE, + LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID, &sdp->sd_live_gh); msleep(GL_GLOCK_MAX_HOLD); /* @@ -251,7 +252,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) fs_warn(sdp, "Unable to recover our journal jid %d.\n", sdp->sd_lockstruct.ls_jid); gfs2_glock_dq_wait(&sdp->sd_live_gh); - gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT, + gfs2_holder_reinit(LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); gfs2_glock_nq(&sdp->sd_live_gh); } -- GitLab From 36a40c37389c7a1bef3f1024c55c056304acf439 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Jun 2022 18:25:25 +0300 Subject: [PATCH 0006/2223] nvdimm/namespace: return uuid_null only once in nd_dev_to_uuid() Refactor nd_dev_to_uuid() in order to make code shorter and cleaner by joining conditions and hence returning uuid_null only once. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220607152525.33468-1-andriy.shevchenko@linux.intel.com Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index bf4f5c09d9b1b..3dae17c90e8cf 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -170,15 +170,12 @@ EXPORT_SYMBOL(nvdimm_namespace_disk_name); const uuid_t *nd_dev_to_uuid(struct device *dev) { - if (!dev) - return &uuid_null; - - if (is_namespace_pmem(dev)) { + if (dev && is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); return nspm->uuid; - } else - return &uuid_null; + } + return &uuid_null; } EXPORT_SYMBOL(nd_dev_to_uuid); -- GitLab From 53fc59511fc4c567342b2ef3f7b99a086430e0b4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Jun 2022 18:37:50 +0300 Subject: [PATCH 0007/2223] nvdimm/namespace: drop unneeded temporary variable in size_store() Refactor size_store() in order to remove temporary variable on stack by joining conditionals. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220607153750.33639-1-andriy.shevchenko@linux.intel.com Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 3dae17c90e8cf..0f863fda56e6b 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -836,7 +836,6 @@ static ssize_t size_store(struct device *dev, { struct nd_region *nd_region = to_nd_region(dev->parent); unsigned long long val; - uuid_t **uuid = NULL; int rc; rc = kstrtoull(buf, 0, &val); @@ -850,16 +849,12 @@ static ssize_t size_store(struct device *dev, if (rc >= 0) rc = nd_namespace_label_update(nd_region, dev); - if (is_namespace_pmem(dev)) { + /* setting size zero == 'delete namespace' */ + if (rc == 0 && val == 0 && is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - uuid = &nspm->uuid; - } - - if (rc == 0 && val == 0 && uuid) { - /* setting size zero == 'delete namespace' */ - kfree(*uuid); - *uuid = NULL; + kfree(nspm->uuid); + nspm->uuid = NULL; } dev_dbg(dev, "%llx %s (%d)\n", val, rc < 0 ? "fail" : "success", rc); -- GitLab From 84261749e58a13e3287f948b61e6e453cca8ae9b Mon Sep 17 00:00:00 2001 From: Tomer Maimon Date: Sun, 17 Jul 2022 15:11:23 +0300 Subject: [PATCH 0008/2223] dt-bindings: ipmi: Add npcm845 compatible Add a compatible string for Nuvoton BMC NPCM845 KCS and modify NPCM KCS description to support all NPCM BMC SoC. Signed-off-by: Tomer Maimon Message-Id: <20220717121124.154734-2-tmaimon77@gmail.com> Acked-by: Krzysztof Kozlowski Signed-off-by: Corey Minyard --- Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt index 352f5e9c759bc..cbc10a68ddef4 100644 --- a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt +++ b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt @@ -1,12 +1,13 @@ -* Nuvoton NPCM7xx KCS (Keyboard Controller Style) IPMI interface +* Nuvoton NPCM KCS (Keyboard Controller Style) IPMI interface -The Nuvoton SOCs (NPCM7xx) are commonly used as BMCs +The Nuvoton SOCs (NPCM) are commonly used as BMCs (Baseboard Management Controllers) and the KCS interface can be used to perform in-band IPMI communication with their host. Required properties: - compatible : should be one of "nuvoton,npcm750-kcs-bmc" + "nuvoton,npcm845-kcs-bmc" - interrupts : interrupt generated by the controller - kcs_chan : The KCS channel number in the controller -- GitLab From dfef1acc36d56d947a69ff57bf03fa0a0f276b7c Mon Sep 17 00:00:00 2001 From: Tomer Maimon Date: Sun, 17 Jul 2022 15:11:24 +0300 Subject: [PATCH 0009/2223] char: ipmi: modify NPCM KCS configuration Modify NPCM IPMI KCS configuration to support all NPCM BMC SoC. Signed-off-by: Tomer Maimon Message-Id: <20220717121124.154734-3-tmaimon77@gmail.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig index b061e6b513ed5..39565cf74b2c9 100644 --- a/drivers/char/ipmi/Kconfig +++ b/drivers/char/ipmi/Kconfig @@ -119,13 +119,13 @@ config ASPEED_KCS_IPMI_BMC provides the access of KCS IO space for BMC side. config NPCM7XX_KCS_IPMI_BMC - depends on ARCH_NPCM7XX || COMPILE_TEST + depends on ARCH_NPCM || COMPILE_TEST select IPMI_KCS_BMC select REGMAP_MMIO - tristate "NPCM7xx KCS IPMI BMC driver" + tristate "NPCM KCS IPMI BMC driver" help Provides a driver for the KCS (Keyboard Controller Style) IPMI - interface found on Nuvoton NPCM7xx SOCs. + interface found on Nuvoton NPCM SOCs. The driver implements the BMC side of the KCS contorller, it provides the access of KCS IO space for BMC side. -- GitLab From 79c87b8f8ba7e5706aa5cb2601635b468820e911 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 15 Jul 2022 13:41:56 +0800 Subject: [PATCH 0010/2223] ipmi: Fix comment typo The double `the' is duplicated in line 4360, remove one. Signed-off-by: Jason Wang Message-Id: <20220715054156.6342-1-wangborong@cdjrlc.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 703433493c852..c8a3b208f923e 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4357,7 +4357,7 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, /* * The message starts at byte 4 which follows the - * the Channel Byte in the "GET MESSAGE" command + * Channel Byte in the "GET MESSAGE" command */ recv_msg->msg.data_len = msg->rsp_size - 4; memcpy(recv_msg->msg_data, &msg->rsp[4], -- GitLab From 938db76cf8c8d2bd7c56aca74bef68d443e76954 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 2 Aug 2022 10:16:14 -0700 Subject: [PATCH 0011/2223] Input: elan_i2c - convert to use dev_groups There is no need for a driver to individually add/create device groups, the driver core will do it automatically for you. Convert the elan_i2c driver to use the dev_groups pointer instead of manually calling the driver core to create the group and have it be cleaned up later on by the devm core. Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220802162854.3015369-1-gregkh@linuxfoundation.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index e1758d5ffe421..d4eb59b55bf1f 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1311,12 +1311,6 @@ static int elan_probe(struct i2c_client *client, return error; } - error = devm_device_add_groups(dev, elan_sysfs_groups); - if (error) { - dev_err(dev, "failed to create sysfs attributes: %d\n", error); - return error; - } - error = input_register_device(data->input); if (error) { dev_err(dev, "failed to register input device: %d\n", error); @@ -1442,6 +1436,7 @@ static struct i2c_driver elan_driver = { .acpi_match_table = ACPI_PTR(elan_acpi_id), .of_match_table = of_match_ptr(elan_of_match), .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .dev_groups = elan_sysfs_groups, }, .probe = elan_probe, .id_table = elan_id, -- GitLab From 4aebcc9059d890bf2f438cfa169dad856123fc9c Mon Sep 17 00:00:00 2001 From: Tomer Maimon Date: Mon, 8 Aug 2022 10:54:52 +0300 Subject: [PATCH 0012/2223] dt-binding: ipmi: add fallback to npcm845 compatible Add to npcm845 KCS compatible string a fallback to npcm750 KCS compatible string becuase NPCM845 and NPCM750 BMCs are using identical KCS modules. Fixes: 84261749e58a ("dt-bindings: ipmi: Add npcm845 compatible") Signed-off-by: Tomer Maimon Message-Id: <20220808075452.115907-1-tmaimon77@gmail.com> Acked-by: Krzysztof Kozlowski Signed-off-by: Corey Minyard --- Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt index cbc10a68ddef4..4fda76e63396a 100644 --- a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt +++ b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt @@ -7,7 +7,7 @@ used to perform in-band IPMI communication with their host. Required properties: - compatible : should be one of "nuvoton,npcm750-kcs-bmc" - "nuvoton,npcm845-kcs-bmc" + "nuvoton,npcm845-kcs-bmc", "nuvoton,npcm750-kcs-bmc" - interrupts : interrupt generated by the controller - kcs_chan : The KCS channel number in the controller -- GitLab From 9900d9249f736bcd474f56e935db2c70977756ba Mon Sep 17 00:00:00 2001 From: Mattijs Korpershoek Date: Tue, 26 Jul 2022 14:56:06 +0200 Subject: [PATCH 0013/2223] MAINTAINERS: input: add mattijs for mt6779-keypad As stated in [1]: Fengping has no longer interest and time to maintain this driver so he agreed to transfer maintainership over to me. Add a dedicated maintainer entry as well for the driver to make sure that I can help with patch reviews. [1] https://lore.kernel.org/r/20220421140255.2781505-1-mkorpershoek@baylibre.com Signed-off-by: Mattijs Korpershoek Link: https://lore.kernel.org/r/20220720-mt8183-keypad-v2-1-6d42c357cb76@baylibre.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 66bffb24a348a..5cff72980872c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12553,6 +12553,12 @@ S: Supported F: Documentation/devicetree/bindings/media/mediatek-jpeg-*.yaml F: drivers/media/platform/mediatek/jpeg/ +MEDIATEK KEYPAD DRIVER +M: Mattijs Korpershoek +S: Supported +F: Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml +F: drivers/input/keyboard/mt6779-keypad.c + MEDIATEK MDP DRIVER M: Minghsiu Tsai M: Houlong Wei -- GitLab From fe2281d630e0ae375d8d53f3ccff21f444ab64c8 Mon Sep 17 00:00:00 2001 From: Mattijs Korpershoek Date: Tue, 26 Jul 2022 14:56:07 +0200 Subject: [PATCH 0014/2223] dt-bindings: mediatek,mt6779-keypad: use unevaluatedProperties writing-bindings.rst states: > - If schema includes other schema (e.g. /schemas/i2c/i2c-controller.yaml) use > "unevaluatedProperties:false". In other cases, usually use > "additionalProperties:false". All 3 properties from matrix-keymap.yaml are valid for the MediaTek keypad: * keypad,num-rows and keypad,num-cols configure the KP_SEL register * linux,keymap represents the (at most) 8x8 hardware matrix Signed-off-by: Mattijs Korpershoek Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220720-mt8183-keypad-v2-2-6d42c357cb76@baylibre.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/mediatek,mt6779-keypad.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml index 03ebd2665d078..ca8ae40a73f7d 100644 --- a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml +++ b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml @@ -56,7 +56,7 @@ required: - clocks - clock-names -additionalProperties: false +unevaluatedProperties: false examples: - | -- GitLab From 24f9cde381a7781f9f58191217989f7de98c5cd8 Mon Sep 17 00:00:00 2001 From: Mattijs Korpershoek Date: Tue, 26 Jul 2022 14:56:08 +0200 Subject: [PATCH 0015/2223] dt-bindings: mediatek,mt6779-keypad: add mediatek,keys-per-group The MediaTek keypad has 2 modes of detecting key events: * single key: each (row, column) can detect one key * double key: each (row, column) is a group of 2 keys With double key, two keys are physically wired to one (row, column) pin. These keys are in the same "group". Multiple keys in the same group reduces the number of pins which minimizes cost. Add a keys-per-group property to describe this. Signed-off-by: Mattijs Korpershoek Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220720-mt8183-keypad-v2-3-6d42c357cb76@baylibre.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/mediatek,mt6779-keypad.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml index ca8ae40a73f7d..387d0448ff771 100644 --- a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml +++ b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml @@ -49,6 +49,12 @@ properties: maximum: 256 default: 16 + mediatek,keys-per-group: + description: each (row, column) group has multiple keys + $ref: /schemas/types.yaml#/definitions/uint32 + default: 1 + maximum: 2 + required: - compatible - reg -- GitLab From e76be36ad9e8560f6d1b02ad12dc912eaa19ddd1 Mon Sep 17 00:00:00 2001 From: Mattijs Korpershoek Date: Tue, 26 Jul 2022 14:56:09 +0200 Subject: [PATCH 0016/2223] Input: mt6779-keypad - prepare double keys support with calc_row_col The MediaTek keypad can operate in two modes: single key or double key. The driver only supports single key mode. In double key mode, the row/column calculation based on the key is different. Add a calc_row_col function pointer which will be different based on single/double key mode. No functional change. Suggested-by: AngeloGioacchino Del Regno Signed-off-by: Mattijs Korpershoek Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220720-mt8183-keypad-v2-4-6d42c357cb76@baylibre.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/mt6779-keypad.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/input/keyboard/mt6779-keypad.c b/drivers/input/keyboard/mt6779-keypad.c index bf447bf598fbc..9decdfa685555 100644 --- a/drivers/input/keyboard/mt6779-keypad.c +++ b/drivers/input/keyboard/mt6779-keypad.c @@ -31,6 +31,8 @@ struct mt6779_keypad { struct clk *clk; u32 n_rows; u32 n_cols; + void (*calc_row_col)(unsigned int key, + unsigned int *row, unsigned int *col); DECLARE_BITMAP(keymap_state, MTK_KPD_NUM_BITS); }; @@ -67,8 +69,7 @@ static irqreturn_t mt6779_keypad_irq_handler(int irq, void *dev_id) continue; key = bit_nr / 32 * 16 + bit_nr % 32; - row = key / 9; - col = key % 9; + keypad->calc_row_col(key, &row, &col); scancode = MATRIX_SCAN_CODE(row, col, row_shift); /* 1: not pressed, 0: pressed */ @@ -94,6 +95,14 @@ static void mt6779_keypad_clk_disable(void *data) clk_disable_unprepare(data); } +static void mt6779_keypad_calc_row_col_single(unsigned int key, + unsigned int *row, + unsigned int *col) +{ + *row = key / 9; + *col = key % 9; +} + static int mt6779_keypad_pdrv_probe(struct platform_device *pdev) { struct mt6779_keypad *keypad; @@ -148,6 +157,8 @@ static int mt6779_keypad_pdrv_probe(struct platform_device *pdev) return -EINVAL; } + keypad->calc_row_col = mt6779_keypad_calc_row_col_single; + wakeup = device_property_read_bool(&pdev->dev, "wakeup-source"); dev_dbg(&pdev->dev, "n_row=%d n_col=%d debounce=%d\n", -- GitLab From 51c88597517d9625c127f0d8f8f3bf04ef5f8d76 Mon Sep 17 00:00:00 2001 From: Mattijs Korpershoek Date: Tue, 26 Jul 2022 14:56:10 +0200 Subject: [PATCH 0017/2223] Input: mt6779-keypad - support double keys matrix MediaTek keypad has 2 modes of detecting key events: - single key: each (row, column) can detect one key - double key: each (row, column) is a group of 2 keys Double key support exists to minimize cost, since it reduces the number of pins required for physical keys. Double key is configured by setting BIT(0) of the KP_SEL register. Enable double key matrix support based on the mediatek,keys-per-group device tree property. Signed-off-by: Mattijs Korpershoek Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220720-mt8183-keypad-v2-5-6d42c357cb76@baylibre.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/mt6779-keypad.c | 32 +++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/input/keyboard/mt6779-keypad.c b/drivers/input/keyboard/mt6779-keypad.c index 9decdfa685555..a05e70af1fd03 100644 --- a/drivers/input/keyboard/mt6779-keypad.c +++ b/drivers/input/keyboard/mt6779-keypad.c @@ -18,6 +18,7 @@ #define MTK_KPD_DEBOUNCE_MASK GENMASK(13, 0) #define MTK_KPD_DEBOUNCE_MAX_MS 256 #define MTK_KPD_SEL 0x0020 +#define MTK_KPD_SEL_DOUBLE_KP_MODE BIT(0) #define MTK_KPD_SEL_COL GENMASK(15, 10) #define MTK_KPD_SEL_ROW GENMASK(9, 4) #define MTK_KPD_SEL_COLMASK(c) GENMASK((c) + 9, 10) @@ -103,12 +104,21 @@ static void mt6779_keypad_calc_row_col_single(unsigned int key, *col = key % 9; } +static void mt6779_keypad_calc_row_col_double(unsigned int key, + unsigned int *row, + unsigned int *col) +{ + *row = key / 13; + *col = (key % 13) / 2; +} + static int mt6779_keypad_pdrv_probe(struct platform_device *pdev) { struct mt6779_keypad *keypad; void __iomem *base; int irq; u32 debounce; + u32 keys_per_group; bool wakeup; int error; @@ -157,7 +167,22 @@ static int mt6779_keypad_pdrv_probe(struct platform_device *pdev) return -EINVAL; } - keypad->calc_row_col = mt6779_keypad_calc_row_col_single; + if (device_property_read_u32(&pdev->dev, "mediatek,keys-per-group", + &keys_per_group)) + keys_per_group = 1; + + switch (keys_per_group) { + case 1: + keypad->calc_row_col = mt6779_keypad_calc_row_col_single; + break; + case 2: + keypad->calc_row_col = mt6779_keypad_calc_row_col_double; + break; + default: + dev_err(&pdev->dev, + "Invalid keys-per-group: %d\n", keys_per_group); + return -EINVAL; + } wakeup = device_property_read_bool(&pdev->dev, "wakeup-source"); @@ -177,6 +202,11 @@ static int mt6779_keypad_pdrv_probe(struct platform_device *pdev) regmap_write(keypad->regmap, MTK_KPD_DEBOUNCE, (debounce * (1 << 5)) & MTK_KPD_DEBOUNCE_MASK); + if (keys_per_group == 2) + regmap_update_bits(keypad->regmap, MTK_KPD_SEL, + MTK_KPD_SEL_DOUBLE_KP_MODE, + MTK_KPD_SEL_DOUBLE_KP_MODE); + regmap_update_bits(keypad->regmap, MTK_KPD_SEL, MTK_KPD_SEL_ROW, MTK_KPD_SEL_ROWMASK(keypad->n_rows)); regmap_update_bits(keypad->regmap, MTK_KPD_SEL, MTK_KPD_SEL_COL, -- GitLab From fb12ad5e3179c9667dfcbafe2c5da91f9e700e53 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 11 Aug 2022 16:11:36 -0700 Subject: [PATCH 0018/2223] Input: bma150 - fix a typo in some comments Remove some extra '0' s/BMA0150_RANGE_xxx/BMA150_RANGE_xxx/ s/BMA0150_BW_xxx/BMA150_BW_xxx Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/a331a6244a1dfbf34dc85f1be6995fa91500c801.1659802757.git.christophe.jaillet@wanadoo.fr Signed-off-by: Dmitry Torokhov --- include/linux/bma150.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bma150.h b/include/linux/bma150.h index 31c9e323a3913..4d4a62d493419 100644 --- a/include/linux/bma150.h +++ b/include/linux/bma150.h @@ -33,8 +33,8 @@ struct bma150_cfg { unsigned char lg_hyst; /* Low-G hysterisis */ unsigned char lg_dur; /* Low-G duration */ unsigned char lg_thres; /* Low-G threshold */ - unsigned char range; /* one of BMA0150_RANGE_xxx */ - unsigned char bandwidth; /* one of BMA0150_BW_xxx */ + unsigned char range; /* one of BMA150_RANGE_xxx */ + unsigned char bandwidth; /* one of BMA150_BW_xxx */ }; struct bma150_platform_data { -- GitLab From 6a33af349b1b977e8e2298ed131a46316bcb3e62 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Aug 2022 16:11:59 -0700 Subject: [PATCH 0019/2223] Input: tc3589x-keypad - use correct struct names in comment The incorrect structure name is being used in the comment for struct tc3589x_keypad_platform_data. Correct it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220805174717.2374416-1-colin.i.king@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/tc3589x-keypad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/tc3589x-keypad.c b/drivers/input/keyboard/tc3589x-keypad.c index 89b9575dc75dc..78e55318ccd63 100644 --- a/drivers/input/keyboard/tc3589x-keypad.c +++ b/drivers/input/keyboard/tc3589x-keypad.c @@ -70,7 +70,7 @@ #define TC3589x_KBD_INT_CLR 0x1 /** - * struct tc35893_keypad_platform_data - platform specific keypad data + * struct tc3589x_keypad_platform_data - platform specific keypad data * @keymap_data: matrix scan code table for keycodes * @krow: mask for available rows, value is 0xFF * @kcol: mask for available columns, value is 0xFF -- GitLab From 93e719f661379c014f44bd83b361b1bc49ea7082 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Aug 2022 16:12:37 -0700 Subject: [PATCH 0020/2223] Input: applespi - use correct struct names in comment The incorrect structure name is being used in the comment for struct touchpad_info_protocol. Correct it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220805174754.2374473-1-colin.i.king@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/applespi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/applespi.c b/drivers/input/keyboard/applespi.c index d1f5354d5ea28..fcf4b55cdf90f 100644 --- a/drivers/input/keyboard/applespi.c +++ b/drivers/input/keyboard/applespi.c @@ -202,7 +202,7 @@ struct command_protocol_tp_info { }; /** - * struct touchpad_info - touchpad info response. + * struct touchpad_info_protocol - touchpad info response. * message.type = 0x1020, message.length = 0x006e * * @unknown1: unknown -- GitLab From 7eac0081a8e958106ed3aea402c8105f30fad6d9 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 10 Aug 2022 09:59:15 +0100 Subject: [PATCH 0021/2223] riscv: dts: microchip: add qspi compatible fallback The "hard" QSPI peripheral on PolarFire SoC is derived from version 2 of the FPGA IP core. The original binding had no fallback etc, so this device tree is valid as is. There was also no functional driver for the QSPI IP, so no device with a devicetree from a previous mainline release will regress. Link: https://lore.kernel.org/linux-spi/7c9f0d96-2882-964a-cd1f-916ddb3f0410@linaro.org/ Signed-off-by: Conor Dooley Acked-by: Krzysztof Kozlowski --- arch/riscv/boot/dts/microchip/mpfs.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi index 499c2e63ad35e..45e3cc6598825 100644 --- a/arch/riscv/boot/dts/microchip/mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi @@ -330,7 +330,7 @@ }; qspi: spi@21000000 { - compatible = "microchip,mpfs-qspi"; + compatible = "microchip,mpfs-qspi", "microchip,coreqspi-rtl-v2"; #address-cells = <1>; #size-cells = <0>; reg = <0x0 0x21000000 0x0 0x1000>; -- GitLab From 96355be8f0a2a7a91aae2e66c0795a13444db5ba Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 26 Jul 2022 18:53:15 +0100 Subject: [PATCH 0022/2223] dt-bindings: pinctrl: renesas: Document RZ/Five SoC RZ/Five SoC is pin compatible with RZ/G2UL (Type 1) SoC. This patch updates the comment to include RZ/Five SoC so that we make it clear "renesas,r9a07g043-pinctrl" compatible string will be used for RZ/Five SoC. Signed-off-by: Lad Prabhakar Link: https://lore.kernel.org/r/20220726175315.1147-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- .../devicetree/bindings/pinctrl/renesas,rzg2l-pinctrl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,rzg2l-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/renesas,rzg2l-pinctrl.yaml index 997b746391120..f081acb7ba049 100644 --- a/Documentation/devicetree/bindings/pinctrl/renesas,rzg2l-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/renesas,rzg2l-pinctrl.yaml @@ -23,7 +23,7 @@ properties: oneOf: - items: - enum: - - renesas,r9a07g043-pinctrl # RZ/G2UL{Type-1,Type-2} + - renesas,r9a07g043-pinctrl # RZ/G2UL{Type-1,Type-2} and RZ/Five - renesas,r9a07g044-pinctrl # RZ/G2{L,LC} - items: -- GitLab From 152a81a0b1204e9c7f4af0004b5ed7a8d67dd037 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Tue, 26 Jul 2022 18:33:48 -0700 Subject: [PATCH 0023/2223] pinctrl: samsung: Finish initializing the gpios before registering them As soon as a gpio is registered, it should be usable by a consumer. So, do all the initialization before registering the gpios. Without this change, a consumer can request a GPIO IRQ and have the gpio to IRQ mapping fail. Signed-off-by: Saravana Kannan Reviewed-by: Sam Protsenko Reviewed-by: Chanho Park Tested-by: Chanho Park Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220727013349.3056826-1-saravanak@google.com --- drivers/pinctrl/samsung/pinctrl-samsung.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c index 4837bceb767b4..bd13b5ef246d8 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.c +++ b/drivers/pinctrl/samsung/pinctrl-samsung.c @@ -1166,15 +1166,15 @@ static int samsung_pinctrl_probe(struct platform_device *pdev) if (ret) goto err_put_banks; - ret = samsung_gpiolib_register(pdev, drvdata); - if (ret) - goto err_unregister; - if (ctrl->eint_gpio_init) ctrl->eint_gpio_init(drvdata); if (ctrl->eint_wkup_init) ctrl->eint_wkup_init(drvdata); + ret = samsung_gpiolib_register(pdev, drvdata); + if (ret) + goto err_unregister; + platform_set_drvdata(pdev, drvdata); return 0; -- GitLab From bc604fbb49f1a00df34e6755a32e8bf5419eb4cd Mon Sep 17 00:00:00 2001 From: Eddie James Date: Fri, 12 Aug 2022 15:32:27 -0700 Subject: [PATCH 0024/2223] dt-bindings: input: Add documentation for IBM Operation Panel Document the bindings for the IBM Operation Panel, which provides a simple interface to control a server. It has a display and three buttons. Also update MAINTAINERS for the new file. Signed-off-by: Eddie James Reviewed-by: Rob Herring Acked-by: Joel Stanley Link: https://lore.kernel.org/r/20220809204147.238132-2-eajames@linux.ibm.com Signed-off-by: Dmitry Torokhov --- .../bindings/input/ibm,op-panel.yaml | 50 +++++++++++++++++++ MAINTAINERS | 6 +++ 2 files changed, 56 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/ibm,op-panel.yaml diff --git a/Documentation/devicetree/bindings/input/ibm,op-panel.yaml b/Documentation/devicetree/bindings/input/ibm,op-panel.yaml new file mode 100644 index 0000000000000..29a1879e356d8 --- /dev/null +++ b/Documentation/devicetree/bindings/input/ibm,op-panel.yaml @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/ibm,op-panel.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: IBM Operation Panel + +maintainers: + - Eddie James + +allOf: + - $ref: input.yaml# + +description: | + The IBM Operation Panel provides a simple interface to control the connected + server. It has a display and three buttons: two directional arrows and one + 'Enter' button. + +properties: + compatible: + const: ibm,op-panel + + reg: + maxItems: 1 + + linux,keycodes: + minItems: 1 + maxItems: 3 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + ibm-op-panel@62 { + compatible = "ibm,op-panel"; + reg = <(0x62 | I2C_OWN_SLAVE_ADDRESS)>; + linux,keycodes = , , ; + }; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 2910c5e50ac02..084a8728953ae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9643,6 +9643,12 @@ S: Orphan F: Documentation/ia64/ F: arch/ia64/ +IBM Operation Panel Input Driver +M: Eddie James +L: linux-input@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/input/ibm,op-panel.yaml + IBM Power 842 compression accelerator M: Haren Myneni S: Supported -- GitLab From 2e6f34faa7e0158b8eb432b44082bac23c63f8bf Mon Sep 17 00:00:00 2001 From: Eddie James Date: Fri, 12 Aug 2022 15:32:39 -0700 Subject: [PATCH 0025/2223] Input: Add IBM Operation Panel driver Add a driver to get the button events from the panel and provide them to userspace with the input subsystem. The panel is connected with I2C and controls the bus, so the driver registers as an I2C slave device. Signed-off-by: Eddie James Reviewed-by: Joel Stanley Reviewed-by: Wolfram Sang # I2C slave parts Link: https://lore.kernel.org/r/20220809204147.238132-3-eajames@linux.ibm.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 1 + drivers/input/misc/Kconfig | 18 +++ drivers/input/misc/Makefile | 1 + drivers/input/misc/ibm-panel.c | 199 +++++++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+) create mode 100644 drivers/input/misc/ibm-panel.c diff --git a/MAINTAINERS b/MAINTAINERS index 084a8728953ae..711bcd4f6269b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9648,6 +9648,7 @@ M: Eddie James L: linux-input@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/input/ibm,op-panel.yaml +F: drivers/input/misc/ibm-panel.c IBM Power 842 compression accelerator M: Haren Myneni diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index a18ab7358d8f3..968240288c61c 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -730,6 +730,24 @@ config INPUT_ADXL34X_SPI To compile this driver as a module, choose M here: the module will be called adxl34x-spi. +config INPUT_IBM_PANEL + tristate "IBM Operation Panel driver" + depends on I2C && I2C_SLAVE + help + Say Y here if you have an IBM Operation Panel connected to your system + over I2C. The panel is typically connected only to a system's service + processor (BMC). + + If unsure, say N. + + The Operation Panel is a controller with some buttons and an LCD + display that allows someone with physical access to the system to + perform various administrative tasks. This driver only supports the part + of the controller that sends commands to the system. + + To compile this driver as a module, choose M here: the module will be + called ibm-panel. + config INPUT_IMS_PCU tristate "IMS Passenger Control Unit driver" depends on USB diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index 28dfc444f0a96..9eea13e98d480 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_INPUT_GPIO_DECODER) += gpio_decoder.o obj-$(CONFIG_INPUT_GPIO_VIBRA) += gpio-vibra.o obj-$(CONFIG_INPUT_HISI_POWERKEY) += hisi_powerkey.o obj-$(CONFIG_HP_SDC_RTC) += hp_sdc_rtc.o +obj-$(CONFIG_INPUT_IBM_PANEL) += ibm-panel.o obj-$(CONFIG_INPUT_IMS_PCU) += ims-pcu.o obj-$(CONFIG_INPUT_IQS269A) += iqs269a.o obj-$(CONFIG_INPUT_IQS626A) += iqs626a.o diff --git a/drivers/input/misc/ibm-panel.c b/drivers/input/misc/ibm-panel.c new file mode 100644 index 0000000000000..094bcdb568f13 --- /dev/null +++ b/drivers/input/misc/ibm-panel.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) IBM Corporation 2020 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE_NAME "ibm-panel" +#define PANEL_KEYCODES_COUNT 3 + +struct ibm_panel { + u8 idx; + u8 command[11]; + u32 keycodes[PANEL_KEYCODES_COUNT]; + spinlock_t lock; /* protects writes to idx and command */ + struct input_dev *input; +}; + +static u8 ibm_panel_calculate_checksum(struct ibm_panel *panel) +{ + u8 chksum; + u16 sum = 0; + unsigned int i; + + for (i = 0; i < sizeof(panel->command) - 1; ++i) { + sum += panel->command[i]; + if (sum & 0xff00) { + sum &= 0xff; + sum++; + } + } + + chksum = sum & 0xff; + chksum = ~chksum; + chksum++; + + return chksum; +} + +static void ibm_panel_process_command(struct ibm_panel *panel) +{ + u8 button; + u8 chksum; + + if (panel->command[0] != 0xff && panel->command[1] != 0xf0) { + dev_dbg(&panel->input->dev, "command invalid: %02x %02x\n", + panel->command[0], panel->command[1]); + return; + } + + chksum = ibm_panel_calculate_checksum(panel); + if (chksum != panel->command[sizeof(panel->command) - 1]) { + dev_dbg(&panel->input->dev, + "command failed checksum: %u != %u\n", chksum, + panel->command[sizeof(panel->command) - 1]); + return; + } + + button = panel->command[2] & 0xf; + if (button < PANEL_KEYCODES_COUNT) { + input_report_key(panel->input, panel->keycodes[button], + !(panel->command[2] & 0x80)); + input_sync(panel->input); + } else { + dev_dbg(&panel->input->dev, "unknown button %u\n", + button); + } +} + +static int ibm_panel_i2c_slave_cb(struct i2c_client *client, + enum i2c_slave_event event, u8 *val) +{ + unsigned long flags; + struct ibm_panel *panel = i2c_get_clientdata(client); + + dev_dbg(&panel->input->dev, "event: %u data: %02x\n", event, *val); + + spin_lock_irqsave(&panel->lock, flags); + + switch (event) { + case I2C_SLAVE_STOP: + if (panel->idx == sizeof(panel->command)) + ibm_panel_process_command(panel); + else + dev_dbg(&panel->input->dev, + "command incorrect size %u\n", panel->idx); + fallthrough; + case I2C_SLAVE_WRITE_REQUESTED: + panel->idx = 0; + break; + case I2C_SLAVE_WRITE_RECEIVED: + if (panel->idx < sizeof(panel->command)) + panel->command[panel->idx++] = *val; + else + /* + * The command is too long and therefore invalid, so set the index + * to it's largest possible value. When a STOP is finally received, + * the command will be rejected upon processing. + */ + panel->idx = U8_MAX; + break; + case I2C_SLAVE_READ_REQUESTED: + case I2C_SLAVE_READ_PROCESSED: + *val = 0xff; + break; + default: + break; + } + + spin_unlock_irqrestore(&panel->lock, flags); + + return 0; +} + +static int ibm_panel_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct ibm_panel *panel; + int i; + int error; + + panel = devm_kzalloc(&client->dev, sizeof(*panel), GFP_KERNEL); + if (!panel) + return -ENOMEM; + + spin_lock_init(&panel->lock); + + panel->input = devm_input_allocate_device(&client->dev); + if (!panel->input) + return -ENOMEM; + + panel->input->name = client->name; + panel->input->id.bustype = BUS_I2C; + + error = device_property_read_u32_array(&client->dev, + "linux,keycodes", + panel->keycodes, + PANEL_KEYCODES_COUNT); + if (error) { + /* + * Use gamepad buttons as defaults for compatibility with + * existing applications. + */ + panel->keycodes[0] = BTN_NORTH; + panel->keycodes[1] = BTN_SOUTH; + panel->keycodes[2] = BTN_SELECT; + } + + for (i = 0; i < PANEL_KEYCODES_COUNT; ++i) + input_set_capability(panel->input, EV_KEY, panel->keycodes[i]); + + error = input_register_device(panel->input); + if (error) { + dev_err(&client->dev, + "Failed to register input device: %d\n", error); + return error; + } + + i2c_set_clientdata(client, panel); + error = i2c_slave_register(client, ibm_panel_i2c_slave_cb); + if (error) { + dev_err(&client->dev, + "Failed to register as i2c slave: %d\n", error); + return error; + } + + return 0; +} + +static void ibm_panel_remove(struct i2c_client *client) +{ + i2c_slave_unregister(client); +} + +static const struct of_device_id ibm_panel_match[] = { + { .compatible = "ibm,op-panel" }, + { } +}; + +static struct i2c_driver ibm_panel_driver = { + .driver = { + .name = DEVICE_NAME, + .of_match_table = ibm_panel_match, + }, + .probe = ibm_panel_probe, + .remove = ibm_panel_remove, +}; +module_i2c_driver(ibm_panel_driver); + +MODULE_AUTHOR("Eddie James "); +MODULE_DESCRIPTION("IBM Operation Panel Driver"); +MODULE_LICENSE("GPL"); -- GitLab From c42a5ff530a7e2122ed8fb576ddf16a18730ef05 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Tue, 16 Aug 2022 14:16:36 -0700 Subject: [PATCH 0026/2223] dt-bindings: adc-joystick: add poll-interval Add poll-interval support for the adc-joystick documentation. This is an optional value and if not provided the adc-joystick works as it does today (with buffers). If this value is provided, the adc-joystick driver is polled at the specified interval. The existing attribute of "poll-interval" was used instead of complying with property-units.yaml after discussion of the issue on the mailing list. Signed-off-by: Maya Matuszczyk Signed-off-by: Chris Morgan Reviewed-by: Rob Herring Acked-by: Artur Rojek Link: https://lore.kernel.org/r/20220816210440.14260-2-macroalpha82@gmail.com Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/input/adc-joystick.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/input/adc-joystick.yaml b/Documentation/devicetree/bindings/input/adc-joystick.yaml index 64d961458ac7c..da0f8dfca8bfd 100644 --- a/Documentation/devicetree/bindings/input/adc-joystick.yaml +++ b/Documentation/devicetree/bindings/input/adc-joystick.yaml @@ -14,6 +14,9 @@ description: > Bindings for joystick devices connected to ADC controllers supporting the Industrial I/O subsystem. +allOf: + - $ref: input.yaml# + properties: compatible: const: adc-joystick @@ -28,6 +31,8 @@ properties: https://github.com/devicetree-org/dt-schema/blob/master/schemas/iio/iio-consumer.yaml for details. + poll-interval: true + '#address-cells': const: 1 -- GitLab From 24c06e000e8fa237ff2d960def0768a47d0db7b1 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Tue, 16 Aug 2022 14:16:54 -0700 Subject: [PATCH 0027/2223] Input: adc-joystick - add polled input device support Add polled input device support to the adc-joystick driver. This is useful for devices which do not have hardware capable triggers on their SARADC. Code modified from adc-joystick.c changes made by Maya Matuszczyk. Signed-off-by: Maya Matuszczyk Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20220816210440.14260-3-macroalpha82@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/adc-joystick.c | 65 ++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/drivers/input/joystick/adc-joystick.c b/drivers/input/joystick/adc-joystick.c index e0cfdc84763f4..c0deff5d42824 100644 --- a/drivers/input/joystick/adc-joystick.c +++ b/drivers/input/joystick/adc-joystick.c @@ -26,8 +26,23 @@ struct adc_joystick { struct adc_joystick_axis *axes; struct iio_channel *chans; int num_chans; + bool polled; }; +static void adc_joystick_poll(struct input_dev *input) +{ + struct adc_joystick *joy = input_get_drvdata(input); + int i, val, ret; + + for (i = 0; i < joy->num_chans; i++) { + ret = iio_read_channel_raw(&joy->chans[i], &val); + if (ret < 0) + return; + input_report_abs(input, joy->axes[i].code, val); + } + input_sync(input); +} + static int adc_joystick_handle(const void *data, void *private) { struct adc_joystick *joy = private; @@ -179,6 +194,7 @@ static int adc_joystick_probe(struct platform_device *pdev) int error; int bits; int i; + unsigned int poll_interval; joy = devm_kzalloc(dev, sizeof(*joy), GFP_KERNEL); if (!joy) @@ -192,8 +208,25 @@ static int adc_joystick_probe(struct platform_device *pdev) return error; } - /* Count how many channels we got. NULL terminated. */ + error = device_property_read_u32(dev, "poll-interval", &poll_interval); + if (error) { + /* -EINVAL means the property is absent. */ + if (error != -EINVAL) + return error; + } else if (poll_interval == 0) { + dev_err(dev, "Unable to get poll-interval\n"); + return -EINVAL; + } else { + joy->polled = true; + } + + /* + * Count how many channels we got. NULL terminated. + * Do not check the storage size if using polling. + */ for (i = 0; joy->chans[i].indio_dev; i++) { + if (joy->polled) + continue; bits = joy->chans[i].channel->scan_type.storagebits; if (!bits || bits > 16) { dev_err(dev, "Unsupported channel storage size\n"); @@ -215,23 +248,31 @@ static int adc_joystick_probe(struct platform_device *pdev) joy->input = input; input->name = pdev->name; input->id.bustype = BUS_HOST; - input->open = adc_joystick_open; - input->close = adc_joystick_close; error = adc_joystick_set_axes(dev, joy); if (error) return error; - joy->buffer = iio_channel_get_all_cb(dev, adc_joystick_handle, joy); - if (IS_ERR(joy->buffer)) { - dev_err(dev, "Unable to allocate callback buffer\n"); - return PTR_ERR(joy->buffer); - } + if (joy->polled) { + input_setup_polling(input, adc_joystick_poll); + input_set_poll_interval(input, poll_interval); + } else { + input->open = adc_joystick_open; + input->close = adc_joystick_close; + + joy->buffer = iio_channel_get_all_cb(dev, adc_joystick_handle, + joy); + if (IS_ERR(joy->buffer)) { + dev_err(dev, "Unable to allocate callback buffer\n"); + return PTR_ERR(joy->buffer); + } - error = devm_add_action_or_reset(dev, adc_joystick_cleanup, joy->buffer); - if (error) { - dev_err(dev, "Unable to add action\n"); - return error; + error = devm_add_action_or_reset(dev, adc_joystick_cleanup, + joy->buffer); + if (error) { + dev_err(dev, "Unable to add action\n"); + return error; + } } input_set_drvdata(input, joy); -- GitLab From a9f08ad7adb3d2f90e11efbb40a1246ef95b0c04 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 15:05:06 -0700 Subject: [PATCH 0028/2223] Input: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220818210022.6865-1-wsa+renesas@sang-engineering.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/lkkbd.c | 8 ++++---- drivers/input/misc/keyspan_remote.c | 2 +- drivers/input/mouse/hgpk.c | 2 +- drivers/input/mouse/synaptics.c | 4 ++-- drivers/input/mouse/synaptics_usb.c | 2 +- drivers/input/mouse/vsxxxaa.c | 4 ++-- drivers/input/rmi4/rmi_f03.c | 2 +- drivers/input/rmi4/rmi_f54.c | 8 ++++---- drivers/input/serio/altera_ps2.c | 4 ++-- drivers/input/serio/ambakmi.c | 4 ++-- drivers/input/serio/ams_delta_serio.c | 4 ++-- drivers/input/serio/apbps2.c | 2 +- drivers/input/serio/ct82c710.c | 2 +- drivers/input/serio/gscps2.c | 2 +- drivers/input/serio/hyperv-keyboard.c | 4 ++-- drivers/input/serio/i8042-x86ia64io.h | 6 +++--- drivers/input/serio/i8042.c | 14 +++++++------- drivers/input/serio/olpc_apsp.c | 8 ++++---- drivers/input/serio/parkbd.c | 2 +- drivers/input/serio/pcips2.c | 4 ++-- drivers/input/serio/ps2-gpio.c | 4 ++-- drivers/input/serio/ps2mult.c | 2 +- drivers/input/serio/q40kbd.c | 4 ++-- drivers/input/serio/rpckbd.c | 4 ++-- drivers/input/serio/sa1111ps2.c | 4 ++-- drivers/input/serio/serport.c | 2 +- drivers/input/serio/sun4i-ps2.c | 4 ++-- drivers/input/tablet/acecad.c | 2 +- drivers/input/tablet/hanwang.c | 2 +- drivers/input/tablet/pegasus_notetaker.c | 2 +- drivers/input/touchscreen/atmel_mxt_ts.c | 8 ++++---- drivers/input/touchscreen/edt-ft5x06.c | 12 ++++++------ drivers/input/touchscreen/sur40.c | 6 +++--- drivers/input/touchscreen/usbtouchscreen.c | 2 +- drivers/input/touchscreen/wacom_w8001.c | 6 +++--- 35 files changed, 76 insertions(+), 76 deletions(-) diff --git a/drivers/input/keyboard/lkkbd.c b/drivers/input/keyboard/lkkbd.c index e4a1839ca934a..ea9a1d8834c1c 100644 --- a/drivers/input/keyboard/lkkbd.c +++ b/drivers/input/keyboard/lkkbd.c @@ -359,18 +359,18 @@ static void lkkbd_detection_done(struct lkkbd *lk) */ switch (lk->id[4]) { case 1: - strlcpy(lk->name, "DEC LK201 keyboard", sizeof(lk->name)); + strscpy(lk->name, "DEC LK201 keyboard", sizeof(lk->name)); if (lk201_compose_is_alt) lk->keycode[0xb1] = KEY_LEFTALT; break; case 2: - strlcpy(lk->name, "DEC LK401 keyboard", sizeof(lk->name)); + strscpy(lk->name, "DEC LK401 keyboard", sizeof(lk->name)); break; default: - strlcpy(lk->name, "Unknown DEC keyboard", sizeof(lk->name)); + strscpy(lk->name, "Unknown DEC keyboard", sizeof(lk->name)); printk(KERN_ERR "lkkbd: keyboard on %s is unknown, please report to " "Jan-Benedict Glaw \n", lk->phys); @@ -626,7 +626,7 @@ static int lkkbd_connect(struct serio *serio, struct serio_driver *drv) lk->ctrlclick_volume = ctrlclick_volume; memcpy(lk->keycode, lkkbd_keycode, sizeof(lk->keycode)); - strlcpy(lk->name, "DEC LK keyboard", sizeof(lk->name)); + strscpy(lk->name, "DEC LK keyboard", sizeof(lk->name)); snprintf(lk->phys, sizeof(lk->phys), "%s/input0", serio->phys); input_dev->name = lk->name; diff --git a/drivers/input/misc/keyspan_remote.c b/drivers/input/misc/keyspan_remote.c index 4650f4a949890..bee4b13764914 100644 --- a/drivers/input/misc/keyspan_remote.c +++ b/drivers/input/misc/keyspan_remote.c @@ -485,7 +485,7 @@ static int keyspan_probe(struct usb_interface *interface, const struct usb_devic } if (udev->manufacturer) - strlcpy(remote->name, udev->manufacturer, sizeof(remote->name)); + strscpy(remote->name, udev->manufacturer, sizeof(remote->name)); if (udev->product) { if (udev->manufacturer) diff --git a/drivers/input/mouse/hgpk.c b/drivers/input/mouse/hgpk.c index 4dc441309aacf..523b26a117d6c 100644 --- a/drivers/input/mouse/hgpk.c +++ b/drivers/input/mouse/hgpk.c @@ -1057,7 +1057,7 @@ void hgpk_module_init(void) strlen(hgpk_mode_name)); if (hgpk_default_mode == HGPK_MODE_INVALID) { hgpk_default_mode = HGPK_MODE_MOUSE; - strlcpy(hgpk_mode_name, hgpk_mode_names[HGPK_MODE_MOUSE], + strscpy(hgpk_mode_name, hgpk_mode_names[HGPK_MODE_MOUSE], sizeof(hgpk_mode_name)); } } diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 434d48ae4b12e..e3f657713b557 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -715,8 +715,8 @@ static void synaptics_pt_create(struct psmouse *psmouse) } serio->id.type = SERIO_PS_PSTHRU; - strlcpy(serio->name, "Synaptics pass-through", sizeof(serio->name)); - strlcpy(serio->phys, "synaptics-pt/serio0", sizeof(serio->phys)); + strscpy(serio->name, "Synaptics pass-through", sizeof(serio->name)); + strscpy(serio->phys, "synaptics-pt/serio0", sizeof(serio->phys)); serio->write = synaptics_pt_write; serio->start = synaptics_pt_start; serio->stop = synaptics_pt_stop; diff --git a/drivers/input/mouse/synaptics_usb.c b/drivers/input/mouse/synaptics_usb.c index b5ff27e32a0c8..75e45f3ae675c 100644 --- a/drivers/input/mouse/synaptics_usb.c +++ b/drivers/input/mouse/synaptics_usb.c @@ -354,7 +354,7 @@ static int synusb_probe(struct usb_interface *intf, synusb->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; if (udev->manufacturer) - strlcpy(synusb->name, udev->manufacturer, + strscpy(synusb->name, udev->manufacturer, sizeof(synusb->name)); if (udev->product) { diff --git a/drivers/input/mouse/vsxxxaa.c b/drivers/input/mouse/vsxxxaa.c index bd415f4b574e5..3bd6e723a4220 100644 --- a/drivers/input/mouse/vsxxxaa.c +++ b/drivers/input/mouse/vsxxxaa.c @@ -138,12 +138,12 @@ static void vsxxxaa_detection_done(struct vsxxxaa *mouse) { switch (mouse->type) { case 0x02: - strlcpy(mouse->name, "DEC VSXXX-AA/-GA mouse", + strscpy(mouse->name, "DEC VSXXX-AA/-GA mouse", sizeof(mouse->name)); break; case 0x04: - strlcpy(mouse->name, "DEC VSXXX-AB digitizer", + strscpy(mouse->name, "DEC VSXXX-AB digitizer", sizeof(mouse->name)); break; diff --git a/drivers/input/rmi4/rmi_f03.c b/drivers/input/rmi4/rmi_f03.c index c194b1664b108..1e11ea30d7bdb 100644 --- a/drivers/input/rmi4/rmi_f03.c +++ b/drivers/input/rmi4/rmi_f03.c @@ -181,7 +181,7 @@ static int rmi_f03_register_pt(struct f03_data *f03) serio->close = rmi_f03_pt_close; serio->port_data = f03; - strlcpy(serio->name, "RMI4 PS/2 pass-through", sizeof(serio->name)); + strscpy(serio->name, "RMI4 PS/2 pass-through", sizeof(serio->name)); snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", dev_name(&f03->fn->dev)); serio->dev.parent = &f03->fn->dev; diff --git a/drivers/input/rmi4/rmi_f54.c b/drivers/input/rmi4/rmi_f54.c index c5ce907535ef9..5c3da910b5b2c 100644 --- a/drivers/input/rmi4/rmi_f54.c +++ b/drivers/input/rmi4/rmi_f54.c @@ -390,8 +390,8 @@ static int rmi_f54_vidioc_querycap(struct file *file, void *priv, { struct f54_data *f54 = video_drvdata(file); - strlcpy(cap->driver, F54_NAME, sizeof(cap->driver)); - strlcpy(cap->card, SYNAPTICS_INPUT_DEVICE_NAME, sizeof(cap->card)); + strscpy(cap->driver, F54_NAME, sizeof(cap->driver)); + strscpy(cap->card, SYNAPTICS_INPUT_DEVICE_NAME, sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "rmi4:%s", dev_name(&f54->fn->dev)); @@ -410,7 +410,7 @@ static int rmi_f54_vidioc_enum_input(struct file *file, void *priv, i->type = V4L2_INPUT_TYPE_TOUCH; - strlcpy(i->name, rmi_f54_report_type_names[reptype], sizeof(i->name)); + strscpy(i->name, rmi_f54_report_type_names[reptype], sizeof(i->name)); return 0; } @@ -696,7 +696,7 @@ static int rmi_f54_probe(struct rmi_function *fn) rmi_f54_set_input(f54, 0); /* register video device */ - strlcpy(f54->v4l2.name, F54_NAME, sizeof(f54->v4l2.name)); + strscpy(f54->v4l2.name, F54_NAME, sizeof(f54->v4l2.name)); ret = v4l2_device_register(&fn->dev, &f54->v4l2); if (ret) { dev_err(&fn->dev, "Unable to register video dev.\n"); diff --git a/drivers/input/serio/altera_ps2.c b/drivers/input/serio/altera_ps2.c index 379e9240c2b33..3a92304f64fb3 100644 --- a/drivers/input/serio/altera_ps2.c +++ b/drivers/input/serio/altera_ps2.c @@ -110,8 +110,8 @@ static int altera_ps2_probe(struct platform_device *pdev) serio->write = altera_ps2_write; serio->open = altera_ps2_open; serio->close = altera_ps2_close; - strlcpy(serio->name, dev_name(&pdev->dev), sizeof(serio->name)); - strlcpy(serio->phys, dev_name(&pdev->dev), sizeof(serio->phys)); + strscpy(serio->name, dev_name(&pdev->dev), sizeof(serio->name)); + strscpy(serio->phys, dev_name(&pdev->dev), sizeof(serio->phys)); serio->port_data = ps2if; serio->dev.parent = &pdev->dev; ps2if->io = serio; diff --git a/drivers/input/serio/ambakmi.c b/drivers/input/serio/ambakmi.c index 4408245b61d2c..c391700fc4ae2 100644 --- a/drivers/input/serio/ambakmi.c +++ b/drivers/input/serio/ambakmi.c @@ -126,8 +126,8 @@ static int amba_kmi_probe(struct amba_device *dev, io->write = amba_kmi_write; io->open = amba_kmi_open; io->close = amba_kmi_close; - strlcpy(io->name, dev_name(&dev->dev), sizeof(io->name)); - strlcpy(io->phys, dev_name(&dev->dev), sizeof(io->phys)); + strscpy(io->name, dev_name(&dev->dev), sizeof(io->name)); + strscpy(io->phys, dev_name(&dev->dev), sizeof(io->phys)); io->port_data = kmi; io->dev.parent = &dev->dev; diff --git a/drivers/input/serio/ams_delta_serio.c b/drivers/input/serio/ams_delta_serio.c index 1c0be299f1790..ec93cb4573c3e 100644 --- a/drivers/input/serio/ams_delta_serio.c +++ b/drivers/input/serio/ams_delta_serio.c @@ -159,8 +159,8 @@ static int ams_delta_serio_init(struct platform_device *pdev) serio->id.type = SERIO_8042; serio->open = ams_delta_serio_open; serio->close = ams_delta_serio_close; - strlcpy(serio->name, "AMS DELTA keyboard adapter", sizeof(serio->name)); - strlcpy(serio->phys, dev_name(&pdev->dev), sizeof(serio->phys)); + strscpy(serio->name, "AMS DELTA keyboard adapter", sizeof(serio->name)); + strscpy(serio->phys, dev_name(&pdev->dev), sizeof(serio->phys)); serio->dev.parent = &pdev->dev; serio->port_data = priv; diff --git a/drivers/input/serio/apbps2.c b/drivers/input/serio/apbps2.c index 974d7bfae0a03..9c9ce097f8bf1 100644 --- a/drivers/input/serio/apbps2.c +++ b/drivers/input/serio/apbps2.c @@ -176,7 +176,7 @@ static int apbps2_of_probe(struct platform_device *ofdev) priv->io->close = apbps2_close; priv->io->write = apbps2_write; priv->io->port_data = priv; - strlcpy(priv->io->name, "APBPS2 PS/2", sizeof(priv->io->name)); + strscpy(priv->io->name, "APBPS2 PS/2", sizeof(priv->io->name)); snprintf(priv->io->phys, sizeof(priv->io->phys), "apbps2_%d", apbps2_idx++); diff --git a/drivers/input/serio/ct82c710.c b/drivers/input/serio/ct82c710.c index d45009d654bf7..752ce60e22116 100644 --- a/drivers/input/serio/ct82c710.c +++ b/drivers/input/serio/ct82c710.c @@ -170,7 +170,7 @@ static int ct82c710_probe(struct platform_device *dev) ct82c710_port->open = ct82c710_open; ct82c710_port->close = ct82c710_close; ct82c710_port->write = ct82c710_write; - strlcpy(ct82c710_port->name, "C&T 82c710 mouse port", + strscpy(ct82c710_port->name, "C&T 82c710 mouse port", sizeof(ct82c710_port->name)); snprintf(ct82c710_port->phys, sizeof(ct82c710_port->phys), "isa%16llx/serio0", (unsigned long long)CT82C710_DATA); diff --git a/drivers/input/serio/gscps2.c b/drivers/input/serio/gscps2.c index da2c67cb86422..633c7de49d671 100644 --- a/drivers/input/serio/gscps2.c +++ b/drivers/input/serio/gscps2.c @@ -361,7 +361,7 @@ static int __init gscps2_probe(struct parisc_device *dev) snprintf(serio->name, sizeof(serio->name), "gsc-ps2-%s", (ps2port->id == GSC_ID_KEYBOARD) ? "keyboard" : "mouse"); - strlcpy(serio->phys, dev_name(&dev->dev), sizeof(serio->phys)); + strscpy(serio->phys, dev_name(&dev->dev), sizeof(serio->phys)); serio->id.type = SERIO_8042; serio->write = gscps2_write; serio->open = gscps2_open; diff --git a/drivers/input/serio/hyperv-keyboard.c b/drivers/input/serio/hyperv-keyboard.c index 1a7b72a9016d7..d62aefb2e2451 100644 --- a/drivers/input/serio/hyperv-keyboard.c +++ b/drivers/input/serio/hyperv-keyboard.c @@ -334,9 +334,9 @@ static int hv_kbd_probe(struct hv_device *hv_dev, hv_serio->dev.parent = &hv_dev->device; hv_serio->id.type = SERIO_8042_XL; hv_serio->port_data = kbd_dev; - strlcpy(hv_serio->name, dev_name(&hv_dev->device), + strscpy(hv_serio->name, dev_name(&hv_dev->device), sizeof(hv_serio->name)); - strlcpy(hv_serio->phys, dev_name(&hv_dev->device), + strscpy(hv_serio->phys, dev_name(&hv_dev->device), sizeof(hv_serio->phys)); hv_serio->start = hv_kbd_start; diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 4fbec7bbeccaa..732b7a6b315d6 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -1300,7 +1300,7 @@ static char i8042_pnp_aux_name[32]; static void i8042_pnp_id_to_string(struct pnp_id *id, char *dst, int dst_size) { - strlcpy(dst, "PNP:", dst_size); + strscpy(dst, "PNP:", dst_size); while (id) { strlcat(dst, " ", dst_size); @@ -1320,7 +1320,7 @@ static int i8042_pnp_kbd_probe(struct pnp_dev *dev, const struct pnp_device_id * if (pnp_irq_valid(dev,0)) i8042_pnp_kbd_irq = pnp_irq(dev, 0); - strlcpy(i8042_pnp_kbd_name, did->id, sizeof(i8042_pnp_kbd_name)); + strscpy(i8042_pnp_kbd_name, did->id, sizeof(i8042_pnp_kbd_name)); if (strlen(pnp_dev_name(dev))) { strlcat(i8042_pnp_kbd_name, ":", sizeof(i8042_pnp_kbd_name)); strlcat(i8042_pnp_kbd_name, pnp_dev_name(dev), sizeof(i8042_pnp_kbd_name)); @@ -1347,7 +1347,7 @@ static int i8042_pnp_aux_probe(struct pnp_dev *dev, const struct pnp_device_id * if (pnp_irq_valid(dev, 0)) i8042_pnp_aux_irq = pnp_irq(dev, 0); - strlcpy(i8042_pnp_aux_name, did->id, sizeof(i8042_pnp_aux_name)); + strscpy(i8042_pnp_aux_name, did->id, sizeof(i8042_pnp_aux_name)); if (strlen(pnp_dev_name(dev))) { strlcat(i8042_pnp_aux_name, ":", sizeof(i8042_pnp_aux_name)); strlcat(i8042_pnp_aux_name, pnp_dev_name(dev), sizeof(i8042_pnp_aux_name)); diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 3fc0a89cc785c..f9486495baefa 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -1341,9 +1341,9 @@ static int i8042_create_kbd_port(void) serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; - strlcpy(serio->name, "i8042 KBD port", sizeof(serio->name)); - strlcpy(serio->phys, I8042_KBD_PHYS_DESC, sizeof(serio->phys)); - strlcpy(serio->firmware_id, i8042_kbd_firmware_id, + strscpy(serio->name, "i8042 KBD port", sizeof(serio->name)); + strscpy(serio->phys, I8042_KBD_PHYS_DESC, sizeof(serio->phys)); + strscpy(serio->firmware_id, i8042_kbd_firmware_id, sizeof(serio->firmware_id)); set_primary_fwnode(&serio->dev, i8042_kbd_fwnode); @@ -1371,15 +1371,15 @@ static int i8042_create_aux_port(int idx) serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; if (idx < 0) { - strlcpy(serio->name, "i8042 AUX port", sizeof(serio->name)); - strlcpy(serio->phys, I8042_AUX_PHYS_DESC, sizeof(serio->phys)); - strlcpy(serio->firmware_id, i8042_aux_firmware_id, + strscpy(serio->name, "i8042 AUX port", sizeof(serio->name)); + strscpy(serio->phys, I8042_AUX_PHYS_DESC, sizeof(serio->phys)); + strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); serio->close = i8042_port_close; } else { snprintf(serio->name, sizeof(serio->name), "i8042 AUX%d port", idx); snprintf(serio->phys, sizeof(serio->phys), I8042_MUX_PHYS_DESC, idx + 1); - strlcpy(serio->firmware_id, i8042_aux_firmware_id, + strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); } diff --git a/drivers/input/serio/olpc_apsp.c b/drivers/input/serio/olpc_apsp.c index 59de8d9b6710e..04d2db982fb80 100644 --- a/drivers/input/serio/olpc_apsp.c +++ b/drivers/input/serio/olpc_apsp.c @@ -199,8 +199,8 @@ static int olpc_apsp_probe(struct platform_device *pdev) kb_serio->close = olpc_apsp_close; kb_serio->port_data = priv; kb_serio->dev.parent = &pdev->dev; - strlcpy(kb_serio->name, "sp keyboard", sizeof(kb_serio->name)); - strlcpy(kb_serio->phys, "sp/serio0", sizeof(kb_serio->phys)); + strscpy(kb_serio->name, "sp keyboard", sizeof(kb_serio->name)); + strscpy(kb_serio->phys, "sp/serio0", sizeof(kb_serio->phys)); priv->kbio = kb_serio; serio_register_port(kb_serio); @@ -216,8 +216,8 @@ static int olpc_apsp_probe(struct platform_device *pdev) pad_serio->close = olpc_apsp_close; pad_serio->port_data = priv; pad_serio->dev.parent = &pdev->dev; - strlcpy(pad_serio->name, "sp touchpad", sizeof(pad_serio->name)); - strlcpy(pad_serio->phys, "sp/serio1", sizeof(pad_serio->phys)); + strscpy(pad_serio->name, "sp touchpad", sizeof(pad_serio->name)); + strscpy(pad_serio->phys, "sp/serio1", sizeof(pad_serio->phys)); priv->padio = pad_serio; serio_register_port(pad_serio); diff --git a/drivers/input/serio/parkbd.c b/drivers/input/serio/parkbd.c index 51b68501896c5..0d54895428f5d 100644 --- a/drivers/input/serio/parkbd.c +++ b/drivers/input/serio/parkbd.c @@ -169,7 +169,7 @@ static struct serio *parkbd_allocate_serio(void) if (serio) { serio->id.type = parkbd_mode; serio->write = parkbd_write; - strlcpy(serio->name, "PARKBD AT/XT keyboard adapter", sizeof(serio->name)); + strscpy(serio->name, "PARKBD AT/XT keyboard adapter", sizeof(serio->name)); snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", parkbd_dev->port->name); } diff --git a/drivers/input/serio/pcips2.c b/drivers/input/serio/pcips2.c index bedf75de0a2ce..05878750f2c2d 100644 --- a/drivers/input/serio/pcips2.c +++ b/drivers/input/serio/pcips2.c @@ -149,8 +149,8 @@ static int pcips2_probe(struct pci_dev *dev, const struct pci_device_id *id) serio->write = pcips2_write; serio->open = pcips2_open; serio->close = pcips2_close; - strlcpy(serio->name, pci_name(dev), sizeof(serio->name)); - strlcpy(serio->phys, dev_name(&dev->dev), sizeof(serio->phys)); + strscpy(serio->name, pci_name(dev), sizeof(serio->name)); + strscpy(serio->phys, dev_name(&dev->dev), sizeof(serio->phys)); serio->port_data = ps2if; serio->dev.parent = &dev->dev; ps2if->io = serio; diff --git a/drivers/input/serio/ps2-gpio.c b/drivers/input/serio/ps2-gpio.c index 9b02dd5dd2b99..bc1dc484389b4 100644 --- a/drivers/input/serio/ps2-gpio.c +++ b/drivers/input/serio/ps2-gpio.c @@ -449,8 +449,8 @@ static int ps2_gpio_probe(struct platform_device *pdev) serio->write = drvdata->write_enable ? ps2_gpio_write : NULL; serio->port_data = drvdata; serio->dev.parent = dev; - strlcpy(serio->name, dev_name(dev), sizeof(serio->name)); - strlcpy(serio->phys, dev_name(dev), sizeof(serio->phys)); + strscpy(serio->name, dev_name(dev), sizeof(serio->name)); + strscpy(serio->phys, dev_name(dev), sizeof(serio->phys)); drvdata->serio = serio; drvdata->dev = dev; diff --git a/drivers/input/serio/ps2mult.c b/drivers/input/serio/ps2mult.c index 0071dd5ebcc27..902e81826fbfe 100644 --- a/drivers/input/serio/ps2mult.c +++ b/drivers/input/serio/ps2mult.c @@ -131,7 +131,7 @@ static int ps2mult_create_port(struct ps2mult *psm, int i) if (!serio) return -ENOMEM; - strlcpy(serio->name, "TQC PS/2 Multiplexer", sizeof(serio->name)); + strscpy(serio->name, "TQC PS/2 Multiplexer", sizeof(serio->name)); snprintf(serio->phys, sizeof(serio->phys), "%s/port%d", mx_serio->phys, i); serio->id.type = SERIO_8042; diff --git a/drivers/input/serio/q40kbd.c b/drivers/input/serio/q40kbd.c index bd248398556a8..a1c61f5de0477 100644 --- a/drivers/input/serio/q40kbd.c +++ b/drivers/input/serio/q40kbd.c @@ -126,8 +126,8 @@ static int q40kbd_probe(struct platform_device *pdev) port->close = q40kbd_close; port->port_data = q40kbd; port->dev.parent = &pdev->dev; - strlcpy(port->name, "Q40 Kbd Port", sizeof(port->name)); - strlcpy(port->phys, "Q40", sizeof(port->phys)); + strscpy(port->name, "Q40 Kbd Port", sizeof(port->name)); + strscpy(port->phys, "Q40", sizeof(port->phys)); q40kbd_stop(); diff --git a/drivers/input/serio/rpckbd.c b/drivers/input/serio/rpckbd.c index 37fe6a5711ea0..7008bc101415b 100644 --- a/drivers/input/serio/rpckbd.c +++ b/drivers/input/serio/rpckbd.c @@ -128,8 +128,8 @@ static int rpckbd_probe(struct platform_device *dev) serio->close = rpckbd_close; serio->dev.parent = &dev->dev; serio->port_data = rpckbd; - strlcpy(serio->name, "RiscPC PS/2 kbd port", sizeof(serio->name)); - strlcpy(serio->phys, "rpckbd/serio0", sizeof(serio->phys)); + strscpy(serio->name, "RiscPC PS/2 kbd port", sizeof(serio->name)); + strscpy(serio->phys, "rpckbd/serio0", sizeof(serio->phys)); platform_set_drvdata(dev, serio); serio_register_port(serio); diff --git a/drivers/input/serio/sa1111ps2.c b/drivers/input/serio/sa1111ps2.c index 68fac4801e2e2..2724c3aa512ce 100644 --- a/drivers/input/serio/sa1111ps2.c +++ b/drivers/input/serio/sa1111ps2.c @@ -267,8 +267,8 @@ static int ps2_probe(struct sa1111_dev *dev) serio->write = ps2_write; serio->open = ps2_open; serio->close = ps2_close; - strlcpy(serio->name, dev_name(&dev->dev), sizeof(serio->name)); - strlcpy(serio->phys, dev_name(&dev->dev), sizeof(serio->phys)); + strscpy(serio->name, dev_name(&dev->dev), sizeof(serio->name)); + strscpy(serio->phys, dev_name(&dev->dev), sizeof(serio->phys)); serio->port_data = ps2if; serio->dev.parent = &dev->dev; ps2if->io = serio; diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c index 669a728095b8b..7f7ef0e3a7494 100644 --- a/drivers/input/serio/serport.c +++ b/drivers/input/serio/serport.c @@ -171,7 +171,7 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, if (!serio) return -ENOMEM; - strlcpy(serio->name, "Serial port", sizeof(serio->name)); + strscpy(serio->name, "Serial port", sizeof(serio->name)); snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", tty_name(tty)); serio->id = serport->id; serio->id.type = SERIO_RS232; diff --git a/drivers/input/serio/sun4i-ps2.c b/drivers/input/serio/sun4i-ps2.c index f15ed3dcdb9b2..eb262640192e9 100644 --- a/drivers/input/serio/sun4i-ps2.c +++ b/drivers/input/serio/sun4i-ps2.c @@ -256,8 +256,8 @@ static int sun4i_ps2_probe(struct platform_device *pdev) serio->close = sun4i_ps2_close; serio->port_data = drvdata; serio->dev.parent = dev; - strlcpy(serio->name, dev_name(dev), sizeof(serio->name)); - strlcpy(serio->phys, dev_name(dev), sizeof(serio->phys)); + strscpy(serio->name, dev_name(dev), sizeof(serio->name)); + strscpy(serio->phys, dev_name(dev), sizeof(serio->phys)); /* shutoff interrupt */ writel(0, drvdata->reg_base + PS2_REG_GCTL); diff --git a/drivers/input/tablet/acecad.c b/drivers/input/tablet/acecad.c index 56c7e471ac32e..80e06727464da 100644 --- a/drivers/input/tablet/acecad.c +++ b/drivers/input/tablet/acecad.c @@ -155,7 +155,7 @@ static int usb_acecad_probe(struct usb_interface *intf, const struct usb_device_ acecad->input = input_dev; if (dev->manufacturer) - strlcpy(acecad->name, dev->manufacturer, sizeof(acecad->name)); + strscpy(acecad->name, dev->manufacturer, sizeof(acecad->name)); if (dev->product) { if (dev->manufacturer) diff --git a/drivers/input/tablet/hanwang.c b/drivers/input/tablet/hanwang.c index 6d58443bb3e98..e492a0331b246 100644 --- a/drivers/input/tablet/hanwang.c +++ b/drivers/input/tablet/hanwang.c @@ -356,7 +356,7 @@ static int hanwang_probe(struct usb_interface *intf, const struct usb_device_id usb_make_path(dev, hanwang->phys, sizeof(hanwang->phys)); strlcat(hanwang->phys, "/input0", sizeof(hanwang->phys)); - strlcpy(hanwang->name, hanwang->features->name, sizeof(hanwang->name)); + strscpy(hanwang->name, hanwang->features->name, sizeof(hanwang->name)); input_dev->name = hanwang->name; input_dev->phys = hanwang->phys; usb_to_input_id(dev, &input_dev->id); diff --git a/drivers/input/tablet/pegasus_notetaker.c b/drivers/input/tablet/pegasus_notetaker.c index c608ac505d1ba..d836d3dcc6a24 100644 --- a/drivers/input/tablet/pegasus_notetaker.c +++ b/drivers/input/tablet/pegasus_notetaker.c @@ -319,7 +319,7 @@ static int pegasus_probe(struct usb_interface *intf, pegasus->irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; if (dev->manufacturer) - strlcpy(pegasus->name, dev->manufacturer, + strscpy(pegasus->name, dev->manufacturer, sizeof(pegasus->name)); if (dev->product) { diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 4eedea08b0b5f..ccecd1441f0be 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -2497,8 +2497,8 @@ static int mxt_vidioc_querycap(struct file *file, void *priv, { struct mxt_data *data = video_drvdata(file); - strlcpy(cap->driver, "atmel_mxt_ts", sizeof(cap->driver)); - strlcpy(cap->card, "atmel_mxt_ts touch", sizeof(cap->card)); + strscpy(cap->driver, "atmel_mxt_ts", sizeof(cap->driver)); + strscpy(cap->card, "atmel_mxt_ts touch", sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "I2C:%s", dev_name(&data->client->dev)); return 0; @@ -2514,11 +2514,11 @@ static int mxt_vidioc_enum_input(struct file *file, void *priv, switch (i->index) { case MXT_V4L_INPUT_REFS: - strlcpy(i->name, "Mutual Capacitance References", + strscpy(i->name, "Mutual Capacitance References", sizeof(i->name)); break; case MXT_V4L_INPUT_DELTAS: - strlcpy(i->name, "Mutual Capacitance Deltas", sizeof(i->name)); + strscpy(i->name, "Mutual Capacitance Deltas", sizeof(i->name)); break; } diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c index 5fb441387fe5a..9ac1378610bc1 100644 --- a/drivers/input/touchscreen/edt-ft5x06.c +++ b/drivers/input/touchscreen/edt-ft5x06.c @@ -912,8 +912,8 @@ static int edt_ft5x06_ts_identify(struct i2c_client *client, p = strchr(rdbuf, '*'); if (p) *p++ = '\0'; - strlcpy(model_name, rdbuf + 1, EDT_NAME_LEN); - strlcpy(fw_version, p ? p : "", EDT_NAME_LEN); + strscpy(model_name, rdbuf + 1, EDT_NAME_LEN); + strscpy(fw_version, p ? p : "", EDT_NAME_LEN); } else if (!strncasecmp(rdbuf, "EP0", 3)) { tsdata->version = EDT_M12; @@ -926,8 +926,8 @@ static int edt_ft5x06_ts_identify(struct i2c_client *client, p = strchr(rdbuf, '*'); if (p) *p++ = '\0'; - strlcpy(model_name, rdbuf, EDT_NAME_LEN); - strlcpy(fw_version, p ? p : "", EDT_NAME_LEN); + strscpy(model_name, rdbuf, EDT_NAME_LEN); + strscpy(fw_version, p ? p : "", EDT_NAME_LEN); } else { /* If it is not an EDT M06/M12 touchscreen, then the model * detection is a bit hairy. The different ft5x06 @@ -945,7 +945,7 @@ static int edt_ft5x06_ts_identify(struct i2c_client *client, if (error) return error; - strlcpy(fw_version, rdbuf, 2); + strscpy(fw_version, rdbuf, 2); error = edt_ft5x06_ts_readwrite(client, 1, "\xA8", 1, rdbuf); @@ -981,7 +981,7 @@ static int edt_ft5x06_ts_identify(struct i2c_client *client, 1, rdbuf); if (error) return error; - strlcpy(fw_version, rdbuf, 1); + strscpy(fw_version, rdbuf, 1); snprintf(model_name, EDT_NAME_LEN, "EVERVISION-FT5726NEi"); break; diff --git a/drivers/input/touchscreen/sur40.c b/drivers/input/touchscreen/sur40.c index 12f2562b0141b..8ddb3f7d307aa 100644 --- a/drivers/input/touchscreen/sur40.c +++ b/drivers/input/touchscreen/sur40.c @@ -939,8 +939,8 @@ static int sur40_vidioc_querycap(struct file *file, void *priv, { struct sur40_state *sur40 = video_drvdata(file); - strlcpy(cap->driver, DRIVER_SHORT, sizeof(cap->driver)); - strlcpy(cap->card, DRIVER_LONG, sizeof(cap->card)); + strscpy(cap->driver, DRIVER_SHORT, sizeof(cap->driver)); + strscpy(cap->card, DRIVER_LONG, sizeof(cap->card)); usb_make_path(sur40->usbdev, cap->bus_info, sizeof(cap->bus_info)); return 0; } @@ -952,7 +952,7 @@ static int sur40_vidioc_enum_input(struct file *file, void *priv, return -EINVAL; i->type = V4L2_INPUT_TYPE_TOUCH; i->std = V4L2_STD_UNKNOWN; - strlcpy(i->name, "In-Cell Sensor", sizeof(i->name)); + strscpy(i->name, "In-Cell Sensor", sizeof(i->name)); i->capabilities = 0; return 0; } diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c index 3dda6eaabdab8..d6d04b9f04fc1 100644 --- a/drivers/input/touchscreen/usbtouchscreen.c +++ b/drivers/input/touchscreen/usbtouchscreen.c @@ -1708,7 +1708,7 @@ static int usbtouch_probe(struct usb_interface *intf, usbtouch->input = input_dev; if (udev->manufacturer) - strlcpy(usbtouch->name, udev->manufacturer, sizeof(usbtouch->name)); + strscpy(usbtouch->name, udev->manufacturer, sizeof(usbtouch->name)); if (udev->product) { if (udev->manufacturer) diff --git a/drivers/input/touchscreen/wacom_w8001.c b/drivers/input/touchscreen/wacom_w8001.c index 691285ace2289..928c5ee3ac36c 100644 --- a/drivers/input/touchscreen/wacom_w8001.c +++ b/drivers/input/touchscreen/wacom_w8001.c @@ -625,7 +625,7 @@ static int w8001_connect(struct serio *serio, struct serio_driver *drv) /* For backwards-compatibility we compose the basename based on * capabilities and then just append the tool type */ - strlcpy(basename, "Wacom Serial", sizeof(basename)); + strscpy(basename, "Wacom Serial", sizeof(basename)); err_pen = w8001_setup_pen(w8001, basename, sizeof(basename)); err_touch = w8001_setup_touch(w8001, basename, sizeof(basename)); @@ -635,7 +635,7 @@ static int w8001_connect(struct serio *serio, struct serio_driver *drv) } if (!err_pen) { - strlcpy(w8001->pen_name, basename, sizeof(w8001->pen_name)); + strscpy(w8001->pen_name, basename, sizeof(w8001->pen_name)); strlcat(w8001->pen_name, " Pen", sizeof(w8001->pen_name)); input_dev_pen->name = w8001->pen_name; @@ -651,7 +651,7 @@ static int w8001_connect(struct serio *serio, struct serio_driver *drv) } if (!err_touch) { - strlcpy(w8001->touch_name, basename, sizeof(w8001->touch_name)); + strscpy(w8001->touch_name, basename, sizeof(w8001->touch_name)); strlcat(w8001->touch_name, " Finger", sizeof(w8001->touch_name)); input_dev_touch->name = w8001->touch_name; -- GitLab From df805304a820ed10fc3d038dd64b85821c9ee606 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 16 Aug 2022 16:30:15 +0300 Subject: [PATCH 0029/2223] dt-bindings: pinctrl: samsung: stop using bindings header with constants The bindings header with pin controller register values is being deprecated and DTS already switched to a DTS-local header. Do not reference the bindings header in schema and replace the defines with raw values. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220816133016.77553-2-krzysztof.kozlowski@linaro.org --- .../pinctrl/samsung,pinctrl-pins-cfg.yaml | 1 - .../bindings/pinctrl/samsung,pinctrl.yaml | 63 ++++++++----------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-pins-cfg.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-pins-cfg.yaml index 9869d4dceddbb..f796f27bf0e64 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-pins-cfg.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-pins-cfg.yaml @@ -20,7 +20,6 @@ description: | The values used for config properties should be derived from the hardware manual and these values are programmed as-is into the pin pull up/down and driver strength register of the pin-controller. - See also include/dt-bindings/pinctrl/samsung.h with useful constants. See also Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml for additional information and example. diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml index 3a65c66ca71d2..dafa51c69c063 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml @@ -15,9 +15,6 @@ description: | This is a part of device tree bindings for Samsung S3C/S5P/Exynos SoC pin controller. - Pin group settings (like drive strength, pull up/down) are available as - macros in include/dt-bindings/pinctrl/samsung.h. - All the pin controller nodes should be represented in the aliases node using the following format 'pinctrl{n}' where n is a unique number for the alias. @@ -138,8 +135,6 @@ additionalProperties: false examples: - | - #include - pinctrl@7f008000 { compatible = "samsung,s3c64xx-pinctrl"; reg = <0x7f008000 0x1000>; @@ -166,8 +161,8 @@ examples: uart0-data-pins { samsung,pins = "gpa-0", "gpa-1"; - samsung,pin-function = ; - samsung,pin-pud = ; + samsung,pin-function = <2>; + samsung,pin-pud = <0>; }; // ... @@ -175,7 +170,6 @@ examples: - | #include - #include pinctrl@11400000 { compatible = "samsung,exynos4210-pinctrl"; @@ -197,9 +191,9 @@ examples: uart0-data-pins { samsung,pins = "gpa0-0", "gpa0-1"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <2>; + samsung,pin-pud = <0>; + samsung,pin-drv = <0>; }; // ... @@ -207,14 +201,14 @@ examples: sleep0: sleep-state { gpa0-0-pin { samsung,pins = "gpa0-0"; - samsung,pin-con-pdn = ; - samsung,pin-pud-pdn = ; + samsung,pin-con-pdn = <2>; + samsung,pin-pud-pdn = <0>; }; gpa0-1-pin { samsung,pins = "gpa0-1"; - samsung,pin-con-pdn = ; - samsung,pin-pud-pdn = ; + samsung,pin-con-pdn = <0>; + samsung,pin-pud-pdn = <0>; }; // ... @@ -223,7 +217,6 @@ examples: - | #include - #include pinctrl@11000000 { compatible = "samsung,exynos4210-pinctrl"; @@ -272,26 +265,26 @@ examples: sd0-clk-pins { samsung,pins = "gpk0-0"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <2>; + samsung,pin-pud = <0>; + samsung,pin-drv = <3>; }; sd4-bus-width8-pins { part-1-pins { samsung,pins = "gpk0-3", "gpk0-4", "gpk0-5", "gpk0-6"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <3>; + samsung,pin-pud = <3>; + samsung,pin-drv = <3>; }; part-2-pins { samsung,pins = "gpk1-3", "gpk1-4", "gpk1-5", "gpk1-6"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <4>; + samsung,pin-pud = <3>; + samsung,pin-drv = <3>; }; }; @@ -299,16 +292,15 @@ examples: otg-gp-pins { samsung,pins = "gpx3-3"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <1>; + samsung,pin-pud = <0>; + samsung,pin-drv = <0>; samsung,pin-val = <0>; }; }; - | #include - #include pinctrl@10580000 { compatible = "samsung,exynos5433-pinctrl"; @@ -352,9 +344,9 @@ examples: initial_alive: initial-state { gpa0-0-pin { samsung,pins = "gpa0-0"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <0>; + samsung,pin-pud = <1>; + samsung,pin-drv = <0>; }; // ... @@ -363,7 +355,6 @@ examples: - | #include - #include pinctrl@114b0000 { compatible = "samsung,exynos5433-pinctrl"; @@ -384,9 +375,9 @@ examples: i2s0-bus-pins { samsung,pins = "gpz0-0", "gpz0-1", "gpz0-2", "gpz0-3", "gpz0-4", "gpz0-5", "gpz0-6"; - samsung,pin-function = ; - samsung,pin-pud = ; - samsung,pin-drv = ; + samsung,pin-function = <2>; + samsung,pin-pud = <0>; + samsung,pin-drv = <0>; }; // ... -- GitLab From 9d9292576810d0b36897718c24dfbc1a2835314b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 16 Aug 2022 16:30:16 +0300 Subject: [PATCH 0030/2223] dt-bindings: pinctrl: samsung: deprecate header with register constants For convenience (less code duplication, some meaning added to raw number), the pin controller pin configuration register values were defined in the bindings header. These are not some IDs or other abstraction layer but raw numbers used in the registers These constants do not fit the purpose of bindings. They do not provide any abstraction, any hardware and driver independent ID. With minor exceptions, the Linux drivers actually do not use the bindings header at all. All of the constants were moved already to headers local to DTS (residing in DTS directory) and to Samsung pinctrl driver (where applicable), so remove any references to the bindings header and add a warning tha tit is deprecated. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Chanho Park Acked-by: Rob Herring Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20220816133016.77553-3-krzysztof.kozlowski@linaro.org --- include/dt-bindings/pinctrl/samsung.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/dt-bindings/pinctrl/samsung.h b/include/dt-bindings/pinctrl/samsung.h index 950970634dfe4..d1da5ff68d0c3 100644 --- a/include/dt-bindings/pinctrl/samsung.h +++ b/include/dt-bindings/pinctrl/samsung.h @@ -10,6 +10,13 @@ #ifndef __DT_BINDINGS_PINCTRL_SAMSUNG_H__ #define __DT_BINDINGS_PINCTRL_SAMSUNG_H__ +/* + * These bindings are deprecated, because they do not match the actual + * concept of bindings but rather contain pure register values. + * Instead include the header in the DTS source directory. + */ +#warning "These bindings are deprecated. Instead use the header in the DTS source directory." + #define EXYNOS_PIN_PULL_NONE 0 #define EXYNOS_PIN_PULL_DOWN 1 #define EXYNOS_PIN_PULL_UP 3 -- GitLab From 2dce502761a2dec7dc84c03872fba5c7af110290 Mon Sep 17 00:00:00 2001 From: Jagan Teki Date: Thu, 18 Aug 2022 18:11:19 +0530 Subject: [PATCH 0031/2223] dt-bindings: pinctrl: rockchip: Document RV1126 pinctrl Document dt-bindings for RV1126 SoC pinctrl support. Cc: linux-gpio@vger.kernel.org Cc: Linus Walleij Acked-by: Krzysztof Kozlowski Signed-off-by: Jagan Teki Link: https://lore.kernel.org/r/20220818124132.125304-7-jagan@edgeble.ai Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.yaml index 677a285ca4169..b486f41df65f1 100644 --- a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.yaml @@ -47,6 +47,7 @@ properties: - rockchip,rk3568-pinctrl - rockchip,rk3588-pinctrl - rockchip,rv1108-pinctrl + - rockchip,rv1126-pinctrl rockchip,grf: $ref: "/schemas/types.yaml#/definitions/phandle" -- GitLab From fd4ea48688c662593eb64ddf44d4a17173661672 Mon Sep 17 00:00:00 2001 From: Jagan Teki Date: Thu, 18 Aug 2022 18:11:20 +0530 Subject: [PATCH 0032/2223] pinctrl: rockchip: Add RV1126 pinctrl support RV1126 has five GPIOs groups - GPIO0 in PD_MMU and GPIO1-4 in PD_BUS. In GPIO0, up to Lower C group GPIO0_C[3:0] is part of PMU but rest of the groups from there are part of GRF. Added pinctrl support for RV1126 and the pull, drv and schmitt calculations are inferred from [1] authored by Jianqun Xu. [1] https://github.com/rockchip-linux/kernel/blob/develop-4.19/drivers/pinctrl/pinctrl-rockchip.c Cc: linux-gpio@vger.kernel.org Cc: Linus Walleij Signed-off-by: Jianqun Xu Signed-off-by: Sugar Zhang Signed-off-by: Jagan Teki Link: https://lore.kernel.org/r/20220818124132.125304-8-jagan@edgeble.ai Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 333 ++++++++++++++++++++++++++++- drivers/pinctrl/pinctrl-rockchip.h | 1 + 2 files changed, 327 insertions(+), 7 deletions(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 32e41395fc768..a91061f9c2aca 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -57,6 +57,7 @@ #define IOMUX_UNROUTED BIT(3) #define IOMUX_WIDTH_3BIT BIT(4) #define IOMUX_WIDTH_2BIT BIT(5) +#define IOMUX_L_SOURCE_PMU BIT(6) #define PIN_BANK(id, pins, label) \ { \ @@ -147,6 +148,21 @@ .pull_type[3] = pull3, \ } +#define PIN_BANK_IOMUX_FLAGS_OFFSET(id, pins, label, iom0, iom1, iom2, \ + iom3, offset0, offset1, offset2, \ + offset3) \ + { \ + .bank_num = id, \ + .nr_pins = pins, \ + .name = label, \ + .iomux = { \ + { .type = iom0, .offset = offset0 }, \ + { .type = iom1, .offset = offset1 }, \ + { .type = iom2, .offset = offset2 }, \ + { .type = iom3, .offset = offset3 }, \ + }, \ + } + #define PIN_BANK_IOMUX_DRV_FLAGS_OFFSET(id, pins, label, iom0, iom1, \ iom2, iom3, drv0, drv1, drv2, \ drv3, offset0, offset1, \ @@ -443,6 +459,37 @@ static struct rockchip_mux_recalced_data rv1108_mux_recalced_data[] = { }, }; +static struct rockchip_mux_recalced_data rv1126_mux_recalced_data[] = { + { + .num = 0, + .pin = 20, + .reg = 0x10000, + .bit = 0, + .mask = 0xf + }, + { + .num = 0, + .pin = 21, + .reg = 0x10000, + .bit = 4, + .mask = 0xf + }, + { + .num = 0, + .pin = 22, + .reg = 0x10000, + .bit = 8, + .mask = 0xf + }, + { + .num = 0, + .pin = 23, + .reg = 0x10000, + .bit = 12, + .mask = 0xf + }, +}; + static struct rockchip_mux_recalced_data rk3128_mux_recalced_data[] = { { .num = 2, @@ -642,6 +689,103 @@ static struct rockchip_mux_route_data px30_mux_route_data[] = { RK_MUXROUTE_SAME(1, RK_PB7, 2, 0x184, BIT(16 + 9) | BIT(9)), /* uart3-rxm1 */ }; +static struct rockchip_mux_route_data rv1126_mux_route_data[] = { + RK_MUXROUTE_GRF(3, RK_PD2, 1, 0x10260, WRITE_MASK_VAL(0, 0, 0)), /* I2S0_MCLK_M0 */ + RK_MUXROUTE_GRF(3, RK_PB0, 3, 0x10260, WRITE_MASK_VAL(0, 0, 1)), /* I2S0_MCLK_M1 */ + + RK_MUXROUTE_GRF(0, RK_PD4, 4, 0x10260, WRITE_MASK_VAL(3, 2, 0)), /* I2S1_MCLK_M0 */ + RK_MUXROUTE_GRF(1, RK_PD5, 2, 0x10260, WRITE_MASK_VAL(3, 2, 1)), /* I2S1_MCLK_M1 */ + RK_MUXROUTE_GRF(2, RK_PC7, 6, 0x10260, WRITE_MASK_VAL(3, 2, 2)), /* I2S1_MCLK_M2 */ + + RK_MUXROUTE_GRF(1, RK_PD0, 1, 0x10260, WRITE_MASK_VAL(4, 4, 0)), /* I2S2_MCLK_M0 */ + RK_MUXROUTE_GRF(2, RK_PB3, 2, 0x10260, WRITE_MASK_VAL(4, 4, 1)), /* I2S2_MCLK_M1 */ + + RK_MUXROUTE_GRF(3, RK_PD4, 2, 0x10260, WRITE_MASK_VAL(12, 12, 0)), /* PDM_CLK0_M0 */ + RK_MUXROUTE_GRF(3, RK_PC0, 3, 0x10260, WRITE_MASK_VAL(12, 12, 1)), /* PDM_CLK0_M1 */ + + RK_MUXROUTE_GRF(3, RK_PC6, 1, 0x10264, WRITE_MASK_VAL(0, 0, 0)), /* CIF_CLKOUT_M0 */ + RK_MUXROUTE_GRF(2, RK_PD1, 3, 0x10264, WRITE_MASK_VAL(0, 0, 1)), /* CIF_CLKOUT_M1 */ + + RK_MUXROUTE_GRF(3, RK_PA4, 5, 0x10264, WRITE_MASK_VAL(5, 4, 0)), /* I2C3_SCL_M0 */ + RK_MUXROUTE_GRF(2, RK_PD4, 7, 0x10264, WRITE_MASK_VAL(5, 4, 1)), /* I2C3_SCL_M1 */ + RK_MUXROUTE_GRF(1, RK_PD6, 3, 0x10264, WRITE_MASK_VAL(5, 4, 2)), /* I2C3_SCL_M2 */ + + RK_MUXROUTE_GRF(3, RK_PA0, 7, 0x10264, WRITE_MASK_VAL(6, 6, 0)), /* I2C4_SCL_M0 */ + RK_MUXROUTE_GRF(4, RK_PA0, 4, 0x10264, WRITE_MASK_VAL(6, 6, 1)), /* I2C4_SCL_M1 */ + + RK_MUXROUTE_GRF(2, RK_PA5, 7, 0x10264, WRITE_MASK_VAL(9, 8, 0)), /* I2C5_SCL_M0 */ + RK_MUXROUTE_GRF(3, RK_PB0, 5, 0x10264, WRITE_MASK_VAL(9, 8, 1)), /* I2C5_SCL_M1 */ + RK_MUXROUTE_GRF(1, RK_PD0, 4, 0x10264, WRITE_MASK_VAL(9, 8, 2)), /* I2C5_SCL_M2 */ + + RK_MUXROUTE_GRF(3, RK_PC0, 5, 0x10264, WRITE_MASK_VAL(11, 10, 0)), /* SPI1_CLK_M0 */ + RK_MUXROUTE_GRF(1, RK_PC6, 3, 0x10264, WRITE_MASK_VAL(11, 10, 1)), /* SPI1_CLK_M1 */ + RK_MUXROUTE_GRF(2, RK_PD5, 6, 0x10264, WRITE_MASK_VAL(11, 10, 2)), /* SPI1_CLK_M2 */ + + RK_MUXROUTE_GRF(3, RK_PC0, 2, 0x10264, WRITE_MASK_VAL(12, 12, 0)), /* RGMII_CLK_M0 */ + RK_MUXROUTE_GRF(2, RK_PB7, 2, 0x10264, WRITE_MASK_VAL(12, 12, 1)), /* RGMII_CLK_M1 */ + + RK_MUXROUTE_GRF(3, RK_PA1, 3, 0x10264, WRITE_MASK_VAL(13, 13, 0)), /* CAN_TXD_M0 */ + RK_MUXROUTE_GRF(3, RK_PA7, 5, 0x10264, WRITE_MASK_VAL(13, 13, 1)), /* CAN_TXD_M1 */ + + RK_MUXROUTE_GRF(3, RK_PA4, 6, 0x10268, WRITE_MASK_VAL(0, 0, 0)), /* PWM8_M0 */ + RK_MUXROUTE_GRF(2, RK_PD7, 5, 0x10268, WRITE_MASK_VAL(0, 0, 1)), /* PWM8_M1 */ + + RK_MUXROUTE_GRF(3, RK_PA5, 6, 0x10268, WRITE_MASK_VAL(2, 2, 0)), /* PWM9_M0 */ + RK_MUXROUTE_GRF(2, RK_PD6, 5, 0x10268, WRITE_MASK_VAL(2, 2, 1)), /* PWM9_M1 */ + + RK_MUXROUTE_GRF(3, RK_PA6, 6, 0x10268, WRITE_MASK_VAL(4, 4, 0)), /* PWM10_M0 */ + RK_MUXROUTE_GRF(2, RK_PD5, 5, 0x10268, WRITE_MASK_VAL(4, 4, 1)), /* PWM10_M1 */ + + RK_MUXROUTE_GRF(3, RK_PA7, 6, 0x10268, WRITE_MASK_VAL(6, 6, 0)), /* PWM11_IR_M0 */ + RK_MUXROUTE_GRF(3, RK_PA1, 5, 0x10268, WRITE_MASK_VAL(6, 6, 1)), /* PWM11_IR_M1 */ + + RK_MUXROUTE_GRF(1, RK_PA5, 3, 0x10268, WRITE_MASK_VAL(8, 8, 0)), /* UART2_TX_M0 */ + RK_MUXROUTE_GRF(3, RK_PA2, 1, 0x10268, WRITE_MASK_VAL(8, 8, 1)), /* UART2_TX_M1 */ + + RK_MUXROUTE_GRF(3, RK_PC6, 3, 0x10268, WRITE_MASK_VAL(11, 10, 0)), /* UART3_TX_M0 */ + RK_MUXROUTE_GRF(1, RK_PA7, 2, 0x10268, WRITE_MASK_VAL(11, 10, 1)), /* UART3_TX_M1 */ + RK_MUXROUTE_GRF(3, RK_PA0, 4, 0x10268, WRITE_MASK_VAL(11, 10, 2)), /* UART3_TX_M2 */ + + RK_MUXROUTE_GRF(3, RK_PA4, 4, 0x10268, WRITE_MASK_VAL(13, 12, 0)), /* UART4_TX_M0 */ + RK_MUXROUTE_GRF(2, RK_PA6, 4, 0x10268, WRITE_MASK_VAL(13, 12, 1)), /* UART4_TX_M1 */ + RK_MUXROUTE_GRF(1, RK_PD5, 3, 0x10268, WRITE_MASK_VAL(13, 12, 2)), /* UART4_TX_M2 */ + + RK_MUXROUTE_GRF(3, RK_PA6, 4, 0x10268, WRITE_MASK_VAL(15, 14, 0)), /* UART5_TX_M0 */ + RK_MUXROUTE_GRF(2, RK_PB0, 4, 0x10268, WRITE_MASK_VAL(15, 14, 1)), /* UART5_TX_M1 */ + RK_MUXROUTE_GRF(2, RK_PA0, 3, 0x10268, WRITE_MASK_VAL(15, 14, 2)), /* UART5_TX_M2 */ + + RK_MUXROUTE_PMU(0, RK_PB6, 3, 0x0114, WRITE_MASK_VAL(0, 0, 0)), /* PWM0_M0 */ + RK_MUXROUTE_PMU(2, RK_PB3, 5, 0x0114, WRITE_MASK_VAL(0, 0, 1)), /* PWM0_M1 */ + + RK_MUXROUTE_PMU(0, RK_PB7, 3, 0x0114, WRITE_MASK_VAL(2, 2, 0)), /* PWM1_M0 */ + RK_MUXROUTE_PMU(2, RK_PB2, 5, 0x0114, WRITE_MASK_VAL(2, 2, 1)), /* PWM1_M1 */ + + RK_MUXROUTE_PMU(0, RK_PC0, 3, 0x0114, WRITE_MASK_VAL(4, 4, 0)), /* PWM2_M0 */ + RK_MUXROUTE_PMU(2, RK_PB1, 5, 0x0114, WRITE_MASK_VAL(4, 4, 1)), /* PWM2_M1 */ + + RK_MUXROUTE_PMU(0, RK_PC1, 3, 0x0114, WRITE_MASK_VAL(6, 6, 0)), /* PWM3_IR_M0 */ + RK_MUXROUTE_PMU(2, RK_PB0, 5, 0x0114, WRITE_MASK_VAL(6, 6, 1)), /* PWM3_IR_M1 */ + + RK_MUXROUTE_PMU(0, RK_PC2, 3, 0x0114, WRITE_MASK_VAL(8, 8, 0)), /* PWM4_M0 */ + RK_MUXROUTE_PMU(2, RK_PA7, 5, 0x0114, WRITE_MASK_VAL(8, 8, 1)), /* PWM4_M1 */ + + RK_MUXROUTE_PMU(0, RK_PC3, 3, 0x0114, WRITE_MASK_VAL(10, 10, 0)), /* PWM5_M0 */ + RK_MUXROUTE_PMU(2, RK_PA6, 5, 0x0114, WRITE_MASK_VAL(10, 10, 1)), /* PWM5_M1 */ + + RK_MUXROUTE_PMU(0, RK_PB2, 3, 0x0114, WRITE_MASK_VAL(12, 12, 0)), /* PWM6_M0 */ + RK_MUXROUTE_PMU(2, RK_PD4, 5, 0x0114, WRITE_MASK_VAL(12, 12, 1)), /* PWM6_M1 */ + + RK_MUXROUTE_PMU(0, RK_PB1, 3, 0x0114, WRITE_MASK_VAL(14, 14, 0)), /* PWM7_IR_M0 */ + RK_MUXROUTE_PMU(3, RK_PA0, 5, 0x0114, WRITE_MASK_VAL(14, 14, 1)), /* PWM7_IR_M1 */ + + RK_MUXROUTE_PMU(0, RK_PB0, 1, 0x0118, WRITE_MASK_VAL(1, 0, 0)), /* SPI0_CLK_M0 */ + RK_MUXROUTE_PMU(2, RK_PA1, 1, 0x0118, WRITE_MASK_VAL(1, 0, 1)), /* SPI0_CLK_M1 */ + RK_MUXROUTE_PMU(2, RK_PB2, 6, 0x0118, WRITE_MASK_VAL(1, 0, 2)), /* SPI0_CLK_M2 */ + + RK_MUXROUTE_PMU(0, RK_PB6, 2, 0x0118, WRITE_MASK_VAL(2, 2, 0)), /* UART1_TX_M0 */ + RK_MUXROUTE_PMU(1, RK_PD0, 5, 0x0118, WRITE_MASK_VAL(2, 2, 1)), /* UART1_TX_M1 */ +}; + static struct rockchip_mux_route_data rk3128_mux_route_data[] = { RK_MUXROUTE_SAME(1, RK_PB2, 1, 0x144, BIT(16 + 3) | BIT(16 + 4)), /* spi-0 */ RK_MUXROUTE_SAME(1, RK_PD3, 3, 0x144, BIT(16 + 3) | BIT(16 + 4) | BIT(3)), /* spi-1 */ @@ -877,8 +1021,12 @@ static int rockchip_get_mux(struct rockchip_pin_bank *bank, int pin) if (bank->iomux[iomux_num].type & IOMUX_GPIO_ONLY) return RK_FUNC_GPIO; - regmap = (bank->iomux[iomux_num].type & IOMUX_SOURCE_PMU) - ? info->regmap_pmu : info->regmap_base; + if (bank->iomux[iomux_num].type & IOMUX_SOURCE_PMU) + regmap = info->regmap_pmu; + else if (bank->iomux[iomux_num].type & IOMUX_L_SOURCE_PMU) + regmap = (pin % 8 < 4) ? info->regmap_pmu : info->regmap_base; + else + regmap = info->regmap_base; /* get basic quadrupel of mux registers and the correct reg inside */ mux_type = bank->iomux[iomux_num].type; @@ -987,8 +1135,12 @@ static int rockchip_set_mux(struct rockchip_pin_bank *bank, int pin, int mux) dev_dbg(dev, "setting mux of GPIO%d-%d to %d\n", bank->bank_num, pin, mux); - regmap = (bank->iomux[iomux_num].type & IOMUX_SOURCE_PMU) - ? info->regmap_pmu : info->regmap_base; + if (bank->iomux[iomux_num].type & IOMUX_SOURCE_PMU) + regmap = info->regmap_pmu; + else if (bank->iomux[iomux_num].type & IOMUX_L_SOURCE_PMU) + regmap = (pin % 8 < 4) ? info->regmap_pmu : info->regmap_base; + else + regmap = info->regmap_base; /* get basic quadrupel of mux registers and the correct reg inside */ mux_type = bank->iomux[iomux_num].type; @@ -1268,6 +1420,119 @@ static int rv1108_calc_schmitt_reg_and_bit(struct rockchip_pin_bank *bank, return 0; } +#define RV1126_PULL_PMU_OFFSET 0x40 +#define RV1126_PULL_GRF_GPIO1A0_OFFSET 0x10108 +#define RV1126_PULL_PINS_PER_REG 8 +#define RV1126_PULL_BITS_PER_PIN 2 +#define RV1126_PULL_BANK_STRIDE 16 +#define RV1126_GPIO_C4_D7(p) (p >= 20 && p <= 31) /* GPIO0_C4 ~ GPIO0_D7 */ + +static int rv1126_calc_pull_reg_and_bit(struct rockchip_pin_bank *bank, + int pin_num, struct regmap **regmap, + int *reg, u8 *bit) +{ + struct rockchip_pinctrl *info = bank->drvdata; + + /* The first 24 pins of the first bank are located in PMU */ + if (bank->bank_num == 0) { + if (RV1126_GPIO_C4_D7(pin_num)) { + *regmap = info->regmap_base; + *reg = RV1126_PULL_GRF_GPIO1A0_OFFSET; + *reg -= (((31 - pin_num) / RV1126_PULL_PINS_PER_REG + 1) * 4); + *bit = pin_num % RV1126_PULL_PINS_PER_REG; + *bit *= RV1126_PULL_BITS_PER_PIN; + return 0; + } + *regmap = info->regmap_pmu; + *reg = RV1126_PULL_PMU_OFFSET; + } else { + *reg = RV1126_PULL_GRF_GPIO1A0_OFFSET; + *regmap = info->regmap_base; + *reg += (bank->bank_num - 1) * RV1126_PULL_BANK_STRIDE; + } + + *reg += ((pin_num / RV1126_PULL_PINS_PER_REG) * 4); + *bit = (pin_num % RV1126_PULL_PINS_PER_REG); + *bit *= RV1126_PULL_BITS_PER_PIN; + + return 0; +} + +#define RV1126_DRV_PMU_OFFSET 0x20 +#define RV1126_DRV_GRF_GPIO1A0_OFFSET 0x10090 +#define RV1126_DRV_BITS_PER_PIN 4 +#define RV1126_DRV_PINS_PER_REG 4 +#define RV1126_DRV_BANK_STRIDE 32 + +static int rv1126_calc_drv_reg_and_bit(struct rockchip_pin_bank *bank, + int pin_num, struct regmap **regmap, + int *reg, u8 *bit) +{ + struct rockchip_pinctrl *info = bank->drvdata; + + /* The first 24 pins of the first bank are located in PMU */ + if (bank->bank_num == 0) { + if (RV1126_GPIO_C4_D7(pin_num)) { + *regmap = info->regmap_base; + *reg = RV1126_DRV_GRF_GPIO1A0_OFFSET; + *reg -= (((31 - pin_num) / RV1126_DRV_PINS_PER_REG + 1) * 4); + *reg -= 0x4; + *bit = pin_num % RV1126_DRV_PINS_PER_REG; + *bit *= RV1126_DRV_BITS_PER_PIN; + return 0; + } + *regmap = info->regmap_pmu; + *reg = RV1126_DRV_PMU_OFFSET; + } else { + *regmap = info->regmap_base; + *reg = RV1126_DRV_GRF_GPIO1A0_OFFSET; + *reg += (bank->bank_num - 1) * RV1126_DRV_BANK_STRIDE; + } + + *reg += ((pin_num / RV1126_DRV_PINS_PER_REG) * 4); + *bit = pin_num % RV1126_DRV_PINS_PER_REG; + *bit *= RV1126_DRV_BITS_PER_PIN; + + return 0; +} + +#define RV1126_SCHMITT_PMU_OFFSET 0x60 +#define RV1126_SCHMITT_GRF_GPIO1A0_OFFSET 0x10188 +#define RV1126_SCHMITT_BANK_STRIDE 16 +#define RV1126_SCHMITT_PINS_PER_GRF_REG 8 +#define RV1126_SCHMITT_PINS_PER_PMU_REG 8 + +static int rv1126_calc_schmitt_reg_and_bit(struct rockchip_pin_bank *bank, + int pin_num, + struct regmap **regmap, + int *reg, u8 *bit) +{ + struct rockchip_pinctrl *info = bank->drvdata; + int pins_per_reg; + + if (bank->bank_num == 0) { + if (RV1126_GPIO_C4_D7(pin_num)) { + *regmap = info->regmap_base; + *reg = RV1126_SCHMITT_GRF_GPIO1A0_OFFSET; + *reg -= (((31 - pin_num) / RV1126_SCHMITT_PINS_PER_GRF_REG + 1) * 4); + *bit = pin_num % RV1126_SCHMITT_PINS_PER_GRF_REG; + return 0; + } + *regmap = info->regmap_pmu; + *reg = RV1126_SCHMITT_PMU_OFFSET; + pins_per_reg = RV1126_SCHMITT_PINS_PER_PMU_REG; + } else { + *regmap = info->regmap_base; + *reg = RV1126_SCHMITT_GRF_GPIO1A0_OFFSET; + pins_per_reg = RV1126_SCHMITT_PINS_PER_GRF_REG; + *reg += (bank->bank_num - 1) * RV1126_SCHMITT_BANK_STRIDE; + } + *reg += ((pin_num / pins_per_reg) * 4); + *bit = pin_num % pins_per_reg; + + return 0; +} + #define RK3308_SCHMITT_PINS_PER_REG 8 #define RK3308_SCHMITT_BANK_STRIDE 16 #define RK3308_SCHMITT_GRF_OFFSET 0x1a0 @@ -1998,6 +2263,12 @@ static int rockchip_set_drive_perpin(struct rockchip_pin_bank *bank, goto config; } + if (ctrl->type == RV1126) { + rmask_bits = RV1126_DRV_BITS_PER_PIN; + ret = strength; + goto config; + } + ret = -EINVAL; for (i = 0; i < ARRAY_SIZE(rockchip_perpin_drv_list[drv_type]); i++) { if (rockchip_perpin_drv_list[drv_type][i] == strength) { @@ -2168,6 +2439,7 @@ static int rockchip_set_pull(struct rockchip_pin_bank *bank, break; case PX30: case RV1108: + case RV1126: case RK3188: case RK3288: case RK3308: @@ -2416,6 +2688,7 @@ static bool rockchip_pinconf_pull_valid(struct rockchip_pin_ctrl *ctrl, return pull ? false : true; case PX30: case RV1108: + case RV1126: case RK3188: case RK3288: case RK3308: @@ -2889,12 +3162,14 @@ static struct rockchip_pin_ctrl *rockchip_pinctrl_get_soc_data( /* preset iomux offset value, set new start value */ if (iom->offset >= 0) { - if (iom->type & IOMUX_SOURCE_PMU) + if ((iom->type & IOMUX_SOURCE_PMU) || + (iom->type & IOMUX_L_SOURCE_PMU)) pmu_offs = iom->offset; else grf_offs = iom->offset; } else { /* set current iomux offset */ - iom->offset = (iom->type & IOMUX_SOURCE_PMU) ? + iom->offset = ((iom->type & IOMUX_SOURCE_PMU) || + (iom->type & IOMUX_L_SOURCE_PMU)) ? pmu_offs : grf_offs; } @@ -2919,7 +3194,7 @@ static struct rockchip_pin_ctrl *rockchip_pinctrl_get_soc_data( inc = (iom->type & (IOMUX_WIDTH_4BIT | IOMUX_WIDTH_3BIT | IOMUX_WIDTH_2BIT)) ? 8 : 4; - if (iom->type & IOMUX_SOURCE_PMU) + if ((iom->type & IOMUX_SOURCE_PMU) || (iom->type & IOMUX_L_SOURCE_PMU)) pmu_offs += inc; else grf_offs += inc; @@ -3178,6 +3453,48 @@ static struct rockchip_pin_ctrl rv1108_pin_ctrl = { .schmitt_calc_reg = rv1108_calc_schmitt_reg_and_bit, }; +static struct rockchip_pin_bank rv1126_pin_banks[] = { + PIN_BANK_IOMUX_FLAGS(0, 32, "gpio0", + IOMUX_WIDTH_4BIT | IOMUX_SOURCE_PMU, + IOMUX_WIDTH_4BIT | IOMUX_SOURCE_PMU, + IOMUX_WIDTH_4BIT | IOMUX_L_SOURCE_PMU, + IOMUX_WIDTH_4BIT), + PIN_BANK_IOMUX_FLAGS_OFFSET(1, 32, "gpio1", + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + 0x10010, 0x10018, 0x10020, 0x10028), + PIN_BANK_IOMUX_FLAGS(2, 32, "gpio2", + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT), + PIN_BANK_IOMUX_FLAGS(3, 32, "gpio3", + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT, + IOMUX_WIDTH_4BIT), + PIN_BANK_IOMUX_FLAGS(4, 2, "gpio4", + IOMUX_WIDTH_4BIT, 0, 0, 0), +}; + +static struct rockchip_pin_ctrl rv1126_pin_ctrl = { + .pin_banks = rv1126_pin_banks, + .nr_banks = ARRAY_SIZE(rv1126_pin_banks), + .label = "RV1126-GPIO", + .type = RV1126, + .grf_mux_offset = 0x10004, /* mux offset from GPIO0_D0 */ + .pmu_mux_offset = 0x0, + .iomux_routes = rv1126_mux_route_data, + .niomux_routes = ARRAY_SIZE(rv1126_mux_route_data), + .iomux_recalced = rv1126_mux_recalced_data, + .niomux_recalced = ARRAY_SIZE(rv1126_mux_recalced_data), + .pull_calc_reg = rv1126_calc_pull_reg_and_bit, + .drv_calc_reg = rv1126_calc_drv_reg_and_bit, + .schmitt_calc_reg = rv1126_calc_schmitt_reg_and_bit, +}; + static struct rockchip_pin_bank rk2928_pin_banks[] = { PIN_BANK(0, 32, "gpio0"), PIN_BANK(1, 32, "gpio1"), @@ -3568,6 +3885,8 @@ static const struct of_device_id rockchip_pinctrl_dt_match[] = { .data = &px30_pin_ctrl }, { .compatible = "rockchip,rv1108-pinctrl", .data = &rv1108_pin_ctrl }, + { .compatible = "rockchip,rv1126-pinctrl", + .data = &rv1126_pin_ctrl }, { .compatible = "rockchip,rk2928-pinctrl", .data = &rk2928_pin_ctrl }, { .compatible = "rockchip,rk3036-pinctrl", diff --git a/drivers/pinctrl/pinctrl-rockchip.h b/drivers/pinctrl/pinctrl-rockchip.h index ec46f8815ac90..4759f336941ef 100644 --- a/drivers/pinctrl/pinctrl-rockchip.h +++ b/drivers/pinctrl/pinctrl-rockchip.h @@ -186,6 +186,7 @@ enum rockchip_pinctrl_type { PX30, RV1108, + RV1126, RK2928, RK3066B, RK3128, -- GitLab From 0ca6e30e4dd1d97416a7febcc8bf06f72e19f063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 5 Aug 2022 14:21:59 +0200 Subject: [PATCH 0033/2223] pinctrl: armada-37xx: Add missing GPIO-only pins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpio1_5 and gpio2_2 are GPIO-only pins. Add them into MPP groups table so they are properly exported as valid pin numbers. Fixes: 87466ccd9401 ("pinctrl: armada-37xx: Add pin controller support for Armada 37xx") Signed-off-by: Pali Rohár Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220805122202.23174-1-pali@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/mvebu/pinctrl-armada-37xx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index bcde042d29dc3..2a9425847a922 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -122,6 +122,16 @@ struct armada_37xx_pinctrl { .funcs = {_func1, _func2} \ } +#define PIN_GRP_GPIO_0(_name, _start, _nr) \ + { \ + .name = _name, \ + .start_pin = _start, \ + .npins = _nr, \ + .reg_mask = 0, \ + .val = {0}, \ + .funcs = {"gpio"} \ + } + #define PIN_GRP_GPIO(_name, _start, _nr, _mask, _func1) \ { \ .name = _name, \ @@ -179,6 +189,7 @@ static struct armada_37xx_pin_group armada_37xx_nb_groups[] = { "pwm", "led"), PIN_GRP_GPIO("pmic1", 7, 1, BIT(7), "pmic"), PIN_GRP_GPIO("pmic0", 6, 1, BIT(8), "pmic"), + PIN_GRP_GPIO_0("gpio1_5", 5, 1), PIN_GRP_GPIO("i2c2", 2, 2, BIT(9), "i2c"), PIN_GRP_GPIO("i2c1", 0, 2, BIT(10), "i2c"), PIN_GRP_GPIO("spi_cs1", 17, 1, BIT(12), "spi"), @@ -195,6 +206,7 @@ static struct armada_37xx_pin_group armada_37xx_nb_groups[] = { static struct armada_37xx_pin_group armada_37xx_sb_groups[] = { PIN_GRP_GPIO("usb32_drvvbus0", 0, 1, BIT(0), "drvbus"), PIN_GRP_GPIO("usb2_drvvbus1", 1, 1, BIT(1), "drvbus"), + PIN_GRP_GPIO_0("gpio2_2", 2, 1), PIN_GRP_GPIO("sdio_sb", 24, 6, BIT(2), "sdio"), PIN_GRP_GPIO("rgmii", 6, 12, BIT(3), "mii"), PIN_GRP_GPIO("smi", 18, 2, BIT(4), "smi"), -- GitLab From 2fa9933d685ee9bcab056c81ef5f7fa242ba90e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 5 Aug 2022 14:22:00 +0200 Subject: [PATCH 0034/2223] pinctrl: armada-37xx: Fix definitions for MPP pins 20-22 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All 3 MPP pins (20, 21 and 22) can be configured individually and also can be configured to GPIO functions. Fix definitions for these MPP pins in existing pin groups. After this change GPIO function can be enabled just for one of these 3 pins. Fixes: 87466ccd9401 ("pinctrl: armada-37xx: Add pin controller support for Armada 37xx") Signed-off-by: Pali Rohár Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220805122202.23174-2-pali@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/mvebu/pinctrl-armada-37xx.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index 2a9425847a922..3a39c670615f3 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -213,9 +213,11 @@ static struct armada_37xx_pin_group armada_37xx_sb_groups[] = { PIN_GRP_GPIO("pcie1", 3, 1, BIT(5), "pcie"), /* this actually controls "pcie1_reset" */ PIN_GRP_GPIO("pcie1_clkreq", 4, 1, BIT(9), "pcie"), PIN_GRP_GPIO("pcie1_wakeup", 5, 1, BIT(10), "pcie"), - PIN_GRP_GPIO("ptp", 20, 3, BIT(11) | BIT(12) | BIT(13), "ptp"), - PIN_GRP("ptp_clk", 21, 1, BIT(6), "ptp", "mii"), - PIN_GRP("ptp_trig", 22, 1, BIT(7), "ptp", "mii"), + PIN_GRP_GPIO("ptp", 20, 1, BIT(11), "ptp"), + PIN_GRP_GPIO_3("ptp_clk", 21, 1, BIT(6) | BIT(12), 0, BIT(6), BIT(12), + "ptp", "mii"), + PIN_GRP_GPIO_3("ptp_trig", 22, 1, BIT(7) | BIT(13), 0, BIT(7), BIT(13), + "ptp", "mii"), PIN_GRP_GPIO_3("mii_col", 23, 1, BIT(8) | BIT(14), 0, BIT(8), BIT(14), "mii", "mii_err"), }; -- GitLab From 6b262b32faf0abf74062e2e2b72cbbea4572b9f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 5 Aug 2022 14:22:01 +0200 Subject: [PATCH 0035/2223] pinctrl: armada-37xx: Checks for errors in gpio_request_enable callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now when all MPP pins are properly defined and every MPP pin has GPIO function, always checks for errors in armada_37xx_gpio_request_enable() function when calling armada_37xx_pmx_set_by_name(). Function armada_37xx_pmx_set_by_name() should not return "not supported" error anymore for any GPIO pin when requesting GPIO mode. Fixes: 87466ccd9401 ("pinctrl: armada-37xx: Add pin controller support for Armada 37xx") Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20220805122202.23174-3-pali@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/mvebu/pinctrl-armada-37xx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index 3a39c670615f3..7f5665e598bf1 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -500,11 +500,15 @@ static int armada_37xx_gpio_request_enable(struct pinctrl_dev *pctldev, struct armada_37xx_pinctrl *info = pinctrl_dev_get_drvdata(pctldev); struct armada_37xx_pin_group *group; int grp = 0; + int ret; dev_dbg(info->dev, "requesting gpio %d\n", offset); - while ((group = armada_37xx_find_next_grp_by_pin(info, offset, &grp))) - armada_37xx_pmx_set_by_name(pctldev, "gpio", group); + while ((group = armada_37xx_find_next_grp_by_pin(info, offset, &grp))) { + ret = armada_37xx_pmx_set_by_name(pctldev, "gpio", group); + if (ret) + return ret; + } return 0; } -- GitLab From 599e465d11a5621063bc5db2d222081716dc3403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 5 Aug 2022 14:22:02 +0200 Subject: [PATCH 0036/2223] pinctrl: armada-37xx: Remove unused macro PIN_GRP() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Macro PIN_GRP() is not used, remove it. Signed-off-by: Pali Rohár Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220805122202.23174-4-pali@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/mvebu/pinctrl-armada-37xx.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index 7f5665e598bf1..261b46841b9f6 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -112,16 +112,6 @@ struct armada_37xx_pinctrl { struct armada_37xx_pm_state pm; }; -#define PIN_GRP(_name, _start, _nr, _mask, _func1, _func2) \ - { \ - .name = _name, \ - .start_pin = _start, \ - .npins = _nr, \ - .reg_mask = _mask, \ - .val = {0, _mask}, \ - .funcs = {_func1, _func2} \ - } - #define PIN_GRP_GPIO_0(_name, _start, _nr) \ { \ .name = _name, \ -- GitLab From 27586b851bae62296b77687a58a8c92ab84d5274 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 10 Aug 2022 10:16:34 -0600 Subject: [PATCH 0037/2223] dt-bindings: pinctrl: aspeed: Add missing properties to examples The aspeed pinctrl parent node (SCU) in the examples is missing various properties. Add the properties in preparation for the SCU schema. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220810161635.73936-2-robh@kernel.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml | 6 ++++++ .../devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml | 4 ++++ .../devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml index d3a8911728d03..f4f1ee6b116e8 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml @@ -63,6 +63,12 @@ examples: syscon: scu@1e6e2000 { compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd"; reg = <0x1e6e2000 0x1a8>; + #clock-cells = <1>; + #reset-cells = <1>; + + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x1e6e2000 0x1000>; pinctrl: pinctrl { compatible = "aspeed,ast2400-pinctrl"; diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml index 5d2c1b1fb7fd0..8168f00884710 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml @@ -82,6 +82,10 @@ examples: #clock-cells = <1>; #reset-cells = <1>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x1e6e2000 0x1000>; + pinctrl: pinctrl { compatible = "aspeed,ast2500-pinctrl"; aspeed,external-nodes = <&gfx>, <&lhc>; diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml index e92686d2f0620..62424c42c9819 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml @@ -96,6 +96,12 @@ examples: syscon: scu@1e6e2000 { compatible = "aspeed,ast2600-scu", "syscon", "simple-mfd"; reg = <0x1e6e2000 0xf6c>; + #clock-cells = <1>; + #reset-cells = <1>; + + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x1e6e2000 0x1000>; pinctrl: pinctrl { compatible = "aspeed,ast2600-pinctrl"; -- GitLab From a9da7251ac8bcc2f2358513868f1903ac2809b3d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Aug 2022 17:14:58 -0700 Subject: [PATCH 0038/2223] Input: gameport - move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220818210156.8143-1-wsa+renesas@sang-engineering.com Signed-off-by: Dmitry Torokhov --- include/linux/gameport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gameport.h b/include/linux/gameport.h index 69081d899492e..8c2f00018e896 100644 --- a/include/linux/gameport.h +++ b/include/linux/gameport.h @@ -110,7 +110,7 @@ static inline void gameport_free_port(struct gameport *gameport) static inline void gameport_set_name(struct gameport *gameport, const char *name) { - strlcpy(gameport->name, name, sizeof(gameport->name)); + strscpy(gameport->name, name, sizeof(gameport->name)); } /* -- GitLab From 6611656736f8f2b94767f5999e78400370d84480 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:41 +0200 Subject: [PATCH 0039/2223] dt-bindings: PCI: qcom: Enumerate platforms with single msi interrupt Explicitly enumerate the older platforms that have a single msi host interrupt. This allows for adding further platforms with, for example, four msi interrupts without resorting to nested conditionals. Drop the redundant comment about older chipsets instead of moving it. Link: https://lore.kernel.org/r/20220714071348.6792-2-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Manivannan Sadhasivam Acked-by: Krzysztof Kozlowski Acked-by: Stanimir Varbanov --- .../devicetree/bindings/pci/qcom,pcie.yaml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml index 7d29e2a45183e..ea388113f04a5 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml @@ -625,7 +625,6 @@ allOf: - reset-names # Newer chipsets support either 1 or 8 MSI vectors - # On older chipsets it's always 1 MSI vector - if: properties: compatible: @@ -660,7 +659,21 @@ allOf: - const: msi5 - const: msi6 - const: msi7 - else: + + - if: + properties: + compatible: + contains: + enum: + - qcom,pcie-apq8064 + - qcom,pcie-apq8084 + - qcom,pcie-ipq4019 + - qcom,pcie-ipq6018 + - qcom,pcie-ipq8064 + - qcom,pcie-ipq8064-v2 + - qcom,pcie-ipq8074 + - qcom,pcie-qcs404 + then: properties: interrupts: maxItems: 1 -- GitLab From 76d777ae045e345ccfbf2d7c873674de09a8a041 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:42 +0200 Subject: [PATCH 0040/2223] dt-bindings: PCI: qcom: Add SC8280XP to binding Add the SC8280XP platform to the binding. SC8280XP use four host interrupts for MSI routing so remove the obsolete comment referring to newer chipsets supporting one or eight interrupts (e.g. for backwards compatibility). Link: https://lore.kernel.org/r/20220714071348.6792-3-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Acked-by: Krzysztof Kozlowski Acked-by: Stanimir Varbanov --- .../devicetree/bindings/pci/qcom,pcie.yaml | 50 ++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml index ea388113f04a5..577d166a7476e 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml @@ -27,6 +27,7 @@ properties: - qcom,pcie-qcs404 - qcom,pcie-sc7280 - qcom,pcie-sc8180x + - qcom,pcie-sc8280xp - qcom,pcie-sdm845 - qcom,pcie-sm8150 - qcom,pcie-sm8250 @@ -181,6 +182,7 @@ allOf: enum: - qcom,pcie-sc7280 - qcom,pcie-sc8180x + - qcom,pcie-sc8280xp - qcom,pcie-sm8250 - qcom,pcie-sm8450-pcie0 - qcom,pcie-sm8450-pcie1 @@ -596,6 +598,35 @@ allOf: items: - const: pci # PCIe core reset + - if: + properties: + compatible: + contains: + enum: + - qcom,pcie-sc8280xp + then: + properties: + clocks: + minItems: 8 + maxItems: 9 + clock-names: + minItems: 8 + items: + - const: aux # Auxiliary clock + - const: cfg # Configuration clock + - const: bus_master # Master AXI clock + - const: bus_slave # Slave AXI clock + - const: slave_q2a # Slave Q2A clock + - const: ddrss_sf_tbu # PCIe SF TBU clock + - const: noc_aggr_4 # NoC aggregate 4 clock + - const: noc_aggr_south_sf # NoC aggregate South SF clock + - const: cnoc_qx # Configuration NoC QX clock + resets: + maxItems: 1 + reset-names: + items: + - const: pci # PCIe core reset + - if: not: properties: @@ -624,7 +655,6 @@ allOf: - resets - reset-names - # Newer chipsets support either 1 or 8 MSI vectors - if: properties: compatible: @@ -660,6 +690,24 @@ allOf: - const: msi6 - const: msi7 + - if: + properties: + compatible: + contains: + enum: + - qcom,pcie-sc8280xp + then: + properties: + interrupts: + minItems: 4 + maxItems: 4 + interrupt-names: + items: + - const: msi0 + - const: msi1 + - const: msi2 + - const: msi3 + - if: properties: compatible: -- GitLab From 76c4207f4085f00d03c96c72c528ee0810692f57 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:43 +0200 Subject: [PATCH 0041/2223] dt-bindings: PCI: qcom: Add SA8540P to binding SA8540P is a new platform related to SC8280XP but which uses a single host interrupt for MSI routing. Link: https://lore.kernel.org/r/20220714071348.6792-4-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Brian Masney Acked-by: Krzysztof Kozlowski Acked-by: Stanimir Varbanov --- Documentation/devicetree/bindings/pci/qcom,pcie.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml index 577d166a7476e..22a2aac4c23f6 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml @@ -25,6 +25,7 @@ properties: - qcom,pcie-ipq4019 - qcom,pcie-ipq8074 - qcom,pcie-qcs404 + - qcom,pcie-sa8540p - qcom,pcie-sc7280 - qcom,pcie-sc8180x - qcom,pcie-sc8280xp @@ -603,6 +604,7 @@ allOf: compatible: contains: enum: + - qcom,pcie-sa8540p - qcom,pcie-sc8280xp then: properties: @@ -721,6 +723,7 @@ allOf: - qcom,pcie-ipq8064-v2 - qcom,pcie-ipq8074 - qcom,pcie-qcs404 + - qcom,pcie-sa8540p then: properties: interrupts: -- GitLab From 70574511f3fc2eea360043aaf7fcbbe4b1ea22b9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:44 +0200 Subject: [PATCH 0042/2223] PCI: qcom: Add support for SC8280XP The SC8280XP platform has seven PCIe controllers: two used with USB4, two 4-lane, two 2-lane and one 1-lane. Add a new "qcom,pcie-sc8280xp" compatible string and reuse the 1.9.0 ops. Note that the SC8280XP controllers need two or three interconnect clocks to be enabled. Model these as optional clocks to avoid encoding devicetree data in the PCIe driver. Note that the same could be done for the SM8450 interconnect clocks and possibly also for the TBU clocks. Link: https://lore.kernel.org/r/20220714071348.6792-5-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Manivannan Sadhasivam Acked-by: Stanimir Varbanov --- drivers/pci/controller/dwc/pcie-qcom.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 66886dc6e777f..11841f2fae9bd 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -180,7 +180,7 @@ struct qcom_pcie_resources_2_3_3 { /* 6 clocks typically, 7 for sm8250 */ struct qcom_pcie_resources_2_7_0 { - struct clk_bulk_data clks[9]; + struct clk_bulk_data clks[12]; int num_clks; struct regulator_bulk_data supplies[2]; struct reset_control *pci_reset; @@ -1175,6 +1175,7 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) struct qcom_pcie_resources_2_7_0 *res = &pcie->res.v2_7_0; struct dw_pcie *pci = pcie->pci; struct device *dev = pci->dev; + unsigned int num_clks, num_opt_clks; unsigned int idx; int ret; @@ -1204,9 +1205,20 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) if (pcie->cfg->has_aggre1_clk) res->clks[idx++].id = "aggre1"; + num_clks = idx; + + ret = devm_clk_bulk_get(dev, num_clks, res->clks); + if (ret < 0) + return ret; + + res->clks[idx++].id = "noc_aggr_4"; + res->clks[idx++].id = "noc_aggr_south_sf"; + res->clks[idx++].id = "cnoc_qx"; + + num_opt_clks = idx - num_clks; res->num_clks = idx; - ret = devm_clk_bulk_get(dev, res->num_clks, res->clks); + ret = devm_clk_bulk_get_optional(dev, num_opt_clks, res->clks + num_clks); if (ret < 0) return ret; @@ -1621,6 +1633,11 @@ static const struct qcom_pcie_cfg ipq4019_cfg = { .ops = &ops_2_4_0, }; +static const struct qcom_pcie_cfg sc8280xp_cfg = { + .ops = &ops_1_9_0, + .has_ddrss_sf_tbu_clk = true, +}; + static const struct qcom_pcie_cfg sdm845_cfg = { .ops = &ops_2_7_0, .has_tbu_clk = true, @@ -1773,6 +1790,7 @@ static const struct of_device_id qcom_pcie_match[] = { { .compatible = "qcom,pcie-sm8150", .data = &sm8150_cfg }, { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, { .compatible = "qcom,pcie-sc8180x", .data = &sc8180x_cfg }, + { .compatible = "qcom,pcie-sc8280xp", .data = &sc8280xp_cfg }, { .compatible = "qcom,pcie-sm8450-pcie0", .data = &sm8450_pcie0_cfg }, { .compatible = "qcom,pcie-sm8450-pcie1", .data = &sm8450_pcie1_cfg }, { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg }, -- GitLab From c64f56d0857a28ad9f4e5b6e68877a6b05660073 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:45 +0200 Subject: [PATCH 0043/2223] PCI: qcom: Add support for SA8540P The SA8540P platform has five PCIe controllers: two 4-lane, two 2-lane and one 1-lane. Add a new "qcom,pcie-sa8540p" compatible string and reuse the 1.9.0 ops. Note that like for SC8280XP, the SA8540P controllers need two or three interconnect clocks to be enabled. Link: https://lore.kernel.org/r/20220714071348.6792-6-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Manivannan Sadhasivam Reviewed-by: Brian Masney Acked-by: Stanimir Varbanov --- drivers/pci/controller/dwc/pcie-qcom.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 11841f2fae9bd..260961f5808eb 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1633,6 +1633,11 @@ static const struct qcom_pcie_cfg ipq4019_cfg = { .ops = &ops_2_4_0, }; +static const struct qcom_pcie_cfg sa8540p_cfg = { + .ops = &ops_1_9_0, + .has_ddrss_sf_tbu_clk = true, +}; + static const struct qcom_pcie_cfg sc8280xp_cfg = { .ops = &ops_1_9_0, .has_ddrss_sf_tbu_clk = true, @@ -1786,6 +1791,7 @@ static const struct of_device_id qcom_pcie_match[] = { { .compatible = "qcom,pcie-ipq8074", .data = &ipq8074_cfg }, { .compatible = "qcom,pcie-ipq4019", .data = &ipq4019_cfg }, { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg }, + { .compatible = "qcom,pcie-sa8540p", .data = &sa8540p_cfg }, { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg }, { .compatible = "qcom,pcie-sm8150", .data = &sm8150_cfg }, { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, -- GitLab From 014aa3518a5826b88a601f5de867551db5c73855 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:46 +0200 Subject: [PATCH 0044/2223] PCI: qcom: Make all optional clocks optional The kernel is not a devicetree validator and does not need to re-encode information which is already available in the devicetree. This is specifically true for the optional PCIe clocks, some of which are really interconnect clocks. Treat also the 2.7.0 optional clocks as truly optional instead of maintaining a list of clocks per compatible (including two compatible strings for the two identical controllers on sm8450) just to validate the devicetree. Link: https://lore.kernel.org/r/20220714071348.6792-7-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Manivannan Sadhasivam Reviewed-by: Dmitry Baryshkov Reviewed-by: Brian Masney Acked-by: Stanimir Varbanov --- drivers/pci/controller/dwc/pcie-qcom.c | 28 ++++---------------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 260961f5808eb..e7e3aa15d2924 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -215,10 +215,6 @@ struct qcom_pcie_ops { struct qcom_pcie_cfg { const struct qcom_pcie_ops *ops; - unsigned int has_tbu_clk:1; - unsigned int has_ddrss_sf_tbu_clk:1; - unsigned int has_aggre0_clk:1; - unsigned int has_aggre1_clk:1; }; struct qcom_pcie { @@ -1196,14 +1192,6 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) res->clks[idx++].id = "bus_master"; res->clks[idx++].id = "bus_slave"; res->clks[idx++].id = "slave_q2a"; - if (pcie->cfg->has_tbu_clk) - res->clks[idx++].id = "tbu"; - if (pcie->cfg->has_ddrss_sf_tbu_clk) - res->clks[idx++].id = "ddrss_sf_tbu"; - if (pcie->cfg->has_aggre0_clk) - res->clks[idx++].id = "aggre0"; - if (pcie->cfg->has_aggre1_clk) - res->clks[idx++].id = "aggre1"; num_clks = idx; @@ -1211,6 +1199,10 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) if (ret < 0) return ret; + res->clks[idx++].id = "tbu"; + res->clks[idx++].id = "ddrss_sf_tbu"; + res->clks[idx++].id = "aggre0"; + res->clks[idx++].id = "aggre1"; res->clks[idx++].id = "noc_aggr_4"; res->clks[idx++].id = "noc_aggr_south_sf"; res->clks[idx++].id = "cnoc_qx"; @@ -1635,17 +1627,14 @@ static const struct qcom_pcie_cfg ipq4019_cfg = { static const struct qcom_pcie_cfg sa8540p_cfg = { .ops = &ops_1_9_0, - .has_ddrss_sf_tbu_clk = true, }; static const struct qcom_pcie_cfg sc8280xp_cfg = { .ops = &ops_1_9_0, - .has_ddrss_sf_tbu_clk = true, }; static const struct qcom_pcie_cfg sdm845_cfg = { .ops = &ops_2_7_0, - .has_tbu_clk = true, }; static const struct qcom_pcie_cfg sm8150_cfg = { @@ -1657,31 +1646,22 @@ static const struct qcom_pcie_cfg sm8150_cfg = { static const struct qcom_pcie_cfg sm8250_cfg = { .ops = &ops_1_9_0, - .has_tbu_clk = true, - .has_ddrss_sf_tbu_clk = true, }; static const struct qcom_pcie_cfg sm8450_pcie0_cfg = { .ops = &ops_1_9_0, - .has_ddrss_sf_tbu_clk = true, - .has_aggre0_clk = true, - .has_aggre1_clk = true, }; static const struct qcom_pcie_cfg sm8450_pcie1_cfg = { .ops = &ops_1_9_0, - .has_ddrss_sf_tbu_clk = true, - .has_aggre1_clk = true, }; static const struct qcom_pcie_cfg sc7280_cfg = { .ops = &ops_1_9_0, - .has_tbu_clk = true, }; static const struct qcom_pcie_cfg sc8180x_cfg = { .ops = &ops_1_9_0, - .has_tbu_clk = true, }; static const struct qcom_pcie_cfg ipq6018_cfg = { -- GitLab From 223117350636e20a86fa540e9b53804194939057 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:47 +0200 Subject: [PATCH 0045/2223] PCI: qcom: Clean up IP configurations The various IP versions have different configurations that are encoded in separate sets of operation callbacks. Currently, there is no need for also maintaining corresponding sets of data parameters, but it is conceivable that these may again be found useful (e.g. to implement minor variations of the operation callbacks). Rename the default configuration structures after the IP version they apply to so that they can more easily be reused by different SoCs. Note that SoC specific configurations can be added later if need arises (e.g. cfg_sc8280xp). Link: https://lore.kernel.org/r/20220714071348.6792-8-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Manivannan Sadhasivam Reviewed-by: Dmitry Baryshkov Reviewed-by: Brian Masney Acked-by: Stanimir Varbanov --- drivers/pci/controller/dwc/pcie-qcom.c | 89 +++++++++----------------- 1 file changed, 29 insertions(+), 60 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index e7e3aa15d2924..ade3704ba6ea5 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1605,66 +1605,35 @@ static const struct qcom_pcie_ops ops_2_9_0 = { .ltssm_enable = qcom_pcie_2_3_2_ltssm_enable, }; -static const struct qcom_pcie_cfg apq8084_cfg = { +static const struct qcom_pcie_cfg cfg_1_0_0 = { .ops = &ops_1_0_0, }; -static const struct qcom_pcie_cfg ipq8064_cfg = { +static const struct qcom_pcie_cfg cfg_1_9_0 = { + .ops = &ops_1_9_0, +}; + +static const struct qcom_pcie_cfg cfg_2_1_0 = { .ops = &ops_2_1_0, }; -static const struct qcom_pcie_cfg msm8996_cfg = { +static const struct qcom_pcie_cfg cfg_2_3_2 = { .ops = &ops_2_3_2, }; -static const struct qcom_pcie_cfg ipq8074_cfg = { +static const struct qcom_pcie_cfg cfg_2_3_3 = { .ops = &ops_2_3_3, }; -static const struct qcom_pcie_cfg ipq4019_cfg = { +static const struct qcom_pcie_cfg cfg_2_4_0 = { .ops = &ops_2_4_0, }; -static const struct qcom_pcie_cfg sa8540p_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sc8280xp_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sdm845_cfg = { +static const struct qcom_pcie_cfg cfg_2_7_0 = { .ops = &ops_2_7_0, }; -static const struct qcom_pcie_cfg sm8150_cfg = { - /* sm8150 has qcom IP rev 1.5.0. However 1.5.0 ops are same as - * 1.9.0, so reuse the same. - */ - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sm8250_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sm8450_pcie0_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sm8450_pcie1_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sc7280_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg sc8180x_cfg = { - .ops = &ops_1_9_0, -}; - -static const struct qcom_pcie_cfg ipq6018_cfg = { +static const struct qcom_pcie_cfg cfg_2_9_0 = { .ops = &ops_2_9_0, }; @@ -1763,24 +1732,24 @@ err_pm_runtime_put: } static const struct of_device_id qcom_pcie_match[] = { - { .compatible = "qcom,pcie-apq8084", .data = &apq8084_cfg }, - { .compatible = "qcom,pcie-ipq8064", .data = &ipq8064_cfg }, - { .compatible = "qcom,pcie-ipq8064-v2", .data = &ipq8064_cfg }, - { .compatible = "qcom,pcie-apq8064", .data = &ipq8064_cfg }, - { .compatible = "qcom,pcie-msm8996", .data = &msm8996_cfg }, - { .compatible = "qcom,pcie-ipq8074", .data = &ipq8074_cfg }, - { .compatible = "qcom,pcie-ipq4019", .data = &ipq4019_cfg }, - { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg }, - { .compatible = "qcom,pcie-sa8540p", .data = &sa8540p_cfg }, - { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg }, - { .compatible = "qcom,pcie-sm8150", .data = &sm8150_cfg }, - { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, - { .compatible = "qcom,pcie-sc8180x", .data = &sc8180x_cfg }, - { .compatible = "qcom,pcie-sc8280xp", .data = &sc8280xp_cfg }, - { .compatible = "qcom,pcie-sm8450-pcie0", .data = &sm8450_pcie0_cfg }, - { .compatible = "qcom,pcie-sm8450-pcie1", .data = &sm8450_pcie1_cfg }, - { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg }, - { .compatible = "qcom,pcie-ipq6018", .data = &ipq6018_cfg }, + { .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 }, + { .compatible = "qcom,pcie-ipq8064", .data = &cfg_2_1_0 }, + { .compatible = "qcom,pcie-ipq8064-v2", .data = &cfg_2_1_0 }, + { .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 }, + { .compatible = "qcom,pcie-msm8996", .data = &cfg_2_3_2 }, + { .compatible = "qcom,pcie-ipq8074", .data = &cfg_2_3_3 }, + { .compatible = "qcom,pcie-ipq4019", .data = &cfg_2_4_0 }, + { .compatible = "qcom,pcie-qcs404", .data = &cfg_2_4_0 }, + { .compatible = "qcom,pcie-sa8540p", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sdm845", .data = &cfg_2_7_0 }, + { .compatible = "qcom,pcie-sm8150", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sm8250", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sc8180x", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sc8280xp", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sm8450-pcie0", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sm8450-pcie1", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sc7280", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-ipq6018", .data = &cfg_2_9_0 }, { } }; -- GitLab From d6cbfcd24443e51fb596fdbf25679d61052a3f84 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Jul 2022 09:13:48 +0200 Subject: [PATCH 0046/2223] PCI: qcom: Sort device-id table Sort the device-id table entries alphabetically by compatible string to make it easier to find entries and add new ones. Link: https://lore.kernel.org/r/20220714071348.6792-9-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Brian Masney Acked-by: Stanimir Varbanov --- drivers/pci/controller/dwc/pcie-qcom.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index ade3704ba6ea5..39ca06ffe6149 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1732,24 +1732,24 @@ err_pm_runtime_put: } static const struct of_device_id qcom_pcie_match[] = { + { .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 }, { .compatible = "qcom,pcie-apq8084", .data = &cfg_1_0_0 }, + { .compatible = "qcom,pcie-ipq4019", .data = &cfg_2_4_0 }, + { .compatible = "qcom,pcie-ipq6018", .data = &cfg_2_9_0 }, { .compatible = "qcom,pcie-ipq8064", .data = &cfg_2_1_0 }, { .compatible = "qcom,pcie-ipq8064-v2", .data = &cfg_2_1_0 }, - { .compatible = "qcom,pcie-apq8064", .data = &cfg_2_1_0 }, - { .compatible = "qcom,pcie-msm8996", .data = &cfg_2_3_2 }, { .compatible = "qcom,pcie-ipq8074", .data = &cfg_2_3_3 }, - { .compatible = "qcom,pcie-ipq4019", .data = &cfg_2_4_0 }, + { .compatible = "qcom,pcie-msm8996", .data = &cfg_2_3_2 }, { .compatible = "qcom,pcie-qcs404", .data = &cfg_2_4_0 }, { .compatible = "qcom,pcie-sa8540p", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sc7280", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sc8180x", .data = &cfg_1_9_0 }, + { .compatible = "qcom,pcie-sc8280xp", .data = &cfg_1_9_0 }, { .compatible = "qcom,pcie-sdm845", .data = &cfg_2_7_0 }, { .compatible = "qcom,pcie-sm8150", .data = &cfg_1_9_0 }, { .compatible = "qcom,pcie-sm8250", .data = &cfg_1_9_0 }, - { .compatible = "qcom,pcie-sc8180x", .data = &cfg_1_9_0 }, - { .compatible = "qcom,pcie-sc8280xp", .data = &cfg_1_9_0 }, { .compatible = "qcom,pcie-sm8450-pcie0", .data = &cfg_1_9_0 }, { .compatible = "qcom,pcie-sm8450-pcie1", .data = &cfg_1_9_0 }, - { .compatible = "qcom,pcie-sc7280", .data = &cfg_1_9_0 }, - { .compatible = "qcom,pcie-ipq6018", .data = &cfg_2_9_0 }, { } }; -- GitLab From 2e379ac66d4b734ba0e6dbdbc20f774d91be090b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 12 Aug 2022 16:11:15 +0200 Subject: [PATCH 0047/2223] PCI: mvebu: Fix endianness when accessing PCI emul bridge members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI emul bridge members iolimitupper, iobaseupper, memlimit and membase are of type __le16, so correctly access these members using le16_to_cpu() macros. Link: https://lore.kernel.org/r/20220812141115.24082-1-pali@kernel.org Fixes: e7a01876729c ("PCI: mvebu: Propagate errors when updating PCI_IO_BASE and PCI_MEM_BASE registers") Reported-by: kernel test robot Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-mvebu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index af915c951f066..3639327c7cd17 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -523,7 +523,7 @@ static int mvebu_pcie_handle_iobase_change(struct mvebu_pcie_port *port) /* Are the new iobase/iolimit values invalid? */ if (conf->iolimit < conf->iobase || - conf->iolimitupper < conf->iobaseupper) + le16_to_cpu(conf->iolimitupper) < le16_to_cpu(conf->iobaseupper)) return mvebu_pcie_set_window(port, port->io_target, port->io_attr, &desired, &port->iowin); @@ -535,10 +535,10 @@ static int mvebu_pcie_handle_iobase_change(struct mvebu_pcie_port *port) * is the CPU address. */ desired.remap = ((conf->iobase & 0xF0) << 8) | - (conf->iobaseupper << 16); + (le16_to_cpu(conf->iobaseupper) << 16); desired.base = port->pcie->io.start + desired.remap; desired.size = ((0xFFF | ((conf->iolimit & 0xF0) << 8) | - (conf->iolimitupper << 16)) - + (le16_to_cpu(conf->iolimitupper) << 16)) - desired.remap) + 1; @@ -552,7 +552,7 @@ static int mvebu_pcie_handle_membase_change(struct mvebu_pcie_port *port) struct pci_bridge_emul_conf *conf = &port->bridge.conf; /* Are the new membase/memlimit values invalid? */ - if (conf->memlimit < conf->membase) + if (le16_to_cpu(conf->memlimit) < le16_to_cpu(conf->membase)) return mvebu_pcie_set_window(port, port->mem_target, port->mem_attr, &desired, &port->memwin); @@ -562,8 +562,8 @@ static int mvebu_pcie_handle_membase_change(struct mvebu_pcie_port *port) * window to setup, according to the PCI-to-PCI bridge * specifications. */ - desired.base = ((conf->membase & 0xFFF0) << 16); - desired.size = (((conf->memlimit & 0xFFF0) << 16) | 0xFFFFF) - + desired.base = ((le16_to_cpu(conf->membase) & 0xFFF0) << 16); + desired.size = (((le16_to_cpu(conf->memlimit) & 0xFFF0) << 16) | 0xFFFFF) - desired.base + 1; return mvebu_pcie_set_window(port, port->mem_target, port->mem_attr, &desired, -- GitLab From 034fdac01fe5184e63d8af901ddb9c9a329f6902 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 5 May 2022 10:39:07 +0200 Subject: [PATCH 0048/2223] PCI: mediatek-gen3: Change driver name to mtk-pcie-gen3 driver_register() will refuse to register another driver with the same name. This change allows pcie-mediatek-gen3 to coexist with pcie-mediatek built into the kernel. Link: https://lore.kernel.org/r/20220505083907.86598-1-nbd@nbd.name Fixes: d3bf75b579b9 ("PCI: mediatek-gen3: Add MediaTek Gen3 driver for MT8192") Signed-off-by: Felix Fietkau Signed-off-by: Lorenzo Pieralisi Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Jianjun Wang --- drivers/pci/controller/pcie-mediatek-gen3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c index 11cdb9b6f1094..b8612ce5f4d0c 100644 --- a/drivers/pci/controller/pcie-mediatek-gen3.c +++ b/drivers/pci/controller/pcie-mediatek-gen3.c @@ -1071,7 +1071,7 @@ static struct platform_driver mtk_pcie_driver = { .probe = mtk_pcie_probe, .remove = mtk_pcie_remove, .driver = { - .name = "mtk-pcie", + .name = "mtk-pcie-gen3", .of_match_table = mtk_pcie_of_match, .pm = &mtk_pcie_pm_ops, }, -- GitLab From 7f08e806a03e0453a0de27137b668d4de52fcd49 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 2 Aug 2022 20:06:24 +0800 Subject: [PATCH 0049/2223] dt-bindings: PCI: mediatek-gen3: Add support for MT8188 and MT8195 MT8188 and MT8195 are ARM platform SoCs with the same PCIe IP as MT8192. Also add new clock name "peri_mem" since the MT8188 and MT8195 use clock "peri_mem" instead of "top_133m". Link: https://lore.kernel.org/r/20220802120624.19258-1-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Kozlowski --- .../devicetree/bindings/pci/mediatek-pcie-gen3.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml index 0499b94627aea..c00be39af64e5 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml @@ -48,7 +48,13 @@ allOf: properties: compatible: - const: mediatek,mt8192-pcie + oneOf: + - items: + - enum: + - mediatek,mt8188-pcie + - mediatek,mt8195-pcie + - const: mediatek,mt8192-pcie + - const: mediatek,mt8192-pcie reg: maxItems: 1 @@ -84,7 +90,9 @@ properties: - const: tl_96m - const: tl_32k - const: peri_26m - - const: top_133m + - enum: + - top_133m # for MT8192 + - peri_mem # for MT8188/MT8195 assigned-clocks: maxItems: 1 @@ -126,6 +134,7 @@ required: - interrupts - ranges - clocks + - clock-names - '#interrupt-cells' - interrupt-controller -- GitLab From 07ae9278b423500f93e10869b1a50276d82050ec Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 Aug 2022 16:18:25 +0200 Subject: [PATCH 0050/2223] rtc: mpfs: Remove printing of stray CR During boot, the driver prints out a stray carriage return character. Remove it, together with the preceding space character. While at it, change prescaler to "unsigned long", as returned by clk_get_rate(), to avoid truncating very large clock rates, and update the format specifiers. Fixes: 0b31d703598dc199 ("rtc: Add driver for Microchip PolarFire SoC") Signed-off-by: Geert Uytterhoeven Reviewed-by: Conor Dooley Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/bce2ca405ef96b1363fd1370887409d9e8468422.1660659437.git.geert+renesas@glider.be --- drivers/rtc/rtc-mpfs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-mpfs.c b/drivers/rtc/rtc-mpfs.c index f14d1925e0c94..944ad10365164 100644 --- a/drivers/rtc/rtc-mpfs.c +++ b/drivers/rtc/rtc-mpfs.c @@ -233,7 +233,7 @@ static int mpfs_rtc_probe(struct platform_device *pdev) { struct mpfs_rtc_dev *rtcdev; struct clk *clk; - u32 prescaler; + unsigned long prescaler; int wakeup_irq, ret; rtcdev = devm_kzalloc(&pdev->dev, sizeof(struct mpfs_rtc_dev), GFP_KERNEL); @@ -275,14 +275,13 @@ static int mpfs_rtc_probe(struct platform_device *pdev) /* prescaler hardware adds 1 to reg value */ prescaler = clk_get_rate(devm_clk_get(&pdev->dev, "rtcref")) - 1; - if (prescaler > MAX_PRESCALER_COUNT) { - dev_dbg(&pdev->dev, "invalid prescaler %d\n", prescaler); + dev_dbg(&pdev->dev, "invalid prescaler %lu\n", prescaler); return -EINVAL; } writel(prescaler, rtcdev->base + PRESCALER_REG); - dev_info(&pdev->dev, "prescaler set to: 0x%X \r\n", prescaler); + dev_info(&pdev->dev, "prescaler set to: %lu\n", prescaler); device_init_wakeup(&pdev->dev, true); ret = dev_pm_set_wake_irq(&pdev->dev, wakeup_irq); -- GitLab From f2c5671a64d2a79341e8ee45d5933f6a76960189 Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Tue, 16 Aug 2022 12:33:11 -0500 Subject: [PATCH 0051/2223] rtc: k3: wait until the unlock field is not zero After writing the magic words to the KICK0 and KICK1 registers, we must wait for a 1 in the unlock field of the general control register to signify when the rtc device is in an unlocked state. Signed-off-by: Bryan Brattlof Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220816173312.23243-1-bb@ti.com --- drivers/rtc/rtc-ti-k3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-ti-k3.c b/drivers/rtc/rtc-ti-k3.c index 7a0f181d3fefe..fd26be7868d25 100644 --- a/drivers/rtc/rtc-ti-k3.c +++ b/drivers/rtc/rtc-ti-k3.c @@ -190,7 +190,7 @@ static int k3rtc_unlock_rtc(struct ti_k3_rtc *priv) /* Skip fence since we are going to check the unlock bit as fence */ ret = regmap_field_read_poll_timeout(priv->r_fields[K3RTC_UNLOCK], ret, - !ret, 2, priv->sync_timeout_us); + ret, 2, priv->sync_timeout_us); return ret; } -- GitLab From 1e2585b49d849196f359bbf86677943fe2d80afe Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Tue, 16 Aug 2022 12:33:12 -0500 Subject: [PATCH 0052/2223] rtc: k3: detect SoC to determine erratum fix To allow new SoCs to use this device without a new compatible string, use a soc_device_attribute list to define all SoCs affected by the TI i2327 erratum and require help from their bootloaders to unlock this device. Signed-off-by: Bryan Brattlof Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220816173312.23243-2-bb@ti.com --- drivers/rtc/rtc-ti-k3.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/rtc/rtc-ti-k3.c b/drivers/rtc/rtc-ti-k3.c index fd26be7868d25..68e50c6a72f1d 100644 --- a/drivers/rtc/rtc-ti-k3.c +++ b/drivers/rtc/rtc-ti-k3.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -45,14 +46,6 @@ #define K3RTC_MIN_OFFSET (-277761) #define K3RTC_MAX_OFFSET (277778) -/** - * struct ti_k3_rtc_soc_data - Private of compatible data for ti-k3-rtc - * @unlock_irq_erratum: Has erratum for unlock infinite IRQs (erratum i2327) - */ -struct ti_k3_rtc_soc_data { - const bool unlock_irq_erratum; -}; - static const struct regmap_config ti_k3_rtc_regmap_config = { .name = "peripheral-registers", .reg_bits = 32, @@ -118,7 +111,6 @@ static const struct reg_field ti_rtc_reg_fields[] = { * @rtc_dev: rtc device * @regmap: rtc mmio regmap * @r_fields: rtc register fields - * @soc: SoC compatible match data */ struct ti_k3_rtc { unsigned int irq; @@ -127,7 +119,6 @@ struct ti_k3_rtc { struct rtc_device *rtc_dev; struct regmap *regmap; struct regmap_field *r_fields[K3_RTC_MAX_FIELDS]; - const struct ti_k3_rtc_soc_data *soc; }; static int k3rtc_field_read(struct ti_k3_rtc *priv, enum ti_k3_rtc_fields f) @@ -195,6 +186,17 @@ static int k3rtc_unlock_rtc(struct ti_k3_rtc *priv) return ret; } +/* + * This is the list of SoCs affected by TI's i2327 errata causing the RTC + * state-machine to break if not unlocked fast enough during boot. These + * SoCs must have the bootloader unlock this device very early in the + * boot-flow before we (Linux) can use this device. + */ +static const struct soc_device_attribute has_erratum_i2327[] = { + { .family = "AM62X", .revision = "SR1.0" }, + { /* sentinel */ } +}; + static int k3rtc_configure(struct device *dev) { int ret; @@ -208,7 +210,7 @@ static int k3rtc_configure(struct device *dev) * * In such occurrence, it is assumed that the RTC module is unusable */ - if (priv->soc->unlock_irq_erratum) { + if (soc_device_match(has_erratum_i2327)) { ret = k3rtc_check_unlocked(priv); /* If there is an error OR if we are locked, return error */ if (ret) { @@ -602,8 +604,6 @@ static int ti_k3_rtc_probe(struct platform_device *pdev) if (IS_ERR(priv->rtc_dev)) return PTR_ERR(priv->rtc_dev); - priv->soc = of_device_get_match_data(dev); - priv->rtc_dev->ops = &ti_k3_rtc_ops; priv->rtc_dev->range_max = (1ULL << 48) - 1; /* 48Bit seconds */ ti_k3_rtc_nvmem_config.priv = priv; @@ -635,12 +635,8 @@ static int ti_k3_rtc_probe(struct platform_device *pdev) return devm_rtc_nvmem_register(priv->rtc_dev, &ti_k3_rtc_nvmem_config); } -static const struct ti_k3_rtc_soc_data ti_k3_am62_data = { - .unlock_irq_erratum = true, -}; - static const struct of_device_id ti_k3_rtc_of_match_table[] = { - {.compatible = "ti,am62-rtc", .data = &ti_k3_am62_data}, + {.compatible = "ti,am62-rtc" }, {} }; MODULE_DEVICE_TABLE(of, ti_k3_rtc_of_match_table); -- GitLab From 509451ac03eb3afa4c4a32d4c11b1938f08de8e4 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Tue, 23 Aug 2022 15:07:02 +0200 Subject: [PATCH 0053/2223] rtc: gamecube: Always reset HW_SRNPROT after read This register would fail to be reset if reading the RTC bias failed for whichever reason. This commit reorganises the code around to unconditionally write it back to its previous value, unmap it, and return the result of regmap_read(), which makes it both simpler and more correct in the error case. Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220823130702.1046-1-linkmauve@linkmauve.fr --- drivers/rtc/rtc-gamecube.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-gamecube.c b/drivers/rtc/rtc-gamecube.c index c2717bb52b2be..c828bc8e05b9c 100644 --- a/drivers/rtc/rtc-gamecube.c +++ b/drivers/rtc/rtc-gamecube.c @@ -265,18 +265,17 @@ static int gamecube_rtc_read_offset_from_sram(struct priv *d) * SRAM address as on previous consoles. */ ret = regmap_read(d->regmap, RTC_SRAM_BIAS, &d->rtc_bias); - if (ret) { - pr_err("failed to get the RTC bias\n"); - iounmap(hw_srnprot); - return -1; - } /* Reset SRAM access to how it was before, our job here is done. */ if (old != 0x7bf) iowrite32be(old, hw_srnprot); + iounmap(hw_srnprot); - return 0; + if (ret) + pr_err("failed to get the RTC bias\n"); + + return ret; } static const struct regmap_range rtc_rd_ranges[] = { -- GitLab From 25bcfaad5ec4e82aede4270d4925967f8520d4cf Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 15 Aug 2022 18:59:23 +0200 Subject: [PATCH 0054/2223] rtc: mxc: Use devm_clk_get_enabled() helper The devm_clk_get_enabled() helper: - calls devm_clk_get() - calls clk_prepare_enable() and registers what is needed in order to call clk_disable_unprepare() when needed, as a managed resource. This simplifies the code, the error handling paths and avoid the need of a dedicated function used with devm_add_action_or_reset(). Based on my test with allyesconfig, this reduces the .o size from: text data bss dec hex filename 6705 1968 0 8673 21e1 drivers/rtc/rtc-mxc.o down to: 6212 1968 0 8180 1ff4 drivers/rtc/rtc-mxc.o Signed-off-by: Christophe JAILLET Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1b5ad1877304b01ddbba73ca615274a52f781aa2.1660582728.git.christophe.jaillet@wanadoo.fr --- drivers/rtc/rtc-mxc.c | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 53d4e253e81f0..762cf03345f14 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -291,14 +291,6 @@ static const struct rtc_class_ops mxc_rtc_ops = { .alarm_irq_enable = mxc_rtc_alarm_irq_enable, }; -static void mxc_rtc_action(void *p) -{ - struct rtc_plat_data *pdata = p; - - clk_disable_unprepare(pdata->clk_ref); - clk_disable_unprepare(pdata->clk_ipg); -} - static int mxc_rtc_probe(struct platform_device *pdev) { struct rtc_device *rtc; @@ -341,33 +333,18 @@ static int mxc_rtc_probe(struct platform_device *pdev) rtc->range_max = (1 << 16) * 86400ULL - 1; } - pdata->clk_ipg = devm_clk_get(&pdev->dev, "ipg"); + pdata->clk_ipg = devm_clk_get_enabled(&pdev->dev, "ipg"); if (IS_ERR(pdata->clk_ipg)) { dev_err(&pdev->dev, "unable to get ipg clock!\n"); return PTR_ERR(pdata->clk_ipg); } - ret = clk_prepare_enable(pdata->clk_ipg); - if (ret) - return ret; - - pdata->clk_ref = devm_clk_get(&pdev->dev, "ref"); + pdata->clk_ref = devm_clk_get_enabled(&pdev->dev, "ref"); if (IS_ERR(pdata->clk_ref)) { - clk_disable_unprepare(pdata->clk_ipg); dev_err(&pdev->dev, "unable to get ref clock!\n"); return PTR_ERR(pdata->clk_ref); } - ret = clk_prepare_enable(pdata->clk_ref); - if (ret) { - clk_disable_unprepare(pdata->clk_ipg); - return ret; - } - - ret = devm_add_action_or_reset(&pdev->dev, mxc_rtc_action, pdata); - if (ret) - return ret; - rate = clk_get_rate(pdata->clk_ref); if (rate == 32768) -- GitLab From b408fad61d34c765c3e01895286332af2d50402a Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Sat, 20 Aug 2022 00:14:10 +0100 Subject: [PATCH 0055/2223] dt-bindings: PCI: fu740-pci: fix missing clock-names The commit b92225b034c0 ("dt-bindings: PCI: designware: Fix 'unevaluatedProperties' warnings") removed the clock-names property as a requirement and from the example as it triggered unevaluatedProperty warnings. dtbs_check was not able to pick up on this at the time, but now can: arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dtb: pcie@e00000000: Unevaluated properties are not allowed ('clock-names' was unexpected) From schema: linux/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml The property was already in use by the FU740 DTS and the clock must be enabled. The Linux and FreeBSD drivers require the property to enable the clocks correctly Re-add the property and its "clocks" dependency, while making it required. Link: https://lore.kernel.org/r/20220819231415.3860210-2-mail@conchuod.ie Fixes: b92225b034c0 ("dt-bindings: PCI: designware: Fix 'unevaluatedProperties' warnings") Fixes: 43cea116be0b ("dt-bindings: PCI: Add SiFive FU740 PCIe host controller") Signed-off-by: Conor Dooley Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/sifive,fu740-pcie.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml b/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml index 195e6afeb1694..844fc71423020 100644 --- a/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml @@ -51,6 +51,12 @@ properties: description: A phandle to the PCIe power up reset line. maxItems: 1 + clocks: + maxItems: 1 + + clock-names: + const: pcie_aux + pwren-gpios: description: Should specify the GPIO for controlling the PCI bus device power on. maxItems: 1 @@ -66,6 +72,7 @@ required: - interrupt-map-mask - interrupt-map - clocks + - clock-names - resets - pwren-gpios - reset-gpios @@ -104,6 +111,7 @@ examples: <0x0 0x0 0x0 0x2 &plic0 58>, <0x0 0x0 0x0 0x3 &plic0 59>, <0x0 0x0 0x0 0x4 &plic0 60>; + clock-names = "pcie_aux"; clocks = <&prci FU740_PRCI_CLK_PCIE_AUX>; resets = <&prci 4>; pwren-gpios = <&gpio 5 0>; -- GitLab From 05a5741019a524ab9e1d355528c8ebcbd6debfe7 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Sat, 20 Aug 2022 00:14:11 +0100 Subject: [PATCH 0056/2223] dt-bindings: PCI: microchip,pcie-host: fix missing clocks properties Recent versions of dt-schema warn about unevaluatedProperties: arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dtb: pcie@2000000000: Unevaluated properties are not allowed ('clock-names', 'clocks', 'legacy-interrupt-controller', 'microchip,axi-m-atr0' were unexpected) From schema: Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml The clocks are required to enable interfaces between the FPGA fabric and the core complex, so add them to the binding. Link: https://lore.kernel.org/r/20220819231415.3860210-3-mail@conchuod.ie Fixes: 6ee6c89aac35 ("dt-bindings: PCI: microchip: Add Microchip PolarFire host binding") Signed-off-by: Conor Dooley Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/microchip,pcie-host.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml b/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml index edb4f81253c8e..6fbe62f4da937 100644 --- a/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml +++ b/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml @@ -25,6 +25,33 @@ properties: - const: cfg - const: apb + clocks: + description: + Fabric Interface Controllers, FICs, are the interface between the FPGA + fabric and the core complex on PolarFire SoC. The FICs require two clocks, + one from each side of the interface. The "FIC clocks" described by this + property are on the core complex side & communication through a FIC is not + possible unless it's corresponding clock is enabled. A clock must be + enabled for each of the interfaces the root port is connected through. + This could in theory be all 4 interfaces, one interface or any combination + in between. + minItems: 1 + items: + - description: FIC0's clock + - description: FIC1's clock + - description: FIC2's clock + - description: FIC3's clock + + clock-names: + description: + As any FIC connection combination is possible, the names should match the + order in the clocks property and take the form "ficN" where N is a number + 0-3 + minItems: 1 + maxItems: 4 + items: + pattern: '^fic[0-3]$' + interrupts: minItems: 1 items: -- GitLab From 1a7966b33b5bbefd950cffef1ea8ee3f5f1bf076 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Sat, 20 Aug 2022 00:14:12 +0100 Subject: [PATCH 0057/2223] dt-bindings: PCI: microchip,pcie-host: fix missing dma-ranges The dma-ranges property was missed when adding the binding initially. The root port can use up to 6 address translation tables, depending on configuration. Link: https://www.microsemi.com/document-portal/doc_download/1245812-polarfire-fpga-and-polarfire-soc-fpga-pci-express-user-guide # Section 1.3.3 Link: https://lore.kernel.org/r/20220819231415.3860210-4-mail@conchuod.ie Fixes: 6ee6c89aac35 ("dt-bindings: PCI: microchip: Add Microchip PolarFire host binding") Signed-off-by: Conor Dooley Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/microchip,pcie-host.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml b/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml index 6fbe62f4da937..23d95c65acff8 100644 --- a/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml +++ b/Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml @@ -67,6 +67,10 @@ properties: ranges: maxItems: 1 + dma-ranges: + minItems: 1 + maxItems: 6 + msi-controller: description: Identifies the node as an MSI controller. -- GitLab From 28a71499744133614da6ca1f9adc4d4044d6f417 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 19 Aug 2022 07:38:17 +0530 Subject: [PATCH 0058/2223] MAINTAINERS: Add Manivannan Sadhasivam as PCI Endpoint reviewer I've been reviewing the patches related to PCI Endpoint Subsystem for some time. So I'd like to add myself as the reviewer to get immediate attention to the patches. Link: https://lore.kernel.org/r/20220819020817.197844-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Acked-by: Kishon Vijay Abraham I --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8a5012ba6ff98..f60dfac7661c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15682,6 +15682,7 @@ PCI ENDPOINT SUBSYSTEM M: Kishon Vijay Abraham I M: Lorenzo Pieralisi R: Krzysztof Wilczyński +R: Manivannan Sadhasivam L: linux-pci@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ -- GitLab From 8d39e55e52c10f78967d6d029631601fcc8a0121 Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Tue, 16 Aug 2022 07:49:14 +0200 Subject: [PATCH 0059/2223] dt-binding: pinctrl: Add cypress,cy8c95x0 Added device tree binding documentation for Cypress CY8C95x0 I2C pin-controller. Signed-off-by: Patrick Rudolph Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220816054917.7893-2-patrick.rudolph@9elements.com Signed-off-by: Linus Walleij --- .../bindings/pinctrl/cypress,cy8c95x0.yaml | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/cypress,cy8c95x0.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/cypress,cy8c95x0.yaml b/Documentation/devicetree/bindings/pinctrl/cypress,cy8c95x0.yaml new file mode 100644 index 0000000000000..915cbbcc35550 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/cypress,cy8c95x0.yaml @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/cypress,cy8c95x0.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Cypress CY8C95X0 I2C GPIO expander + +maintainers: + - Patrick Rudolph + +description: | + This supports the 20/40/60 pin Cypress CYC95x0 GPIO I2C expanders. + Pin function configuration is performed on a per-pin basis. + +properties: + compatible: + enum: + - cypress,cy8c9520 + - cypress,cy8c9540 + - cypress,cy8c9560 + + reg: + maxItems: 1 + + gpio-controller: true + + '#gpio-cells': + description: + The first cell is the GPIO number and the second cell specifies GPIO + flags, as defined in . + const: 2 + + interrupts: + maxItems: 1 + + interrupt-controller: true + + '#interrupt-cells': + const: 2 + + gpio-line-names: true + + gpio-ranges: + maxItems: 1 + + gpio-reserved-ranges: + maxItems: 1 + + vdd-supply: + description: + Optional power supply. + +patternProperties: + '-pins$': + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: pincfg-node.yaml# + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: '^gp([0-7][0-7])$' + minItems: 1 + maxItems: 60 + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [ gpio, pwm ] + + bias-pull-down: true + + bias-pull-up: true + + bias-disable: true + + output-high: true + + output-low: true + + drive-push-pull: true + + drive-open-drain: true + + drive-open-source: true + + required: + - pins + - function + + additionalProperties: false + +required: + - compatible + - reg + - interrupts + - interrupt-controller + - '#interrupt-cells' + - gpio-controller + - '#gpio-cells' + +additionalProperties: false + +allOf: + - $ref: "pinctrl.yaml#" + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pinctrl@20 { + compatible = "cypress,cy8c9520"; + reg = <0x20>; + gpio-controller; + #gpio-cells = <2>; + #interrupt-cells = <2>; + interrupts = ; + interrupt-controller; + vdd-supply = <&p3v3>; + gpio-reserved-ranges = <5 1>; + }; + }; -- GitLab From e6cbbe42944de93ba4e0785b4f90d284b1d7cdf6 Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Tue, 16 Aug 2022 07:49:15 +0200 Subject: [PATCH 0060/2223] pinctrl: Add Cypress cy8c95x0 support Add support for cypress I2C GPIO expanders cy8c9520, cy8c9540 and cy8c9560. The GPIO expanders feature a PWM mode, thus add it as pinctrl driver. The chip features multiple drive modes for each pin when configured as output and multiple bias settings when configured as input. Tested all three components and verified that all functionality is fully working. Datasheet: https://www.cypress.com/file/37971/download Signed-off-by: Patrick Rudolph Signed-off-by: Naresh Solanki Link: https://lore.kernel.org/r/20220816054917.7893-3-patrick.rudolph@9elements.com Signed-off-by: Linus Walleij --- MAINTAINERS | 6 + drivers/pinctrl/Kconfig | 14 + drivers/pinctrl/Makefile | 1 + drivers/pinctrl/pinctrl-cy8c95x0.c | 1381 ++++++++++++++++++++++++++++ 4 files changed, 1402 insertions(+) create mode 100644 drivers/pinctrl/pinctrl-cy8c95x0.c diff --git a/MAINTAINERS b/MAINTAINERS index 8a5012ba6ff98..131299c18f029 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5629,6 +5629,12 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ T: git git://linuxtv.org/anttip/media_tree.git F: drivers/media/common/cypress_firmware* +CYPRESS CY8C95X0 PINCTRL DRIVER +M: Patrick Rudolph +L: linux-gpio@vger.kernel.org +S: Maintained +F: drivers/pinctrl/pinctrl-cy8c95x0.c + CYPRESS CY8CTMA140 TOUCHSCREEN DRIVER M: Linus Walleij L: linux-input@vger.kernel.org diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index 1cf74b0c42e56..fc0e529e633ff 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -135,6 +135,20 @@ config PINCTRL_BM1880 help Pinctrl driver for Bitmain BM1880 SoC. +config PINCTRL_CY8C95X0 + tristate "Cypress CY8C95X0 I2C pinctrl and GPIO driver" + depends on I2C && OF + select GPIOLIB + select GPIOLIB_IRQCHIP + select PINMUX + select PINCONF + select GENERIC_PINCONF + select REGMAP_I2C + help + Support for 20/40/60 pin Cypress Cy8C95x0 pinctrl/gpio I2C expander. + This driver can also be built as a module. If so, the module will be + called pinctrl-cy8c95x0. + config PINCTRL_DA850_PUPD tristate "TI DA850/OMAP-L138/AM18XX pull-up and pull-down groups" depends on OF && (ARCH_DAVINCI_DA850 || COMPILE_TEST) diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index e76f5cdc64b0e..7188dab7eec88 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_PINCTRL_AT91) += pinctrl-at91.o obj-$(CONFIG_PINCTRL_AT91PIO4) += pinctrl-at91-pio4.o obj-$(CONFIG_PINCTRL_AXP209) += pinctrl-axp209.o obj-$(CONFIG_PINCTRL_BM1880) += pinctrl-bm1880.o +obj-$(CONFIG_PINCTRL_CY8C95X0) += pinctrl-cy8c95x0.o obj-$(CONFIG_PINCTRL_DA850_PUPD) += pinctrl-da850-pupd.o obj-$(CONFIG_PINCTRL_DA9062) += pinctrl-da9062.o obj-$(CONFIG_PINCTRL_DIGICOLOR) += pinctrl-digicolor.o diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c new file mode 100644 index 0000000000000..a29df0920f4f4 --- /dev/null +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -0,0 +1,1381 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CY8C95X0 20/40/60 pin I2C GPIO port expander with interrupt support + * + * Copyright (C) 2022 9elements GmbH + * Author: Patrick Rudolph + * Author: Naresh Solanki + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Fast access registers */ +#define CY8C95X0_INPUT 0x00 +#define CY8C95X0_OUTPUT 0x08 +#define CY8C95X0_INTSTATUS 0x10 + +#define CY8C95X0_INPUT_(x) (CY8C95X0_INPUT + (x)) +#define CY8C95X0_OUTPUT_(x) (CY8C95X0_OUTPUT + (x)) +#define CY8C95X0_INTSTATUS_(x) (CY8C95X0_INTSTATUS + (x)) + +/* Port Select configures the port */ +#define CY8C95X0_PORTSEL 0x18 +/* port settings, write PORTSEL first */ +#define CY8C95X0_INTMASK 0x19 +#define CY8C95X0_PWMSEL 0x1A +#define CY8C95X0_INVERT 0x1B +#define CY8C95X0_DIRECTION 0x1C +/* Drive mode register change state on writing '1' */ +#define CY8C95X0_DRV_PU 0x1D +#define CY8C95X0_DRV_PD 0x1E +#define CY8C95X0_DRV_ODH 0x1F +#define CY8C95X0_DRV_ODL 0x20 +#define CY8C95X0_DRV_PP_FAST 0x21 +#define CY8C95X0_DRV_PP_SLOW 0x22 +#define CY8C95X0_DRV_HIZ 0x23 +#define CY8C95X0_DEVID 0x2E +#define CY8C95X0_WATCHDOG 0x2F +#define CY8C95X0_COMMAND 0x30 + +#define CY8C95X0_PIN_TO_OFFSET(x) (((x) >= 20) ? ((x) + 4) : (x)) + +static const struct i2c_device_id cy8c95x0_id[] = { + { "cy8c9520", 20, }, + { "cy8c9540", 40, }, + { "cy8c9560", 60, }, + { } +}; +MODULE_DEVICE_TABLE(i2c, cy8c95x0_id); + +#define OF_CY8C95X(__nrgpio) ((void *)(__nrgpio)) + +static const struct of_device_id cy8c95x0_dt_ids[] = { + { .compatible = "cypress,cy8c9520", .data = OF_CY8C95X(20), }, + { .compatible = "cypress,cy8c9540", .data = OF_CY8C95X(40), }, + { .compatible = "cypress,cy8c9560", .data = OF_CY8C95X(60), }, + { } +}; + +MODULE_DEVICE_TABLE(of, cy8c95x0_dt_ids); + +#define MAX_BANK 8 +#define BANK_SZ 8 +#define MAX_LINE (MAX_BANK * BANK_SZ) + +#define CY8C95X0_GPIO_MASK GENMASK(7, 0) + +/** + * struct cy8c95x0_pinctrl - driver data + * @regmap: Device's regmap + * @irq_lock: IRQ bus lock + * @i2c_lock: Mutex for the device internal mux register + * @irq_mask: I/O bits affected by interrupts + * @irq_trig_raise: I/O bits affected by raising voltage level + * @irq_trig_fall: I/O bits affected by falling voltage level + * @irq_trig_low: I/O bits affected by a low voltage level + * @irq_trig_high: I/O bits affected by a high voltage level + * @push_pull: I/O bits configured as push pull driver + * @shiftmask: Mask used to compensate for Gport2 width + * @irq_chip: IRQ chip configuration + * @nport: Number of Gports in this chip + * @gpio_chip: gpiolib chip + * @driver_data: private driver data + * @regulator: Pointer to the regulator for the IC + * @dev: struct device + * @pctldev: pin controller device + * @pinctrl_desc: pin controller description + * @name: Chip controller name + * @tpin: Total number of pins + */ +struct cy8c95x0_pinctrl { + struct regmap *regmap; + struct mutex irq_lock; + struct mutex i2c_lock; + DECLARE_BITMAP(irq_mask, MAX_LINE); + DECLARE_BITMAP(irq_trig_raise, MAX_LINE); + DECLARE_BITMAP(irq_trig_fall, MAX_LINE); + DECLARE_BITMAP(irq_trig_low, MAX_LINE); + DECLARE_BITMAP(irq_trig_high, MAX_LINE); + DECLARE_BITMAP(push_pull, MAX_LINE); + DECLARE_BITMAP(shiftmask, MAX_LINE); + struct irq_chip irq_chip; + int nport; + struct gpio_chip gpio_chip; + unsigned long driver_data; + struct regulator *regulator; + struct device *dev; + struct pinctrl_dev *pctldev; + struct pinctrl_desc pinctrl_desc; + char name[32]; + unsigned int tpin; +}; + +static const struct pinctrl_pin_desc cy8c9560_pins[] = { + PINCTRL_PIN(0, "gp00"), + PINCTRL_PIN(1, "gp01"), + PINCTRL_PIN(2, "gp02"), + PINCTRL_PIN(3, "gp03"), + PINCTRL_PIN(4, "gp04"), + PINCTRL_PIN(5, "gp05"), + PINCTRL_PIN(6, "gp06"), + PINCTRL_PIN(7, "gp07"), + + PINCTRL_PIN(8, "gp10"), + PINCTRL_PIN(9, "gp11"), + PINCTRL_PIN(10, "gp12"), + PINCTRL_PIN(11, "gp13"), + PINCTRL_PIN(12, "gp14"), + PINCTRL_PIN(13, "gp15"), + PINCTRL_PIN(14, "gp16"), + PINCTRL_PIN(15, "gp17"), + + PINCTRL_PIN(16, "gp20"), + PINCTRL_PIN(17, "gp21"), + PINCTRL_PIN(18, "gp22"), + PINCTRL_PIN(19, "gp23"), + + PINCTRL_PIN(20, "gp30"), + PINCTRL_PIN(21, "gp31"), + PINCTRL_PIN(22, "gp32"), + PINCTRL_PIN(23, "gp33"), + PINCTRL_PIN(24, "gp34"), + PINCTRL_PIN(25, "gp35"), + PINCTRL_PIN(26, "gp36"), + PINCTRL_PIN(27, "gp37"), + + PINCTRL_PIN(28, "gp40"), + PINCTRL_PIN(29, "gp41"), + PINCTRL_PIN(30, "gp42"), + PINCTRL_PIN(31, "gp43"), + PINCTRL_PIN(32, "gp44"), + PINCTRL_PIN(33, "gp45"), + PINCTRL_PIN(34, "gp46"), + PINCTRL_PIN(35, "gp47"), + + PINCTRL_PIN(36, "gp50"), + PINCTRL_PIN(37, "gp51"), + PINCTRL_PIN(38, "gp52"), + PINCTRL_PIN(39, "gp53"), + PINCTRL_PIN(40, "gp54"), + PINCTRL_PIN(41, "gp55"), + PINCTRL_PIN(42, "gp56"), + PINCTRL_PIN(43, "gp57"), + + PINCTRL_PIN(44, "gp60"), + PINCTRL_PIN(45, "gp61"), + PINCTRL_PIN(46, "gp62"), + PINCTRL_PIN(47, "gp63"), + PINCTRL_PIN(48, "gp64"), + PINCTRL_PIN(49, "gp65"), + PINCTRL_PIN(50, "gp66"), + PINCTRL_PIN(51, "gp67"), + + PINCTRL_PIN(52, "gp70"), + PINCTRL_PIN(53, "gp71"), + PINCTRL_PIN(54, "gp72"), + PINCTRL_PIN(55, "gp73"), + PINCTRL_PIN(56, "gp74"), + PINCTRL_PIN(57, "gp75"), + PINCTRL_PIN(58, "gp76"), + PINCTRL_PIN(59, "gp77"), +}; + +static const char * const cy8c95x0_groups[] = { + "gp00", + "gp01", + "gp02", + "gp03", + "gp04", + "gp05", + "gp06", + "gp07", + + "gp10", + "gp11", + "gp12", + "gp13", + "gp14", + "gp15", + "gp16", + "gp17", + + "gp20", + "gp21", + "gp22", + "gp23", + + "gp30", + "gp31", + "gp32", + "gp33", + "gp34", + "gp35", + "gp36", + "gp37", + + "gp40", + "gp41", + "gp42", + "gp43", + "gp44", + "gp45", + "gp46", + "gp47", + + "gp50", + "gp51", + "gp52", + "gp53", + "gp54", + "gp55", + "gp56", + "gp57", + + "gp60", + "gp61", + "gp62", + "gp63", + "gp64", + "gp65", + "gp66", + "gp67", + + "gp70", + "gp71", + "gp72", + "gp73", + "gp74", + "gp75", + "gp76", + "gp77", +}; + +static inline u8 cypress_get_port(struct cy8c95x0_pinctrl *chip, unsigned int pin) +{ + /* Account for GPORT2 which only has 4 bits */ + return CY8C95X0_PIN_TO_OFFSET(pin) / BANK_SZ; +} + +static int cypress_get_pin_mask(struct cy8c95x0_pinctrl *chip, unsigned int pin) +{ + /* Account for GPORT2 which only has 4 bits */ + return BIT(CY8C95X0_PIN_TO_OFFSET(pin) % BANK_SZ); +} + +static bool cy8c95x0_readable_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case 0x24 ... 0x27: + return false; + } + + return true; +} + +static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case CY8C95X0_INPUT_(0) ... CY8C95X0_INPUT_(7): + return false; + case CY8C95X0_DEVID: + return false; + case 0x24 ... 0x27: + return false; + } + + return true; +} + +static bool cy8c95x0_volatile_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case CY8C95X0_INPUT_(0) ... CY8C95X0_INPUT_(7): + case CY8C95X0_INTSTATUS_(0) ... CY8C95X0_INTSTATUS_(7): + case CY8C95X0_INTMASK: + case CY8C95X0_INVERT: + case CY8C95X0_PWMSEL: + case CY8C95X0_DIRECTION: + case CY8C95X0_DRV_PU: + case CY8C95X0_DRV_PD: + case CY8C95X0_DRV_ODH: + case CY8C95X0_DRV_ODL: + case CY8C95X0_DRV_PP_FAST: + case CY8C95X0_DRV_PP_SLOW: + case CY8C95X0_DRV_HIZ: + return true; + } + + return false; +} + +static bool cy8c95x0_precious_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case CY8C95X0_INTSTATUS_(0) ... CY8C95X0_INTSTATUS_(7): + return true; + } + + return false; +} + +static const struct reg_default cy8c95x0_reg_defaults[] = { + { CY8C95X0_OUTPUT_(0), 0xff }, + { CY8C95X0_OUTPUT_(1), 0xff }, + { CY8C95X0_OUTPUT_(2), 0xff }, + { CY8C95X0_OUTPUT_(3), 0xff }, + { CY8C95X0_OUTPUT_(4), 0xff }, + { CY8C95X0_OUTPUT_(5), 0xff }, + { CY8C95X0_OUTPUT_(6), 0xff }, + { CY8C95X0_OUTPUT_(7), 0xff }, + { CY8C95X0_PORTSEL, 0 }, + { CY8C95X0_PWMSEL, 0 }, +}; + +static const struct regmap_config cy8c95x0_i2c_regmap = { + .reg_bits = 8, + .val_bits = 8, + + .reg_defaults = cy8c95x0_reg_defaults, + .num_reg_defaults = ARRAY_SIZE(cy8c95x0_reg_defaults), + + .readable_reg = cy8c95x0_readable_register, + .writeable_reg = cy8c95x0_writeable_register, + .volatile_reg = cy8c95x0_volatile_register, + .precious_reg = cy8c95x0_precious_register, + + .cache_type = REGCACHE_FLAT, + .max_register = CY8C95X0_COMMAND, +}; + +static int cy8c95x0_write_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, + unsigned long *val, unsigned long *mask) +{ + DECLARE_BITMAP(tmask, MAX_LINE); + DECLARE_BITMAP(tval, MAX_LINE); + int write_val; + int ret = 0; + int i, off = 0; + u8 bits; + + /* Add the 4 bit gap of Gport2 */ + bitmap_andnot(tmask, mask, chip->shiftmask, MAX_LINE); + bitmap_shift_left(tmask, tmask, 4, MAX_LINE); + bitmap_replace(tmask, tmask, mask, chip->shiftmask, BANK_SZ * 3); + + bitmap_andnot(tval, val, chip->shiftmask, MAX_LINE); + bitmap_shift_left(tval, tval, 4, MAX_LINE); + bitmap_replace(tval, tval, val, chip->shiftmask, BANK_SZ * 3); + + mutex_lock(&chip->i2c_lock); + for (i = 0; i < chip->nport; i++) { + /* Skip over unused banks */ + bits = bitmap_get_value8(tmask, i * BANK_SZ); + if (!bits) + continue; + + switch (reg) { + /* muxed registers */ + case CY8C95X0_INTMASK: + case CY8C95X0_PWMSEL: + case CY8C95X0_INVERT: + case CY8C95X0_DIRECTION: + case CY8C95X0_DRV_PU: + case CY8C95X0_DRV_PD: + case CY8C95X0_DRV_ODH: + case CY8C95X0_DRV_ODL: + case CY8C95X0_DRV_PP_FAST: + case CY8C95X0_DRV_PP_SLOW: + case CY8C95X0_DRV_HIZ: + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, i); + if (ret < 0) + goto out; + off = reg; + break; + /* direct access registers */ + case CY8C95X0_INPUT: + case CY8C95X0_OUTPUT: + case CY8C95X0_INTSTATUS: + off = reg + i; + break; + default: + ret = -EINVAL; + goto out; + } + + write_val = bitmap_get_value8(tval, i * BANK_SZ); + + ret = regmap_update_bits(chip->regmap, off, bits, write_val); + if (ret < 0) + goto out; + } +out: + mutex_unlock(&chip->i2c_lock); + + if (ret < 0) + dev_err(chip->dev, "failed writing register %d: err %d\n", off, ret); + + return ret; +} + +static int cy8c95x0_read_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, + unsigned long *val, unsigned long *mask) +{ + DECLARE_BITMAP(tmask, MAX_LINE); + DECLARE_BITMAP(tval, MAX_LINE); + DECLARE_BITMAP(tmp, MAX_LINE); + int read_val; + int ret = 0; + int i, off = 0; + u8 bits; + + /* Add the 4 bit gap of Gport2 */ + bitmap_andnot(tmask, mask, chip->shiftmask, MAX_LINE); + bitmap_shift_left(tmask, tmask, 4, MAX_LINE); + bitmap_replace(tmask, tmask, mask, chip->shiftmask, BANK_SZ * 3); + + bitmap_andnot(tval, val, chip->shiftmask, MAX_LINE); + bitmap_shift_left(tval, tval, 4, MAX_LINE); + bitmap_replace(tval, tval, val, chip->shiftmask, BANK_SZ * 3); + + mutex_lock(&chip->i2c_lock); + for (i = 0; i < chip->nport; i++) { + /* Skip over unused banks */ + bits = bitmap_get_value8(tmask, i * BANK_SZ); + if (!bits) + continue; + + switch (reg) { + /* muxed registers */ + case CY8C95X0_INTMASK: + case CY8C95X0_PWMSEL: + case CY8C95X0_INVERT: + case CY8C95X0_DIRECTION: + case CY8C95X0_DRV_PU: + case CY8C95X0_DRV_PD: + case CY8C95X0_DRV_ODH: + case CY8C95X0_DRV_ODL: + case CY8C95X0_DRV_PP_FAST: + case CY8C95X0_DRV_PP_SLOW: + case CY8C95X0_DRV_HIZ: + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, i); + if (ret < 0) + goto out; + off = reg; + break; + /* direct access registers */ + case CY8C95X0_INPUT: + case CY8C95X0_OUTPUT: + case CY8C95X0_INTSTATUS: + off = reg + i; + break; + default: + ret = -EINVAL; + goto out; + } + + ret = regmap_read(chip->regmap, off, &read_val); + if (ret < 0) + goto out; + + read_val &= bits; + read_val |= bitmap_get_value8(tval, i * BANK_SZ) & ~bits; + bitmap_set_value8(tval, read_val, i * BANK_SZ); + } + + /* Fill the 4 bit gap of Gport2 */ + bitmap_shift_right(tmp, tval, 4, MAX_LINE); + bitmap_replace(val, tmp, tval, chip->shiftmask, MAX_LINE); + +out: + mutex_unlock(&chip->i2c_lock); + + if (ret < 0) + dev_err(chip->dev, "failed reading register %d: err %d\n", off, ret); + + return ret; +} + +static int cy8c95x0_gpio_direction_input(struct gpio_chip *gc, unsigned int off) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + u8 port = cypress_get_port(chip, off); + u8 bit = cypress_get_pin_mask(chip, off); + int ret; + + mutex_lock(&chip->i2c_lock); + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); + if (ret) + goto out; + + ret = regmap_write_bits(chip->regmap, CY8C95X0_DIRECTION, bit, bit); + if (ret) + goto out; + + if (test_bit(off, chip->push_pull)) { + /* + * Disable driving the pin by forcing it to HighZ. Only setting the + * direction register isn't sufficient in Push-Pull mode. + */ + ret = regmap_write_bits(chip->regmap, CY8C95X0_DRV_HIZ, bit, bit); + if (ret) + goto out; + clear_bit(off, chip->push_pull); + } + +out: + mutex_unlock(&chip->i2c_lock); + + return ret; +} + +static int cy8c95x0_gpio_direction_output(struct gpio_chip *gc, + unsigned int off, int val) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + u8 port = cypress_get_port(chip, off); + u8 outreg = CY8C95X0_OUTPUT_(port); + u8 bit = cypress_get_pin_mask(chip, off); + int ret; + + /* set output level */ + ret = regmap_write_bits(chip->regmap, outreg, bit, val ? bit : 0); + if (ret) + return ret; + + mutex_lock(&chip->i2c_lock); + /* select port */ + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); + if (ret) + goto out; + + /* then direction */ + ret = regmap_write_bits(chip->regmap, CY8C95X0_DIRECTION, bit, 0); + +out: + mutex_unlock(&chip->i2c_lock); + + return ret; +} + +static int cy8c95x0_gpio_get_value(struct gpio_chip *gc, unsigned int off) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + u8 inreg = CY8C95X0_INPUT_(cypress_get_port(chip, off)); + u8 bit = cypress_get_pin_mask(chip, off); + u32 reg_val; + int ret; + + ret = regmap_read(chip->regmap, inreg, ®_val); + if (ret < 0) { + /* + * NOTE: + * diagnostic already emitted; that's all we should + * do unless gpio_*_value_cansleep() calls become different + * from their nonsleeping siblings (and report faults). + */ + return 0; + } + + return !!(reg_val & bit); +} + +static void cy8c95x0_gpio_set_value(struct gpio_chip *gc, unsigned int off, + int val) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + u8 outreg = CY8C95X0_OUTPUT_(cypress_get_port(chip, off)); + u8 bit = cypress_get_pin_mask(chip, off); + + regmap_write_bits(chip->regmap, outreg, bit, val ? bit : 0); +} + +static int cy8c95x0_gpio_get_direction(struct gpio_chip *gc, unsigned int off) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + u8 port = cypress_get_port(chip, off); + u8 bit = cypress_get_pin_mask(chip, off); + u32 reg_val; + int ret; + + mutex_lock(&chip->i2c_lock); + + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); + if (ret < 0) + goto out; + + ret = regmap_read(chip->regmap, CY8C95X0_DIRECTION, ®_val); + if (ret < 0) + goto out; + + mutex_unlock(&chip->i2c_lock); + + if (reg_val & bit) + return GPIO_LINE_DIRECTION_IN; + + return GPIO_LINE_DIRECTION_OUT; +out: + mutex_unlock(&chip->i2c_lock); + return ret; +} + +static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, + unsigned int off, + unsigned long *config) +{ + enum pin_config_param param = pinconf_to_config_param(*config); + u8 port = cypress_get_port(chip, off); + u8 bit = cypress_get_pin_mask(chip, off); + unsigned int reg; + u32 reg_val; + u16 arg = 0; + int ret; + + mutex_lock(&chip->i2c_lock); + + /* select port */ + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); + if (ret < 0) + goto out; + + switch (param) { + case PIN_CONFIG_BIAS_PULL_UP: + reg = CY8C95X0_DRV_PU; + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + reg = CY8C95X0_DRV_PD; + break; + case PIN_CONFIG_BIAS_DISABLE: + reg = CY8C95X0_DRV_HIZ; + break; + case PIN_CONFIG_DRIVE_OPEN_DRAIN: + reg = CY8C95X0_DRV_ODL; + break; + case PIN_CONFIG_DRIVE_OPEN_SOURCE: + reg = CY8C95X0_DRV_ODH; + break; + case PIN_CONFIG_DRIVE_PUSH_PULL: + reg = CY8C95X0_DRV_PP_FAST; + break; + case PIN_CONFIG_INPUT_ENABLE: + reg = CY8C95X0_DIRECTION; + break; + case PIN_CONFIG_MODE_PWM: + reg = CY8C95X0_PWMSEL; + break; + case PIN_CONFIG_OUTPUT: + reg = CY8C95X0_OUTPUT_(port); + break; + case PIN_CONFIG_OUTPUT_ENABLE: + reg = CY8C95X0_DIRECTION; + break; + + case PIN_CONFIG_BIAS_HIGH_IMPEDANCE: + case PIN_CONFIG_BIAS_BUS_HOLD: + case PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: + case PIN_CONFIG_DRIVE_STRENGTH: + case PIN_CONFIG_DRIVE_STRENGTH_UA: + case PIN_CONFIG_INPUT_DEBOUNCE: + case PIN_CONFIG_INPUT_SCHMITT: + case PIN_CONFIG_INPUT_SCHMITT_ENABLE: + case PIN_CONFIG_MODE_LOW_POWER: + case PIN_CONFIG_PERSIST_STATE: + case PIN_CONFIG_POWER_SOURCE: + case PIN_CONFIG_SKEW_DELAY: + case PIN_CONFIG_SLEEP_HARDWARE_STATE: + case PIN_CONFIG_SLEW_RATE: + default: + ret = -ENOTSUPP; + goto out; + } + /* Writing 1 to one of the drive mode registers will automatically + * clear conflicting set bits in the other drive mode registers. + */ + ret = regmap_read(chip->regmap, reg, ®_val); + if (reg_val & bit) + arg = 1; + + *config = pinconf_to_config_packed(param, (u16)arg); +out: + mutex_unlock(&chip->i2c_lock); + + return ret; +} + +static int cy8c95x0_gpio_set_pincfg(struct cy8c95x0_pinctrl *chip, + unsigned int off, + unsigned long config) +{ + u8 port = cypress_get_port(chip, off); + u8 bit = cypress_get_pin_mask(chip, off); + unsigned long param = pinconf_to_config_param(config); + unsigned int reg; + int ret; + + mutex_lock(&chip->i2c_lock); + + /* select port */ + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); + if (ret < 0) + goto out; + + switch (param) { + case PIN_CONFIG_BIAS_PULL_UP: + clear_bit(off, chip->push_pull); + reg = CY8C95X0_DRV_PU; + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + clear_bit(off, chip->push_pull); + reg = CY8C95X0_DRV_PD; + break; + case PIN_CONFIG_BIAS_DISABLE: + clear_bit(off, chip->push_pull); + reg = CY8C95X0_DRV_HIZ; + break; + case PIN_CONFIG_DRIVE_OPEN_DRAIN: + clear_bit(off, chip->push_pull); + reg = CY8C95X0_DRV_ODL; + break; + case PIN_CONFIG_DRIVE_OPEN_SOURCE: + clear_bit(off, chip->push_pull); + reg = CY8C95X0_DRV_ODH; + break; + case PIN_CONFIG_DRIVE_PUSH_PULL: + set_bit(off, chip->push_pull); + reg = CY8C95X0_DRV_PP_FAST; + break; + case PIN_CONFIG_MODE_PWM: + reg = CY8C95X0_PWMSEL; + break; + default: + ret = -ENOTSUPP; + goto out; + } + /* Writing 1 to one of the drive mode registers will automatically + * clear conflicting set bits in the other drive mode registers. + */ + ret = regmap_write_bits(chip->regmap, reg, bit, bit); + +out: + mutex_unlock(&chip->i2c_lock); + return ret; +} + +static int cy8c95x0_gpio_set_config(struct gpio_chip *gc, unsigned int offset, + unsigned long config) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + unsigned long arg = pinconf_to_config_argument(config); + + switch (pinconf_to_config_param(config)) { + case PIN_CONFIG_INPUT_ENABLE: + return cy8c95x0_gpio_direction_input(gc, offset); + case PIN_CONFIG_OUTPUT: + return cy8c95x0_gpio_direction_output(gc, offset, arg); + case PIN_CONFIG_MODE_PWM: + case PIN_CONFIG_BIAS_PULL_UP: + case PIN_CONFIG_BIAS_PULL_DOWN: + case PIN_CONFIG_BIAS_DISABLE: + case PIN_CONFIG_DRIVE_OPEN_DRAIN: + case PIN_CONFIG_DRIVE_OPEN_SOURCE: + case PIN_CONFIG_DRIVE_PUSH_PULL: + return cy8c95x0_gpio_set_pincfg(chip, offset, config); + default: + return -ENOTSUPP; + } +} + +static int cy8c95x0_gpio_get_multiple(struct gpio_chip *gc, + unsigned long *mask, unsigned long *bits) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + + return cy8c95x0_read_regs_mask(chip, CY8C95X0_INPUT, bits, mask); +} + +static void cy8c95x0_gpio_set_multiple(struct gpio_chip *gc, + unsigned long *mask, unsigned long *bits) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + + cy8c95x0_write_regs_mask(chip, CY8C95X0_OUTPUT, bits, mask); +} + +static int cy8c95x0_setup_gpiochip(struct cy8c95x0_pinctrl *chip, int ngpio) +{ + struct gpio_chip *gc = &chip->gpio_chip; + + gc->direction_input = cy8c95x0_gpio_direction_input; + gc->direction_output = cy8c95x0_gpio_direction_output; + gc->get = cy8c95x0_gpio_get_value; + gc->set = cy8c95x0_gpio_set_value; + gc->get_direction = cy8c95x0_gpio_get_direction; + gc->get_multiple = cy8c95x0_gpio_get_multiple; + gc->set_multiple = cy8c95x0_gpio_set_multiple; + gc->set_config = cy8c95x0_gpio_set_config; + gc->can_sleep = true; + + gc->base = -1; + gc->ngpio = ngpio; + + gc->parent = chip->dev; + gc->owner = THIS_MODULE; + gc->names = NULL; + + gc->label = dev_name(chip->dev); + + return devm_gpiochip_add_data(chip->dev, gc, chip); +} + +static void cy8c95x0_irq_mask(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + + set_bit(irqd_to_hwirq(d), chip->irq_mask); +} + +static void cy8c95x0_irq_unmask(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + + clear_bit(irqd_to_hwirq(d), chip->irq_mask); +} + +static void cy8c95x0_irq_bus_lock(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + + mutex_lock(&chip->irq_lock); +} + +static void cy8c95x0_irq_bus_sync_unlock(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + DECLARE_BITMAP(ones, MAX_LINE); + DECLARE_BITMAP(irq_mask, MAX_LINE); + DECLARE_BITMAP(reg_direction, MAX_LINE); + + bitmap_fill(ones, MAX_LINE); + + cy8c95x0_write_regs_mask(chip, CY8C95X0_INTMASK, chip->irq_mask, ones); + + /* Switch direction to input if needed */ + cy8c95x0_read_regs_mask(chip, CY8C95X0_DIRECTION, reg_direction, chip->irq_mask); + bitmap_or(irq_mask, chip->irq_mask, reg_direction, MAX_LINE); + bitmap_complement(irq_mask, irq_mask, MAX_LINE); + + /* Look for any newly setup interrupt */ + cy8c95x0_write_regs_mask(chip, CY8C95X0_DIRECTION, ones, irq_mask); + + mutex_unlock(&chip->irq_lock); +} + +static int cy8c95x0_irq_set_type(struct irq_data *d, unsigned int type) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + unsigned int trig_type; + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + case IRQ_TYPE_EDGE_FALLING: + case IRQ_TYPE_EDGE_BOTH: + trig_type = type; + break; + case IRQ_TYPE_LEVEL_HIGH: + trig_type = IRQ_TYPE_EDGE_RISING; + break; + case IRQ_TYPE_LEVEL_LOW: + trig_type = IRQ_TYPE_EDGE_FALLING; + break; + default: + dev_err(chip->dev, "irq %d: unsupported type %d\n", d->irq, type); + return -EINVAL; + } + + assign_bit(hwirq, chip->irq_trig_fall, trig_type & IRQ_TYPE_EDGE_FALLING); + assign_bit(hwirq, chip->irq_trig_raise, trig_type & IRQ_TYPE_EDGE_RISING); + assign_bit(hwirq, chip->irq_trig_low, type == IRQ_TYPE_LEVEL_LOW); + assign_bit(hwirq, chip->irq_trig_high, type == IRQ_TYPE_LEVEL_HIGH); + + return 0; +} + +static void cy8c95x0_irq_shutdown(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + clear_bit(hwirq, chip->irq_trig_raise); + clear_bit(hwirq, chip->irq_trig_fall); + clear_bit(hwirq, chip->irq_trig_low); + clear_bit(hwirq, chip->irq_trig_high); +} + +static bool cy8c95x0_irq_pending(struct cy8c95x0_pinctrl *chip, unsigned long *pending) +{ + DECLARE_BITMAP(ones, MAX_LINE); + DECLARE_BITMAP(cur_stat, MAX_LINE); + DECLARE_BITMAP(new_stat, MAX_LINE); + DECLARE_BITMAP(trigger, MAX_LINE); + + bitmap_fill(ones, MAX_LINE); + + /* Read the current interrupt status from the device */ + if (cy8c95x0_read_regs_mask(chip, CY8C95X0_INTSTATUS, trigger, ones)) + return false; + + /* Check latched inputs */ + if (cy8c95x0_read_regs_mask(chip, CY8C95X0_INPUT, cur_stat, trigger)) + return false; + + /* Apply filter for rising/falling edge selection */ + bitmap_replace(new_stat, chip->irq_trig_fall, chip->irq_trig_raise, + cur_stat, MAX_LINE); + + bitmap_and(pending, new_stat, trigger, MAX_LINE); + + return !bitmap_empty(pending, MAX_LINE); +} + +static irqreturn_t cy8c95x0_irq_handler(int irq, void *devid) +{ + struct cy8c95x0_pinctrl *chip = devid; + struct gpio_chip *gc = &chip->gpio_chip; + DECLARE_BITMAP(pending, MAX_LINE); + int nested_irq, level; + bool ret; + + ret = cy8c95x0_irq_pending(chip, pending); + if (!ret) + return IRQ_RETVAL(0); + + ret = 0; + for_each_set_bit(level, pending, MAX_LINE) { + /* Already accounted for 4bit gap in GPort2 */ + nested_irq = irq_find_mapping(gc->irq.domain, level); + + if (unlikely(nested_irq <= 0)) { + dev_warn_ratelimited(gc->parent, "unmapped interrupt %d\n", level); + continue; + } + + if (test_bit(level, chip->irq_trig_low)) + while (!cy8c95x0_gpio_get_value(gc, level)) + handle_nested_irq(nested_irq); + else if (test_bit(level, chip->irq_trig_high)) + while (cy8c95x0_gpio_get_value(gc, level)) + handle_nested_irq(nested_irq); + else + handle_nested_irq(nested_irq); + + ret = 1; + } + + return IRQ_RETVAL(ret); +} + +static int cy8c95x0_pinctrl_get_groups_count(struct pinctrl_dev *pctldev) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + + return chip->tpin; +} + +static const char *cy8c95x0_pinctrl_get_group_name(struct pinctrl_dev *pctldev, + unsigned int group) +{ + return cy8c95x0_groups[group]; +} + +static int cy8c95x0_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, + unsigned int group, + const unsigned int **pins, + unsigned int *num_pins) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + + if (group >= chip->tpin) { + *pins = NULL; + *num_pins = 0; + return 0; + } + + *pins = &cy8c9560_pins[group].number; + *num_pins = 1; + return 0; +} + +static const struct pinctrl_ops cy8c95x0_pinctrl_ops = { + .get_groups_count = cy8c95x0_pinctrl_get_groups_count, + .get_group_name = cy8c95x0_pinctrl_get_group_name, + .get_group_pins = cy8c95x0_pinctrl_get_group_pins, + .dt_node_to_map = pinconf_generic_dt_node_to_map_pin, + .dt_free_map = pinconf_generic_dt_free_map, +}; + +static int cy8c95x0_get_functions_count(struct pinctrl_dev *pctldev) +{ + return 2; +} + +static const char *cy8c95x0_get_fname(struct pinctrl_dev *pctldev, unsigned int selector) +{ + if (selector == 0) + return "gpio"; + else + return "pwm"; +} + +static int cy8c95x0_get_groups(struct pinctrl_dev *pctldev, unsigned int selector, + const char * const **groups, + unsigned int * const num_groups) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + + *groups = cy8c95x0_groups; + *num_groups = chip->tpin; + return 0; +} + +static int cy8c95x0_pinmux_cfg(struct cy8c95x0_pinctrl *chip, + unsigned int val, + unsigned long off) +{ + u8 port = cypress_get_port(chip, off); + u8 bit = cypress_get_pin_mask(chip, off); + int ret; + + /* select port */ + ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); + if (ret < 0) + return ret; + + ret = regmap_write_bits(chip->regmap, CY8C95X0_PWMSEL, bit, val ? bit : 0); + if (ret < 0) + return ret; + + /* Set direction to output & set output to 1 so that PWM can work */ + ret = regmap_write_bits(chip->regmap, CY8C95X0_DIRECTION, bit, bit); + if (ret < 0) + return ret; + + return regmap_write_bits(chip->regmap, CY8C95X0_OUTPUT_(port), bit, bit); +} + +static int cy8c95x0_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, + unsigned int group) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + + if (group >= chip->tpin) + return -EINVAL; + + return cy8c95x0_pinmux_cfg(chip, selector, group); +} + +static const struct pinmux_ops cy8c95x0_pmxops = { + .get_functions_count = cy8c95x0_get_functions_count, + .get_function_name = cy8c95x0_get_fname, + .get_function_groups = cy8c95x0_get_groups, + .set_mux = cy8c95x0_set_mux, + .strict = true, +}; + +static int cy8c95x0_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, + unsigned long *config) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + + return cy8c95x0_gpio_get_pincfg(chip, pin, config); +} + +static int cy8c95x0_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, + unsigned long *configs, unsigned int num_configs) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + int ret = 0; + int i; + + if (WARN_ON(pin >= chip->tpin)) + return -EINVAL; + + for (i = 0; i < num_configs; i++) { + ret = cy8c95x0_gpio_set_pincfg(chip, pin, configs[i]); + if (ret) + return ret; + } + + return ret; +} + +static const struct pinconf_ops cy8c95x0_pinconf_ops = { + .pin_config_get = cy8c95x0_pinconf_get, + .pin_config_set = cy8c95x0_pinconf_set, + .is_generic = true, +}; + +static int cy8c95x0_irq_setup(struct cy8c95x0_pinctrl *chip, int irq) +{ + struct irq_chip *irq_chip = &chip->irq_chip; + struct gpio_irq_chip *girq = &chip->gpio_chip.irq; + DECLARE_BITMAP(pending_irqs, MAX_LINE); + int ret; + + mutex_init(&chip->irq_lock); + + bitmap_zero(pending_irqs, MAX_LINE); + + /* Read IRQ status register to clear all pending interrupts */ + ret = cy8c95x0_irq_pending(chip, pending_irqs); + if (ret) { + dev_err(chip->dev, "failed to clear irq status register\n"); + return ret; + } + + /* Mask all interrupts */ + bitmap_fill(chip->irq_mask, MAX_LINE); + + irq_chip->name = devm_kasprintf(chip->dev, GFP_KERNEL, "%s-irq", chip->name); + irq_chip->irq_mask = cy8c95x0_irq_mask; + irq_chip->irq_unmask = cy8c95x0_irq_unmask; + irq_chip->irq_bus_lock = cy8c95x0_irq_bus_lock; + irq_chip->irq_bus_sync_unlock = cy8c95x0_irq_bus_sync_unlock; + irq_chip->irq_set_type = cy8c95x0_irq_set_type; + irq_chip->irq_shutdown = cy8c95x0_irq_shutdown; + + girq->chip = irq_chip; + /* This will let us handle the parent IRQ in the driver */ + girq->parent_handler = NULL; + girq->num_parents = 0; + girq->parents = NULL; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_simple_irq; + girq->threaded = true; + girq->first = 0; + + ret = devm_request_threaded_irq(chip->dev, irq, + NULL, cy8c95x0_irq_handler, + IRQF_ONESHOT | IRQF_SHARED | IRQF_TRIGGER_HIGH, + dev_name(chip->dev), chip); + if (ret) { + dev_err(chip->dev, "failed to request irq %d\n", irq); + return ret; + } + dev_info(chip->dev, "Registered threaded IRQ\n"); + + return 0; +} + +static int cy8c95x0_setup_pinctrl(struct cy8c95x0_pinctrl *chip) +{ + struct pinctrl_desc *pd = &chip->pinctrl_desc; + + pd->pctlops = &cy8c95x0_pinctrl_ops; + pd->confops = &cy8c95x0_pinconf_ops; + pd->pmxops = &cy8c95x0_pmxops; + pd->npins = chip->gpio_chip.ngpio; + pd->name = devm_kasprintf(chip->dev, GFP_KERNEL, "pinctrl-%s", + chip->name); + pd->pins = cy8c9560_pins; + pd->npins = chip->tpin; + pd->owner = THIS_MODULE; + chip->pctldev = devm_pinctrl_register(chip->dev, pd, chip); + + if (IS_ERR(chip->pctldev)) + return dev_err_probe(chip->dev, PTR_ERR(chip->pctldev), + "can't register controller\n"); + return 0; +} + +static int device_cy8c95x0_init(struct cy8c95x0_pinctrl *chip) +{ + DECLARE_BITMAP(ones, MAX_LINE); + DECLARE_BITMAP(zeros, MAX_LINE); + int ret; + + /* Set all pins to input. This is the POR default. */ + bitmap_fill(ones, MAX_LINE); + ret = cy8c95x0_write_regs_mask(chip, CY8C95X0_DIRECTION, ones, ones); + if (ret) { + dev_err(chip->dev, "Failed to set pins to input\n"); + return ret; + } + + bitmap_zero(zeros, MAX_LINE); + ret = cy8c95x0_write_regs_mask(chip, CY8C95X0_INVERT, zeros, ones); + if (ret) { + dev_err(chip->dev, "Failed to set polarity inversion\n"); + return ret; + } + + return 0; +} + +static int cy8c95x0_detect(struct i2c_client *client, + struct i2c_board_info *info) +{ + struct i2c_adapter *adapter = client->adapter; + int ret; + const char *name; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + return -ENODEV; + + ret = i2c_smbus_read_byte_data(client, CY8C95X0_DEVID); + if (ret < 0) + return ret; + switch (ret & 0xf0) { + case 0x20: + name = cy8c95x0_id[0].name; + break; + case 0x40: + name = cy8c95x0_id[1].name; + break; + case 0x60: + name = cy8c95x0_id[2].name; + break; + default: + return -ENODEV; + } + + dev_info(&client->dev, "Found a %s chip at 0x%02x.\n", name, client->addr); + strscpy(info->type, name, I2C_NAME_SIZE); + + return -ENODEV; +} + +static int cy8c95x0_probe(struct i2c_client *client) +{ + struct cy8c95x0_pinctrl *chip; + struct regulator *reg; + int ret; + + chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + chip->dev = &client->dev; + + /* Set the device type */ + if (client->dev.of_node) + chip->driver_data = (unsigned long)of_device_get_match_data(&client->dev); + else + chip->driver_data = i2c_match_id(cy8c95x0_id, client)->driver_data; + + if (!chip->driver_data) + return -ENODEV; + + i2c_set_clientdata(client, chip); + + chip->tpin = chip->driver_data & CY8C95X0_GPIO_MASK; + chip->nport = DIV_ROUND_UP(CY8C95X0_PIN_TO_OFFSET(chip->tpin), BANK_SZ); + + switch (chip->tpin) { + case 20: + strscpy(chip->name, cy8c95x0_id[0].name, I2C_NAME_SIZE); + break; + case 40: + strscpy(chip->name, cy8c95x0_id[1].name, I2C_NAME_SIZE); + break; + case 60: + strscpy(chip->name, cy8c95x0_id[2].name, I2C_NAME_SIZE); + break; + } + + reg = devm_regulator_get(&client->dev, "vdd"); + if (IS_ERR(reg)) { + if (PTR_ERR(reg) == -EPROBE_DEFER) + return -EPROBE_DEFER; + } else { + ret = regulator_enable(reg); + if (ret) { + dev_err(&client->dev, "failed to enable regulator vdd: %d\n", ret); + return ret; + } + chip->regulator = reg; + } + + chip->regmap = devm_regmap_init_i2c(client, &cy8c95x0_i2c_regmap); + if (IS_ERR(chip->regmap)) { + ret = PTR_ERR(chip->regmap); + goto err_exit; + } + + bitmap_zero(chip->push_pull, MAX_LINE); + bitmap_zero(chip->shiftmask, MAX_LINE); + bitmap_set(chip->shiftmask, 0, 20); + mutex_init(&chip->i2c_lock); + + ret = device_cy8c95x0_init(chip); + if (ret) + goto err_exit; + + if (client->irq) { + ret = cy8c95x0_irq_setup(chip, client->irq); + if (ret) + goto err_exit; + } + + ret = cy8c95x0_setup_gpiochip(chip, chip->tpin); + if (ret) + goto err_exit; + + ret = cy8c95x0_setup_pinctrl(chip); + if (ret) + goto err_exit; + + return 0; + +err_exit: + if (!IS_ERR_OR_NULL(chip->regulator)) + regulator_disable(chip->regulator); + return ret; +} + +static int cy8c95x0_remove(struct i2c_client *client) +{ + struct cy8c95x0_pinctrl *chip = i2c_get_clientdata(client); + + if (!IS_ERR_OR_NULL(chip->regulator)) + regulator_disable(chip->regulator); + + return 0; +} + +static struct i2c_driver cy8c95x0_driver = { + .driver = { + .name = "cy8c95x0-pinctrl", + .of_match_table = cy8c95x0_dt_ids, + }, + .probe_new = cy8c95x0_probe, + .remove = cy8c95x0_remove, + .id_table = cy8c95x0_id, + .detect = cy8c95x0_detect, +}; + +module_i2c_driver(cy8c95x0_driver); + +MODULE_AUTHOR("Patrick Rudolph "); +MODULE_AUTHOR("Naresh Solanki "); +MODULE_DESCRIPTION("Pinctrl driver for CY8C95X0"); +MODULE_LICENSE("GPL"); -- GitLab From 658aea35ab88deca19705413199933c2cef9bac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 24 Aug 2022 13:21:24 +0200 Subject: [PATCH 0061/2223] PCI: pci-bridge-emul: Set position of PCI capabilities to real HW value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mvebu and aardvark HW have PCIe capabilities on different offset in PCI config space. Extend pci-bridge-emul.c code to allow setting custom driver custom value where PCIe capabilities starts. With this change PCIe capabilities of both drivers are reported at the same location as where they are reported by U-Boot - in their real HW offset. Link: https://lore.kernel.org/r/20220824112124.21675-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-aardvark.c | 1 + drivers/pci/controller/pci-mvebu.c | 1 + drivers/pci/pci-bridge-emul.c | 48 +++++++++++++++++---------- drivers/pci/pci-bridge-emul.h | 2 ++ 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 966c8b48bd969..4834198cc86b7 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -1078,6 +1078,7 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); bridge->has_pcie = true; + bridge->pcie_start = PCIE_CORE_PCIEXP_CAP; bridge->data = pcie; bridge->ops = &advk_pci_bridge_emul_ops; diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index af915c951f066..0fdbb5585fec7 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -946,6 +946,7 @@ static int mvebu_pci_bridge_emul_init(struct mvebu_pcie_port *port) bridge->subsystem_vendor_id = ssdev_id & 0xffff; bridge->subsystem_id = ssdev_id >> 16; bridge->has_pcie = true; + bridge->pcie_start = PCIE_CAP_PCIEXP; bridge->data = port; bridge->ops = &mvebu_pci_bridge_emul_ops; diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 9c2ca28e3ecf0..9334b2dd47641 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -22,11 +22,7 @@ #define PCI_BRIDGE_CONF_END PCI_STD_HEADER_SIZEOF #define PCI_CAP_SSID_SIZEOF (PCI_SSVID_DEVICE_ID + 2) -#define PCI_CAP_SSID_START PCI_BRIDGE_CONF_END -#define PCI_CAP_SSID_END (PCI_CAP_SSID_START + PCI_CAP_SSID_SIZEOF) #define PCI_CAP_PCIE_SIZEOF (PCI_EXP_SLTSTA2 + 2) -#define PCI_CAP_PCIE_START PCI_CAP_SSID_END -#define PCI_CAP_PCIE_END (PCI_CAP_PCIE_START + PCI_CAP_PCIE_SIZEOF) /** * struct pci_bridge_reg_behavior - register bits behaviors @@ -324,7 +320,7 @@ pci_bridge_emul_read_ssid(struct pci_bridge_emul *bridge, int reg, u32 *value) switch (reg) { case PCI_CAP_LIST_ID: *value = PCI_CAP_ID_SSVID | - (bridge->has_pcie ? (PCI_CAP_PCIE_START << 8) : 0); + ((bridge->pcie_start > bridge->ssid_start) ? (bridge->pcie_start << 8) : 0); return PCI_BRIDGE_EMUL_HANDLED; case PCI_SSVID_VENDOR_ID: @@ -365,18 +361,33 @@ int pci_bridge_emul_init(struct pci_bridge_emul *bridge, if (!bridge->pci_regs_behavior) return -ENOMEM; - if (bridge->subsystem_vendor_id) - bridge->conf.capabilities_pointer = PCI_CAP_SSID_START; - else if (bridge->has_pcie) - bridge->conf.capabilities_pointer = PCI_CAP_PCIE_START; - else - bridge->conf.capabilities_pointer = 0; + /* If ssid_start and pcie_start were not specified then choose the lowest possible value. */ + if (!bridge->ssid_start && !bridge->pcie_start) { + if (bridge->subsystem_vendor_id) + bridge->ssid_start = PCI_BRIDGE_CONF_END; + if (bridge->has_pcie) + bridge->pcie_start = bridge->ssid_start + PCI_CAP_SSID_SIZEOF; + } else if (!bridge->ssid_start && bridge->subsystem_vendor_id) { + if (bridge->pcie_start - PCI_BRIDGE_CONF_END >= PCI_CAP_SSID_SIZEOF) + bridge->ssid_start = PCI_BRIDGE_CONF_END; + else + bridge->ssid_start = bridge->pcie_start + PCI_CAP_PCIE_SIZEOF; + } else if (!bridge->pcie_start && bridge->has_pcie) { + if (bridge->ssid_start - PCI_BRIDGE_CONF_END >= PCI_CAP_PCIE_SIZEOF) + bridge->pcie_start = PCI_BRIDGE_CONF_END; + else + bridge->pcie_start = bridge->ssid_start + PCI_CAP_SSID_SIZEOF; + } + + bridge->conf.capabilities_pointer = min(bridge->ssid_start, bridge->pcie_start); if (bridge->conf.capabilities_pointer) bridge->conf.status |= cpu_to_le16(PCI_STATUS_CAP_LIST); if (bridge->has_pcie) { bridge->pcie_conf.cap_id = PCI_CAP_ID_EXP; + bridge->pcie_conf.next = (bridge->ssid_start > bridge->pcie_start) ? + bridge->ssid_start : 0; bridge->pcie_conf.cap |= cpu_to_le16(PCI_EXP_TYPE_ROOT_PORT << 4); bridge->pcie_cap_regs_behavior = kmemdup(pcie_cap_regs_behavior, @@ -459,15 +470,17 @@ int pci_bridge_emul_conf_read(struct pci_bridge_emul *bridge, int where, read_op = bridge->ops->read_base; cfgspace = (__le32 *) &bridge->conf; behavior = bridge->pci_regs_behavior; - } else if (reg >= PCI_CAP_SSID_START && reg < PCI_CAP_SSID_END && bridge->subsystem_vendor_id) { + } else if (reg >= bridge->ssid_start && reg < bridge->ssid_start + PCI_CAP_SSID_SIZEOF && + bridge->subsystem_vendor_id) { /* Emulated PCI Bridge Subsystem Vendor ID capability */ - reg -= PCI_CAP_SSID_START; + reg -= bridge->ssid_start; read_op = pci_bridge_emul_read_ssid; cfgspace = NULL; behavior = NULL; - } else if (reg >= PCI_CAP_PCIE_START && reg < PCI_CAP_PCIE_END && bridge->has_pcie) { + } else if (reg >= bridge->pcie_start && reg < bridge->pcie_start + PCI_CAP_PCIE_SIZEOF && + bridge->has_pcie) { /* Our emulated PCIe capability */ - reg -= PCI_CAP_PCIE_START; + reg -= bridge->pcie_start; read_op = bridge->ops->read_pcie; cfgspace = (__le32 *) &bridge->pcie_conf; behavior = bridge->pcie_cap_regs_behavior; @@ -538,9 +551,10 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, write_op = bridge->ops->write_base; cfgspace = (__le32 *) &bridge->conf; behavior = bridge->pci_regs_behavior; - } else if (reg >= PCI_CAP_PCIE_START && reg < PCI_CAP_PCIE_END && bridge->has_pcie) { + } else if (reg >= bridge->pcie_start && reg < bridge->pcie_start + PCI_CAP_PCIE_SIZEOF && + bridge->has_pcie) { /* Our emulated PCIe capability */ - reg -= PCI_CAP_PCIE_START; + reg -= bridge->pcie_start; write_op = bridge->ops->write_pcie; cfgspace = (__le32 *) &bridge->pcie_conf; behavior = bridge->pcie_cap_regs_behavior; diff --git a/drivers/pci/pci-bridge-emul.h b/drivers/pci/pci-bridge-emul.h index 71392b67471da..2a0e59c7f0d90 100644 --- a/drivers/pci/pci-bridge-emul.h +++ b/drivers/pci/pci-bridge-emul.h @@ -131,6 +131,8 @@ struct pci_bridge_emul { struct pci_bridge_reg_behavior *pci_regs_behavior; struct pci_bridge_reg_behavior *pcie_cap_regs_behavior; void *data; + u8 pcie_start; + u8 ssid_start; bool has_pcie; u16 subsystem_vendor_id; u16 subsystem_id; -- GitLab From 0e3db16300fbae5e47ce6c298bf63a7862e5d576 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Jun 2022 19:50:53 +0300 Subject: [PATCH 0062/2223] pinctrl: bcm: Convert drivers to use struct pingroup and PINCTRL_PINGROUP() The pin control header provides struct pingroup and PINCTRL_PINGROUP() macro. Utilize them instead of open coded variants in the driver. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220620165053.74170-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/pinctrl-bcm6318.c | 121 ++++++++++----------- drivers/pinctrl/bcm/pinctrl-bcm63268.c | 139 +++++++++++-------------- drivers/pinctrl/bcm/pinctrl-bcm6328.c | 85 +++++++-------- drivers/pinctrl/bcm/pinctrl-bcm6358.c | 20 ++-- drivers/pinctrl/bcm/pinctrl-bcm6362.c | 121 ++++++++++----------- drivers/pinctrl/bcm/pinctrl-bcm6368.c | 91 +++++++--------- drivers/pinctrl/bcm/pinctrl-bcm63xx.h | 2 + 7 files changed, 259 insertions(+), 320 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6318.c b/drivers/pinctrl/bcm/pinctrl-bcm6318.c index 9311220fb6cba..64073546310e6 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6318.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6318.c @@ -27,12 +27,6 @@ #define BCM6318_PAD_REG 0x54 #define BCM6328_PAD_MASK GENMASK(3, 0) -struct bcm6318_pingroup { - const char *name; - const unsigned * const pins; - const unsigned num_pins; -}; - struct bcm6318_function { const char *name; const char * const *groups; @@ -146,64 +140,57 @@ static unsigned gpio47_pins[] = { 47 }; static unsigned gpio48_pins[] = { 48 }; static unsigned gpio49_pins[] = { 49 }; -#define BCM6318_GROUP(n) \ - { \ - .name = #n, \ - .pins = n##_pins, \ - .num_pins = ARRAY_SIZE(n##_pins), \ - } - -static struct bcm6318_pingroup bcm6318_groups[] = { - BCM6318_GROUP(gpio0), - BCM6318_GROUP(gpio1), - BCM6318_GROUP(gpio2), - BCM6318_GROUP(gpio3), - BCM6318_GROUP(gpio4), - BCM6318_GROUP(gpio5), - BCM6318_GROUP(gpio6), - BCM6318_GROUP(gpio7), - BCM6318_GROUP(gpio8), - BCM6318_GROUP(gpio9), - BCM6318_GROUP(gpio10), - BCM6318_GROUP(gpio11), - BCM6318_GROUP(gpio12), - BCM6318_GROUP(gpio13), - BCM6318_GROUP(gpio14), - BCM6318_GROUP(gpio15), - BCM6318_GROUP(gpio16), - BCM6318_GROUP(gpio17), - BCM6318_GROUP(gpio18), - BCM6318_GROUP(gpio19), - BCM6318_GROUP(gpio20), - BCM6318_GROUP(gpio21), - BCM6318_GROUP(gpio22), - BCM6318_GROUP(gpio23), - BCM6318_GROUP(gpio24), - BCM6318_GROUP(gpio25), - BCM6318_GROUP(gpio26), - BCM6318_GROUP(gpio27), - BCM6318_GROUP(gpio28), - BCM6318_GROUP(gpio29), - BCM6318_GROUP(gpio30), - BCM6318_GROUP(gpio31), - BCM6318_GROUP(gpio32), - BCM6318_GROUP(gpio33), - BCM6318_GROUP(gpio34), - BCM6318_GROUP(gpio35), - BCM6318_GROUP(gpio36), - BCM6318_GROUP(gpio37), - BCM6318_GROUP(gpio38), - BCM6318_GROUP(gpio39), - BCM6318_GROUP(gpio40), - BCM6318_GROUP(gpio41), - BCM6318_GROUP(gpio42), - BCM6318_GROUP(gpio43), - BCM6318_GROUP(gpio44), - BCM6318_GROUP(gpio45), - BCM6318_GROUP(gpio46), - BCM6318_GROUP(gpio47), - BCM6318_GROUP(gpio48), - BCM6318_GROUP(gpio49), +static struct pingroup bcm6318_groups[] = { + BCM_PIN_GROUP(gpio0), + BCM_PIN_GROUP(gpio1), + BCM_PIN_GROUP(gpio2), + BCM_PIN_GROUP(gpio3), + BCM_PIN_GROUP(gpio4), + BCM_PIN_GROUP(gpio5), + BCM_PIN_GROUP(gpio6), + BCM_PIN_GROUP(gpio7), + BCM_PIN_GROUP(gpio8), + BCM_PIN_GROUP(gpio9), + BCM_PIN_GROUP(gpio10), + BCM_PIN_GROUP(gpio11), + BCM_PIN_GROUP(gpio12), + BCM_PIN_GROUP(gpio13), + BCM_PIN_GROUP(gpio14), + BCM_PIN_GROUP(gpio15), + BCM_PIN_GROUP(gpio16), + BCM_PIN_GROUP(gpio17), + BCM_PIN_GROUP(gpio18), + BCM_PIN_GROUP(gpio19), + BCM_PIN_GROUP(gpio20), + BCM_PIN_GROUP(gpio21), + BCM_PIN_GROUP(gpio22), + BCM_PIN_GROUP(gpio23), + BCM_PIN_GROUP(gpio24), + BCM_PIN_GROUP(gpio25), + BCM_PIN_GROUP(gpio26), + BCM_PIN_GROUP(gpio27), + BCM_PIN_GROUP(gpio28), + BCM_PIN_GROUP(gpio29), + BCM_PIN_GROUP(gpio30), + BCM_PIN_GROUP(gpio31), + BCM_PIN_GROUP(gpio32), + BCM_PIN_GROUP(gpio33), + BCM_PIN_GROUP(gpio34), + BCM_PIN_GROUP(gpio35), + BCM_PIN_GROUP(gpio36), + BCM_PIN_GROUP(gpio37), + BCM_PIN_GROUP(gpio38), + BCM_PIN_GROUP(gpio39), + BCM_PIN_GROUP(gpio40), + BCM_PIN_GROUP(gpio41), + BCM_PIN_GROUP(gpio42), + BCM_PIN_GROUP(gpio43), + BCM_PIN_GROUP(gpio44), + BCM_PIN_GROUP(gpio45), + BCM_PIN_GROUP(gpio46), + BCM_PIN_GROUP(gpio47), + BCM_PIN_GROUP(gpio48), + BCM_PIN_GROUP(gpio49), }; /* GPIO_MODE */ @@ -368,10 +355,10 @@ static const char *bcm6318_pinctrl_get_group_name(struct pinctrl_dev *pctldev, static int bcm6318_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, unsigned group, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { *pins = bcm6318_groups[group].pins; - *num_pins = bcm6318_groups[group].num_pins; + *npins = bcm6318_groups[group].npins; return 0; } @@ -424,7 +411,7 @@ static int bcm6318_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned selector, unsigned group) { struct bcm63xx_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); - const struct bcm6318_pingroup *pg = &bcm6318_groups[group]; + const struct pingroup *pg = &bcm6318_groups[group]; const struct bcm6318_function *f = &bcm6318_funcs[selector]; bcm6318_rmw_mux(pc, pg->pins[0], f->mode_val, f->mux_val); diff --git a/drivers/pinctrl/bcm/pinctrl-bcm63268.c b/drivers/pinctrl/bcm/pinctrl-bcm63268.c index 1c1060a395970..80c2fc55ffa29 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm63268.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm63268.c @@ -40,12 +40,6 @@ enum bcm63268_pinctrl_reg { BCM63268_BASEMODE, }; -struct bcm63268_pingroup { - const char *name; - const unsigned * const pins; - const unsigned num_pins; -}; - struct bcm63268_function { const char *name; const char * const *groups; @@ -185,74 +179,67 @@ static unsigned vdsl_phy1_grp_pins[] = { 12, 13 }; static unsigned vdsl_phy2_grp_pins[] = { 24, 25 }; static unsigned vdsl_phy3_grp_pins[] = { 26, 27 }; -#define BCM63268_GROUP(n) \ - { \ - .name = #n, \ - .pins = n##_pins, \ - .num_pins = ARRAY_SIZE(n##_pins), \ - } - -static struct bcm63268_pingroup bcm63268_groups[] = { - BCM63268_GROUP(gpio0), - BCM63268_GROUP(gpio1), - BCM63268_GROUP(gpio2), - BCM63268_GROUP(gpio3), - BCM63268_GROUP(gpio4), - BCM63268_GROUP(gpio5), - BCM63268_GROUP(gpio6), - BCM63268_GROUP(gpio7), - BCM63268_GROUP(gpio8), - BCM63268_GROUP(gpio9), - BCM63268_GROUP(gpio10), - BCM63268_GROUP(gpio11), - BCM63268_GROUP(gpio12), - BCM63268_GROUP(gpio13), - BCM63268_GROUP(gpio14), - BCM63268_GROUP(gpio15), - BCM63268_GROUP(gpio16), - BCM63268_GROUP(gpio17), - BCM63268_GROUP(gpio18), - BCM63268_GROUP(gpio19), - BCM63268_GROUP(gpio20), - BCM63268_GROUP(gpio21), - BCM63268_GROUP(gpio22), - BCM63268_GROUP(gpio23), - BCM63268_GROUP(gpio24), - BCM63268_GROUP(gpio25), - BCM63268_GROUP(gpio26), - BCM63268_GROUP(gpio27), - BCM63268_GROUP(gpio28), - BCM63268_GROUP(gpio29), - BCM63268_GROUP(gpio30), - BCM63268_GROUP(gpio31), - BCM63268_GROUP(gpio32), - BCM63268_GROUP(gpio33), - BCM63268_GROUP(gpio34), - BCM63268_GROUP(gpio35), - BCM63268_GROUP(gpio36), - BCM63268_GROUP(gpio37), - BCM63268_GROUP(gpio38), - BCM63268_GROUP(gpio39), - BCM63268_GROUP(gpio40), - BCM63268_GROUP(gpio41), - BCM63268_GROUP(gpio42), - BCM63268_GROUP(gpio43), - BCM63268_GROUP(gpio44), - BCM63268_GROUP(gpio45), - BCM63268_GROUP(gpio46), - BCM63268_GROUP(gpio47), - BCM63268_GROUP(gpio48), - BCM63268_GROUP(gpio49), - BCM63268_GROUP(gpio50), - BCM63268_GROUP(gpio51), +static struct pingroup bcm63268_groups[] = { + BCM_PIN_GROUP(gpio0), + BCM_PIN_GROUP(gpio1), + BCM_PIN_GROUP(gpio2), + BCM_PIN_GROUP(gpio3), + BCM_PIN_GROUP(gpio4), + BCM_PIN_GROUP(gpio5), + BCM_PIN_GROUP(gpio6), + BCM_PIN_GROUP(gpio7), + BCM_PIN_GROUP(gpio8), + BCM_PIN_GROUP(gpio9), + BCM_PIN_GROUP(gpio10), + BCM_PIN_GROUP(gpio11), + BCM_PIN_GROUP(gpio12), + BCM_PIN_GROUP(gpio13), + BCM_PIN_GROUP(gpio14), + BCM_PIN_GROUP(gpio15), + BCM_PIN_GROUP(gpio16), + BCM_PIN_GROUP(gpio17), + BCM_PIN_GROUP(gpio18), + BCM_PIN_GROUP(gpio19), + BCM_PIN_GROUP(gpio20), + BCM_PIN_GROUP(gpio21), + BCM_PIN_GROUP(gpio22), + BCM_PIN_GROUP(gpio23), + BCM_PIN_GROUP(gpio24), + BCM_PIN_GROUP(gpio25), + BCM_PIN_GROUP(gpio26), + BCM_PIN_GROUP(gpio27), + BCM_PIN_GROUP(gpio28), + BCM_PIN_GROUP(gpio29), + BCM_PIN_GROUP(gpio30), + BCM_PIN_GROUP(gpio31), + BCM_PIN_GROUP(gpio32), + BCM_PIN_GROUP(gpio33), + BCM_PIN_GROUP(gpio34), + BCM_PIN_GROUP(gpio35), + BCM_PIN_GROUP(gpio36), + BCM_PIN_GROUP(gpio37), + BCM_PIN_GROUP(gpio38), + BCM_PIN_GROUP(gpio39), + BCM_PIN_GROUP(gpio40), + BCM_PIN_GROUP(gpio41), + BCM_PIN_GROUP(gpio42), + BCM_PIN_GROUP(gpio43), + BCM_PIN_GROUP(gpio44), + BCM_PIN_GROUP(gpio45), + BCM_PIN_GROUP(gpio46), + BCM_PIN_GROUP(gpio47), + BCM_PIN_GROUP(gpio48), + BCM_PIN_GROUP(gpio49), + BCM_PIN_GROUP(gpio50), + BCM_PIN_GROUP(gpio51), /* multi pin groups */ - BCM63268_GROUP(nand_grp), - BCM63268_GROUP(dectpd_grp), - BCM63268_GROUP(vdsl_phy0_grp), - BCM63268_GROUP(vdsl_phy1_grp), - BCM63268_GROUP(vdsl_phy2_grp), - BCM63268_GROUP(vdsl_phy3_grp), + BCM_PIN_GROUP(nand_grp), + BCM_PIN_GROUP(dectpd_grp), + BCM_PIN_GROUP(vdsl_phy0_grp), + BCM_PIN_GROUP(vdsl_phy1_grp), + BCM_PIN_GROUP(vdsl_phy2_grp), + BCM_PIN_GROUP(vdsl_phy3_grp), }; static const char * const led_groups[] = { @@ -487,10 +474,10 @@ static const char *bcm63268_pinctrl_get_group_name(struct pinctrl_dev *pctldev, static int bcm63268_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, unsigned group, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { *pins = bcm63268_groups[group].pins; - *num_pins = bcm63268_groups[group].num_pins; + *npins = bcm63268_groups[group].npins; return 0; } @@ -545,13 +532,13 @@ static int bcm63268_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned selector, unsigned group) { struct bcm63xx_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); - const struct bcm63268_pingroup *pg = &bcm63268_groups[group]; + const struct pingroup *pg = &bcm63268_groups[group]; const struct bcm63268_function *f = &bcm63268_funcs[selector]; unsigned i; unsigned int reg; unsigned int val, mask; - for (i = 0; i < pg->num_pins; i++) + for (i = 0; i < pg->npins; i++) bcm63268_set_gpio(pc, pg->pins[i]); switch (f->reg) { diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6328.c b/drivers/pinctrl/bcm/pinctrl-bcm6328.c index ffa8864abab6d..1eef5ab9a5e52 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6328.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6328.c @@ -125,49 +125,42 @@ static unsigned gpio31_pins[] = { 31 }; static unsigned hsspi_cs1_pins[] = { 36 }; static unsigned usb_port1_pins[] = { 38 }; -#define BCM6328_GROUP(n) \ - { \ - .name = #n, \ - .pins = n##_pins, \ - .num_pins = ARRAY_SIZE(n##_pins), \ - } - -static struct bcm6328_pingroup bcm6328_groups[] = { - BCM6328_GROUP(gpio0), - BCM6328_GROUP(gpio1), - BCM6328_GROUP(gpio2), - BCM6328_GROUP(gpio3), - BCM6328_GROUP(gpio4), - BCM6328_GROUP(gpio5), - BCM6328_GROUP(gpio6), - BCM6328_GROUP(gpio7), - BCM6328_GROUP(gpio8), - BCM6328_GROUP(gpio9), - BCM6328_GROUP(gpio10), - BCM6328_GROUP(gpio11), - BCM6328_GROUP(gpio12), - BCM6328_GROUP(gpio13), - BCM6328_GROUP(gpio14), - BCM6328_GROUP(gpio15), - BCM6328_GROUP(gpio16), - BCM6328_GROUP(gpio17), - BCM6328_GROUP(gpio18), - BCM6328_GROUP(gpio19), - BCM6328_GROUP(gpio20), - BCM6328_GROUP(gpio21), - BCM6328_GROUP(gpio22), - BCM6328_GROUP(gpio23), - BCM6328_GROUP(gpio24), - BCM6328_GROUP(gpio25), - BCM6328_GROUP(gpio26), - BCM6328_GROUP(gpio27), - BCM6328_GROUP(gpio28), - BCM6328_GROUP(gpio29), - BCM6328_GROUP(gpio30), - BCM6328_GROUP(gpio31), - - BCM6328_GROUP(hsspi_cs1), - BCM6328_GROUP(usb_port1), +static struct pingroup bcm6328_groups[] = { + BCM_PIN_GROUP(gpio0), + BCM_PIN_GROUP(gpio1), + BCM_PIN_GROUP(gpio2), + BCM_PIN_GROUP(gpio3), + BCM_PIN_GROUP(gpio4), + BCM_PIN_GROUP(gpio5), + BCM_PIN_GROUP(gpio6), + BCM_PIN_GROUP(gpio7), + BCM_PIN_GROUP(gpio8), + BCM_PIN_GROUP(gpio9), + BCM_PIN_GROUP(gpio10), + BCM_PIN_GROUP(gpio11), + BCM_PIN_GROUP(gpio12), + BCM_PIN_GROUP(gpio13), + BCM_PIN_GROUP(gpio14), + BCM_PIN_GROUP(gpio15), + BCM_PIN_GROUP(gpio16), + BCM_PIN_GROUP(gpio17), + BCM_PIN_GROUP(gpio18), + BCM_PIN_GROUP(gpio19), + BCM_PIN_GROUP(gpio20), + BCM_PIN_GROUP(gpio21), + BCM_PIN_GROUP(gpio22), + BCM_PIN_GROUP(gpio23), + BCM_PIN_GROUP(gpio24), + BCM_PIN_GROUP(gpio25), + BCM_PIN_GROUP(gpio26), + BCM_PIN_GROUP(gpio27), + BCM_PIN_GROUP(gpio28), + BCM_PIN_GROUP(gpio29), + BCM_PIN_GROUP(gpio30), + BCM_PIN_GROUP(gpio31), + + BCM_PIN_GROUP(hsspi_cs1), + BCM_PIN_GROUP(usb_port1), }; /* GPIO_MODE */ @@ -292,10 +285,10 @@ static const char *bcm6328_pinctrl_get_group_name(struct pinctrl_dev *pctldev, static int bcm6328_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, unsigned group, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { *pins = bcm6328_groups[group].pins; - *num_pins = bcm6328_groups[group].num_pins; + *npins = bcm6328_groups[group].npins; return 0; } @@ -338,7 +331,7 @@ static int bcm6328_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned selector, unsigned group) { struct bcm63xx_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); - const struct bcm6328_pingroup *pg = &bcm6328_groups[group]; + const struct pingroup *pg = &bcm6328_groups[group]; const struct bcm6328_function *f = &bcm6328_funcs[selector]; bcm6328_rmw_mux(pc, pg->pins[0], f->mode_val, f->mux_val); diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6358.c b/drivers/pinctrl/bcm/pinctrl-bcm6358.c index 9f6cd7447887f..891de49d76e74 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6358.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6358.c @@ -35,9 +35,7 @@ #define BCM6358_MODE_MUX_SYS_IRQ BIT(15) struct bcm6358_pingroup { - const char *name; - const unsigned * const pins; - const unsigned num_pins; + struct pingroup grp; const uint16_t mode_val; @@ -131,9 +129,7 @@ static unsigned sys_irq_grp_pins[] = { 5 }; #define BCM6358_GPIO_MUX_GROUP(n, bit, dir) \ { \ - .name = #n, \ - .pins = n##_pins, \ - .num_pins = ARRAY_SIZE(n##_pins), \ + .grp = BCM_PIN_GROUP(n), \ .mode_val = BCM6358_MODE_MUX_##bit, \ .direction = dir, \ } @@ -219,15 +215,15 @@ static int bcm6358_pinctrl_get_group_count(struct pinctrl_dev *pctldev) static const char *bcm6358_pinctrl_get_group_name(struct pinctrl_dev *pctldev, unsigned group) { - return bcm6358_groups[group].name; + return bcm6358_groups[group].grp.name; } static int bcm6358_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, unsigned group, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { - *pins = bcm6358_groups[group].pins; - *num_pins = bcm6358_groups[group].num_pins; + *pins = bcm6358_groups[group].grp.pins; + *npins = bcm6358_groups[group].grp.npins; return 0; } @@ -264,12 +260,12 @@ static int bcm6358_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned int mask = val; unsigned pin; - for (pin = 0; pin < pg->num_pins; pin++) + for (pin = 0; pin < pg->grp.npins; pin++) mask |= (unsigned long)bcm6358_pins[pin].drv_data; regmap_field_update_bits(priv->overlays, mask, val); - for (pin = 0; pin < pg->num_pins; pin++) { + for (pin = 0; pin < pg->grp.npins; pin++) { struct pinctrl_gpio_range *range; unsigned int hw_gpio = bcm6358_pins[pin].number; diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6362.c b/drivers/pinctrl/bcm/pinctrl-bcm6362.c index 13c7230949b2b..d9ba1b6c2aebb 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6362.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6362.c @@ -35,12 +35,6 @@ enum bcm6362_pinctrl_reg { BCM6362_BASEMODE, }; -struct bcm6362_pingroup { - const char *name; - const unsigned * const pins; - const unsigned num_pins; -}; - struct bcm6362_function { const char *name; const char * const *groups; @@ -162,63 +156,56 @@ static unsigned nand_grp_pins[] = { 18, 19, 20, 21, 22, 23, 27, }; -#define BCM6362_GROUP(n) \ - { \ - .name = #n, \ - .pins = n##_pins, \ - .num_pins = ARRAY_SIZE(n##_pins), \ - } - -static struct bcm6362_pingroup bcm6362_groups[] = { - BCM6362_GROUP(gpio0), - BCM6362_GROUP(gpio1), - BCM6362_GROUP(gpio2), - BCM6362_GROUP(gpio3), - BCM6362_GROUP(gpio4), - BCM6362_GROUP(gpio5), - BCM6362_GROUP(gpio6), - BCM6362_GROUP(gpio7), - BCM6362_GROUP(gpio8), - BCM6362_GROUP(gpio9), - BCM6362_GROUP(gpio10), - BCM6362_GROUP(gpio11), - BCM6362_GROUP(gpio12), - BCM6362_GROUP(gpio13), - BCM6362_GROUP(gpio14), - BCM6362_GROUP(gpio15), - BCM6362_GROUP(gpio16), - BCM6362_GROUP(gpio17), - BCM6362_GROUP(gpio18), - BCM6362_GROUP(gpio19), - BCM6362_GROUP(gpio20), - BCM6362_GROUP(gpio21), - BCM6362_GROUP(gpio22), - BCM6362_GROUP(gpio23), - BCM6362_GROUP(gpio24), - BCM6362_GROUP(gpio25), - BCM6362_GROUP(gpio26), - BCM6362_GROUP(gpio27), - BCM6362_GROUP(gpio28), - BCM6362_GROUP(gpio29), - BCM6362_GROUP(gpio30), - BCM6362_GROUP(gpio31), - BCM6362_GROUP(gpio32), - BCM6362_GROUP(gpio33), - BCM6362_GROUP(gpio34), - BCM6362_GROUP(gpio35), - BCM6362_GROUP(gpio36), - BCM6362_GROUP(gpio37), - BCM6362_GROUP(gpio38), - BCM6362_GROUP(gpio39), - BCM6362_GROUP(gpio40), - BCM6362_GROUP(gpio41), - BCM6362_GROUP(gpio42), - BCM6362_GROUP(gpio43), - BCM6362_GROUP(gpio44), - BCM6362_GROUP(gpio45), - BCM6362_GROUP(gpio46), - BCM6362_GROUP(gpio47), - BCM6362_GROUP(nand_grp), +static struct pingroup bcm6362_groups[] = { + BCM_PIN_GROUP(gpio0), + BCM_PIN_GROUP(gpio1), + BCM_PIN_GROUP(gpio2), + BCM_PIN_GROUP(gpio3), + BCM_PIN_GROUP(gpio4), + BCM_PIN_GROUP(gpio5), + BCM_PIN_GROUP(gpio6), + BCM_PIN_GROUP(gpio7), + BCM_PIN_GROUP(gpio8), + BCM_PIN_GROUP(gpio9), + BCM_PIN_GROUP(gpio10), + BCM_PIN_GROUP(gpio11), + BCM_PIN_GROUP(gpio12), + BCM_PIN_GROUP(gpio13), + BCM_PIN_GROUP(gpio14), + BCM_PIN_GROUP(gpio15), + BCM_PIN_GROUP(gpio16), + BCM_PIN_GROUP(gpio17), + BCM_PIN_GROUP(gpio18), + BCM_PIN_GROUP(gpio19), + BCM_PIN_GROUP(gpio20), + BCM_PIN_GROUP(gpio21), + BCM_PIN_GROUP(gpio22), + BCM_PIN_GROUP(gpio23), + BCM_PIN_GROUP(gpio24), + BCM_PIN_GROUP(gpio25), + BCM_PIN_GROUP(gpio26), + BCM_PIN_GROUP(gpio27), + BCM_PIN_GROUP(gpio28), + BCM_PIN_GROUP(gpio29), + BCM_PIN_GROUP(gpio30), + BCM_PIN_GROUP(gpio31), + BCM_PIN_GROUP(gpio32), + BCM_PIN_GROUP(gpio33), + BCM_PIN_GROUP(gpio34), + BCM_PIN_GROUP(gpio35), + BCM_PIN_GROUP(gpio36), + BCM_PIN_GROUP(gpio37), + BCM_PIN_GROUP(gpio38), + BCM_PIN_GROUP(gpio39), + BCM_PIN_GROUP(gpio40), + BCM_PIN_GROUP(gpio41), + BCM_PIN_GROUP(gpio42), + BCM_PIN_GROUP(gpio43), + BCM_PIN_GROUP(gpio44), + BCM_PIN_GROUP(gpio45), + BCM_PIN_GROUP(gpio46), + BCM_PIN_GROUP(gpio47), + BCM_PIN_GROUP(nand_grp), }; static const char * const led_groups[] = { @@ -463,10 +450,10 @@ static const char *bcm6362_pinctrl_get_group_name(struct pinctrl_dev *pctldev, static int bcm6362_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, unsigned group, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { *pins = bcm6362_groups[group].pins; - *num_pins = bcm6362_groups[group].num_pins; + *npins = bcm6362_groups[group].npins; return 0; } @@ -519,13 +506,13 @@ static int bcm6362_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned selector, unsigned group) { struct bcm63xx_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); - const struct bcm6362_pingroup *pg = &bcm6362_groups[group]; + const struct pingroup *pg = &bcm6362_groups[group]; const struct bcm6362_function *f = &bcm6362_funcs[selector]; unsigned i; unsigned int reg; unsigned int val, mask; - for (i = 0; i < pg->num_pins; i++) + for (i = 0; i < pg->npins; i++) bcm6362_set_gpio(pc, pg->pins[i]); switch (f->reg) { diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6368.c b/drivers/pinctrl/bcm/pinctrl-bcm6368.c index b33a74aec82ba..6208467ba6f94 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6368.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6368.c @@ -26,12 +26,6 @@ #define BCM6368_BASEMODE_GPIO 0x0 #define BCM6368_BASEMODE_UART1 0x1 -struct bcm6368_pingroup { - const char *name; - const unsigned * const pins; - const unsigned num_pins; -}; - struct bcm6368_function { const char *name; const char * const *groups; @@ -127,47 +121,40 @@ static unsigned gpio30_pins[] = { 30 }; static unsigned gpio31_pins[] = { 31 }; static unsigned uart1_grp_pins[] = { 30, 31, 32, 33 }; -#define BCM6368_GROUP(n) \ - { \ - .name = #n, \ - .pins = n##_pins, \ - .num_pins = ARRAY_SIZE(n##_pins), \ - } - -static struct bcm6368_pingroup bcm6368_groups[] = { - BCM6368_GROUP(gpio0), - BCM6368_GROUP(gpio1), - BCM6368_GROUP(gpio2), - BCM6368_GROUP(gpio3), - BCM6368_GROUP(gpio4), - BCM6368_GROUP(gpio5), - BCM6368_GROUP(gpio6), - BCM6368_GROUP(gpio7), - BCM6368_GROUP(gpio8), - BCM6368_GROUP(gpio9), - BCM6368_GROUP(gpio10), - BCM6368_GROUP(gpio11), - BCM6368_GROUP(gpio12), - BCM6368_GROUP(gpio13), - BCM6368_GROUP(gpio14), - BCM6368_GROUP(gpio15), - BCM6368_GROUP(gpio16), - BCM6368_GROUP(gpio17), - BCM6368_GROUP(gpio18), - BCM6368_GROUP(gpio19), - BCM6368_GROUP(gpio20), - BCM6368_GROUP(gpio21), - BCM6368_GROUP(gpio22), - BCM6368_GROUP(gpio23), - BCM6368_GROUP(gpio24), - BCM6368_GROUP(gpio25), - BCM6368_GROUP(gpio26), - BCM6368_GROUP(gpio27), - BCM6368_GROUP(gpio28), - BCM6368_GROUP(gpio29), - BCM6368_GROUP(gpio30), - BCM6368_GROUP(gpio31), - BCM6368_GROUP(uart1_grp), +static struct pingroup bcm6368_groups[] = { + BCM_PIN_GROUP(gpio0), + BCM_PIN_GROUP(gpio1), + BCM_PIN_GROUP(gpio2), + BCM_PIN_GROUP(gpio3), + BCM_PIN_GROUP(gpio4), + BCM_PIN_GROUP(gpio5), + BCM_PIN_GROUP(gpio6), + BCM_PIN_GROUP(gpio7), + BCM_PIN_GROUP(gpio8), + BCM_PIN_GROUP(gpio9), + BCM_PIN_GROUP(gpio10), + BCM_PIN_GROUP(gpio11), + BCM_PIN_GROUP(gpio12), + BCM_PIN_GROUP(gpio13), + BCM_PIN_GROUP(gpio14), + BCM_PIN_GROUP(gpio15), + BCM_PIN_GROUP(gpio16), + BCM_PIN_GROUP(gpio17), + BCM_PIN_GROUP(gpio18), + BCM_PIN_GROUP(gpio19), + BCM_PIN_GROUP(gpio20), + BCM_PIN_GROUP(gpio21), + BCM_PIN_GROUP(gpio22), + BCM_PIN_GROUP(gpio23), + BCM_PIN_GROUP(gpio24), + BCM_PIN_GROUP(gpio25), + BCM_PIN_GROUP(gpio26), + BCM_PIN_GROUP(gpio27), + BCM_PIN_GROUP(gpio28), + BCM_PIN_GROUP(gpio29), + BCM_PIN_GROUP(gpio30), + BCM_PIN_GROUP(gpio31), + BCM_PIN_GROUP(uart1_grp), }; static const char * const analog_afe_0_groups[] = { @@ -358,10 +345,10 @@ static const char *bcm6368_pinctrl_get_group_name(struct pinctrl_dev *pctldev, static int bcm6368_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, unsigned group, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { *pins = bcm6368_groups[group].pins; - *num_pins = bcm6368_groups[group].num_pins; + *npins = bcm6368_groups[group].npins; return 0; } @@ -393,14 +380,14 @@ static int bcm6368_pinctrl_set_mux(struct pinctrl_dev *pctldev, { struct bcm63xx_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); struct bcm6368_priv *priv = pc->driver_data; - const struct bcm6368_pingroup *pg = &bcm6368_groups[group]; + const struct pingroup *pg = &bcm6368_groups[group]; const struct bcm6368_function *fun = &bcm6368_funcs[selector]; int i, pin; if (fun->basemode) { unsigned int mask = 0; - for (i = 0; i < pg->num_pins; i++) { + for (i = 0; i < pg->npins; i++) { pin = pg->pins[i]; if (pin < BCM63XX_BANK_GPIOS) mask |= BIT(pin); @@ -419,7 +406,7 @@ static int bcm6368_pinctrl_set_mux(struct pinctrl_dev *pctldev, BIT(pin)); } - for (pin = 0; pin < pg->num_pins; pin++) { + for (pin = 0; pin < pg->npins; pin++) { struct pinctrl_gpio_range *range; int hw_gpio = bcm6368_pins[pin].number; diff --git a/drivers/pinctrl/bcm/pinctrl-bcm63xx.h b/drivers/pinctrl/bcm/pinctrl-bcm63xx.h index d58c8cd5b6b8e..95243027ecd9e 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm63xx.h +++ b/drivers/pinctrl/bcm/pinctrl-bcm63xx.h @@ -21,6 +21,8 @@ struct bcm63xx_pinctrl_soc { unsigned int ngpios; }; +#define BCM_PIN_GROUP(n) PINCTRL_PINGROUP(#n, n##_pins, ARRAY_SIZE(n##_pins)) + struct bcm63xx_pinctrl { struct device *dev; struct regmap *regs; -- GitLab From 39b707fa7aba7cbfd7d53be50b6098e620f7a6d4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Jun 2022 14:29:04 +0300 Subject: [PATCH 0063/2223] pinctrl: nomadik: Convert drivers to use struct pingroup and PINCTRL_PINGROUP() The pin control header provides struct pingroup and PINCTRL_PINGROUP() macro. Utilize them instead of open coded variants in the driver. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220621112904.65674-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- .../pinctrl/nomadik/pinctrl-nomadik-db8500.c | 295 +++++++++--------- .../pinctrl/nomadik/pinctrl-nomadik-stn8815.c | 29 +- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 26 +- drivers/pinctrl/nomadik/pinctrl-nomadik.h | 16 +- 4 files changed, 180 insertions(+), 186 deletions(-) diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c b/drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c index ac3d4d91266d7..758d21f0a8503 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c @@ -674,163 +674,160 @@ static const unsigned hwobs_oc4_1_pins[] = { DB8500_PIN_D17, DB8500_PIN_D16, DB8500_PIN_D21, DB8500_PIN_D20, DB8500_PIN_C20, DB8500_PIN_B21, DB8500_PIN_C21, DB8500_PIN_A22, DB8500_PIN_B24, DB8500_PIN_C22 }; -#define DB8500_PIN_GROUP(a, b) { .name = #a, .pins = a##_pins, \ - .npins = ARRAY_SIZE(a##_pins), .altsetting = b } - static const struct nmk_pingroup nmk_db8500_groups[] = { /* Altfunction A column */ - DB8500_PIN_GROUP(u0_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(u1rxtx_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(u1ctsrts_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(ipi2c_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(ipi2c_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp0txrx_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp0tfstck_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp0rfsrck_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc0_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc0_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc0_dat47_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc0dat31dir_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp1txrx_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp1_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcdb_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcdvsi0_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcdvsi1_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcd_d0_d7_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcd_d8_d11_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcd_d12_d15_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(lcd_d12_d23_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(kp_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(kpskaskb_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc2_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc2_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(ssp1_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(ssp0_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(i2c0_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(ipgpio0_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(ipgpio1_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(modem_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(kp_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp2sck_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(msp2_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc4_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc1_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc1_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(mc1dir_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(hsir_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(hsit_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(hsit_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(clkout1_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(clkout1_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(clkout2_a_1, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(clkout2_a_2, NMK_GPIO_ALT_A), - DB8500_PIN_GROUP(usb_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(u0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(u1rxtx_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(u1ctsrts_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(ipi2c_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(ipi2c_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp0txrx_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp0tfstck_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp0rfsrck_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc0_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc0_dat47_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc0dat31dir_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp1txrx_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcdb_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcdvsi0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcdvsi1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcd_d0_d7_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcd_d8_d11_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcd_d12_d15_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(lcd_d12_d23_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(kp_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(kpskaskb_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc2_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc2_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(ssp1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(ssp0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(i2c0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(ipgpio0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(ipgpio1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(modem_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(kp_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp2sck_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(msp2_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc4_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc1_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mc1dir_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(hsir_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(hsit_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(hsit_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(clkout1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(clkout1_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(clkout2_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(clkout2_a_2, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(usb_a_1, NMK_GPIO_ALT_A), /* Altfunction B column */ - DB8500_PIN_GROUP(trig_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(i2c4_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(i2c1_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(i2c2_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(i2c2_b_2, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(msp0txrx_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(i2c1_b_2, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(u2rxtx_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(uartmodtx_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(msp0sck_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(uartmodrx_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(stmmod_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(uartmodrx_b_2, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(spi3_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(msp1txrx_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(kp_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(kp_b_2, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(sm_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(smcs0_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(smcs1_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(ipgpio7_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(ipgpio2_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(ipgpio3_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(lcdaclk_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(lcda_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(lcd_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(lcd_d16_d23_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(ddrtrig_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(pwl_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(spi1_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(mc3_b_1, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(pwl_b_2, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(pwl_b_3, NMK_GPIO_ALT_B), - DB8500_PIN_GROUP(pwl_b_4, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(trig_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(i2c4_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(i2c1_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(i2c2_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(i2c2_b_2, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(msp0txrx_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(i2c1_b_2, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(u2rxtx_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(uartmodtx_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(msp0sck_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(uartmodrx_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(stmmod_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(uartmodrx_b_2, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(spi3_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(msp1txrx_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(kp_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(kp_b_2, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(sm_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(smcs0_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(smcs1_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(ipgpio7_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(ipgpio2_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(ipgpio3_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(lcdaclk_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(lcda_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(lcd_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(lcd_d16_d23_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(ddrtrig_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(pwl_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(spi1_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(mc3_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(pwl_b_2, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(pwl_b_3, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(pwl_b_4, NMK_GPIO_ALT_B), /* Altfunction C column */ - DB8500_PIN_GROUP(ipjtag_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio6_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio0_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio1_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio3_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio2_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(slim0_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ms_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(iptrigout_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(u2rxtx_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(u2ctsrts_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(u0_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio4_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio5_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio6_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio7_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(smcleale_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(stmape_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(u2rxtx_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio2_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio3_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio4_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(ipgpio5_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(mc5_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(mc2rstn_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(kp_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(smps0_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(smps1_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(u2rxtx_c_3, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(stmape_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(uartmodrx_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(uartmodtx_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(stmmod_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(usbsim_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(mc4rstn_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(clkout1_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(clkout2_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(i2c3_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(spi0_c_1, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(usbsim_c_2, NMK_GPIO_ALT_C), - DB8500_PIN_GROUP(i2c3_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipjtag_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio6_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio0_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio1_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio3_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio2_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(slim0_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ms_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(iptrigout_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(u2rxtx_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(u2ctsrts_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(u0_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio4_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio5_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio6_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio7_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(smcleale_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(stmape_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(u2rxtx_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio2_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio3_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio4_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(ipgpio5_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(mc5_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(mc2rstn_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(kp_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(smps0_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(smps1_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(u2rxtx_c_3, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(stmape_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(uartmodrx_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(uartmodtx_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(stmmod_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(usbsim_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(mc4rstn_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(clkout1_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(clkout2_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(i2c3_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(spi0_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(usbsim_c_2, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(i2c3_c_2, NMK_GPIO_ALT_C), /* Other alt C1 column */ - DB8500_PIN_GROUP(u2rx_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(stmape_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(remap0_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(remap1_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(ptma9_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(kp_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(rf_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(hxclk_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(uartmodrx_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(uartmodtx_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(stmmod_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(hxgpio_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(rf_oc1_2, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(spi2_oc1_1, NMK_GPIO_ALT_C1), - DB8500_PIN_GROUP(spi2_oc1_2, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(u2rx_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(stmape_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(remap0_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(remap1_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(ptma9_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(kp_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(rf_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(hxclk_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(uartmodrx_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(uartmodtx_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(stmmod_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(hxgpio_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(rf_oc1_2, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(spi2_oc1_1, NMK_GPIO_ALT_C1), + NMK_PIN_GROUP(spi2_oc1_2, NMK_GPIO_ALT_C1), /* Other alt C2 column */ - DB8500_PIN_GROUP(sbag_oc2_1, NMK_GPIO_ALT_C2), - DB8500_PIN_GROUP(etmr4_oc2_1, NMK_GPIO_ALT_C2), - DB8500_PIN_GROUP(ptma9_oc2_1, NMK_GPIO_ALT_C2), + NMK_PIN_GROUP(sbag_oc2_1, NMK_GPIO_ALT_C2), + NMK_PIN_GROUP(etmr4_oc2_1, NMK_GPIO_ALT_C2), + NMK_PIN_GROUP(ptma9_oc2_1, NMK_GPIO_ALT_C2), /* Other alt C3 column */ - DB8500_PIN_GROUP(stmmod_oc3_1, NMK_GPIO_ALT_C3), - DB8500_PIN_GROUP(stmmod_oc3_2, NMK_GPIO_ALT_C3), - DB8500_PIN_GROUP(uartmodrx_oc3_1, NMK_GPIO_ALT_C3), - DB8500_PIN_GROUP(uartmodtx_oc3_1, NMK_GPIO_ALT_C3), - DB8500_PIN_GROUP(etmr4_oc3_1, NMK_GPIO_ALT_C3), + NMK_PIN_GROUP(stmmod_oc3_1, NMK_GPIO_ALT_C3), + NMK_PIN_GROUP(stmmod_oc3_2, NMK_GPIO_ALT_C3), + NMK_PIN_GROUP(uartmodrx_oc3_1, NMK_GPIO_ALT_C3), + NMK_PIN_GROUP(uartmodtx_oc3_1, NMK_GPIO_ALT_C3), + NMK_PIN_GROUP(etmr4_oc3_1, NMK_GPIO_ALT_C3), /* Other alt C4 column */ - DB8500_PIN_GROUP(sbag_oc4_1, NMK_GPIO_ALT_C4), - DB8500_PIN_GROUP(hwobs_oc4_1, NMK_GPIO_ALT_C4), + NMK_PIN_GROUP(sbag_oc4_1, NMK_GPIO_ALT_C4), + NMK_PIN_GROUP(hwobs_oc4_1, NMK_GPIO_ALT_C4), }; /* We use this macro to define the groups applicable to a function */ diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c b/drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c index 8d944bb3a036c..c0d7c86d09391 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c @@ -303,23 +303,20 @@ static const unsigned usbhs_c_1_pins[] = { STN8815_PIN_E21, STN8815_PIN_E20, STN8815_PIN_C16, STN8815_PIN_A15, STN8815_PIN_D17, STN8815_PIN_C17 }; -#define STN8815_PIN_GROUP(a, b) { .name = #a, .pins = a##_pins, \ - .npins = ARRAY_SIZE(a##_pins), .altsetting = b } - static const struct nmk_pingroup nmk_stn8815_groups[] = { - STN8815_PIN_GROUP(u0txrx_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(u0ctsrts_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(u0modem_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(mmcsd_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(mmcsd_b_1, NMK_GPIO_ALT_B), - STN8815_PIN_GROUP(u1_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(i2c1_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(i2c0_a_1, NMK_GPIO_ALT_A), - STN8815_PIN_GROUP(u1_b_1, NMK_GPIO_ALT_B), - STN8815_PIN_GROUP(i2cusb_b_1, NMK_GPIO_ALT_B), - STN8815_PIN_GROUP(clcd_16_23_b_1, NMK_GPIO_ALT_B), - STN8815_PIN_GROUP(usbfs_b_1, NMK_GPIO_ALT_B), - STN8815_PIN_GROUP(usbhs_c_1, NMK_GPIO_ALT_C), + NMK_PIN_GROUP(u0txrx_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(u0ctsrts_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(u0modem_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mmcsd_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(mmcsd_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(u1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(i2c1_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(i2c0_a_1, NMK_GPIO_ALT_A), + NMK_PIN_GROUP(u1_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(i2cusb_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(clcd_16_23_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(usbfs_b_1, NMK_GPIO_ALT_B), + NMK_PIN_GROUP(usbhs_c_1, NMK_GPIO_ALT_C), }; /* We use this macro to define the groups applicable to a function */ diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index f5014d09d81a2..58c7ac8c7d4d1 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -1179,17 +1179,17 @@ static const char *nmk_get_group_name(struct pinctrl_dev *pctldev, { struct nmk_pinctrl *npct = pinctrl_dev_get_drvdata(pctldev); - return npct->soc->groups[selector].name; + return npct->soc->groups[selector].grp.name; } static int nmk_get_group_pins(struct pinctrl_dev *pctldev, unsigned selector, const unsigned **pins, - unsigned *num_pins) + unsigned *npins) { struct nmk_pinctrl *npct = pinctrl_dev_get_drvdata(pctldev); - *pins = npct->soc->groups[selector].pins; - *num_pins = npct->soc->groups[selector].npins; + *pins = npct->soc->groups[selector].grp.pins; + *npins = npct->soc->groups[selector].grp.npins; return 0; } @@ -1531,7 +1531,7 @@ static int nmk_pmx_set(struct pinctrl_dev *pctldev, unsigned function, if (g->altsetting < 0) return -EINVAL; - dev_dbg(npct->dev, "enable group %s, %u pins\n", g->name, g->npins); + dev_dbg(npct->dev, "enable group %s, %u pins\n", g->grp.name, g->grp.npins); /* * If we're setting altfunc C by setting both AFSLA and AFSLB to 1, @@ -1566,26 +1566,26 @@ static int nmk_pmx_set(struct pinctrl_dev *pctldev, unsigned function, * Then mask the pins that need to be sleeping now when we're * switching to the ALT C function. */ - for (i = 0; i < g->npins; i++) - slpm[g->pins[i] / NMK_GPIO_PER_CHIP] &= ~BIT(g->pins[i]); + for (i = 0; i < g->grp.npins; i++) + slpm[g->grp.pins[i] / NMK_GPIO_PER_CHIP] &= ~BIT(g->grp.pins[i]); nmk_gpio_glitch_slpm_init(slpm); } - for (i = 0; i < g->npins; i++) { + for (i = 0; i < g->grp.npins; i++) { struct nmk_gpio_chip *nmk_chip; unsigned bit; - nmk_chip = find_nmk_gpio_from_pin(g->pins[i]); + nmk_chip = find_nmk_gpio_from_pin(g->grp.pins[i]); if (!nmk_chip) { dev_err(npct->dev, "invalid pin offset %d in group %s at index %d\n", - g->pins[i], g->name, i); + g->grp.pins[i], g->grp.name, i); goto out_glitch; } - dev_dbg(npct->dev, "setting pin %d to altsetting %d\n", g->pins[i], g->altsetting); + dev_dbg(npct->dev, "setting pin %d to altsetting %d\n", g->grp.pins[i], g->altsetting); clk_enable(nmk_chip->clk); - bit = g->pins[i] % NMK_GPIO_PER_CHIP; + bit = g->grp.pins[i] % NMK_GPIO_PER_CHIP; /* * If the pin is switching to altfunc, and there was an * interrupt installed on it which has been lazy disabled, @@ -1608,7 +1608,7 @@ static int nmk_pmx_set(struct pinctrl_dev *pctldev, unsigned function, * then some bits in PRCM GPIOCR registers must be cleared. */ if ((g->altsetting & NMK_GPIO_ALT_C) == NMK_GPIO_ALT_C) - nmk_prcm_altcx_set_mode(npct, g->pins[i], + nmk_prcm_altcx_set_mode(npct, g->grp.pins[i], g->altsetting >> NMK_GPIO_ALT_CX_SHIFT); } diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.h b/drivers/pinctrl/nomadik/pinctrl-nomadik.h index ae0bac06639fe..820f07f4db328 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.h +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.h @@ -105,21 +105,21 @@ struct nmk_function { /** * struct nmk_pingroup - describes a Nomadik pin group - * @name: the name of this specific pin group - * @pins: an array of discrete physical pins used in this group, taken - * from the driver-local pin enumeration space - * @num_pins: the number of pins in this group array, i.e. the number of - * elements in .pins so we can iterate over that array + * @grp: Generic data of the pin group (name and pins) * @altsetting: the altsetting to apply to all pins in this group to * configure them to be used by a function */ struct nmk_pingroup { - const char *name; - const unsigned int *pins; - const unsigned npins; + struct pingroup grp; int altsetting; }; +#define NMK_PIN_GROUP(a, b) \ + { \ + .grp = PINCTRL_PINGROUP(#a, a##_pins, ARRAY_SIZE(a##_pins)), \ + .altsetting = b, \ + } + /** * struct nmk_pinctrl_soc_data - Nomadik pin controller per-SoC configuration * @pins: An array describing all pins the pin controller affects. -- GitLab From 4faa4e73011d65583b25a5597c5f0e118e128ed3 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 17 Aug 2022 12:38:32 +0100 Subject: [PATCH 0064/2223] dt-bindings: pinctrl: qcom: Add sm8450 lpass lpi pinctrl bindings Add device tree binding Documentation details for Qualcomm SM8450 LPASS(Low Power Audio Sub System) LPI(Low Power Island) pinctrl driver. Signed-off-by: Srinivas Kandagatla Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220817113833.9625-2-srinivas.kandagatla@linaro.org Signed-off-by: Linus Walleij --- .../qcom,sm8450-lpass-lpi-pinctrl.yaml | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/qcom,sm8450-lpass-lpi-pinctrl.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-lpass-lpi-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-lpass-lpi-pinctrl.yaml new file mode 100644 index 0000000000000..3694795ec7938 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-lpass-lpi-pinctrl.yaml @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/qcom,sm8450-lpass-lpi-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Technologies, Inc. Low Power Audio SubSystem (LPASS) + Low Power Island (LPI) TLMM block + +maintainers: + - Srinivas Kandagatla + +description: | + This binding describes the Top Level Mode Multiplexer block found in the + LPASS LPI IP on most Qualcomm SoCs + +properties: + compatible: + const: qcom,sm8450-lpass-lpi-pinctrl + + reg: + items: + - description: LPASS LPI TLMM Control and Status registers + - description: LPASS LPI pins SLEW registers + + clocks: + items: + - description: LPASS Core voting clock + - description: LPASS Audio voting clock + + clock-names: + items: + - const: core + - const: audio + + gpio-controller: true + + '#gpio-cells': + description: Specifying the pin number and flags, as defined in + include/dt-bindings/gpio/gpio.h + const: 2 + + gpio-ranges: + maxItems: 1 + +#PIN CONFIGURATION NODES +patternProperties: + '-pins$': + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: /schemas/pinctrl/pincfg-node.yaml + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: "^gpio([0-9]|[1-2][0-9]])$" + + function: + enum: [ swr_tx_clk, swr_tx_data, swr_rx_clk, swr_rx_data, + dmic1_clk, dmic1_data, dmic2_clk, dmic2_data, dmic4_clk, + dmic4_data, i2s2_clk, i2s2_ws, dmic3_clk, dmic3_data, + qua_mi2s_sclk, qua_mi2s_ws, qua_mi2s_data, i2s1_clk, i2s1_ws, + i2s1_data, wsa_swr_clk, wsa_swr_data, wsa2_swr_clk, + wsa2_swr_data, i2s2_data, i2s4_ws, i2s4_clk, i2s4_data, + slimbus_clk, i2s3_clk, i2s3_ws, i2s3_data, slimbus_data, + ext_mclk1_c, ext_mclk1_b, ext_mclk1_a, ext_mclk1_d, + ext_mclk1_e ] + description: + Specify the alternative function to be configured for the specified + pins. + + drive-strength: + enum: [2, 4, 6, 8, 10, 12, 14, 16] + default: 2 + description: + Selects the drive strength for the specified pins, in mA. + + slew-rate: + enum: [0, 1, 2, 3] + default: 0 + description: | + 0: No adjustments + 1: Higher Slew rate (faster edges) + 2: Lower Slew rate (slower edges) + 3: Reserved (No adjustments) + + bias-pull-down: true + + bias-pull-up: true + + bias-disable: true + + output-high: true + + output-low: true + + required: + - pins + - function + + additionalProperties: false + +allOf: + - $ref: pinctrl.yaml# + +required: + - compatible + - reg + - clocks + - clock-names + - gpio-controller + - '#gpio-cells' + - gpio-ranges + +additionalProperties: false + +examples: + - | + #include + pinctrl@3440000 { + compatible = "qcom,sm8450-lpass-lpi-pinctrl"; + reg = <0x3440000 0x20000>, + <0x34d0000 0x10000>; + clocks = <&q6afecc LPASS_HW_MACRO_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>, + <&q6afecc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>; + clock-names = "core", "audio"; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&lpi_tlmm 0 0 23>; + }; -- GitLab From ec1652fc4d56660c33850176d06b3f1a02796946 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 17 Aug 2022 12:38:33 +0100 Subject: [PATCH 0065/2223] pinctrl: qcom: Add sm8450 lpass lpi pinctrl driver Add pinctrl driver to support pin configuration for LPASS (Low Power Audio SubSystem) LPI (Low Power Island) pinctrl on SM8450. This IP is an additional pin control block for Audio Pins on top the existing SoC Top level pin-controller. Hardware setup looks like: TLMM GPIO[165 - 187] --> LPASS LPI GPIO [0 - 22] This pin controller has some similarities compared to Top level msm SoC Pin controller like 'each pin belongs to a single group' and so on. However this one is intended to control only audio pins in particular, which can not be configured/touched by the Top level SoC pin controller except setting them as gpios. Apart from this, slew rate is also available in this block for certain pins which are connected to SLIMbus or SoundWire Bus. Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220817113833.9625-3-srinivas.kandagatla@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig | 9 + drivers/pinctrl/qcom/Makefile | 1 + .../pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c | 240 ++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index f415c13caae04..35e59f940ddb4 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -390,6 +390,15 @@ config PINCTRL_SM8450 Qualcomm Technologies Inc TLMM block found on the Qualcomm Technologies Inc SM8450 platform. +config PINCTRL_SM8450_LPASS_LPI + tristate "Qualcomm Technologies Inc SM8450 LPASS LPI pin controller driver" + depends on GPIOLIB + depends on PINCTRL_LPASS_LPI + help + This is the pinctrl, pinmux, pinconf and gpiolib driver for the + Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI + (Low Power Island) found on the Qualcomm Technologies Inc SM8450 platform. + config PINCTRL_LPASS_LPI tristate "Qualcomm Technologies Inc LPASS LPI pin controller driver" select PINMUX diff --git a/drivers/pinctrl/qcom/Makefile b/drivers/pinctrl/qcom/Makefile index fbd64853a24db..06e4cddbca68e 100644 --- a/drivers/pinctrl/qcom/Makefile +++ b/drivers/pinctrl/qcom/Makefile @@ -45,4 +45,5 @@ obj-$(CONFIG_PINCTRL_SM8250) += pinctrl-sm8250.o obj-$(CONFIG_PINCTRL_SM8250_LPASS_LPI) += pinctrl-sm8250-lpass-lpi.o obj-$(CONFIG_PINCTRL_SM8350) += pinctrl-sm8350.o obj-$(CONFIG_PINCTRL_SM8450) += pinctrl-sm8450.o +obj-$(CONFIG_PINCTRL_SM8450_LPASS_LPI) += pinctrl-sm8450-lpass-lpi.o obj-$(CONFIG_PINCTRL_LPASS_LPI) += pinctrl-lpass-lpi.o diff --git a/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c new file mode 100644 index 0000000000000..c3c8c34148f11 --- /dev/null +++ b/drivers/pinctrl/qcom/pinctrl-sm8450-lpass-lpi.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Linaro Ltd. + */ + +#include +#include +#include + +#include "pinctrl-lpass-lpi.h" + +enum lpass_lpi_functions { + LPI_MUX_dmic1_clk, + LPI_MUX_dmic1_data, + LPI_MUX_dmic2_clk, + LPI_MUX_dmic2_data, + LPI_MUX_dmic3_clk, + LPI_MUX_dmic3_data, + LPI_MUX_dmic4_clk, + LPI_MUX_dmic4_data, + LPI_MUX_i2s1_clk, + LPI_MUX_i2s1_data, + LPI_MUX_i2s1_ws, + LPI_MUX_i2s2_clk, + LPI_MUX_i2s2_data, + LPI_MUX_i2s2_ws, + LPI_MUX_i2s3_clk, + LPI_MUX_i2s3_data, + LPI_MUX_i2s3_ws, + LPI_MUX_i2s4_clk, + LPI_MUX_i2s4_data, + LPI_MUX_i2s4_ws, + LPI_MUX_qua_mi2s_data, + LPI_MUX_qua_mi2s_sclk, + LPI_MUX_qua_mi2s_ws, + LPI_MUX_swr_rx_clk, + LPI_MUX_swr_rx_data, + LPI_MUX_swr_tx_clk, + LPI_MUX_swr_tx_data, + LPI_MUX_wsa_swr_clk, + LPI_MUX_wsa_swr_data, + LPI_MUX_wsa2_swr_clk, + LPI_MUX_wsa2_swr_data, + LPI_MUX_slimbus_clk, + LPI_MUX_slimbus_data, + LPI_MUX_ext_mclk1_a, + LPI_MUX_ext_mclk1_b, + LPI_MUX_ext_mclk1_c, + LPI_MUX_ext_mclk1_d, + LPI_MUX_ext_mclk1_e, + LPI_MUX_gpio, + LPI_MUX__, +}; + +static int gpio0_pins[] = { 0 }; +static int gpio1_pins[] = { 1 }; +static int gpio2_pins[] = { 2 }; +static int gpio3_pins[] = { 3 }; +static int gpio4_pins[] = { 4 }; +static int gpio5_pins[] = { 5 }; +static int gpio6_pins[] = { 6 }; +static int gpio7_pins[] = { 7 }; +static int gpio8_pins[] = { 8 }; +static int gpio9_pins[] = { 9 }; +static int gpio10_pins[] = { 10 }; +static int gpio11_pins[] = { 11 }; +static int gpio12_pins[] = { 12 }; +static int gpio13_pins[] = { 13 }; +static int gpio14_pins[] = { 14 }; +static int gpio15_pins[] = { 15 }; +static int gpio16_pins[] = { 16 }; +static int gpio17_pins[] = { 17 }; +static int gpio18_pins[] = { 18 }; +static int gpio19_pins[] = { 19 }; +static int gpio20_pins[] = { 20 }; +static int gpio21_pins[] = { 21 }; +static int gpio22_pins[] = { 22 }; + +static const struct pinctrl_pin_desc sm8450_lpi_pins[] = { + PINCTRL_PIN(0, "gpio0"), + PINCTRL_PIN(1, "gpio1"), + PINCTRL_PIN(2, "gpio2"), + PINCTRL_PIN(3, "gpio3"), + PINCTRL_PIN(4, "gpio4"), + PINCTRL_PIN(5, "gpio5"), + PINCTRL_PIN(6, "gpio6"), + PINCTRL_PIN(7, "gpio7"), + PINCTRL_PIN(8, "gpio8"), + PINCTRL_PIN(9, "gpio9"), + PINCTRL_PIN(10, "gpio10"), + PINCTRL_PIN(11, "gpio11"), + PINCTRL_PIN(12, "gpio12"), + PINCTRL_PIN(13, "gpio13"), + PINCTRL_PIN(14, "gpio14"), + PINCTRL_PIN(15, "gpio15"), + PINCTRL_PIN(16, "gpio16"), + PINCTRL_PIN(17, "gpio17"), + PINCTRL_PIN(18, "gpio18"), + PINCTRL_PIN(19, "gpio19"), + PINCTRL_PIN(20, "gpio20"), + PINCTRL_PIN(21, "gpio21"), + PINCTRL_PIN(22, "gpio22"), +}; + +static const char * const swr_tx_clk_groups[] = { "gpio0" }; +static const char * const swr_tx_data_groups[] = { "gpio1", "gpio2", "gpio14" }; +static const char * const swr_rx_clk_groups[] = { "gpio3" }; +static const char * const swr_rx_data_groups[] = { "gpio4", "gpio5", "gpio15" }; +static const char * const dmic1_clk_groups[] = { "gpio6" }; +static const char * const dmic1_data_groups[] = { "gpio7" }; +static const char * const dmic2_clk_groups[] = { "gpio8" }; +static const char * const dmic2_data_groups[] = { "gpio9" }; +static const char * const dmic4_clk_groups[] = { "gpio17" }; +static const char * const dmic4_data_groups[] = { "gpio18" }; +static const char * const i2s2_clk_groups[] = { "gpio10" }; +static const char * const i2s2_ws_groups[] = { "gpio11" }; +static const char * const dmic3_clk_groups[] = { "gpio12" }; +static const char * const dmic3_data_groups[] = { "gpio13" }; +static const char * const qua_mi2s_sclk_groups[] = { "gpio0" }; +static const char * const qua_mi2s_ws_groups[] = { "gpio1" }; +static const char * const qua_mi2s_data_groups[] = { "gpio2", "gpio3", "gpio4", "gpio5" }; +static const char * const i2s1_clk_groups[] = { "gpio6" }; +static const char * const i2s1_ws_groups[] = { "gpio7" }; +static const char * const i2s1_data_groups[] = { "gpio8", "gpio9" }; +static const char * const wsa_swr_clk_groups[] = { "gpio10" }; +static const char * const wsa_swr_data_groups[] = { "gpio11" }; +static const char * const wsa2_swr_clk_groups[] = { "gpio15" }; +static const char * const wsa2_swr_data_groups[] = { "gpio16" }; +static const char * const i2s2_data_groups[] = { "gpio15", "gpio16" }; +static const char * const i2s4_ws_groups[] = { "gpio13" }; +static const char * const i2s4_clk_groups[] = { "gpio12" }; +static const char * const i2s4_data_groups[] = { "gpio17", "gpio18" }; +static const char * const slimbus_clk_groups[] = { "gpio19"}; +static const char * const i2s3_clk_groups[] = { "gpio19"}; +static const char * const i2s3_ws_groups[] = { "gpio20"}; +static const char * const i2s3_data_groups[] = { "gpio21", "gpio22"}; +static const char * const slimbus_data_groups[] = { "gpio20"}; +static const char * const ext_mclk1_c_groups[] = { "gpio5" }; +static const char * const ext_mclk1_b_groups[] = { "gpio9" }; +static const char * const ext_mclk1_a_groups[] = { "gpio13" }; +static const char * const ext_mclk1_d_groups[] = { "gpio14" }; +static const char * const ext_mclk1_e_groups[] = { "gpio22" }; + +static const struct lpi_pingroup sm8450_groups[] = { + LPI_PINGROUP(0, 0, swr_tx_clk, qua_mi2s_sclk, _, _), + LPI_PINGROUP(1, 2, swr_tx_data, qua_mi2s_ws, _, _), + LPI_PINGROUP(2, 4, swr_tx_data, qua_mi2s_data, _, _), + LPI_PINGROUP(3, 8, swr_rx_clk, qua_mi2s_data, _, _), + LPI_PINGROUP(4, 10, swr_rx_data, qua_mi2s_data, _, _), + LPI_PINGROUP(5, 12, swr_rx_data, ext_mclk1_c, qua_mi2s_data, _), + LPI_PINGROUP(6, LPI_NO_SLEW, dmic1_clk, i2s1_clk, _, _), + LPI_PINGROUP(7, LPI_NO_SLEW, dmic1_data, i2s1_ws, _, _), + LPI_PINGROUP(8, LPI_NO_SLEW, dmic2_clk, i2s1_data, _, _), + LPI_PINGROUP(9, LPI_NO_SLEW, dmic2_data, i2s1_data, ext_mclk1_b, _), + LPI_PINGROUP(10, 16, i2s2_clk, wsa_swr_clk, _, _), + LPI_PINGROUP(11, 18, i2s2_ws, wsa_swr_data, _, _), + LPI_PINGROUP(12, LPI_NO_SLEW, dmic3_clk, i2s4_clk, _, _), + LPI_PINGROUP(13, LPI_NO_SLEW, dmic3_data, i2s4_ws, ext_mclk1_a, _), + LPI_PINGROUP(14, 6, swr_tx_data, ext_mclk1_d, _, _), + LPI_PINGROUP(15, 20, i2s2_data, wsa2_swr_clk, _, _), + LPI_PINGROUP(16, 22, i2s2_data, wsa2_swr_data, _, _), + LPI_PINGROUP(17, LPI_NO_SLEW, dmic4_clk, i2s4_data, _, _), + LPI_PINGROUP(18, LPI_NO_SLEW, dmic4_data, i2s4_data, _, _), + LPI_PINGROUP(19, LPI_NO_SLEW, i2s3_clk, slimbus_clk, _, _), + LPI_PINGROUP(20, LPI_NO_SLEW, i2s3_ws, slimbus_data, _, _), + LPI_PINGROUP(21, LPI_NO_SLEW, i2s3_data, _, _, _), + LPI_PINGROUP(22, LPI_NO_SLEW, i2s3_data, ext_mclk1_e, _, _), +}; + +static const struct lpi_function sm8450_functions[] = { + LPI_FUNCTION(dmic1_clk), + LPI_FUNCTION(dmic1_data), + LPI_FUNCTION(dmic2_clk), + LPI_FUNCTION(dmic2_data), + LPI_FUNCTION(dmic3_clk), + LPI_FUNCTION(dmic3_data), + LPI_FUNCTION(dmic4_clk), + LPI_FUNCTION(dmic4_data), + LPI_FUNCTION(i2s1_clk), + LPI_FUNCTION(i2s1_data), + LPI_FUNCTION(i2s1_ws), + LPI_FUNCTION(i2s2_clk), + LPI_FUNCTION(i2s2_data), + LPI_FUNCTION(i2s2_ws), + LPI_FUNCTION(i2s3_clk), + LPI_FUNCTION(i2s3_data), + LPI_FUNCTION(i2s3_ws), + LPI_FUNCTION(i2s4_clk), + LPI_FUNCTION(i2s4_data), + LPI_FUNCTION(i2s4_ws), + LPI_FUNCTION(qua_mi2s_data), + LPI_FUNCTION(qua_mi2s_sclk), + LPI_FUNCTION(qua_mi2s_ws), + LPI_FUNCTION(swr_rx_clk), + LPI_FUNCTION(swr_rx_data), + LPI_FUNCTION(swr_tx_clk), + LPI_FUNCTION(swr_tx_data), + LPI_FUNCTION(slimbus_clk), + LPI_FUNCTION(slimbus_data), + LPI_FUNCTION(wsa_swr_clk), + LPI_FUNCTION(wsa_swr_data), + LPI_FUNCTION(wsa2_swr_clk), + LPI_FUNCTION(wsa2_swr_data), + LPI_FUNCTION(ext_mclk1_a), + LPI_FUNCTION(ext_mclk1_b), + LPI_FUNCTION(ext_mclk1_c), + LPI_FUNCTION(ext_mclk1_d), + LPI_FUNCTION(ext_mclk1_e), +}; + +static const struct lpi_pinctrl_variant_data sm8450_lpi_data = { + .pins = sm8450_lpi_pins, + .npins = ARRAY_SIZE(sm8450_lpi_pins), + .groups = sm8450_groups, + .ngroups = ARRAY_SIZE(sm8450_groups), + .functions = sm8450_functions, + .nfunctions = ARRAY_SIZE(sm8450_functions), +}; + +static const struct of_device_id lpi_pinctrl_of_match[] = { + { + .compatible = "qcom,sm8450-lpass-lpi-pinctrl", + .data = &sm8450_lpi_data, + }, + { } +}; +MODULE_DEVICE_TABLE(of, lpi_pinctrl_of_match); + +static struct platform_driver lpi_pinctrl_driver = { + .driver = { + .name = "qcom-sm8450-lpass-lpi-pinctrl", + .of_match_table = lpi_pinctrl_of_match, + }, + .probe = lpi_pinctrl_probe, + .remove = lpi_pinctrl_remove, +}; + +module_platform_driver(lpi_pinctrl_driver); +MODULE_DESCRIPTION("QTI SM8450 LPI GPIO pin control driver"); +MODULE_LICENSE("GPL"); -- GitLab From 958bb025f5b3138217ffd4479b1877ba53297df9 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 17 Aug 2022 12:37:46 +0100 Subject: [PATCH 0066/2223] dt-bindings: pinctrl: qcom: Add sc8280xp lpass lpi pinctrl bindings Add device tree binding Documentation details for Qualcomm SC8280XP LPASS(Low Power Audio Sub System) LPI(Low Power Island) pinctrl driver. Signed-off-by: Srinivas Kandagatla Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220817113747.9111-2-srinivas.kandagatla@linaro.org Signed-off-by: Linus Walleij --- .../qcom,sc8280xp-lpass-lpi-pinctrl.yaml | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-lpass-lpi-pinctrl.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-lpass-lpi-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-lpass-lpi-pinctrl.yaml new file mode 100644 index 0000000000000..1f468303bb08b --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-lpass-lpi-pinctrl.yaml @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/qcom,sc8280xp-lpass-lpi-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Technologies, Inc. Low Power Audio SubSystem (LPASS) + Low Power Island (LPI) TLMM block + +maintainers: + - Srinivas Kandagatla + +description: | + This binding describes the Top Level Mode Multiplexer block found in the + LPASS LPI IP on most Qualcomm SoCs + +properties: + compatible: + const: qcom,sc8280xp-lpass-lpi-pinctrl + + reg: + items: + - description: LPASS LPI TLMM Control and Status registers + - description: LPASS LPI pins SLEW registers + + clocks: + items: + - description: LPASS Core voting clock + - description: LPASS Audio voting clock + + clock-names: + items: + - const: core + - const: audio + + gpio-controller: true + + '#gpio-cells': + description: Specifying the pin number and flags, as defined in + include/dt-bindings/gpio/gpio.h + const: 2 + + gpio-ranges: + maxItems: 1 + +#PIN CONFIGURATION NODES +patternProperties: + '-pins$': + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: /schemas/pinctrl/pincfg-node.yaml + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: "^gpio([0-1]|1[0-8]])$" + + function: + enum: [ swr_tx_clk, swr_tx_data, swr_rx_clk, swr_rx_data, + dmic1_clk, dmic1_data, dmic2_clk, dmic2_data, dmic4_clk, + dmic4_data, i2s2_clk, i2s2_ws, dmic3_clk, dmic3_data, + qua_mi2s_sclk, qua_mi2s_ws, qua_mi2s_data, i2s1_clk, i2s1_ws, + i2s1_data, wsa_swr_clk, wsa_swr_data, wsa2_swr_clk, + wsa2_swr_data, i2s2_data, i2s3_clk, i2s3_ws, i2s3_data, + ext_mclk1_c, ext_mclk1_b, ext_mclk1_a ] + description: + Specify the alternative function to be configured for the specified + pins. + + drive-strength: + enum: [2, 4, 6, 8, 10, 12, 14, 16] + default: 2 + description: + Selects the drive strength for the specified pins, in mA. + + slew-rate: + enum: [0, 1, 2, 3] + default: 0 + description: | + 0: No adjustments + 1: Higher Slew rate (faster edges) + 2: Lower Slew rate (slower edges) + 3: Reserved (No adjustments) + + bias-pull-down: true + + bias-pull-up: true + + bias-disable: true + + output-high: true + + output-low: true + + required: + - pins + - function + + additionalProperties: false + +allOf: + - $ref: pinctrl.yaml# + +required: + - compatible + - reg + - clocks + - clock-names + - gpio-controller + - '#gpio-cells' + - gpio-ranges + +additionalProperties: false + +examples: + - | + #include + pinctrl@33c0000 { + compatible = "qcom,sc8280xp-lpass-lpi-pinctrl"; + reg = <0x33c0000 0x20000>, + <0x3550000 0x10000>; + clocks = <&q6afecc LPASS_HW_MACRO_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>, + <&q6afecc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>; + clock-names = "core", "audio"; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&lpi_tlmm 0 0 18>; + }; -- GitLab From 67f40373ee7b419374b191cedd63a05afd33a459 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 17 Aug 2022 12:37:47 +0100 Subject: [PATCH 0067/2223] pinctrl: qcom: Add sc8280xp lpass lpi pinctrl driver Add pinctrl driver to support pin configuration for LPASS (Low Power Audio SubSystem) LPI (Low Power Island) pinctrl on SC8280XP. This IP is an additional pin control block for Audio Pins on top the existing SoC Top level pin-controller. Hardware setup looks like: TLMM GPIO[189 - 207] --> LPASS LPI GPIO [0 - 18] This pin controller has some similarities compared to Top level msm SoC Pin controller like 'each pin belongs to a single group' and so on. However this one is intended to control only audio pins in particular, which can not be configured/touched by the Top level SoC pin controller except setting them as gpios. Apart from this, slew rate is also available in this block for certain pins which are connected to SLIMbus or SoundWire Bus. Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220817113747.9111-3-srinivas.kandagatla@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig | 9 + drivers/pinctrl/qcom/Makefile | 1 + .../pinctrl/qcom/pinctrl-sc8280xp-lpass-lpi.c | 207 ++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 drivers/pinctrl/qcom/pinctrl-sc8280xp-lpass-lpi.c diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index 35e59f940ddb4..2961b5eb8e10a 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -399,6 +399,15 @@ config PINCTRL_SM8450_LPASS_LPI Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI (Low Power Island) found on the Qualcomm Technologies Inc SM8450 platform. +config PINCTRL_SC8280XP_LPASS_LPI + tristate "Qualcomm Technologies Inc SC8280XP LPASS LPI pin controller driver" + depends on GPIOLIB + depends on PINCTRL_LPASS_LPI + help + This is the pinctrl, pinmux, pinconf and gpiolib driver for the + Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI + (Low Power Island) found on the Qualcomm Technologies Inc SC8280XP platform. + config PINCTRL_LPASS_LPI tristate "Qualcomm Technologies Inc LPASS LPI pin controller driver" select PINMUX diff --git a/drivers/pinctrl/qcom/Makefile b/drivers/pinctrl/qcom/Makefile index 06e4cddbca68e..8269a1db8794a 100644 --- a/drivers/pinctrl/qcom/Makefile +++ b/drivers/pinctrl/qcom/Makefile @@ -46,4 +46,5 @@ obj-$(CONFIG_PINCTRL_SM8250_LPASS_LPI) += pinctrl-sm8250-lpass-lpi.o obj-$(CONFIG_PINCTRL_SM8350) += pinctrl-sm8350.o obj-$(CONFIG_PINCTRL_SM8450) += pinctrl-sm8450.o obj-$(CONFIG_PINCTRL_SM8450_LPASS_LPI) += pinctrl-sm8450-lpass-lpi.o +obj-$(CONFIG_PINCTRL_SC8280XP_LPASS_LPI) += pinctrl-sc8280xp-lpass-lpi.o obj-$(CONFIG_PINCTRL_LPASS_LPI) += pinctrl-lpass-lpi.o diff --git a/drivers/pinctrl/qcom/pinctrl-sc8280xp-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sc8280xp-lpass-lpi.c new file mode 100644 index 0000000000000..4b9c0beac32ef --- /dev/null +++ b/drivers/pinctrl/qcom/pinctrl-sc8280xp-lpass-lpi.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Linaro Ltd. + */ + +#include +#include +#include + +#include "pinctrl-lpass-lpi.h" + +enum lpass_lpi_functions { + LPI_MUX_dmic1_clk, + LPI_MUX_dmic1_data, + LPI_MUX_dmic2_clk, + LPI_MUX_dmic2_data, + LPI_MUX_dmic3_clk, + LPI_MUX_dmic3_data, + LPI_MUX_dmic4_clk, + LPI_MUX_dmic4_data, + LPI_MUX_i2s1_clk, + LPI_MUX_i2s1_data, + LPI_MUX_i2s1_ws, + LPI_MUX_i2s2_clk, + LPI_MUX_i2s2_data, + LPI_MUX_i2s2_ws, + LPI_MUX_i2s3_clk, + LPI_MUX_i2s3_data, + LPI_MUX_i2s3_ws, + LPI_MUX_qua_mi2s_data, + LPI_MUX_qua_mi2s_sclk, + LPI_MUX_qua_mi2s_ws, + LPI_MUX_swr_rx_clk, + LPI_MUX_swr_rx_data, + LPI_MUX_swr_tx_clk, + LPI_MUX_swr_tx_data, + LPI_MUX_wsa_swr_clk, + LPI_MUX_wsa_swr_data, + LPI_MUX_wsa2_swr_clk, + LPI_MUX_wsa2_swr_data, + LPI_MUX_ext_mclk1_a, + LPI_MUX_ext_mclk1_b, + LPI_MUX_ext_mclk1_c, + LPI_MUX_gpio, + LPI_MUX__, +}; + +static int gpio0_pins[] = { 0 }; +static int gpio1_pins[] = { 1 }; +static int gpio2_pins[] = { 2 }; +static int gpio3_pins[] = { 3 }; +static int gpio4_pins[] = { 4 }; +static int gpio5_pins[] = { 5 }; +static int gpio6_pins[] = { 6 }; +static int gpio7_pins[] = { 7 }; +static int gpio8_pins[] = { 8 }; +static int gpio9_pins[] = { 9 }; +static int gpio10_pins[] = { 10 }; +static int gpio11_pins[] = { 11 }; +static int gpio12_pins[] = { 12 }; +static int gpio13_pins[] = { 13 }; +static int gpio14_pins[] = { 14 }; +static int gpio15_pins[] = { 15 }; +static int gpio16_pins[] = { 16 }; +static int gpio17_pins[] = { 17 }; +static int gpio18_pins[] = { 18 }; + +static const struct pinctrl_pin_desc sc8280xp_lpi_pins[] = { + PINCTRL_PIN(0, "gpio0"), + PINCTRL_PIN(1, "gpio1"), + PINCTRL_PIN(2, "gpio2"), + PINCTRL_PIN(3, "gpio3"), + PINCTRL_PIN(4, "gpio4"), + PINCTRL_PIN(5, "gpio5"), + PINCTRL_PIN(6, "gpio6"), + PINCTRL_PIN(7, "gpio7"), + PINCTRL_PIN(8, "gpio8"), + PINCTRL_PIN(9, "gpio9"), + PINCTRL_PIN(10, "gpio10"), + PINCTRL_PIN(11, "gpio11"), + PINCTRL_PIN(12, "gpio12"), + PINCTRL_PIN(13, "gpio13"), + PINCTRL_PIN(14, "gpio14"), + PINCTRL_PIN(15, "gpio15"), + PINCTRL_PIN(16, "gpio16"), + PINCTRL_PIN(17, "gpio17"), + PINCTRL_PIN(18, "gpio18"), +}; + +static const char * const swr_tx_clk_groups[] = { "gpio0" }; +static const char * const swr_tx_data_groups[] = { "gpio1", "gpio2", "gpio14" }; +static const char * const swr_rx_clk_groups[] = { "gpio3" }; +static const char * const swr_rx_data_groups[] = { "gpio4", "gpio5" }; +static const char * const dmic1_clk_groups[] = { "gpio6" }; +static const char * const dmic1_data_groups[] = { "gpio7" }; +static const char * const dmic2_clk_groups[] = { "gpio8" }; +static const char * const dmic2_data_groups[] = { "gpio9" }; +static const char * const dmic4_clk_groups[] = { "gpio17" }; +static const char * const dmic4_data_groups[] = { "gpio18" }; +static const char * const i2s2_clk_groups[] = { "gpio10" }; +static const char * const i2s2_ws_groups[] = { "gpio11" }; +static const char * const dmic3_clk_groups[] = { "gpio12" }; +static const char * const dmic3_data_groups[] = { "gpio13" }; +static const char * const qua_mi2s_sclk_groups[] = { "gpio0" }; +static const char * const qua_mi2s_ws_groups[] = { "gpio1" }; +static const char * const qua_mi2s_data_groups[] = { "gpio2", "gpio3", "gpio4", "gpio5" }; +static const char * const i2s1_clk_groups[] = { "gpio6" }; +static const char * const i2s1_ws_groups[] = { "gpio7" }; +static const char * const i2s1_data_groups[] = { "gpio8", "gpio9" }; +static const char * const wsa_swr_clk_groups[] = { "gpio10" }; +static const char * const wsa_swr_data_groups[] = { "gpio11" }; +static const char * const wsa2_swr_clk_groups[] = { "gpio15" }; +static const char * const wsa2_swr_data_groups[] = { "gpio16" }; +static const char * const i2s2_data_groups[] = { "gpio15", "gpio16" }; +static const char * const i2s3_clk_groups[] = { "gpio12"}; +static const char * const i2s3_ws_groups[] = { "gpio13"}; +static const char * const i2s3_data_groups[] = { "gpio17", "gpio18"}; +static const char * const ext_mclk1_c_groups[] = { "gpio5" }; +static const char * const ext_mclk1_b_groups[] = { "gpio9" }; +static const char * const ext_mclk1_a_groups[] = { "gpio13" }; + +static const struct lpi_pingroup sc8280xp_groups[] = { + LPI_PINGROUP(0, 0, swr_tx_clk, qua_mi2s_sclk, _, _), + LPI_PINGROUP(1, 2, swr_tx_data, qua_mi2s_ws, _, _), + LPI_PINGROUP(2, 4, swr_tx_data, qua_mi2s_data, _, _), + LPI_PINGROUP(3, 8, swr_rx_clk, qua_mi2s_data, _, _), + LPI_PINGROUP(4, 10, swr_rx_data, qua_mi2s_data, _, _), + LPI_PINGROUP(5, 12, swr_rx_data, ext_mclk1_c, qua_mi2s_data, _), + LPI_PINGROUP(6, LPI_NO_SLEW, dmic1_clk, i2s1_clk, _, _), + LPI_PINGROUP(7, LPI_NO_SLEW, dmic1_data, i2s1_ws, _, _), + LPI_PINGROUP(8, LPI_NO_SLEW, dmic2_clk, i2s1_data, _, _), + LPI_PINGROUP(9, LPI_NO_SLEW, dmic2_data, i2s1_data, ext_mclk1_b, _), + LPI_PINGROUP(10, 16, i2s2_clk, wsa_swr_clk, _, _), + LPI_PINGROUP(11, 18, i2s2_ws, wsa_swr_data, _, _), + LPI_PINGROUP(12, LPI_NO_SLEW, dmic3_clk, i2s3_clk, _, _), + LPI_PINGROUP(13, LPI_NO_SLEW, dmic3_data, i2s3_ws, ext_mclk1_a, _), + LPI_PINGROUP(14, 6, swr_tx_data, _, _, _), + LPI_PINGROUP(15, 20, i2s2_data, wsa2_swr_clk, _, _), + LPI_PINGROUP(16, 22, i2s2_data, wsa2_swr_data, _, _), + LPI_PINGROUP(17, LPI_NO_SLEW, dmic4_clk, i2s3_data, _, _), + LPI_PINGROUP(18, LPI_NO_SLEW, dmic4_data, i2s3_data, _, _), +}; + +static const struct lpi_function sc8280xp_functions[] = { + LPI_FUNCTION(dmic1_clk), + LPI_FUNCTION(dmic1_data), + LPI_FUNCTION(dmic2_clk), + LPI_FUNCTION(dmic2_data), + LPI_FUNCTION(dmic3_clk), + LPI_FUNCTION(dmic3_data), + LPI_FUNCTION(dmic4_clk), + LPI_FUNCTION(dmic4_data), + LPI_FUNCTION(i2s1_clk), + LPI_FUNCTION(i2s1_data), + LPI_FUNCTION(i2s1_ws), + LPI_FUNCTION(i2s2_clk), + LPI_FUNCTION(i2s2_data), + LPI_FUNCTION(i2s2_ws), + LPI_FUNCTION(i2s3_clk), + LPI_FUNCTION(i2s3_data), + LPI_FUNCTION(i2s3_ws), + LPI_FUNCTION(qua_mi2s_data), + LPI_FUNCTION(qua_mi2s_sclk), + LPI_FUNCTION(qua_mi2s_ws), + LPI_FUNCTION(swr_rx_clk), + LPI_FUNCTION(swr_rx_data), + LPI_FUNCTION(swr_tx_clk), + LPI_FUNCTION(swr_tx_data), + LPI_FUNCTION(wsa_swr_clk), + LPI_FUNCTION(wsa_swr_data), + LPI_FUNCTION(wsa2_swr_clk), + LPI_FUNCTION(wsa2_swr_data), + LPI_FUNCTION(ext_mclk1_a), + LPI_FUNCTION(ext_mclk1_b), + LPI_FUNCTION(ext_mclk1_c), +}; + +static const struct lpi_pinctrl_variant_data sc8280xp_lpi_data = { + .pins = sc8280xp_lpi_pins, + .npins = ARRAY_SIZE(sc8280xp_lpi_pins), + .groups = sc8280xp_groups, + .ngroups = ARRAY_SIZE(sc8280xp_groups), + .functions = sc8280xp_functions, + .nfunctions = ARRAY_SIZE(sc8280xp_functions), +}; + +static const struct of_device_id lpi_pinctrl_of_match[] = { + { + .compatible = "qcom,sc8280xp-lpass-lpi-pinctrl", + .data = &sc8280xp_lpi_data, + }, + { } +}; +MODULE_DEVICE_TABLE(of, lpi_pinctrl_of_match); + +static struct platform_driver lpi_pinctrl_driver = { + .driver = { + .name = "qcom-sc8280xp-lpass-lpi-pinctrl", + .of_match_table = lpi_pinctrl_of_match, + }, + .probe = lpi_pinctrl_probe, + .remove = lpi_pinctrl_remove, +}; + +module_platform_driver(lpi_pinctrl_driver); +MODULE_DESCRIPTION("QTI SC8280XP LPI GPIO pin control driver"); +MODULE_LICENSE("GPL"); -- GitLab From 9f1bdd7e822147a481cd75c0b2ac4d0199ac70d3 Mon Sep 17 00:00:00 2001 From: "Hui.Liu" Date: Thu, 18 Aug 2022 15:50:11 +0800 Subject: [PATCH 0068/2223] dt-bindings: pinctrl: mediatek: add support for mt8188 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the pinctrl header file on MediaTek mt8188. Add the new binding document for pinctrl on MediaTek mt8188. Signed-off-by: Hui.Liu Reviewed-by: Rob Herring Reviewed-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220818075012.20880-2-hui.liu@mediatek.com Signed-off-by: Linus Walleij --- .../pinctrl/mediatek,mt8188-pinctrl.yaml | 226 +++ .../pinctrl/mediatek,mt8188-pinfunc.h | 1280 +++++++++++++++++ 2 files changed, 1506 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/mediatek,mt8188-pinctrl.yaml create mode 100644 include/dt-bindings/pinctrl/mediatek,mt8188-pinfunc.h diff --git a/Documentation/devicetree/bindings/pinctrl/mediatek,mt8188-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/mediatek,mt8188-pinctrl.yaml new file mode 100644 index 0000000000000..7e750f1e643d0 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/mediatek,mt8188-pinctrl.yaml @@ -0,0 +1,226 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/mediatek,mt8188-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT8188 Pin Controller + +maintainers: + - Hui Liu + +description: | + The MediaTek's MT8188 Pin controller is used to control SoC pins. + +properties: + compatible: + const: mediatek,mt8188-pinctrl + + gpio-controller: true + + '#gpio-cells': + description: | + Number of cells in GPIO specifier, should be two. The first cell + is the pin number, the second cell is used to specify optional + parameters which are defined in . + const: 2 + + gpio-ranges: + maxItems: 1 + + gpio-line-names: true + + reg: + items: + - description: gpio registers base address + - description: rm group io configuration registers base address + - description: lt group io configuration registers base address + - description: lm group io configuration registers base address + - description: rt group io configuration registers base address + - description: eint registers base address + + reg-names: + items: + - const: iocfg0 + - const: iocfg_rm + - const: iocfg_lt + - const: iocfg_lm + - const: iocfg_rt + - const: eint + + interrupt-controller: true + + '#interrupt-cells': + const: 2 + + interrupts: + description: The interrupt outputs to sysirq. + maxItems: 1 + + mediatek,rsel-resistance-in-si-unit: + type: boolean + description: | + We provide two methods to select the resistance for I2C when pull up or pull down. + The first is by RSEL definition value, another one is by resistance value(ohm). + This flag is used to identify if the method is resistance(si unit) value. + +# PIN CONFIGURATION NODES +patternProperties: + '-pins$': + type: object + additionalProperties: false + + patternProperties: + '^pins': + type: object + $ref: "/schemas/pinctrl/pincfg-node.yaml" + additionalProperties: false + description: | + A pinctrl node should contain at least one subnode representing the + pinctrl groups available on the machine. Each subnode will list the + pins it needs, and how they should be configured, with regard to muxer + configuration, pullups, drive strength, input enable/disable and + input schmitt. + + properties: + pinmux: + description: | + Integer array, represents gpio pin number and mux setting. + Supported pin number and mux varies for different SoCs, and are + defined as macros in dt-bindings/pinctrl/mediatek,-pinfunc.h + directly. + + drive-strength: + enum: [2, 4, 6, 8, 10, 12, 14, 16] + + drive-strength-microamp: + enum: [125, 250, 500, 1000] + + bias-pull-down: + oneOf: + - type: boolean + - enum: [100, 101, 102, 103] + description: mt8188 pull down PUPD/R0/R1 type define value. + - enum: [200, 201, 202, 203, 204, 205, 206, 207] + description: mt8188 pull down RSEL type define value. + - enum: [75000, 5000] + description: mt8188 pull down RSEL type si unit value(ohm). + description: | + For pull down type is normal, it doesn't need add RSEL & R1R0 define + and resistance value. + For pull down type is PUPD/R0/R1 type, it can add R1R0 define to + set different resistance. It can support "MTK_PUPD_SET_R1R0_00" & + "MTK_PUPD_SET_R1R0_01" & "MTK_PUPD_SET_R1R0_10" & "MTK_PUPD_SET_R1R0_11" + define in mt8188. + For pull down type is RSEL, it can add RSEL define & resistance value(ohm) + to set different resistance by identifying property "mediatek,rsel-resistance-in-si-unit". + It can support "MTK_PULL_SET_RSEL_000" & "MTK_PULL_SET_RSEL_001" + & "MTK_PULL_SET_RSEL_010" & "MTK_PULL_SET_RSEL_011" & "MTK_PULL_SET_RSEL_100" + & "MTK_PULL_SET_RSEL_101" & "MTK_PULL_SET_RSEL_110" & "MTK_PULL_SET_RSEL_111" + define in mt8188. It can also support resistance value(ohm) "75000" & "5000" in mt8188. + + bias-pull-up: + oneOf: + - type: boolean + - enum: [100, 101, 102, 103] + description: mt8188 pull up PUPD/R0/R1 type define value. + - enum: [200, 201, 202, 203, 204, 205, 206, 207] + description: mt8188 pull up RSEL type define value. + - enum: [1000, 1500, 2000, 3000, 4000, 5000, 10000, 75000] + description: mt8188 pull up RSEL type si unit value(ohm). + description: | + For pull up type is normal, it don't need add RSEL & R1R0 define + and resistance value. + For pull up type is PUPD/R0/R1 type, it can add R1R0 define to + set different resistance. It can support "MTK_PUPD_SET_R1R0_00" & + "MTK_PUPD_SET_R1R0_01" & "MTK_PUPD_SET_R1R0_10" & "MTK_PUPD_SET_R1R0_11" + define in mt8188. + For pull up type is RSEL, it can add RSEL define & resistance value(ohm) + to set different resistance by identifying property "mediatek,rsel-resistance-in-si-unit". + It can support "MTK_PULL_SET_RSEL_000" & "MTK_PULL_SET_RSEL_001" + & "MTK_PULL_SET_RSEL_010" & "MTK_PULL_SET_RSEL_011" & "MTK_PULL_SET_RSEL_100" + & "MTK_PULL_SET_RSEL_101" & "MTK_PULL_SET_RSEL_110" & "MTK_PULL_SET_RSEL_111" + define in mt8188. It can also support resistance value(ohm) + "1000" & "1500" & "2000" & "3000" & "4000" & "5000" & "10000" & "75000" in mt8188. + + bias-disable: true + + output-high: true + + output-low: true + + input-enable: true + + input-disable: true + + input-schmitt-enable: true + + input-schmitt-disable: true + + required: + - pinmux + +required: + - compatible + - reg + - interrupts + - interrupt-controller + - '#interrupt-cells' + - gpio-controller + - '#gpio-cells' + - gpio-ranges + +additionalProperties: false + +examples: + - | + #include + #include + + pio: pinctrl@10005000 { + compatible = "mediatek,mt8188-pinctrl"; + reg = <0x10005000 0x1000>, + <0x11c00000 0x1000>, + <0x11e10000 0x1000>, + <0x11e20000 0x1000>, + <0x11ea0000 0x1000>, + <0x1000b000 0x1000>; + reg-names = "iocfg0", "iocfg_rm", + "iocfg_lt", "iocfg_lm", "iocfg_rt", + "eint"; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&pio 0 0 176>; + interrupt-controller; + interrupts = ; + #interrupt-cells = <2>; + + pio-pins { + pins { + pinmux = ; + output-low; + }; + }; + + spi0-pins { + pins-spi { + pinmux = , + , + ; + drive-strength = <6>; + }; + pins-spi-mi { + pinmux = ; + bias-pull-down = ; + }; + }; + + i2c0-pins { + pins { + pinmux = , + ; + bias-disable; + drive-strength-microamp = <1000>; + }; + }; + }; diff --git a/include/dt-bindings/pinctrl/mediatek,mt8188-pinfunc.h b/include/dt-bindings/pinctrl/mediatek,mt8188-pinfunc.h new file mode 100644 index 0000000000000..2688da2f621fd --- /dev/null +++ b/include/dt-bindings/pinctrl/mediatek,mt8188-pinfunc.h @@ -0,0 +1,1280 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (C) 2022 MediaTek Inc. + * Author: Hui Liu + */ + +#ifndef __MEDIATEK_MT8188_PINFUNC_H +#define __MEDIATEK_MT8188_PINFUNC_H + +#include "mt65xx.h" + +#define PINMUX_GPIO0__FUNC_B_GPIO0 (MTK_PIN_NO(0) | 0) +#define PINMUX_GPIO0__FUNC_B0_TP_GPIO0_AO (MTK_PIN_NO(0) | 1) +#define PINMUX_GPIO0__FUNC_O_SPIM5_CSB (MTK_PIN_NO(0) | 2) +#define PINMUX_GPIO0__FUNC_O_UTXD1 (MTK_PIN_NO(0) | 3) +#define PINMUX_GPIO0__FUNC_O_DMIC3_CLK (MTK_PIN_NO(0) | 4) +#define PINMUX_GPIO0__FUNC_B0_I2SIN_MCK (MTK_PIN_NO(0) | 5) +#define PINMUX_GPIO0__FUNC_O_I2SO2_MCK (MTK_PIN_NO(0) | 6) +#define PINMUX_GPIO0__FUNC_B0_DBG_MON_A0 (MTK_PIN_NO(0) | 7) + +#define PINMUX_GPIO1__FUNC_B_GPIO1 (MTK_PIN_NO(1) | 0) +#define PINMUX_GPIO1__FUNC_B0_TP_GPIO1_AO (MTK_PIN_NO(1) | 1) +#define PINMUX_GPIO1__FUNC_O_SPIM5_CLK (MTK_PIN_NO(1) | 2) +#define PINMUX_GPIO1__FUNC_I1_URXD1 (MTK_PIN_NO(1) | 3) +#define PINMUX_GPIO1__FUNC_I0_DMIC3_DAT (MTK_PIN_NO(1) | 4) +#define PINMUX_GPIO1__FUNC_B0_I2SIN_BCK (MTK_PIN_NO(1) | 5) +#define PINMUX_GPIO1__FUNC_B0_I2SO2_BCK (MTK_PIN_NO(1) | 6) +#define PINMUX_GPIO1__FUNC_B0_DBG_MON_A1 (MTK_PIN_NO(1) | 7) + +#define PINMUX_GPIO2__FUNC_B_GPIO2 (MTK_PIN_NO(2) | 0) +#define PINMUX_GPIO2__FUNC_B0_TP_GPIO2_AO (MTK_PIN_NO(2) | 1) +#define PINMUX_GPIO2__FUNC_B0_SPIM5_MOSI (MTK_PIN_NO(2) | 2) +#define PINMUX_GPIO2__FUNC_O_URTS1 (MTK_PIN_NO(2) | 3) +#define PINMUX_GPIO2__FUNC_I0_DMIC3_DAT_R (MTK_PIN_NO(2) | 4) +#define PINMUX_GPIO2__FUNC_B0_I2SIN_WS (MTK_PIN_NO(2) | 5) +#define PINMUX_GPIO2__FUNC_B0_I2SO2_WS (MTK_PIN_NO(2) | 6) +#define PINMUX_GPIO2__FUNC_B0_DBG_MON_A2 (MTK_PIN_NO(2) | 7) + +#define PINMUX_GPIO3__FUNC_B_GPIO3 (MTK_PIN_NO(3) | 0) +#define PINMUX_GPIO3__FUNC_B0_TP_GPIO3_AO (MTK_PIN_NO(3) | 1) +#define PINMUX_GPIO3__FUNC_B0_SPIM5_MISO (MTK_PIN_NO(3) | 2) +#define PINMUX_GPIO3__FUNC_I1_UCTS1 (MTK_PIN_NO(3) | 3) +#define PINMUX_GPIO3__FUNC_O_DMIC4_CLK (MTK_PIN_NO(3) | 4) +#define PINMUX_GPIO3__FUNC_I0_I2SIN_D0 (MTK_PIN_NO(3) | 5) +#define PINMUX_GPIO3__FUNC_O_I2SO2_D0 (MTK_PIN_NO(3) | 6) +#define PINMUX_GPIO3__FUNC_B0_DBG_MON_A3 (MTK_PIN_NO(3) | 7) + +#define PINMUX_GPIO4__FUNC_B_GPIO4 (MTK_PIN_NO(4) | 0) +#define PINMUX_GPIO4__FUNC_B0_TP_GPIO4_AO (MTK_PIN_NO(4) | 1) +#define PINMUX_GPIO4__FUNC_I0_SPDIF_IN2 (MTK_PIN_NO(4) | 2) +#define PINMUX_GPIO4__FUNC_O_I2SO1_MCK (MTK_PIN_NO(4) | 3) +#define PINMUX_GPIO4__FUNC_I0_DMIC4_DAT (MTK_PIN_NO(4) | 4) +#define PINMUX_GPIO4__FUNC_I0_I2SIN_D1 (MTK_PIN_NO(4) | 5) +#define PINMUX_GPIO4__FUNC_O_I2SO2_D1 (MTK_PIN_NO(4) | 6) +#define PINMUX_GPIO4__FUNC_B0_DBG_MON_A4 (MTK_PIN_NO(4) | 7) + +#define PINMUX_GPIO5__FUNC_B_GPIO5 (MTK_PIN_NO(5) | 0) +#define PINMUX_GPIO5__FUNC_B0_TP_GPIO5_AO (MTK_PIN_NO(5) | 1) +#define PINMUX_GPIO5__FUNC_I0_SPDIF_IN1 (MTK_PIN_NO(5) | 2) +#define PINMUX_GPIO5__FUNC_O_I2SO1_BCK (MTK_PIN_NO(5) | 3) +#define PINMUX_GPIO5__FUNC_I0_DMIC4_DAT_R (MTK_PIN_NO(5) | 4) +#define PINMUX_GPIO5__FUNC_I0_I2SIN_D2 (MTK_PIN_NO(5) | 5) +#define PINMUX_GPIO5__FUNC_O_I2SO2_D2 (MTK_PIN_NO(5) | 6) +#define PINMUX_GPIO5__FUNC_B0_DBG_MON_A5 (MTK_PIN_NO(5) | 7) + +#define PINMUX_GPIO6__FUNC_B_GPIO6 (MTK_PIN_NO(6) | 0) +#define PINMUX_GPIO6__FUNC_B0_TP_GPIO6_AO (MTK_PIN_NO(6) | 1) +#define PINMUX_GPIO6__FUNC_I0_SPDIF_IN0 (MTK_PIN_NO(6) | 2) +#define PINMUX_GPIO6__FUNC_O_I2SO1_WS (MTK_PIN_NO(6) | 3) +#define PINMUX_GPIO6__FUNC_O_DMIC1_CLK (MTK_PIN_NO(6) | 4) +#define PINMUX_GPIO6__FUNC_I0_I2SIN_D3 (MTK_PIN_NO(6) | 5) +#define PINMUX_GPIO6__FUNC_O_I2SO2_D3 (MTK_PIN_NO(6) | 6) +#define PINMUX_GPIO6__FUNC_B0_MD32_0_GPIO0 (MTK_PIN_NO(6) | 7) + +#define PINMUX_GPIO7__FUNC_B_GPIO7 (MTK_PIN_NO(7) | 0) +#define PINMUX_GPIO7__FUNC_B0_TP_GPIO7_AO (MTK_PIN_NO(7) | 1) +#define PINMUX_GPIO7__FUNC_O_SPIM3_CSB (MTK_PIN_NO(7) | 2) +#define PINMUX_GPIO7__FUNC_B0_TDMIN_MCK (MTK_PIN_NO(7) | 3) +#define PINMUX_GPIO7__FUNC_I0_DMIC1_DAT (MTK_PIN_NO(7) | 4) +#define PINMUX_GPIO7__FUNC_O_CMVREF0 (MTK_PIN_NO(7) | 5) +#define PINMUX_GPIO7__FUNC_O_CLKM0 (MTK_PIN_NO(7) | 6) +#define PINMUX_GPIO7__FUNC_B0_DBG_MON_A6 (MTK_PIN_NO(7) | 7) + +#define PINMUX_GPIO8__FUNC_B_GPIO8 (MTK_PIN_NO(8) | 0) +#define PINMUX_GPIO8__FUNC_B0_TP_GPIO0_AO (MTK_PIN_NO(8) | 1) +#define PINMUX_GPIO8__FUNC_O_SPIM3_CLK (MTK_PIN_NO(8) | 2) +#define PINMUX_GPIO8__FUNC_B0_TDMIN_BCK (MTK_PIN_NO(8) | 3) +#define PINMUX_GPIO8__FUNC_I0_DMIC1_DAT_R (MTK_PIN_NO(8) | 4) +#define PINMUX_GPIO8__FUNC_O_CMVREF1 (MTK_PIN_NO(8) | 5) +#define PINMUX_GPIO8__FUNC_O_CLKM1 (MTK_PIN_NO(8) | 6) +#define PINMUX_GPIO8__FUNC_B0_DBG_MON_A7 (MTK_PIN_NO(8) | 7) + +#define PINMUX_GPIO9__FUNC_B_GPIO9 (MTK_PIN_NO(9) | 0) +#define PINMUX_GPIO9__FUNC_B0_TP_GPIO1_AO (MTK_PIN_NO(9) | 1) +#define PINMUX_GPIO9__FUNC_B0_SPIM3_MOSI (MTK_PIN_NO(9) | 2) +#define PINMUX_GPIO9__FUNC_B0_TDMIN_LRCK (MTK_PIN_NO(9) | 3) +#define PINMUX_GPIO9__FUNC_O_DMIC2_CLK (MTK_PIN_NO(9) | 4) +#define PINMUX_GPIO9__FUNC_O_CMFLASH0 (MTK_PIN_NO(9) | 5) +#define PINMUX_GPIO9__FUNC_O_PWM_0 (MTK_PIN_NO(9) | 6) +#define PINMUX_GPIO9__FUNC_B0_DBG_MON_A8 (MTK_PIN_NO(9) | 7) + +#define PINMUX_GPIO10__FUNC_B_GPIO10 (MTK_PIN_NO(10) | 0) +#define PINMUX_GPIO10__FUNC_B0_TP_GPIO2_AO (MTK_PIN_NO(10) | 1) +#define PINMUX_GPIO10__FUNC_B0_SPIM3_MISO (MTK_PIN_NO(10) | 2) +#define PINMUX_GPIO10__FUNC_I0_TDMIN_DI (MTK_PIN_NO(10) | 3) +#define PINMUX_GPIO10__FUNC_I0_DMIC2_DAT (MTK_PIN_NO(10) | 4) +#define PINMUX_GPIO10__FUNC_O_CMFLASH1 (MTK_PIN_NO(10) | 5) +#define PINMUX_GPIO10__FUNC_O_PWM_1 (MTK_PIN_NO(10) | 6) +#define PINMUX_GPIO10__FUNC_B0_DBG_MON_A9 (MTK_PIN_NO(10) | 7) + +#define PINMUX_GPIO11__FUNC_B_GPIO11 (MTK_PIN_NO(11) | 0) +#define PINMUX_GPIO11__FUNC_B0_TP_GPIO3_AO (MTK_PIN_NO(11) | 1) +#define PINMUX_GPIO11__FUNC_O_SPDIF_OUT (MTK_PIN_NO(11) | 2) +#define PINMUX_GPIO11__FUNC_O_I2SO1_D0 (MTK_PIN_NO(11) | 3) +#define PINMUX_GPIO11__FUNC_I0_DMIC2_DAT_R (MTK_PIN_NO(11) | 4) +#define PINMUX_GPIO11__FUNC_I0_DVFSRC_EXT_REQ (MTK_PIN_NO(11) | 5) +#define PINMUX_GPIO11__FUNC_O_CMVREF6 (MTK_PIN_NO(11) | 6) +#define PINMUX_GPIO11__FUNC_B0_DBG_MON_A10 (MTK_PIN_NO(11) | 7) + +#define PINMUX_GPIO12__FUNC_B_GPIO12 (MTK_PIN_NO(12) | 0) +#define PINMUX_GPIO12__FUNC_B0_TP_GPIO4_AO (MTK_PIN_NO(12) | 1) +#define PINMUX_GPIO12__FUNC_O_SPIM4_CSB (MTK_PIN_NO(12) | 2) +#define PINMUX_GPIO12__FUNC_B1_JTMS_SEL3 (MTK_PIN_NO(12) | 3) +#define PINMUX_GPIO12__FUNC_B1_APU_JTAG_TMS (MTK_PIN_NO(12) | 4) +#define PINMUX_GPIO12__FUNC_I0_VPU_UDI_TMS (MTK_PIN_NO(12) | 5) +#define PINMUX_GPIO12__FUNC_I0_IPU_JTAG_TMS (MTK_PIN_NO(12) | 6) +#define PINMUX_GPIO12__FUNC_I0_HDMITX20_HTPLG (MTK_PIN_NO(12) | 7) + +#define PINMUX_GPIO13__FUNC_B_GPIO13 (MTK_PIN_NO(13) | 0) +#define PINMUX_GPIO13__FUNC_B0_TP_GPIO5_AO (MTK_PIN_NO(13) | 1) +#define PINMUX_GPIO13__FUNC_O_SPIM4_CLK (MTK_PIN_NO(13) | 2) +#define PINMUX_GPIO13__FUNC_I0_JTCK_SEL3 (MTK_PIN_NO(13) | 3) +#define PINMUX_GPIO13__FUNC_I0_APU_JTAG_TCK (MTK_PIN_NO(13) | 4) +#define PINMUX_GPIO13__FUNC_I0_VPU_UDI_TCK (MTK_PIN_NO(13) | 5) +#define PINMUX_GPIO13__FUNC_I0_IPU_JTAG_TCK (MTK_PIN_NO(13) | 6) +#define PINMUX_GPIO13__FUNC_B1_HDMITX20_CEC (MTK_PIN_NO(13) | 7) + +#define PINMUX_GPIO14__FUNC_B_GPIO14 (MTK_PIN_NO(14) | 0) +#define PINMUX_GPIO14__FUNC_B0_TP_GPIO6_AO (MTK_PIN_NO(14) | 1) +#define PINMUX_GPIO14__FUNC_B0_SPIM4_MOSI (MTK_PIN_NO(14) | 2) +#define PINMUX_GPIO14__FUNC_I1_JTDI_SEL3 (MTK_PIN_NO(14) | 3) +#define PINMUX_GPIO14__FUNC_I1_APU_JTAG_TDI (MTK_PIN_NO(14) | 4) +#define PINMUX_GPIO14__FUNC_I0_VPU_UDI_TDI (MTK_PIN_NO(14) | 5) +#define PINMUX_GPIO14__FUNC_I0_IPU_JTAG_TDI (MTK_PIN_NO(14) | 6) +#define PINMUX_GPIO14__FUNC_B1_HDMITX20_SCL (MTK_PIN_NO(14) | 7) + +#define PINMUX_GPIO15__FUNC_B_GPIO15 (MTK_PIN_NO(15) | 0) +#define PINMUX_GPIO15__FUNC_B0_TP_GPIO7_AO (MTK_PIN_NO(15) | 1) +#define PINMUX_GPIO15__FUNC_B0_SPIM4_MISO (MTK_PIN_NO(15) | 2) +#define PINMUX_GPIO15__FUNC_O_JTDO_SEL3 (MTK_PIN_NO(15) | 3) +#define PINMUX_GPIO15__FUNC_O_APU_JTAG_TDO (MTK_PIN_NO(15) | 4) +#define PINMUX_GPIO15__FUNC_O_VPU_UDI_TDO (MTK_PIN_NO(15) | 5) +#define PINMUX_GPIO15__FUNC_O_IPU_JTAG_TDO (MTK_PIN_NO(15) | 6) +#define PINMUX_GPIO15__FUNC_B1_HDMITX20_SDA (MTK_PIN_NO(15) | 7) + +#define PINMUX_GPIO16__FUNC_B_GPIO16 (MTK_PIN_NO(16) | 0) +#define PINMUX_GPIO16__FUNC_B0_TP_GPIO0_AO (MTK_PIN_NO(16) | 1) +#define PINMUX_GPIO16__FUNC_O_UTXD3 (MTK_PIN_NO(16) | 2) +#define PINMUX_GPIO16__FUNC_I1_JTRSTn_SEL3 (MTK_PIN_NO(16) | 3) +#define PINMUX_GPIO16__FUNC_I0_APU_JTAG_TRST (MTK_PIN_NO(16) | 4) +#define PINMUX_GPIO16__FUNC_I0_VPU_UDI_NTRST (MTK_PIN_NO(16) | 5) +#define PINMUX_GPIO16__FUNC_I0_IPU_JTAG_TRST (MTK_PIN_NO(16) | 6) +#define PINMUX_GPIO16__FUNC_O_HDMITX20_PWR5V (MTK_PIN_NO(16) | 7) + +#define PINMUX_GPIO17__FUNC_B_GPIO17 (MTK_PIN_NO(17) | 0) +#define PINMUX_GPIO17__FUNC_B0_TP_GPIO1_AO (MTK_PIN_NO(17) | 1) +#define PINMUX_GPIO17__FUNC_I1_URXD3 (MTK_PIN_NO(17) | 2) +#define PINMUX_GPIO17__FUNC_O_CMFLASH2 (MTK_PIN_NO(17) | 3) +#define PINMUX_GPIO17__FUNC_I0_EDP_TX_HPD (MTK_PIN_NO(17) | 4) +#define PINMUX_GPIO17__FUNC_I0_DVFSRC_EXT_REQ (MTK_PIN_NO(17) | 5) +#define PINMUX_GPIO17__FUNC_O_CMVREF7 (MTK_PIN_NO(17) | 6) +#define PINMUX_GPIO17__FUNC_B0_MD32_0_GPIO1 (MTK_PIN_NO(17) | 7) + +#define PINMUX_GPIO18__FUNC_B_GPIO18 (MTK_PIN_NO(18) | 0) +#define PINMUX_GPIO18__FUNC_B0_TP_GPIO2_AO (MTK_PIN_NO(18) | 1) +#define PINMUX_GPIO18__FUNC_O_CMFLASH0 (MTK_PIN_NO(18) | 2) +#define PINMUX_GPIO18__FUNC_O_CMVREF4 (MTK_PIN_NO(18) | 3) +#define PINMUX_GPIO18__FUNC_B0_TDMIN_MCK (MTK_PIN_NO(18) | 4) +#define PINMUX_GPIO18__FUNC_O_UTXD1 (MTK_PIN_NO(18) | 5) +#define PINMUX_GPIO18__FUNC_O_TP_UTXD1_AO (MTK_PIN_NO(18) | 6) +#define PINMUX_GPIO18__FUNC_B0_DBG_MON_A11 (MTK_PIN_NO(18) | 7) + +#define PINMUX_GPIO19__FUNC_B_GPIO19 (MTK_PIN_NO(19) | 0) +#define PINMUX_GPIO19__FUNC_B0_TP_GPIO3_AO (MTK_PIN_NO(19) | 1) +#define PINMUX_GPIO19__FUNC_O_CMFLASH1 (MTK_PIN_NO(19) | 2) +#define PINMUX_GPIO19__FUNC_O_CMVREF5 (MTK_PIN_NO(19) | 3) +#define PINMUX_GPIO19__FUNC_B0_TDMIN_BCK (MTK_PIN_NO(19) | 4) +#define PINMUX_GPIO19__FUNC_I1_URXD1 (MTK_PIN_NO(19) | 5) +#define PINMUX_GPIO19__FUNC_I1_TP_URXD1_AO (MTK_PIN_NO(19) | 6) +#define PINMUX_GPIO19__FUNC_B0_DBG_MON_A12 (MTK_PIN_NO(19) | 7) + +#define PINMUX_GPIO20__FUNC_B_GPIO20 (MTK_PIN_NO(20) | 0) +#define PINMUX_GPIO20__FUNC_B0_TP_GPIO4_AO (MTK_PIN_NO(20) | 1) +#define PINMUX_GPIO20__FUNC_O_CMFLASH2 (MTK_PIN_NO(20) | 2) +#define PINMUX_GPIO20__FUNC_O_CLKM2 (MTK_PIN_NO(20) | 3) +#define PINMUX_GPIO20__FUNC_B0_TDMIN_LRCK (MTK_PIN_NO(20) | 4) +#define PINMUX_GPIO20__FUNC_O_URTS1 (MTK_PIN_NO(20) | 5) +#define PINMUX_GPIO20__FUNC_O_TP_URTS1_AO (MTK_PIN_NO(20) | 6) +#define PINMUX_GPIO20__FUNC_B0_DBG_MON_A13 (MTK_PIN_NO(20) | 7) + +#define PINMUX_GPIO21__FUNC_B_GPIO21 (MTK_PIN_NO(21) | 0) +#define PINMUX_GPIO21__FUNC_B0_TP_GPIO5_AO (MTK_PIN_NO(21) | 1) +#define PINMUX_GPIO21__FUNC_O_CMFLASH3 (MTK_PIN_NO(21) | 2) +#define PINMUX_GPIO21__FUNC_O_CLKM3 (MTK_PIN_NO(21) | 3) +#define PINMUX_GPIO21__FUNC_I0_TDMIN_DI (MTK_PIN_NO(21) | 4) +#define PINMUX_GPIO21__FUNC_I1_UCTS1 (MTK_PIN_NO(21) | 5) +#define PINMUX_GPIO21__FUNC_I1_TP_UCTS1_AO (MTK_PIN_NO(21) | 6) +#define PINMUX_GPIO21__FUNC_B0_DBG_MON_A14 (MTK_PIN_NO(21) | 7) + +#define PINMUX_GPIO22__FUNC_B_GPIO22 (MTK_PIN_NO(22) | 0) +#define PINMUX_GPIO22__FUNC_O_CMMCLK0 (MTK_PIN_NO(22) | 1) +#define PINMUX_GPIO22__FUNC_B0_TP_GPIO6_AO (MTK_PIN_NO(22) | 5) +#define PINMUX_GPIO22__FUNC_B0_DBG_MON_A15 (MTK_PIN_NO(22) | 7) + +#define PINMUX_GPIO23__FUNC_B_GPIO23 (MTK_PIN_NO(23) | 0) +#define PINMUX_GPIO23__FUNC_O_CMMCLK1 (MTK_PIN_NO(23) | 1) +#define PINMUX_GPIO23__FUNC_O_PWM_2 (MTK_PIN_NO(23) | 3) +#define PINMUX_GPIO23__FUNC_B1_PCIE_PHY_I2C_SCL (MTK_PIN_NO(23) | 4) +#define PINMUX_GPIO23__FUNC_B0_TP_GPIO7_AO (MTK_PIN_NO(23) | 5) +#define PINMUX_GPIO23__FUNC_I0_DP_TX_HPD (MTK_PIN_NO(23) | 6) +#define PINMUX_GPIO23__FUNC_B0_DBG_MON_A16 (MTK_PIN_NO(23) | 7) + +#define PINMUX_GPIO24__FUNC_B_GPIO24 (MTK_PIN_NO(24) | 0) +#define PINMUX_GPIO24__FUNC_O_CMMCLK2 (MTK_PIN_NO(24) | 1) +#define PINMUX_GPIO24__FUNC_O_PWM_3 (MTK_PIN_NO(24) | 3) +#define PINMUX_GPIO24__FUNC_B1_PCIE_PHY_I2C_SDA (MTK_PIN_NO(24) | 4) +#define PINMUX_GPIO24__FUNC_I0_DVFSRC_EXT_REQ (MTK_PIN_NO(24) | 5) +#define PINMUX_GPIO24__FUNC_I0_EDP_TX_HPD (MTK_PIN_NO(24) | 6) +#define PINMUX_GPIO24__FUNC_B0_MD32_0_GPIO2 (MTK_PIN_NO(24) | 7) + +#define PINMUX_GPIO25__FUNC_B_GPIO25 (MTK_PIN_NO(25) | 0) +#define PINMUX_GPIO25__FUNC_O_LCM_RST (MTK_PIN_NO(25) | 1) +#define PINMUX_GPIO25__FUNC_O_LCM1_RST (MTK_PIN_NO(25) | 2) +#define PINMUX_GPIO25__FUNC_I0_DP_TX_HPD (MTK_PIN_NO(25) | 3) + +#define PINMUX_GPIO26__FUNC_B_GPIO26 (MTK_PIN_NO(26) | 0) +#define PINMUX_GPIO26__FUNC_I0_DSI_TE (MTK_PIN_NO(26) | 1) +#define PINMUX_GPIO26__FUNC_I0_DSI1_TE (MTK_PIN_NO(26) | 2) +#define PINMUX_GPIO26__FUNC_I0_EDP_TX_HPD (MTK_PIN_NO(26) | 3) + +#define PINMUX_GPIO27__FUNC_B_GPIO27 (MTK_PIN_NO(27) | 0) +#define PINMUX_GPIO27__FUNC_O_LCM1_RST (MTK_PIN_NO(27) | 1) +#define PINMUX_GPIO27__FUNC_O_LCM_RST (MTK_PIN_NO(27) | 2) +#define PINMUX_GPIO27__FUNC_I0_DP_TX_HPD (MTK_PIN_NO(27) | 3) +#define PINMUX_GPIO27__FUNC_O_CMVREF2 (MTK_PIN_NO(27) | 4) +#define PINMUX_GPIO27__FUNC_O_mbistwriteen_trigger (MTK_PIN_NO(27) | 5) +#define PINMUX_GPIO27__FUNC_O_PWM_2 (MTK_PIN_NO(27) | 6) +#define PINMUX_GPIO27__FUNC_B0_DBG_MON_A17 (MTK_PIN_NO(27) | 7) + +#define PINMUX_GPIO28__FUNC_B_GPIO28 (MTK_PIN_NO(28) | 0) +#define PINMUX_GPIO28__FUNC_I0_DSI1_TE (MTK_PIN_NO(28) | 1) +#define PINMUX_GPIO28__FUNC_I0_DSI_TE (MTK_PIN_NO(28) | 2) +#define PINMUX_GPIO28__FUNC_I0_EDP_TX_HPD (MTK_PIN_NO(28) | 3) +#define PINMUX_GPIO28__FUNC_O_CMVREF3 (MTK_PIN_NO(28) | 4) +#define PINMUX_GPIO28__FUNC_O_mbistreaden_trigger (MTK_PIN_NO(28) | 5) +#define PINMUX_GPIO28__FUNC_O_PWM_3 (MTK_PIN_NO(28) | 6) +#define PINMUX_GPIO28__FUNC_B0_DBG_MON_A18 (MTK_PIN_NO(28) | 7) + +#define PINMUX_GPIO29__FUNC_B_GPIO29 (MTK_PIN_NO(29) | 0) +#define PINMUX_GPIO29__FUNC_O_DISP_PWM0 (MTK_PIN_NO(29) | 1) +#define PINMUX_GPIO29__FUNC_O_DISP_PWM1 (MTK_PIN_NO(29) | 2) + +#define PINMUX_GPIO30__FUNC_B_GPIO30 (MTK_PIN_NO(30) | 0) +#define PINMUX_GPIO30__FUNC_O_DISP_PWM1 (MTK_PIN_NO(30) | 1) +#define PINMUX_GPIO30__FUNC_O_DISP_PWM0 (MTK_PIN_NO(30) | 2) +#define PINMUX_GPIO30__FUNC_O_CMFLASH3 (MTK_PIN_NO(30) | 3) +#define PINMUX_GPIO30__FUNC_O_PWM_1 (MTK_PIN_NO(30) | 4) +#define PINMUX_GPIO30__FUNC_B0_DBG_MON_A19 (MTK_PIN_NO(30) | 7) + +#define PINMUX_GPIO31__FUNC_B_GPIO31 (MTK_PIN_NO(31) | 0) +#define PINMUX_GPIO31__FUNC_O_UTXD0 (MTK_PIN_NO(31) | 1) +#define PINMUX_GPIO31__FUNC_O_TP_UTXD1_AO (MTK_PIN_NO(31) | 2) +#define PINMUX_GPIO31__FUNC_O_ADSP_UTXD0 (MTK_PIN_NO(31) | 3) +#define PINMUX_GPIO31__FUNC_O_TP_UTXD2_AO (MTK_PIN_NO(31) | 4) +#define PINMUX_GPIO31__FUNC_O_MD32_0_TXD (MTK_PIN_NO(31) | 5) +#define PINMUX_GPIO31__FUNC_O_MD32_1_TXD (MTK_PIN_NO(31) | 6) +#define PINMUX_GPIO31__FUNC_O_SSPM_UTXD_AO (MTK_PIN_NO(31) | 7) + +#define PINMUX_GPIO32__FUNC_B_GPIO32 (MTK_PIN_NO(32) | 0) +#define PINMUX_GPIO32__FUNC_I1_URXD0 (MTK_PIN_NO(32) | 1) +#define PINMUX_GPIO32__FUNC_I1_TP_URXD1_AO (MTK_PIN_NO(32) | 2) +#define PINMUX_GPIO32__FUNC_I1_ADSP_URXD0 (MTK_PIN_NO(32) | 3) +#define PINMUX_GPIO32__FUNC_I1_TP_URXD2_AO (MTK_PIN_NO(32) | 4) +#define PINMUX_GPIO32__FUNC_I1_MD32_0_RXD (MTK_PIN_NO(32) | 5) +#define PINMUX_GPIO32__FUNC_I1_MD32_1_RXD (MTK_PIN_NO(32) | 6) +#define PINMUX_GPIO32__FUNC_I1_SSPM_URXD_AO (MTK_PIN_NO(32) | 7) + +#define PINMUX_GPIO33__FUNC_B_GPIO33 (MTK_PIN_NO(33) | 0) +#define PINMUX_GPIO33__FUNC_O_UTXD1 (MTK_PIN_NO(33) | 1) +#define PINMUX_GPIO33__FUNC_O_URTS2 (MTK_PIN_NO(33) | 2) +#define PINMUX_GPIO33__FUNC_O_ADSP_UTXD0 (MTK_PIN_NO(33) | 3) +#define PINMUX_GPIO33__FUNC_O_TP_UTXD1_AO (MTK_PIN_NO(33) | 4) +#define PINMUX_GPIO33__FUNC_O_mbistwriteen_trigger (MTK_PIN_NO(33) | 5) +#define PINMUX_GPIO33__FUNC_O_MD32_0_TXD (MTK_PIN_NO(33) | 6) +#define PINMUX_GPIO33__FUNC_O_SSPM_UTXD_AO (MTK_PIN_NO(33) | 7) + +#define PINMUX_GPIO34__FUNC_B_GPIO34 (MTK_PIN_NO(34) | 0) +#define PINMUX_GPIO34__FUNC_I1_URXD1 (MTK_PIN_NO(34) | 1) +#define PINMUX_GPIO34__FUNC_I1_UCTS2 (MTK_PIN_NO(34) | 2) +#define PINMUX_GPIO34__FUNC_I1_ADSP_URXD0 (MTK_PIN_NO(34) | 3) +#define PINMUX_GPIO34__FUNC_I1_TP_URXD1_AO (MTK_PIN_NO(34) | 4) +#define PINMUX_GPIO34__FUNC_O_mbistreaden_trigger (MTK_PIN_NO(34) | 5) +#define PINMUX_GPIO34__FUNC_I1_MD32_0_RXD (MTK_PIN_NO(34) | 6) +#define PINMUX_GPIO34__FUNC_I1_SSPM_URXD_AO (MTK_PIN_NO(34) | 7) + +#define PINMUX_GPIO35__FUNC_B_GPIO35 (MTK_PIN_NO(35) | 0) +#define PINMUX_GPIO35__FUNC_O_UTXD2 (MTK_PIN_NO(35) | 1) +#define PINMUX_GPIO35__FUNC_O_URTS1 (MTK_PIN_NO(35) | 2) +#define PINMUX_GPIO35__FUNC_O_ADSP_UTXD0 (MTK_PIN_NO(35) | 3) +#define PINMUX_GPIO35__FUNC_O_TP_URTS1_AO (MTK_PIN_NO(35) | 4) +#define PINMUX_GPIO35__FUNC_O_TP_UTXD2_AO (MTK_PIN_NO(35) | 5) +#define PINMUX_GPIO35__FUNC_O_MD32_1_TXD (MTK_PIN_NO(35) | 6) +#define PINMUX_GPIO35__FUNC_B0_DBG_MON_A20 (MTK_PIN_NO(35) | 7) + +#define PINMUX_GPIO36__FUNC_B_GPIO36 (MTK_PIN_NO(36) | 0) +#define PINMUX_GPIO36__FUNC_I1_URXD2 (MTK_PIN_NO(36) | 1) +#define PINMUX_GPIO36__FUNC_I1_UCTS1 (MTK_PIN_NO(36) | 2) +#define PINMUX_GPIO36__FUNC_I1_ADSP_URXD0 (MTK_PIN_NO(36) | 3) +#define PINMUX_GPIO36__FUNC_I1_TP_UCTS1_AO (MTK_PIN_NO(36) | 4) +#define PINMUX_GPIO36__FUNC_I1_TP_URXD2_AO (MTK_PIN_NO(36) | 5) +#define PINMUX_GPIO36__FUNC_I1_MD32_1_RXD (MTK_PIN_NO(36) | 6) +#define PINMUX_GPIO36__FUNC_B0_DBG_MON_A21 (MTK_PIN_NO(36) | 7) + +#define PINMUX_GPIO37__FUNC_B_GPIO37 (MTK_PIN_NO(37) | 0) +#define PINMUX_GPIO37__FUNC_B1_JTMS_SEL1 (MTK_PIN_NO(37) | 1) +#define PINMUX_GPIO37__FUNC_I0_UDI_TMS (MTK_PIN_NO(37) | 2) +#define PINMUX_GPIO37__FUNC_I1_SPM_JTAG_TMS (MTK_PIN_NO(37) | 3) +#define PINMUX_GPIO37__FUNC_I1_ADSP_JTAG0_TMS (MTK_PIN_NO(37) | 4) +#define PINMUX_GPIO37__FUNC_I1_SCP_JTAG0_TMS (MTK_PIN_NO(37) | 5) +#define PINMUX_GPIO37__FUNC_I1_CCU0_JTAG_TMS (MTK_PIN_NO(37) | 6) +#define PINMUX_GPIO37__FUNC_I1_MCUPM_JTAG_TMS (MTK_PIN_NO(37) | 7) + +#define PINMUX_GPIO38__FUNC_B_GPIO38 (MTK_PIN_NO(38) | 0) +#define PINMUX_GPIO38__FUNC_I0_JTCK_SEL1 (MTK_PIN_NO(38) | 1) +#define PINMUX_GPIO38__FUNC_I0_UDI_TCK (MTK_PIN_NO(38) | 2) +#define PINMUX_GPIO38__FUNC_I1_SPM_JTAG_TCK (MTK_PIN_NO(38) | 3) +#define PINMUX_GPIO38__FUNC_I0_ADSP_JTAG0_TCK (MTK_PIN_NO(38) | 4) +#define PINMUX_GPIO38__FUNC_I1_SCP_JTAG0_TCK (MTK_PIN_NO(38) | 5) +#define PINMUX_GPIO38__FUNC_I1_CCU0_JTAG_TCK (MTK_PIN_NO(38) | 6) +#define PINMUX_GPIO38__FUNC_I1_MCUPM_JTAG_TCK (MTK_PIN_NO(38) | 7) + +#define PINMUX_GPIO39__FUNC_B_GPIO39 (MTK_PIN_NO(39) | 0) +#define PINMUX_GPIO39__FUNC_I1_JTDI_SEL1 (MTK_PIN_NO(39) | 1) +#define PINMUX_GPIO39__FUNC_I0_UDI_TDI (MTK_PIN_NO(39) | 2) +#define PINMUX_GPIO39__FUNC_I1_SPM_JTAG_TDI (MTK_PIN_NO(39) | 3) +#define PINMUX_GPIO39__FUNC_I1_ADSP_JTAG0_TDI (MTK_PIN_NO(39) | 4) +#define PINMUX_GPIO39__FUNC_I1_SCP_JTAG0_TDI (MTK_PIN_NO(39) | 5) +#define PINMUX_GPIO39__FUNC_I1_CCU0_JTAG_TDI (MTK_PIN_NO(39) | 6) +#define PINMUX_GPIO39__FUNC_I1_MCUPM_JTAG_TDI (MTK_PIN_NO(39) | 7) + +#define PINMUX_GPIO40__FUNC_B_GPIO40 (MTK_PIN_NO(40) | 0) +#define PINMUX_GPIO40__FUNC_O_JTDO_SEL1 (MTK_PIN_NO(40) | 1) +#define PINMUX_GPIO40__FUNC_O_UDI_TDO (MTK_PIN_NO(40) | 2) +#define PINMUX_GPIO40__FUNC_O_SPM_JTAG_TDO (MTK_PIN_NO(40) | 3) +#define PINMUX_GPIO40__FUNC_O_ADSP_JTAG0_TDO (MTK_PIN_NO(40) | 4) +#define PINMUX_GPIO40__FUNC_O_SCP_JTAG0_TDO (MTK_PIN_NO(40) | 5) +#define PINMUX_GPIO40__FUNC_O_CCU0_JTAG_TDO (MTK_PIN_NO(40) | 6) +#define PINMUX_GPIO40__FUNC_O_MCUPM_JTAG_TDO (MTK_PIN_NO(40) | 7) + +#define PINMUX_GPIO41__FUNC_B_GPIO41 (MTK_PIN_NO(41) | 0) +#define PINMUX_GPIO41__FUNC_I1_JTRSTn_SEL1 (MTK_PIN_NO(41) | 1) +#define PINMUX_GPIO41__FUNC_I0_UDI_NTRST (MTK_PIN_NO(41) | 2) +#define PINMUX_GPIO41__FUNC_I0_SPM_JTAG_TRSTN (MTK_PIN_NO(41) | 3) +#define PINMUX_GPIO41__FUNC_I1_ADSP_JTAG0_TRSTN (MTK_PIN_NO(41) | 4) +#define PINMUX_GPIO41__FUNC_I0_SCP_JTAG0_TRSTN (MTK_PIN_NO(41) | 5) +#define PINMUX_GPIO41__FUNC_I1_CCU0_JTAG_TRST (MTK_PIN_NO(41) | 6) +#define PINMUX_GPIO41__FUNC_I0_MCUPM_JTAG_TRSTN (MTK_PIN_NO(41) | 7) + +#define PINMUX_GPIO42__FUNC_B_GPIO42 (MTK_PIN_NO(42) | 0) +#define PINMUX_GPIO42__FUNC_B1_KPCOL0 (MTK_PIN_NO(42) | 1) + +#define PINMUX_GPIO43__FUNC_B_GPIO43 (MTK_PIN_NO(43) | 0) +#define PINMUX_GPIO43__FUNC_B1_KPCOL1 (MTK_PIN_NO(43) | 1) +#define PINMUX_GPIO43__FUNC_I0_DP_TX_HPD (MTK_PIN_NO(43) | 2) +#define PINMUX_GPIO43__FUNC_O_CMFLASH2 (MTK_PIN_NO(43) | 3) +#define PINMUX_GPIO43__FUNC_I0_DVFSRC_EXT_REQ (MTK_PIN_NO(43) | 4) +#define PINMUX_GPIO43__FUNC_O_mbistwriteen_trigger (MTK_PIN_NO(43) | 7) + +#define PINMUX_GPIO44__FUNC_B_GPIO44 (MTK_PIN_NO(44) | 0) +#define PINMUX_GPIO44__FUNC_B1_KPROW0 (MTK_PIN_NO(44) | 1) + +#define PINMUX_GPIO45__FUNC_B_GPIO45 (MTK_PIN_NO(45) | 0) +#define PINMUX_GPIO45__FUNC_B1_KPROW1 (MTK_PIN_NO(45) | 1) +#define PINMUX_GPIO45__FUNC_I0_EDP_TX_HPD (MTK_PIN_NO(45) | 2) +#define PINMUX_GPIO45__FUNC_O_CMFLASH3 (MTK_PIN_NO(45) | 3) +#define PINMUX_GPIO45__FUNC_B0_I2SIN_MCK (MTK_PIN_NO(45) | 4) +#define PINMUX_GPIO45__FUNC_O_mbistreaden_trigger (MTK_PIN_NO(45) | 7) + +#define PINMUX_GPIO46__FUNC_B_GPIO46 (MTK_PIN_NO(46) | 0) +#define PINMUX_GPIO46__FUNC_I0_DP_TX_HPD (MTK_PIN_NO(46) | 1) +#define PINMUX_GPIO46__FUNC_O_PWM_0 (MTK_PIN_NO(46) | 2) +#define PINMUX_GPIO46__FUNC_I0_VBUSVALID_2P (MTK_PIN_NO(46) | 3) +#define PINMUX_GPIO46__FUNC_B0_DBG_MON_A22 (MTK_PIN_NO(46) | 7) + +#define PINMUX_GPIO47__FUNC_B_GPIO47 (MTK_PIN_NO(47) | 0) +#define PINMUX_GPIO47__FUNC_I1_WAKEN (MTK_PIN_NO(47) | 1) +#define PINMUX_GPIO47__FUNC_O_GDU_TROOPS_DET0 (MTK_PIN_NO(47) | 6) + +#define PINMUX_GPIO48__FUNC_B_GPIO48 (MTK_PIN_NO(48) | 0) +#define PINMUX_GPIO48__FUNC_O_PERSTN (MTK_PIN_NO(48) | 1) +#define PINMUX_GPIO48__FUNC_O_GDU_TROOPS_DET1 (MTK_PIN_NO(48) | 6) + +#define PINMUX_GPIO49__FUNC_B_GPIO49 (MTK_PIN_NO(49) | 0) +#define PINMUX_GPIO49__FUNC_B1_CLKREQN (MTK_PIN_NO(49) | 1) +#define PINMUX_GPIO49__FUNC_O_GDU_TROOPS_DET2 (MTK_PIN_NO(49) | 6) + +#define PINMUX_GPIO50__FUNC_B_GPIO50 (MTK_PIN_NO(50) | 0) +#define PINMUX_GPIO50__FUNC_O_HDMITX20_PWR5V (MTK_PIN_NO(50) | 1) +#define PINMUX_GPIO50__FUNC_I1_IDDIG_1P (MTK_PIN_NO(50) | 3) +#define PINMUX_GPIO50__FUNC_I1_SCP_JTAG1_TMS (MTK_PIN_NO(50) | 4) +#define PINMUX_GPIO50__FUNC_I1_SSPM_JTAG_TMS (MTK_PIN_NO(50) | 5) +#define PINMUX_GPIO50__FUNC_I1_MD32_0_JTAG_TMS (MTK_PIN_NO(50) | 6) +#define PINMUX_GPIO50__FUNC_I1_MD32_1_JTAG_TMS (MTK_PIN_NO(50) | 7) + +#define PINMUX_GPIO51__FUNC_B_GPIO51 (MTK_PIN_NO(51) | 0) +#define PINMUX_GPIO51__FUNC_I0_HDMITX20_HTPLG (MTK_PIN_NO(51) | 1) +#define PINMUX_GPIO51__FUNC_I0_EDP_TX_HPD (MTK_PIN_NO(51) | 2) +#define PINMUX_GPIO51__FUNC_O_USB_DRVVBUS_1P (MTK_PIN_NO(51) | 3) +#define PINMUX_GPIO51__FUNC_I1_SCP_JTAG1_TCK (MTK_PIN_NO(51) | 4) +#define PINMUX_GPIO51__FUNC_I1_SSPM_JTAG_TCK (MTK_PIN_NO(51) | 5) +#define PINMUX_GPIO51__FUNC_I1_MD32_0_JTAG_TCK (MTK_PIN_NO(51) | 6) +#define PINMUX_GPIO51__FUNC_I1_MD32_1_JTAG_TCK (MTK_PIN_NO(51) | 7) + +#define PINMUX_GPIO52__FUNC_B_GPIO52 (MTK_PIN_NO(52) | 0) +#define PINMUX_GPIO52__FUNC_B1_HDMITX20_CEC (MTK_PIN_NO(52) | 1) +#define PINMUX_GPIO52__FUNC_I0_VBUSVALID_1P (MTK_PIN_NO(52) | 3) +#define PINMUX_GPIO52__FUNC_I1_SCP_JTAG1_TDI (MTK_PIN_NO(52) | 4) +#define PINMUX_GPIO52__FUNC_I1_SSPM_JTAG_TDI (MTK_PIN_NO(52) | 5) +#define PINMUX_GPIO52__FUNC_I1_MD32_0_JTAG_TDI (MTK_PIN_NO(52) | 6) +#define PINMUX_GPIO52__FUNC_I1_MD32_1_JTAG_TDI (MTK_PIN_NO(52) | 7) + +#define PINMUX_GPIO53__FUNC_B_GPIO53 (MTK_PIN_NO(53) | 0) +#define PINMUX_GPIO53__FUNC_B1_HDMITX20_SCL (MTK_PIN_NO(53) | 1) +#define PINMUX_GPIO53__FUNC_I1_IDDIG_2P (MTK_PIN_NO(53) | 3) +#define PINMUX_GPIO53__FUNC_O_SCP_JTAG1_TDO (MTK_PIN_NO(53) | 4) +#define PINMUX_GPIO53__FUNC_O_SSPM_JTAG_TDO (MTK_PIN_NO(53) | 5) +#define PINMUX_GPIO53__FUNC_O_MD32_0_JTAG_TDO (MTK_PIN_NO(53) | 6) +#define PINMUX_GPIO53__FUNC_O_MD32_1_JTAG_TDO (MTK_PIN_NO(53) | 7) + +#define PINMUX_GPIO54__FUNC_B_GPIO54 (MTK_PIN_NO(54) | 0) +#define PINMUX_GPIO54__FUNC_B1_HDMITX20_SDA (MTK_PIN_NO(54) | 1) +#define PINMUX_GPIO54__FUNC_O_USB_DRVVBUS_2P (MTK_PIN_NO(54) | 3) +#define PINMUX_GPIO54__FUNC_I0_SCP_JTAG1_TRSTN (MTK_PIN_NO(54) | 4) +#define PINMUX_GPIO54__FUNC_I0_SSPM_JTAG_TRSTN (MTK_PIN_NO(54) | 5) +#define PINMUX_GPIO54__FUNC_I1_MD32_0_JTAG_TRST (MTK_PIN_NO(54) | 6) +#define PINMUX_GPIO54__FUNC_I1_MD32_1_JTAG_TRST (MTK_PIN_NO(54) | 7) + +#define PINMUX_GPIO55__FUNC_B_GPIO55 (MTK_PIN_NO(55) | 0) +#define PINMUX_GPIO55__FUNC_B1_SCL0 (MTK_PIN_NO(55) | 1) +#define PINMUX_GPIO55__FUNC_B1_SCP_SCL0 (MTK_PIN_NO(55) | 2) +#define PINMUX_GPIO55__FUNC_B1_SCP_SCL1 (MTK_PIN_NO(55) | 3) +#define PINMUX_GPIO55__FUNC_B1_PCIE_PHY_I2C_SCL (MTK_PIN_NO(55) | 4) + +#define PINMUX_GPIO56__FUNC_B_GPIO56 (MTK_PIN_NO(56) | 0) +#define PINMUX_GPIO56__FUNC_B1_SDA0 (MTK_PIN_NO(56) | 1) +#define PINMUX_GPIO56__FUNC_B1_SCP_SDA0 (MTK_PIN_NO(56) | 2) +#define PINMUX_GPIO56__FUNC_B1_SCP_SDA1 (MTK_PIN_NO(56) | 3) +#define PINMUX_GPIO56__FUNC_B1_PCIE_PHY_I2C_SDA (MTK_PIN_NO(56) | 4) + +#define PINMUX_GPIO57__FUNC_B_GPIO57 (MTK_PIN_NO(57) | 0) +#define PINMUX_GPIO57__FUNC_B1_SCL1 (MTK_PIN_NO(57) | 1) + +#define PINMUX_GPIO58__FUNC_B_GPIO58 (MTK_PIN_NO(58) | 0) +#define PINMUX_GPIO58__FUNC_B1_SDA1 (MTK_PIN_NO(58) | 1) + +#define PINMUX_GPIO59__FUNC_B_GPIO59 (MTK_PIN_NO(59) | 0) +#define PINMUX_GPIO59__FUNC_B1_SCL2 (MTK_PIN_NO(59) | 1) +#define PINMUX_GPIO59__FUNC_B1_SCP_SCL0 (MTK_PIN_NO(59) | 2) +#define PINMUX_GPIO59__FUNC_B1_SCP_SCL1 (MTK_PIN_NO(59) | 3) + +#define PINMUX_GPIO60__FUNC_B_GPIO60 (MTK_PIN_NO(60) | 0) +#define PINMUX_GPIO60__FUNC_B1_SDA2 (MTK_PIN_NO(60) | 1) +#define PINMUX_GPIO60__FUNC_B1_SCP_SDA0 (MTK_PIN_NO(60) | 2) +#define PINMUX_GPIO60__FUNC_B1_SCP_SDA1 (MTK_PIN_NO(60) | 3) + +#define PINMUX_GPIO61__FUNC_B_GPIO61 (MTK_PIN_NO(61) | 0) +#define PINMUX_GPIO61__FUNC_B1_SCL3 (MTK_PIN_NO(61) | 1) +#define PINMUX_GPIO61__FUNC_B1_SCP_SCL0 (MTK_PIN_NO(61) | 2) +#define PINMUX_GPIO61__FUNC_B1_SCP_SCL1 (MTK_PIN_NO(61) | 3) +#define PINMUX_GPIO61__FUNC_B1_PCIE_PHY_I2C_SCL (MTK_PIN_NO(61) | 4) + +#define PINMUX_GPIO62__FUNC_B_GPIO62 (MTK_PIN_NO(62) | 0) +#define PINMUX_GPIO62__FUNC_B1_SDA3 (MTK_PIN_NO(62) | 1) +#define PINMUX_GPIO62__FUNC_B1_SCP_SDA0 (MTK_PIN_NO(62) | 2) +#define PINMUX_GPIO62__FUNC_B1_SCP_SDA1 (MTK_PIN_NO(62) | 3) +#define PINMUX_GPIO62__FUNC_B1_PCIE_PHY_I2C_SDA (MTK_PIN_NO(62) | 4) + +#define PINMUX_GPIO63__FUNC_B_GPIO63 (MTK_PIN_NO(63) | 0) +#define PINMUX_GPIO63__FUNC_B1_SCL4 (MTK_PIN_NO(63) | 1) + +#define PINMUX_GPIO64__FUNC_B_GPIO64 (MTK_PIN_NO(64) | 0) +#define PINMUX_GPIO64__FUNC_B1_SDA4 (MTK_PIN_NO(64) | 1) + +#define PINMUX_GPIO65__FUNC_B_GPIO65 (MTK_PIN_NO(65) | 0) +#define PINMUX_GPIO65__FUNC_B1_SCL5 (MTK_PIN_NO(65) | 1) +#define PINMUX_GPIO65__FUNC_B1_SCP_SCL0 (MTK_PIN_NO(65) | 2) +#define PINMUX_GPIO65__FUNC_B1_SCP_SCL1 (MTK_PIN_NO(65) | 3) + +#define PINMUX_GPIO66__FUNC_B_GPIO66 (MTK_PIN_NO(66) | 0) +#define PINMUX_GPIO66__FUNC_B1_SDA5 (MTK_PIN_NO(66) | 1) +#define PINMUX_GPIO66__FUNC_B1_SCP_SDA0 (MTK_PIN_NO(66) | 2) +#define PINMUX_GPIO66__FUNC_B1_SCP_SDA1 (MTK_PIN_NO(66) | 3) + +#define PINMUX_GPIO67__FUNC_B_GPIO67 (MTK_PIN_NO(67) | 0) +#define PINMUX_GPIO67__FUNC_B1_SCL6 (MTK_PIN_NO(67) | 1) +#define PINMUX_GPIO67__FUNC_B1_SCP_SCL0 (MTK_PIN_NO(67) | 2) +#define PINMUX_GPIO67__FUNC_B1_SCP_SCL1 (MTK_PIN_NO(67) | 3) +#define PINMUX_GPIO67__FUNC_B1_PCIE_PHY_I2C_SCL (MTK_PIN_NO(67) | 4) + +#define PINMUX_GPIO68__FUNC_B_GPIO68 (MTK_PIN_NO(68) | 0) +#define PINMUX_GPIO68__FUNC_B1_SDA6 (MTK_PIN_NO(68) | 1) +#define PINMUX_GPIO68__FUNC_B1_SCP_SDA0 (MTK_PIN_NO(68) | 2) +#define PINMUX_GPIO68__FUNC_B1_SCP_SDA1 (MTK_PIN_NO(68) | 3) +#define PINMUX_GPIO68__FUNC_B1_PCIE_PHY_I2C_SDA (MTK_PIN_NO(68) | 4) + +#define PINMUX_GPIO69__FUNC_B_GPIO69 (MTK_PIN_NO(69) | 0) +#define PINMUX_GPIO69__FUNC_O_SPIM0_CSB (MTK_PIN_NO(69) | 1) +#define PINMUX_GPIO69__FUNC_O_SCP_SPI0_CS (MTK_PIN_NO(69) | 2) +#define PINMUX_GPIO69__FUNC_O_DMIC3_CLK (MTK_PIN_NO(69) | 3) +#define PINMUX_GPIO69__FUNC_B0_MD32_1_GPIO0 (MTK_PIN_NO(69) | 4) +#define PINMUX_GPIO69__FUNC_O_CMVREF0 (MTK_PIN_NO(69) | 5) +#define PINMUX_GPIO69__FUNC_O_GDU_SUM_TROOP0_0 (MTK_PIN_NO(69) | 6) +#define PINMUX_GPIO69__FUNC_B0_DBG_MON_A23 (MTK_PIN_NO(69) | 7) + +#define PINMUX_GPIO70__FUNC_B_GPIO70 (MTK_PIN_NO(70) | 0) +#define PINMUX_GPIO70__FUNC_O_SPIM0_CLK (MTK_PIN_NO(70) | 1) +#define PINMUX_GPIO70__FUNC_O_SCP_SPI0_CK (MTK_PIN_NO(70) | 2) +#define PINMUX_GPIO70__FUNC_I0_DMIC3_DAT (MTK_PIN_NO(70) | 3) +#define PINMUX_GPIO70__FUNC_B0_MD32_1_GPIO1 (MTK_PIN_NO(70) | 4) +#define PINMUX_GPIO70__FUNC_O_CMVREF1 (MTK_PIN_NO(70) | 5) +#define PINMUX_GPIO70__FUNC_O_GDU_SUM_TROOP0_1 (MTK_PIN_NO(70) | 6) +#define PINMUX_GPIO70__FUNC_B0_DBG_MON_A24 (MTK_PIN_NO(70) | 7) + +#define PINMUX_GPIO71__FUNC_B_GPIO71 (MTK_PIN_NO(71) | 0) +#define PINMUX_GPIO71__FUNC_B0_SPIM0_MOSI (MTK_PIN_NO(71) | 1) +#define PINMUX_GPIO71__FUNC_O_SCP_SPI0_MO (MTK_PIN_NO(71) | 2) +#define PINMUX_GPIO71__FUNC_I0_DMIC3_DAT_R (MTK_PIN_NO(71) | 3) +#define PINMUX_GPIO71__FUNC_B0_MD32_1_GPIO2 (MTK_PIN_NO(71) | 4) +#define PINMUX_GPIO71__FUNC_O_CMVREF2 (MTK_PIN_NO(71) | 5) +#define PINMUX_GPIO71__FUNC_O_GDU_SUM_TROOP0_2 (MTK_PIN_NO(71) | 6) +#define PINMUX_GPIO71__FUNC_B0_DBG_MON_A25 (MTK_PIN_NO(71) | 7) + +#define PINMUX_GPIO72__FUNC_B_GPIO72 (MTK_PIN_NO(72) | 0) +#define PINMUX_GPIO72__FUNC_B0_SPIM0_MISO (MTK_PIN_NO(72) | 1) +#define PINMUX_GPIO72__FUNC_I0_SCP_SPI0_MI (MTK_PIN_NO(72) | 2) +#define PINMUX_GPIO72__FUNC_O_DMIC4_CLK (MTK_PIN_NO(72) | 3) +#define PINMUX_GPIO72__FUNC_O_CMVREF3 (MTK_PIN_NO(72) | 5) +#define PINMUX_GPIO72__FUNC_O_GDU_SUM_TROOP1_0 (MTK_PIN_NO(72) | 6) +#define PINMUX_GPIO72__FUNC_B0_DBG_MON_A26 (MTK_PIN_NO(72) | 7) + +#define PINMUX_GPIO73__FUNC_B_GPIO73 (MTK_PIN_NO(73) | 0) +#define PINMUX_GPIO73__FUNC_B0_SPIM0_MIO2 (MTK_PIN_NO(73) | 1) +#define PINMUX_GPIO73__FUNC_O_UTXD3 (MTK_PIN_NO(73) | 2) +#define PINMUX_GPIO73__FUNC_I0_DMIC4_DAT (MTK_PIN_NO(73) | 3) +#define PINMUX_GPIO73__FUNC_O_CLKM0 (MTK_PIN_NO(73) | 4) +#define PINMUX_GPIO73__FUNC_O_CMVREF4 (MTK_PIN_NO(73) | 5) +#define PINMUX_GPIO73__FUNC_O_GDU_SUM_TROOP1_1 (MTK_PIN_NO(73) | 6) +#define PINMUX_GPIO73__FUNC_B0_DBG_MON_A27 (MTK_PIN_NO(73) | 7) + +#define PINMUX_GPIO74__FUNC_B_GPIO74 (MTK_PIN_NO(74) | 0) +#define PINMUX_GPIO74__FUNC_B0_SPIM0_MIO3 (MTK_PIN_NO(74) | 1) +#define PINMUX_GPIO74__FUNC_I1_URXD3 (MTK_PIN_NO(74) | 2) +#define PINMUX_GPIO74__FUNC_I0_DMIC4_DAT_R (MTK_PIN_NO(74) | 3) +#define PINMUX_GPIO74__FUNC_O_CLKM1 (MTK_PIN_NO(74) | 4) +#define PINMUX_GPIO74__FUNC_O_CMVREF5 (MTK_PIN_NO(74) | 5) +#define PINMUX_GPIO74__FUNC_O_GDU_SUM_TROOP1_2 (MTK_PIN_NO(74) | 6) +#define PINMUX_GPIO74__FUNC_B0_DBG_MON_A28 (MTK_PIN_NO(74) | 7) + +#define PINMUX_GPIO75__FUNC_B_GPIO75 (MTK_PIN_NO(75) | 0) +#define PINMUX_GPIO75__FUNC_O_SPIM1_CSB (MTK_PIN_NO(75) | 1) +#define PINMUX_GPIO75__FUNC_O_SCP_SPI1_A_CS (MTK_PIN_NO(75) | 2) +#define PINMUX_GPIO75__FUNC_B0_TDMIN_MCK (MTK_PIN_NO(75) | 3) +#define PINMUX_GPIO75__FUNC_B1_SCP_SCL0 (MTK_PIN_NO(75) | 4) +#define PINMUX_GPIO75__FUNC_O_CMVREF6 (MTK_PIN_NO(75) | 5) +#define PINMUX_GPIO75__FUNC_O_GDU_SUM_TROOP2_0 (MTK_PIN_NO(75) | 6) +#define PINMUX_GPIO75__FUNC_B0_DBG_MON_A29 (MTK_PIN_NO(75) | 7) + +#define PINMUX_GPIO76__FUNC_B_GPIO76 (MTK_PIN_NO(76) | 0) +#define PINMUX_GPIO76__FUNC_O_SPIM1_CLK (MTK_PIN_NO(76) | 1) +#define PINMUX_GPIO76__FUNC_O_SCP_SPI1_A_CK (MTK_PIN_NO(76) | 2) +#define PINMUX_GPIO76__FUNC_B0_TDMIN_BCK (MTK_PIN_NO(76) | 3) +#define PINMUX_GPIO76__FUNC_B1_SCP_SDA0 (MTK_PIN_NO(76) | 4) +#define PINMUX_GPIO76__FUNC_O_CMVREF7 (MTK_PIN_NO(76) | 5) +#define PINMUX_GPIO76__FUNC_O_GDU_SUM_TROOP2_1 (MTK_PIN_NO(76) | 6) +#define PINMUX_GPIO76__FUNC_B0_DBG_MON_A30 (MTK_PIN_NO(76) | 7) + +#define PINMUX_GPIO77__FUNC_B_GPIO77 (MTK_PIN_NO(77) | 0) +#define PINMUX_GPIO77__FUNC_B0_SPIM1_MOSI (MTK_PIN_NO(77) | 1) +#define PINMUX_GPIO77__FUNC_O_SCP_SPI1_A_MO (MTK_PIN_NO(77) | 2) +#define PINMUX_GPIO77__FUNC_B0_TDMIN_LRCK (MTK_PIN_NO(77) | 3) +#define PINMUX_GPIO77__FUNC_B1_SCP_SCL1 (MTK_PIN_NO(77) | 4) +#define PINMUX_GPIO77__FUNC_O_GDU_SUM_TROOP2_2 (MTK_PIN_NO(77) | 6) +#define PINMUX_GPIO77__FUNC_B0_DBG_MON_A31 (MTK_PIN_NO(77) | 7) + +#define PINMUX_GPIO78__FUNC_B_GPIO78 (MTK_PIN_NO(78) | 0) +#define PINMUX_GPIO78__FUNC_B0_SPIM1_MISO (MTK_PIN_NO(78) | 1) +#define PINMUX_GPIO78__FUNC_I0_SCP_SPI1_A_MI (MTK_PIN_NO(78) | 2) +#define PINMUX_GPIO78__FUNC_I0_TDMIN_DI (MTK_PIN_NO(78) | 3) +#define PINMUX_GPIO78__FUNC_B1_SCP_SDA1 (MTK_PIN_NO(78) | 4) +#define PINMUX_GPIO78__FUNC_B0_DBG_MON_A32 (MTK_PIN_NO(78) | 7) + +#define PINMUX_GPIO79__FUNC_B_GPIO79 (MTK_PIN_NO(79) | 0) +#define PINMUX_GPIO79__FUNC_O_SPIM2_CSB (MTK_PIN_NO(79) | 1) +#define PINMUX_GPIO79__FUNC_O_SCP_SPI2_CS (MTK_PIN_NO(79) | 2) +#define PINMUX_GPIO79__FUNC_O_I2SO1_MCK (MTK_PIN_NO(79) | 3) +#define PINMUX_GPIO79__FUNC_O_UTXD2 (MTK_PIN_NO(79) | 4) +#define PINMUX_GPIO79__FUNC_O_TP_UTXD2_AO (MTK_PIN_NO(79) | 5) +#define PINMUX_GPIO79__FUNC_B0_PCM_SYNC (MTK_PIN_NO(79) | 6) +#define PINMUX_GPIO79__FUNC_B0_DBG_MON_B0 (MTK_PIN_NO(79) | 7) + +#define PINMUX_GPIO80__FUNC_B_GPIO80 (MTK_PIN_NO(80) | 0) +#define PINMUX_GPIO80__FUNC_O_SPIM2_CLK (MTK_PIN_NO(80) | 1) +#define PINMUX_GPIO80__FUNC_O_SCP_SPI2_CK (MTK_PIN_NO(80) | 2) +#define PINMUX_GPIO80__FUNC_O_I2SO1_BCK (MTK_PIN_NO(80) | 3) +#define PINMUX_GPIO80__FUNC_I1_URXD2 (MTK_PIN_NO(80) | 4) +#define PINMUX_GPIO80__FUNC_I1_TP_URXD2_AO (MTK_PIN_NO(80) | 5) +#define PINMUX_GPIO80__FUNC_B0_PCM_CLK (MTK_PIN_NO(80) | 6) +#define PINMUX_GPIO80__FUNC_B0_DBG_MON_B1 (MTK_PIN_NO(80) | 7) + +#define PINMUX_GPIO81__FUNC_B_GPIO81 (MTK_PIN_NO(81) | 0) +#define PINMUX_GPIO81__FUNC_B0_SPIM2_MOSI (MTK_PIN_NO(81) | 1) +#define PINMUX_GPIO81__FUNC_O_SCP_SPI2_MO (MTK_PIN_NO(81) | 2) +#define PINMUX_GPIO81__FUNC_O_I2SO1_WS (MTK_PIN_NO(81) | 3) +#define PINMUX_GPIO81__FUNC_O_URTS2 (MTK_PIN_NO(81) | 4) +#define PINMUX_GPIO81__FUNC_O_TP_URTS2_AO (MTK_PIN_NO(81) | 5) +#define PINMUX_GPIO81__FUNC_O_PCM_DO (MTK_PIN_NO(81) | 6) +#define PINMUX_GPIO81__FUNC_B0_DBG_MON_B2 (MTK_PIN_NO(81) | 7) + +#define PINMUX_GPIO82__FUNC_B_GPIO82 (MTK_PIN_NO(82) | 0) +#define PINMUX_GPIO82__FUNC_B0_SPIM2_MISO (MTK_PIN_NO(82) | 1) +#define PINMUX_GPIO82__FUNC_I0_SCP_SPI2_MI (MTK_PIN_NO(82) | 2) +#define PINMUX_GPIO82__FUNC_O_I2SO1_D0 (MTK_PIN_NO(82) | 3) +#define PINMUX_GPIO82__FUNC_I1_UCTS2 (MTK_PIN_NO(82) | 4) +#define PINMUX_GPIO82__FUNC_I1_TP_UCTS2_AO (MTK_PIN_NO(82) | 5) +#define PINMUX_GPIO82__FUNC_I0_PCM_DI (MTK_PIN_NO(82) | 6) +#define PINMUX_GPIO82__FUNC_B0_DBG_MON_B3 (MTK_PIN_NO(82) | 7) + +#define PINMUX_GPIO83__FUNC_B_GPIO83 (MTK_PIN_NO(83) | 0) +#define PINMUX_GPIO83__FUNC_I1_IDDIG (MTK_PIN_NO(83) | 1) + +#define PINMUX_GPIO84__FUNC_B_GPIO84 (MTK_PIN_NO(84) | 0) +#define PINMUX_GPIO84__FUNC_O_USB_DRVVBUS (MTK_PIN_NO(84) | 1) + +#define PINMUX_GPIO85__FUNC_B_GPIO85 (MTK_PIN_NO(85) | 0) +#define PINMUX_GPIO85__FUNC_I0_VBUSVALID (MTK_PIN_NO(85) | 1) + +#define PINMUX_GPIO86__FUNC_B_GPIO86 (MTK_PIN_NO(86) | 0) +#define PINMUX_GPIO86__FUNC_I1_IDDIG_1P (MTK_PIN_NO(86) | 1) +#define PINMUX_GPIO86__FUNC_O_UTXD1 (MTK_PIN_NO(86) | 2) +#define PINMUX_GPIO86__FUNC_O_URTS2 (MTK_PIN_NO(86) | 3) +#define PINMUX_GPIO86__FUNC_O_PWM_2 (MTK_PIN_NO(86) | 4) +#define PINMUX_GPIO86__FUNC_B0_TP_GPIO4_AO (MTK_PIN_NO(86) | 5) +#define PINMUX_GPIO86__FUNC_O_AUXIF_ST0 (MTK_PIN_NO(86) | 6) +#define PINMUX_GPIO86__FUNC_B0_DBG_MON_B4 (MTK_PIN_NO(86) | 7) + +#define PINMUX_GPIO87__FUNC_B_GPIO87 (MTK_PIN_NO(87) | 0) +#define PINMUX_GPIO87__FUNC_O_USB_DRVVBUS_1P (MTK_PIN_NO(87) | 1) +#define PINMUX_GPIO87__FUNC_I1_URXD1 (MTK_PIN_NO(87) | 2) +#define PINMUX_GPIO87__FUNC_I1_UCTS2 (MTK_PIN_NO(87) | 3) +#define PINMUX_GPIO87__FUNC_O_PWM_3 (MTK_PIN_NO(87) | 4) +#define PINMUX_GPIO87__FUNC_B0_TP_GPIO5_AO (MTK_PIN_NO(87) | 5) +#define PINMUX_GPIO87__FUNC_O_AUXIF_CLK0 (MTK_PIN_NO(87) | 6) +#define PINMUX_GPIO87__FUNC_B0_DBG_MON_B5 (MTK_PIN_NO(87) | 7) + +#define PINMUX_GPIO88__FUNC_B_GPIO88 (MTK_PIN_NO(88) | 0) +#define PINMUX_GPIO88__FUNC_I0_VBUSVALID_1P (MTK_PIN_NO(88) | 1) +#define PINMUX_GPIO88__FUNC_O_UTXD2 (MTK_PIN_NO(88) | 2) +#define PINMUX_GPIO88__FUNC_O_URTS1 (MTK_PIN_NO(88) | 3) +#define PINMUX_GPIO88__FUNC_O_CLKM2 (MTK_PIN_NO(88) | 4) +#define PINMUX_GPIO88__FUNC_B0_TP_GPIO6_AO (MTK_PIN_NO(88) | 5) +#define PINMUX_GPIO88__FUNC_O_AUXIF_ST1 (MTK_PIN_NO(88) | 6) +#define PINMUX_GPIO88__FUNC_B0_DBG_MON_B6 (MTK_PIN_NO(88) | 7) + +#define PINMUX_GPIO89__FUNC_B_GPIO89 (MTK_PIN_NO(89) | 0) +#define PINMUX_GPIO89__FUNC_I1_IDDIG_2P (MTK_PIN_NO(89) | 1) +#define PINMUX_GPIO89__FUNC_I1_URXD2 (MTK_PIN_NO(89) | 2) +#define PINMUX_GPIO89__FUNC_I1_UCTS1 (MTK_PIN_NO(89) | 3) +#define PINMUX_GPIO89__FUNC_O_CLKM3 (MTK_PIN_NO(89) | 4) +#define PINMUX_GPIO89__FUNC_B0_TP_GPIO7_AO (MTK_PIN_NO(89) | 5) +#define PINMUX_GPIO89__FUNC_O_AUXIF_CLK1 (MTK_PIN_NO(89) | 6) +#define PINMUX_GPIO89__FUNC_B0_DBG_MON_B7 (MTK_PIN_NO(89) | 7) + +#define PINMUX_GPIO90__FUNC_B_GPIO90 (MTK_PIN_NO(90) | 0) +#define PINMUX_GPIO90__FUNC_O_USB_DRVVBUS_2P (MTK_PIN_NO(90) | 1) +#define PINMUX_GPIO90__FUNC_O_UTXD3 (MTK_PIN_NO(90) | 2) +#define PINMUX_GPIO90__FUNC_O_ADSP_UTXD0 (MTK_PIN_NO(90) | 3) +#define PINMUX_GPIO90__FUNC_O_SSPM_UTXD_AO (MTK_PIN_NO(90) | 4) +#define PINMUX_GPIO90__FUNC_O_MD32_0_TXD (MTK_PIN_NO(90) | 5) +#define PINMUX_GPIO90__FUNC_O_MD32_1_TXD (MTK_PIN_NO(90) | 6) +#define PINMUX_GPIO90__FUNC_B0_DBG_MON_B8 (MTK_PIN_NO(90) | 7) + +#define PINMUX_GPIO91__FUNC_B_GPIO91 (MTK_PIN_NO(91) | 0) +#define PINMUX_GPIO91__FUNC_I0_VBUSVALID_2P (MTK_PIN_NO(91) | 1) +#define PINMUX_GPIO91__FUNC_I1_URXD3 (MTK_PIN_NO(91) | 2) +#define PINMUX_GPIO91__FUNC_I1_ADSP_URXD0 (MTK_PIN_NO(91) | 3) +#define PINMUX_GPIO91__FUNC_I1_SSPM_URXD_AO (MTK_PIN_NO(91) | 4) +#define PINMUX_GPIO91__FUNC_I1_MD32_0_RXD (MTK_PIN_NO(91) | 5) +#define PINMUX_GPIO91__FUNC_I1_MD32_1_RXD (MTK_PIN_NO(91) | 6) +#define PINMUX_GPIO91__FUNC_B0_DBG_MON_B9 (MTK_PIN_NO(91) | 7) + +#define PINMUX_GPIO92__FUNC_B_GPIO92 (MTK_PIN_NO(92) | 0) +#define PINMUX_GPIO92__FUNC_O_PWRAP_SPI0_CSN (MTK_PIN_NO(92) | 1) + +#define PINMUX_GPIO93__FUNC_B_GPIO93 (MTK_PIN_NO(93) | 0) +#define PINMUX_GPIO93__FUNC_O_PWRAP_SPI0_CK (MTK_PIN_NO(93) | 1) + +#define PINMUX_GPIO94__FUNC_B_GPIO94 (MTK_PIN_NO(94) | 0) +#define PINMUX_GPIO94__FUNC_B0_PWRAP_SPI0_MO (MTK_PIN_NO(94) | 1) +#define PINMUX_GPIO94__FUNC_B0_PWRAP_SPI0_MI (MTK_PIN_NO(94) | 2) + +#define PINMUX_GPIO95__FUNC_B_GPIO95 (MTK_PIN_NO(95) | 0) +#define PINMUX_GPIO95__FUNC_B0_PWRAP_SPI0_MI (MTK_PIN_NO(95) | 1) +#define PINMUX_GPIO95__FUNC_B0_PWRAP_SPI0_MO (MTK_PIN_NO(95) | 2) + +#define PINMUX_GPIO96__FUNC_B_GPIO96 (MTK_PIN_NO(96) | 0) +#define PINMUX_GPIO96__FUNC_O_SRCLKENA0 (MTK_PIN_NO(96) | 1) + +#define PINMUX_GPIO97__FUNC_B_GPIO97 (MTK_PIN_NO(97) | 0) +#define PINMUX_GPIO97__FUNC_O_SRCLKENA1 (MTK_PIN_NO(97) | 1) + +#define PINMUX_GPIO98__FUNC_B_GPIO98 (MTK_PIN_NO(98) | 0) +#define PINMUX_GPIO98__FUNC_O_SCP_VREQ_VAO (MTK_PIN_NO(98) | 1) +#define PINMUX_GPIO98__FUNC_I0_DVFSRC_EXT_REQ (MTK_PIN_NO(98) | 2) + +#define PINMUX_GPIO99__FUNC_B_GPIO99 (MTK_PIN_NO(99) | 0) +#define PINMUX_GPIO99__FUNC_I0_RTC32K_CK (MTK_PIN_NO(99) | 1) + +#define PINMUX_GPIO100__FUNC_B_GPIO100 (MTK_PIN_NO(100) | 0) +#define PINMUX_GPIO100__FUNC_O_WATCHDOG (MTK_PIN_NO(100) | 1) + +#define PINMUX_GPIO101__FUNC_B_GPIO101 (MTK_PIN_NO(101) | 0) +#define PINMUX_GPIO101__FUNC_O_AUD_CLK_MOSI (MTK_PIN_NO(101) | 1) +#define PINMUX_GPIO101__FUNC_O_I2SO1_MCK (MTK_PIN_NO(101) | 2) +#define PINMUX_GPIO101__FUNC_B0_I2SIN_BCK (MTK_PIN_NO(101) | 3) + +#define PINMUX_GPIO102__FUNC_B_GPIO102 (MTK_PIN_NO(102) | 0) +#define PINMUX_GPIO102__FUNC_O_AUD_SYNC_MOSI (MTK_PIN_NO(102) | 1) +#define PINMUX_GPIO102__FUNC_O_I2SO1_BCK (MTK_PIN_NO(102) | 2) +#define PINMUX_GPIO102__FUNC_B0_I2SIN_WS (MTK_PIN_NO(102) | 3) + +#define PINMUX_GPIO103__FUNC_B_GPIO103 (MTK_PIN_NO(103) | 0) +#define PINMUX_GPIO103__FUNC_O_AUD_DAT_MOSI0 (MTK_PIN_NO(103) | 1) +#define PINMUX_GPIO103__FUNC_O_I2SO1_WS (MTK_PIN_NO(103) | 2) +#define PINMUX_GPIO103__FUNC_I0_I2SIN_D0 (MTK_PIN_NO(103) | 3) + +#define PINMUX_GPIO104__FUNC_B_GPIO104 (MTK_PIN_NO(104) | 0) +#define PINMUX_GPIO104__FUNC_O_AUD_DAT_MOSI1 (MTK_PIN_NO(104) | 1) +#define PINMUX_GPIO104__FUNC_O_I2SO1_D0 (MTK_PIN_NO(104) | 2) +#define PINMUX_GPIO104__FUNC_I0_I2SIN_D1 (MTK_PIN_NO(104) | 3) + +#define PINMUX_GPIO105__FUNC_B_GPIO105 (MTK_PIN_NO(105) | 0) +#define PINMUX_GPIO105__FUNC_I0_AUD_DAT_MISO0 (MTK_PIN_NO(105) | 1) +#define PINMUX_GPIO105__FUNC_I0_VOW_DAT_MISO (MTK_PIN_NO(105) | 2) +#define PINMUX_GPIO105__FUNC_I0_I2SIN_D2 (MTK_PIN_NO(105) | 3) + +#define PINMUX_GPIO106__FUNC_B_GPIO106 (MTK_PIN_NO(106) | 0) +#define PINMUX_GPIO106__FUNC_I0_AUD_DAT_MISO1 (MTK_PIN_NO(106) | 1) +#define PINMUX_GPIO106__FUNC_I0_VOW_CLK_MISO (MTK_PIN_NO(106) | 2) +#define PINMUX_GPIO106__FUNC_I0_I2SIN_D3 (MTK_PIN_NO(106) | 3) + +#define PINMUX_GPIO107__FUNC_B_GPIO107 (MTK_PIN_NO(107) | 0) +#define PINMUX_GPIO107__FUNC_B0_I2SIN_MCK (MTK_PIN_NO(107) | 1) +#define PINMUX_GPIO107__FUNC_I0_SPLIN_MCK (MTK_PIN_NO(107) | 2) +#define PINMUX_GPIO107__FUNC_I0_SPDIF_IN0 (MTK_PIN_NO(107) | 3) +#define PINMUX_GPIO107__FUNC_O_CMVREF4 (MTK_PIN_NO(107) | 4) +#define PINMUX_GPIO107__FUNC_O_AUXIF_ST0 (MTK_PIN_NO(107) | 5) +#define PINMUX_GPIO107__FUNC_O_PGD_LV_LSC_PWR0 (MTK_PIN_NO(107) | 6) + +#define PINMUX_GPIO108__FUNC_B_GPIO108 (MTK_PIN_NO(108) | 0) +#define PINMUX_GPIO108__FUNC_B0_I2SIN_BCK (MTK_PIN_NO(108) | 1) +#define PINMUX_GPIO108__FUNC_I0_SPLIN_LRCK (MTK_PIN_NO(108) | 2) +#define PINMUX_GPIO108__FUNC_O_DMIC4_CLK (MTK_PIN_NO(108) | 3) +#define PINMUX_GPIO108__FUNC_O_CMVREF5 (MTK_PIN_NO(108) | 4) +#define PINMUX_GPIO108__FUNC_O_AUXIF_CLK0 (MTK_PIN_NO(108) | 5) +#define PINMUX_GPIO108__FUNC_O_PGD_LV_LSC_PWR1 (MTK_PIN_NO(108) | 6) +#define PINMUX_GPIO108__FUNC_B0_DBG_MON_B10 (MTK_PIN_NO(108) | 7) + +#define PINMUX_GPIO109__FUNC_B_GPIO109 (MTK_PIN_NO(109) | 0) +#define PINMUX_GPIO109__FUNC_B0_I2SIN_WS (MTK_PIN_NO(109) | 1) +#define PINMUX_GPIO109__FUNC_I0_SPLIN_BCK (MTK_PIN_NO(109) | 2) +#define PINMUX_GPIO109__FUNC_I0_DMIC4_DAT (MTK_PIN_NO(109) | 3) +#define PINMUX_GPIO109__FUNC_O_CMVREF6 (MTK_PIN_NO(109) | 4) +#define PINMUX_GPIO109__FUNC_O_AUXIF_ST1 (MTK_PIN_NO(109) | 5) +#define PINMUX_GPIO109__FUNC_O_PGD_LV_LSC_PWR2 (MTK_PIN_NO(109) | 6) +#define PINMUX_GPIO109__FUNC_B0_DBG_MON_B11 (MTK_PIN_NO(109) | 7) + +#define PINMUX_GPIO110__FUNC_B_GPIO110 (MTK_PIN_NO(110) | 0) +#define PINMUX_GPIO110__FUNC_I0_I2SIN_D0 (MTK_PIN_NO(110) | 1) +#define PINMUX_GPIO110__FUNC_I0_SPLIN_D0 (MTK_PIN_NO(110) | 2) +#define PINMUX_GPIO110__FUNC_I0_DMIC4_DAT_R (MTK_PIN_NO(110) | 3) +#define PINMUX_GPIO110__FUNC_O_CMVREF7 (MTK_PIN_NO(110) | 4) +#define PINMUX_GPIO110__FUNC_O_AUXIF_CLK1 (MTK_PIN_NO(110) | 5) +#define PINMUX_GPIO110__FUNC_O_PGD_LV_LSC_PWR3 (MTK_PIN_NO(110) | 6) +#define PINMUX_GPIO110__FUNC_B0_DBG_MON_B12 (MTK_PIN_NO(110) | 7) + +#define PINMUX_GPIO111__FUNC_B_GPIO111 (MTK_PIN_NO(111) | 0) +#define PINMUX_GPIO111__FUNC_I0_I2SIN_D1 (MTK_PIN_NO(111) | 1) +#define PINMUX_GPIO111__FUNC_I0_SPLIN_D1 (MTK_PIN_NO(111) | 2) +#define PINMUX_GPIO111__FUNC_O_DMIC3_CLK (MTK_PIN_NO(111) | 3) +#define PINMUX_GPIO111__FUNC_O_SPDIF_OUT (MTK_PIN_NO(111) | 4) +#define PINMUX_GPIO111__FUNC_O_PGD_LV_LSC_PWR4 (MTK_PIN_NO(111) | 6) +#define PINMUX_GPIO111__FUNC_B0_DBG_MON_B13 (MTK_PIN_NO(111) | 7) + +#define PINMUX_GPIO112__FUNC_B_GPIO112 (MTK_PIN_NO(112) | 0) +#define PINMUX_GPIO112__FUNC_I0_I2SIN_D2 (MTK_PIN_NO(112) | 1) +#define PINMUX_GPIO112__FUNC_I0_SPLIN_D2 (MTK_PIN_NO(112) | 2) +#define PINMUX_GPIO112__FUNC_I0_DMIC3_DAT (MTK_PIN_NO(112) | 3) +#define PINMUX_GPIO112__FUNC_B0_TDMIN_MCK (MTK_PIN_NO(112) | 4) +#define PINMUX_GPIO112__FUNC_O_I2SO1_WS (MTK_PIN_NO(112) | 5) +#define PINMUX_GPIO112__FUNC_O_PGD_LV_LSC_PWR5 (MTK_PIN_NO(112) | 6) +#define PINMUX_GPIO112__FUNC_B0_DBG_MON_B14 (MTK_PIN_NO(112) | 7) + +#define PINMUX_GPIO113__FUNC_B_GPIO113 (MTK_PIN_NO(113) | 0) +#define PINMUX_GPIO113__FUNC_I0_I2SIN_D3 (MTK_PIN_NO(113) | 1) +#define PINMUX_GPIO113__FUNC_I0_SPLIN_D3 (MTK_PIN_NO(113) | 2) +#define PINMUX_GPIO113__FUNC_I0_DMIC3_DAT_R (MTK_PIN_NO(113) | 3) +#define PINMUX_GPIO113__FUNC_B0_TDMIN_BCK (MTK_PIN_NO(113) | 4) +#define PINMUX_GPIO113__FUNC_O_I2SO1_D0 (MTK_PIN_NO(113) | 5) +#define PINMUX_GPIO113__FUNC_B0_DBG_MON_B15 (MTK_PIN_NO(113) | 7) + +#define PINMUX_GPIO114__FUNC_B_GPIO114 (MTK_PIN_NO(114) | 0) +#define PINMUX_GPIO114__FUNC_O_I2SO2_MCK (MTK_PIN_NO(114) | 1) +#define PINMUX_GPIO114__FUNC_B0_I2SIN_MCK (MTK_PIN_NO(114) | 2) +#define PINMUX_GPIO114__FUNC_I1_MCUPM_JTAG_TMS (MTK_PIN_NO(114) | 3) +#define PINMUX_GPIO114__FUNC_B1_APU_JTAG_TMS (MTK_PIN_NO(114) | 4) +#define PINMUX_GPIO114__FUNC_I1_SCP_JTAG1_TMS (MTK_PIN_NO(114) | 5) +#define PINMUX_GPIO114__FUNC_I1_SPM_JTAG_TMS (MTK_PIN_NO(114) | 6) +#define PINMUX_GPIO114__FUNC_B0_DBG_MON_B16 (MTK_PIN_NO(114) | 7) + +#define PINMUX_GPIO115__FUNC_B_GPIO115 (MTK_PIN_NO(115) | 0) +#define PINMUX_GPIO115__FUNC_B0_I2SO2_BCK (MTK_PIN_NO(115) | 1) +#define PINMUX_GPIO115__FUNC_B0_I2SIN_BCK (MTK_PIN_NO(115) | 2) +#define PINMUX_GPIO115__FUNC_I1_MCUPM_JTAG_TCK (MTK_PIN_NO(115) | 3) +#define PINMUX_GPIO115__FUNC_I0_APU_JTAG_TCK (MTK_PIN_NO(115) | 4) +#define PINMUX_GPIO115__FUNC_I1_SCP_JTAG1_TCK (MTK_PIN_NO(115) | 5) +#define PINMUX_GPIO115__FUNC_I1_SPM_JTAG_TCK (MTK_PIN_NO(115) | 6) +#define PINMUX_GPIO115__FUNC_B0_DBG_MON_B17 (MTK_PIN_NO(115) | 7) + +#define PINMUX_GPIO116__FUNC_B_GPIO116 (MTK_PIN_NO(116) | 0) +#define PINMUX_GPIO116__FUNC_B0_I2SO2_WS (MTK_PIN_NO(116) | 1) +#define PINMUX_GPIO116__FUNC_B0_I2SIN_WS (MTK_PIN_NO(116) | 2) +#define PINMUX_GPIO116__FUNC_I1_MCUPM_JTAG_TDI (MTK_PIN_NO(116) | 3) +#define PINMUX_GPIO116__FUNC_I1_APU_JTAG_TDI (MTK_PIN_NO(116) | 4) +#define PINMUX_GPIO116__FUNC_I1_SCP_JTAG1_TDI (MTK_PIN_NO(116) | 5) +#define PINMUX_GPIO116__FUNC_I1_SPM_JTAG_TDI (MTK_PIN_NO(116) | 6) +#define PINMUX_GPIO116__FUNC_B0_DBG_MON_B18 (MTK_PIN_NO(116) | 7) + +#define PINMUX_GPIO117__FUNC_B_GPIO117 (MTK_PIN_NO(117) | 0) +#define PINMUX_GPIO117__FUNC_O_I2SO2_D0 (MTK_PIN_NO(117) | 1) +#define PINMUX_GPIO117__FUNC_I0_I2SIN_D0 (MTK_PIN_NO(117) | 2) +#define PINMUX_GPIO117__FUNC_O_MCUPM_JTAG_TDO (MTK_PIN_NO(117) | 3) +#define PINMUX_GPIO117__FUNC_O_APU_JTAG_TDO (MTK_PIN_NO(117) | 4) +#define PINMUX_GPIO117__FUNC_O_SCP_JTAG1_TDO (MTK_PIN_NO(117) | 5) +#define PINMUX_GPIO117__FUNC_O_SPM_JTAG_TDO (MTK_PIN_NO(117) | 6) +#define PINMUX_GPIO117__FUNC_B0_DBG_MON_B19 (MTK_PIN_NO(117) | 7) + +#define PINMUX_GPIO118__FUNC_B_GPIO118 (MTK_PIN_NO(118) | 0) +#define PINMUX_GPIO118__FUNC_O_I2SO2_D1 (MTK_PIN_NO(118) | 1) +#define PINMUX_GPIO118__FUNC_I0_I2SIN_D1 (MTK_PIN_NO(118) | 2) +#define PINMUX_GPIO118__FUNC_I0_MCUPM_JTAG_TRSTN (MTK_PIN_NO(118) | 3) +#define PINMUX_GPIO118__FUNC_I0_APU_JTAG_TRST (MTK_PIN_NO(118) | 4) +#define PINMUX_GPIO118__FUNC_I0_SCP_JTAG1_TRSTN (MTK_PIN_NO(118) | 5) +#define PINMUX_GPIO118__FUNC_I0_SPM_JTAG_TRSTN (MTK_PIN_NO(118) | 6) +#define PINMUX_GPIO118__FUNC_B0_DBG_MON_B20 (MTK_PIN_NO(118) | 7) + +#define PINMUX_GPIO119__FUNC_B_GPIO119 (MTK_PIN_NO(119) | 0) +#define PINMUX_GPIO119__FUNC_O_I2SO2_D2 (MTK_PIN_NO(119) | 1) +#define PINMUX_GPIO119__FUNC_I0_I2SIN_D2 (MTK_PIN_NO(119) | 2) +#define PINMUX_GPIO119__FUNC_O_UTXD3 (MTK_PIN_NO(119) | 3) +#define PINMUX_GPIO119__FUNC_B0_TDMIN_LRCK (MTK_PIN_NO(119) | 4) +#define PINMUX_GPIO119__FUNC_O_I2SO1_MCK (MTK_PIN_NO(119) | 5) +#define PINMUX_GPIO119__FUNC_O_SSPM_UTXD_AO (MTK_PIN_NO(119) | 6) +#define PINMUX_GPIO119__FUNC_B0_DBG_MON_B21 (MTK_PIN_NO(119) | 7) + +#define PINMUX_GPIO120__FUNC_B_GPIO120 (MTK_PIN_NO(120) | 0) +#define PINMUX_GPIO120__FUNC_O_I2SO2_D3 (MTK_PIN_NO(120) | 1) +#define PINMUX_GPIO120__FUNC_I0_I2SIN_D3 (MTK_PIN_NO(120) | 2) +#define PINMUX_GPIO120__FUNC_I1_URXD3 (MTK_PIN_NO(120) | 3) +#define PINMUX_GPIO120__FUNC_I0_TDMIN_DI (MTK_PIN_NO(120) | 4) +#define PINMUX_GPIO120__FUNC_O_I2SO1_BCK (MTK_PIN_NO(120) | 5) +#define PINMUX_GPIO120__FUNC_I1_SSPM_URXD_AO (MTK_PIN_NO(120) | 6) +#define PINMUX_GPIO120__FUNC_B0_DBG_MON_B22 (MTK_PIN_NO(120) | 7) + +#define PINMUX_GPIO121__FUNC_B_GPIO121 (MTK_PIN_NO(121) | 0) +#define PINMUX_GPIO121__FUNC_B0_PCM_CLK (MTK_PIN_NO(121) | 1) +#define PINMUX_GPIO121__FUNC_O_SPIM4_CSB (MTK_PIN_NO(121) | 2) +#define PINMUX_GPIO121__FUNC_O_SCP_SPI1_B_CS (MTK_PIN_NO(121) | 3) +#define PINMUX_GPIO121__FUNC_O_TP_UTXD2_AO (MTK_PIN_NO(121) | 4) +#define PINMUX_GPIO121__FUNC_O_AUXIF_ST0 (MTK_PIN_NO(121) | 5) +#define PINMUX_GPIO121__FUNC_O_PGD_DA_EFUSE_RDY (MTK_PIN_NO(121) | 6) +#define PINMUX_GPIO121__FUNC_B0_DBG_MON_B23 (MTK_PIN_NO(121) | 7) + +#define PINMUX_GPIO122__FUNC_B_GPIO122 (MTK_PIN_NO(122) | 0) +#define PINMUX_GPIO122__FUNC_B0_PCM_SYNC (MTK_PIN_NO(122) | 1) +#define PINMUX_GPIO122__FUNC_O_SPIM4_CLK (MTK_PIN_NO(122) | 2) +#define PINMUX_GPIO122__FUNC_O_SCP_SPI1_B_CK (MTK_PIN_NO(122) | 3) +#define PINMUX_GPIO122__FUNC_I1_TP_URXD2_AO (MTK_PIN_NO(122) | 4) +#define PINMUX_GPIO122__FUNC_O_AUXIF_CLK0 (MTK_PIN_NO(122) | 5) +#define PINMUX_GPIO122__FUNC_O_PGD_DA_EFUSE_RDY_PRE (MTK_PIN_NO(122) | 6) +#define PINMUX_GPIO122__FUNC_B0_DBG_MON_B24 (MTK_PIN_NO(122) | 7) + +#define PINMUX_GPIO123__FUNC_B_GPIO123 (MTK_PIN_NO(123) | 0) +#define PINMUX_GPIO123__FUNC_O_PCM_DO (MTK_PIN_NO(123) | 1) +#define PINMUX_GPIO123__FUNC_B0_SPIM4_MOSI (MTK_PIN_NO(123) | 2) +#define PINMUX_GPIO123__FUNC_O_SCP_SPI1_B_MO (MTK_PIN_NO(123) | 3) +#define PINMUX_GPIO123__FUNC_O_TP_URTS2_AO (MTK_PIN_NO(123) | 4) +#define PINMUX_GPIO123__FUNC_O_AUXIF_ST1 (MTK_PIN_NO(123) | 5) +#define PINMUX_GPIO123__FUNC_O_PGD_DA_PWRGD_RESET (MTK_PIN_NO(123) | 6) +#define PINMUX_GPIO123__FUNC_B0_DBG_MON_B25 (MTK_PIN_NO(123) | 7) + +#define PINMUX_GPIO124__FUNC_B_GPIO124 (MTK_PIN_NO(124) | 0) +#define PINMUX_GPIO124__FUNC_I0_PCM_DI (MTK_PIN_NO(124) | 1) +#define PINMUX_GPIO124__FUNC_B0_SPIM4_MISO (MTK_PIN_NO(124) | 2) +#define PINMUX_GPIO124__FUNC_I0_SCP_SPI1_B_MI (MTK_PIN_NO(124) | 3) +#define PINMUX_GPIO124__FUNC_I1_TP_UCTS2_AO (MTK_PIN_NO(124) | 4) +#define PINMUX_GPIO124__FUNC_O_AUXIF_CLK1 (MTK_PIN_NO(124) | 5) +#define PINMUX_GPIO124__FUNC_O_PGD_DA_PWRGD_ENB (MTK_PIN_NO(124) | 6) +#define PINMUX_GPIO124__FUNC_B0_DBG_MON_B26 (MTK_PIN_NO(124) | 7) + +#define PINMUX_GPIO125__FUNC_B_GPIO125 (MTK_PIN_NO(125) | 0) +#define PINMUX_GPIO125__FUNC_O_DMIC1_CLK (MTK_PIN_NO(125) | 1) +#define PINMUX_GPIO125__FUNC_O_SPINOR_CK (MTK_PIN_NO(125) | 2) +#define PINMUX_GPIO125__FUNC_B0_TDMIN_MCK (MTK_PIN_NO(125) | 3) +#define PINMUX_GPIO125__FUNC_O_LVTS_FOUT (MTK_PIN_NO(125) | 6) +#define PINMUX_GPIO125__FUNC_B0_DBG_MON_B27 (MTK_PIN_NO(125) | 7) + +#define PINMUX_GPIO126__FUNC_B_GPIO126 (MTK_PIN_NO(126) | 0) +#define PINMUX_GPIO126__FUNC_I0_DMIC1_DAT (MTK_PIN_NO(126) | 1) +#define PINMUX_GPIO126__FUNC_O_SPINOR_CS (MTK_PIN_NO(126) | 2) +#define PINMUX_GPIO126__FUNC_B0_TDMIN_BCK (MTK_PIN_NO(126) | 3) +#define PINMUX_GPIO126__FUNC_O_LVTS_SDO (MTK_PIN_NO(126) | 6) +#define PINMUX_GPIO126__FUNC_B0_DBG_MON_B28 (MTK_PIN_NO(126) | 7) + +#define PINMUX_GPIO127__FUNC_B_GPIO127 (MTK_PIN_NO(127) | 0) +#define PINMUX_GPIO127__FUNC_I0_DMIC1_DAT_R (MTK_PIN_NO(127) | 1) +#define PINMUX_GPIO127__FUNC_B0_SPINOR_IO0 (MTK_PIN_NO(127) | 2) +#define PINMUX_GPIO127__FUNC_B0_TDMIN_LRCK (MTK_PIN_NO(127) | 3) +#define PINMUX_GPIO127__FUNC_I0_LVTS_26M (MTK_PIN_NO(127) | 6) +#define PINMUX_GPIO127__FUNC_B0_DBG_MON_B29 (MTK_PIN_NO(127) | 7) + +#define PINMUX_GPIO128__FUNC_B_GPIO128 (MTK_PIN_NO(128) | 0) +#define PINMUX_GPIO128__FUNC_O_DMIC2_CLK (MTK_PIN_NO(128) | 1) +#define PINMUX_GPIO128__FUNC_B0_SPINOR_IO1 (MTK_PIN_NO(128) | 2) +#define PINMUX_GPIO128__FUNC_I0_TDMIN_DI (MTK_PIN_NO(128) | 3) +#define PINMUX_GPIO128__FUNC_I0_LVTS_SCF (MTK_PIN_NO(128) | 6) +#define PINMUX_GPIO128__FUNC_B0_DBG_MON_B30 (MTK_PIN_NO(128) | 7) + +#define PINMUX_GPIO129__FUNC_B_GPIO129 (MTK_PIN_NO(129) | 0) +#define PINMUX_GPIO129__FUNC_I0_DMIC2_DAT (MTK_PIN_NO(129) | 1) +#define PINMUX_GPIO129__FUNC_B0_SPINOR_IO2 (MTK_PIN_NO(129) | 2) +#define PINMUX_GPIO129__FUNC_I0_SPDIF_IN1 (MTK_PIN_NO(129) | 3) +#define PINMUX_GPIO129__FUNC_I0_LVTS_SCK (MTK_PIN_NO(129) | 6) +#define PINMUX_GPIO129__FUNC_B0_DBG_MON_B31 (MTK_PIN_NO(129) | 7) + +#define PINMUX_GPIO130__FUNC_B_GPIO130 (MTK_PIN_NO(130) | 0) +#define PINMUX_GPIO130__FUNC_I0_DMIC2_DAT_R (MTK_PIN_NO(130) | 1) +#define PINMUX_GPIO130__FUNC_B0_SPINOR_IO3 (MTK_PIN_NO(130) | 2) +#define PINMUX_GPIO130__FUNC_I0_SPDIF_IN2 (MTK_PIN_NO(130) | 3) +#define PINMUX_GPIO130__FUNC_I0_LVTS_SDI (MTK_PIN_NO(130) | 6) +#define PINMUX_GPIO130__FUNC_B0_DBG_MON_B32 (MTK_PIN_NO(130) | 7) + +#define PINMUX_GPIO131__FUNC_B_GPIO131 (MTK_PIN_NO(131) | 0) +#define PINMUX_GPIO131__FUNC_O_DPI_D0 (MTK_PIN_NO(131) | 1) +#define PINMUX_GPIO131__FUNC_O_GBE_TXD3 (MTK_PIN_NO(131) | 2) +#define PINMUX_GPIO131__FUNC_O_DMIC1_CLK (MTK_PIN_NO(131) | 3) +#define PINMUX_GPIO131__FUNC_O_I2SO2_MCK (MTK_PIN_NO(131) | 4) +#define PINMUX_GPIO131__FUNC_B0_TP_GPIO0_AO (MTK_PIN_NO(131) | 5) +#define PINMUX_GPIO131__FUNC_O_SPIM5_CSB (MTK_PIN_NO(131) | 6) +#define PINMUX_GPIO131__FUNC_O_PGD_LV_HSC_PWR0 (MTK_PIN_NO(131) | 7) + +#define PINMUX_GPIO132__FUNC_B_GPIO132 (MTK_PIN_NO(132) | 0) +#define PINMUX_GPIO132__FUNC_O_DPI_D1 (MTK_PIN_NO(132) | 1) +#define PINMUX_GPIO132__FUNC_O_GBE_TXD2 (MTK_PIN_NO(132) | 2) +#define PINMUX_GPIO132__FUNC_I0_DMIC1_DAT (MTK_PIN_NO(132) | 3) +#define PINMUX_GPIO132__FUNC_B0_I2SO2_BCK (MTK_PIN_NO(132) | 4) +#define PINMUX_GPIO132__FUNC_B0_TP_GPIO1_AO (MTK_PIN_NO(132) | 5) +#define PINMUX_GPIO132__FUNC_O_SPIM5_CLK (MTK_PIN_NO(132) | 6) +#define PINMUX_GPIO132__FUNC_O_PGD_LV_HSC_PWR1 (MTK_PIN_NO(132) | 7) + +#define PINMUX_GPIO133__FUNC_B_GPIO133 (MTK_PIN_NO(133) | 0) +#define PINMUX_GPIO133__FUNC_O_DPI_D2 (MTK_PIN_NO(133) | 1) +#define PINMUX_GPIO133__FUNC_O_GBE_TXD1 (MTK_PIN_NO(133) | 2) +#define PINMUX_GPIO133__FUNC_I0_DMIC1_DAT_R (MTK_PIN_NO(133) | 3) +#define PINMUX_GPIO133__FUNC_B0_I2SO2_WS (MTK_PIN_NO(133) | 4) +#define PINMUX_GPIO133__FUNC_B0_TP_GPIO2_AO (MTK_PIN_NO(133) | 5) +#define PINMUX_GPIO133__FUNC_B0_SPIM5_MOSI (MTK_PIN_NO(133) | 6) +#define PINMUX_GPIO133__FUNC_O_PGD_LV_HSC_PWR2 (MTK_PIN_NO(133) | 7) + +#define PINMUX_GPIO134__FUNC_B_GPIO134 (MTK_PIN_NO(134) | 0) +#define PINMUX_GPIO134__FUNC_O_DPI_D3 (MTK_PIN_NO(134) | 1) +#define PINMUX_GPIO134__FUNC_O_GBE_TXD0 (MTK_PIN_NO(134) | 2) +#define PINMUX_GPIO134__FUNC_O_DMIC2_CLK (MTK_PIN_NO(134) | 3) +#define PINMUX_GPIO134__FUNC_O_I2SO2_D0 (MTK_PIN_NO(134) | 4) +#define PINMUX_GPIO134__FUNC_B0_TP_GPIO3_AO (MTK_PIN_NO(134) | 5) +#define PINMUX_GPIO134__FUNC_B0_SPIM5_MISO (MTK_PIN_NO(134) | 6) +#define PINMUX_GPIO134__FUNC_O_PGD_LV_HSC_PWR3 (MTK_PIN_NO(134) | 7) + +#define PINMUX_GPIO135__FUNC_B_GPIO135 (MTK_PIN_NO(135) | 0) +#define PINMUX_GPIO135__FUNC_O_DPI_D4 (MTK_PIN_NO(135) | 1) +#define PINMUX_GPIO135__FUNC_I0_GBE_RXD3 (MTK_PIN_NO(135) | 2) +#define PINMUX_GPIO135__FUNC_I0_DMIC2_DAT (MTK_PIN_NO(135) | 3) +#define PINMUX_GPIO135__FUNC_O_I2SO2_D1 (MTK_PIN_NO(135) | 4) +#define PINMUX_GPIO135__FUNC_B0_TP_GPIO4_AO (MTK_PIN_NO(135) | 5) +#define PINMUX_GPIO135__FUNC_I1_WAKEN (MTK_PIN_NO(135) | 6) +#define PINMUX_GPIO135__FUNC_O_PGD_LV_HSC_PWR4 (MTK_PIN_NO(135) | 7) + +#define PINMUX_GPIO136__FUNC_B_GPIO136 (MTK_PIN_NO(136) | 0) +#define PINMUX_GPIO136__FUNC_O_DPI_D5 (MTK_PIN_NO(136) | 1) +#define PINMUX_GPIO136__FUNC_I0_GBE_RXD2 (MTK_PIN_NO(136) | 2) +#define PINMUX_GPIO136__FUNC_I0_DMIC2_DAT_R (MTK_PIN_NO(136) | 3) +#define PINMUX_GPIO136__FUNC_O_I2SO2_D2 (MTK_PIN_NO(136) | 4) +#define PINMUX_GPIO136__FUNC_B0_TP_GPIO5_AO (MTK_PIN_NO(136) | 5) +#define PINMUX_GPIO136__FUNC_O_PERSTN (MTK_PIN_NO(136) | 6) +#define PINMUX_GPIO136__FUNC_O_PGD_LV_HSC_PWR5 (MTK_PIN_NO(136) | 7) + +#define PINMUX_GPIO137__FUNC_B_GPIO137 (MTK_PIN_NO(137) | 0) +#define PINMUX_GPIO137__FUNC_O_DPI_D6 (MTK_PIN_NO(137) | 1) +#define PINMUX_GPIO137__FUNC_I0_GBE_RXD1 (MTK_PIN_NO(137) | 2) +#define PINMUX_GPIO137__FUNC_O_DMIC3_CLK (MTK_PIN_NO(137) | 3) +#define PINMUX_GPIO137__FUNC_O_I2SO2_D3 (MTK_PIN_NO(137) | 4) +#define PINMUX_GPIO137__FUNC_B0_TP_GPIO6_AO (MTK_PIN_NO(137) | 5) +#define PINMUX_GPIO137__FUNC_B1_CLKREQN (MTK_PIN_NO(137) | 6) +#define PINMUX_GPIO137__FUNC_O_PWM_0 (MTK_PIN_NO(137) | 7) + +#define PINMUX_GPIO138__FUNC_B_GPIO138 (MTK_PIN_NO(138) | 0) +#define PINMUX_GPIO138__FUNC_O_DPI_D7 (MTK_PIN_NO(138) | 1) +#define PINMUX_GPIO138__FUNC_I0_GBE_RXD0 (MTK_PIN_NO(138) | 2) +#define PINMUX_GPIO138__FUNC_I0_DMIC3_DAT (MTK_PIN_NO(138) | 3) +#define PINMUX_GPIO138__FUNC_O_CLKM2 (MTK_PIN_NO(138) | 4) +#define PINMUX_GPIO138__FUNC_B0_TP_GPIO7_AO (MTK_PIN_NO(138) | 5) +#define PINMUX_GPIO138__FUNC_B0_MD32_0_GPIO0 (MTK_PIN_NO(138) | 7) + +#define PINMUX_GPIO139__FUNC_B_GPIO139 (MTK_PIN_NO(139) | 0) +#define PINMUX_GPIO139__FUNC_O_DPI_D8 (MTK_PIN_NO(139) | 1) +#define PINMUX_GPIO139__FUNC_B0_GBE_TXC (MTK_PIN_NO(139) | 2) +#define PINMUX_GPIO139__FUNC_I0_DMIC3_DAT_R (MTK_PIN_NO(139) | 3) +#define PINMUX_GPIO139__FUNC_O_CLKM3 (MTK_PIN_NO(139) | 4) +#define PINMUX_GPIO139__FUNC_O_TP_UTXD2_AO (MTK_PIN_NO(139) | 5) +#define PINMUX_GPIO139__FUNC_O_UTXD2 (MTK_PIN_NO(139) | 6) +#define PINMUX_GPIO139__FUNC_B0_MD32_0_GPIO1 (MTK_PIN_NO(139) | 7) + +#define PINMUX_GPIO140__FUNC_B_GPIO140 (MTK_PIN_NO(140) | 0) +#define PINMUX_GPIO140__FUNC_O_DPI_D9 (MTK_PIN_NO(140) | 1) +#define PINMUX_GPIO140__FUNC_I0_GBE_RXC (MTK_PIN_NO(140) | 2) +#define PINMUX_GPIO140__FUNC_O_DMIC4_CLK (MTK_PIN_NO(140) | 3) +#define PINMUX_GPIO140__FUNC_O_PWM_2 (MTK_PIN_NO(140) | 4) +#define PINMUX_GPIO140__FUNC_I1_TP_URXD2_AO (MTK_PIN_NO(140) | 5) +#define PINMUX_GPIO140__FUNC_I1_URXD2 (MTK_PIN_NO(140) | 6) +#define PINMUX_GPIO140__FUNC_B0_MD32_0_GPIO2 (MTK_PIN_NO(140) | 7) + +#define PINMUX_GPIO141__FUNC_B_GPIO141 (MTK_PIN_NO(141) | 0) +#define PINMUX_GPIO141__FUNC_O_DPI_D10 (MTK_PIN_NO(141) | 1) +#define PINMUX_GPIO141__FUNC_I0_GBE_RXDV (MTK_PIN_NO(141) | 2) +#define PINMUX_GPIO141__FUNC_I0_DMIC4_DAT (MTK_PIN_NO(141) | 3) +#define PINMUX_GPIO141__FUNC_O_PWM_3 (MTK_PIN_NO(141) | 4) +#define PINMUX_GPIO141__FUNC_O_TP_URTS2_AO (MTK_PIN_NO(141) | 5) +#define PINMUX_GPIO141__FUNC_O_URTS2 (MTK_PIN_NO(141) | 6) +#define PINMUX_GPIO141__FUNC_B0_MD32_1_GPIO0 (MTK_PIN_NO(141) | 7) + +#define PINMUX_GPIO142__FUNC_B_GPIO142 (MTK_PIN_NO(142) | 0) +#define PINMUX_GPIO142__FUNC_O_DPI_D11 (MTK_PIN_NO(142) | 1) +#define PINMUX_GPIO142__FUNC_O_GBE_TXEN (MTK_PIN_NO(142) | 2) +#define PINMUX_GPIO142__FUNC_I0_DMIC4_DAT_R (MTK_PIN_NO(142) | 3) +#define PINMUX_GPIO142__FUNC_O_PWM_1 (MTK_PIN_NO(142) | 4) +#define PINMUX_GPIO142__FUNC_I1_TP_UCTS2_AO (MTK_PIN_NO(142) | 5) +#define PINMUX_GPIO142__FUNC_I1_UCTS2 (MTK_PIN_NO(142) | 6) +#define PINMUX_GPIO142__FUNC_B0_MD32_1_GPIO1 (MTK_PIN_NO(142) | 7) + +#define PINMUX_GPIO143__FUNC_B_GPIO143 (MTK_PIN_NO(143) | 0) +#define PINMUX_GPIO143__FUNC_O_DPI_D12 (MTK_PIN_NO(143) | 1) +#define PINMUX_GPIO143__FUNC_O_GBE_MDC (MTK_PIN_NO(143) | 2) +#define PINMUX_GPIO143__FUNC_B0_MD32_0_GPIO0 (MTK_PIN_NO(143) | 3) +#define PINMUX_GPIO143__FUNC_O_CLKM0 (MTK_PIN_NO(143) | 4) +#define PINMUX_GPIO143__FUNC_O_SPIM3_CSB (MTK_PIN_NO(143) | 5) +#define PINMUX_GPIO143__FUNC_O_UTXD1 (MTK_PIN_NO(143) | 6) +#define PINMUX_GPIO143__FUNC_B0_MD32_1_GPIO2 (MTK_PIN_NO(143) | 7) + +#define PINMUX_GPIO144__FUNC_B_GPIO144 (MTK_PIN_NO(144) | 0) +#define PINMUX_GPIO144__FUNC_O_DPI_D13 (MTK_PIN_NO(144) | 1) +#define PINMUX_GPIO144__FUNC_B1_GBE_MDIO (MTK_PIN_NO(144) | 2) +#define PINMUX_GPIO144__FUNC_B0_MD32_0_GPIO1 (MTK_PIN_NO(144) | 3) +#define PINMUX_GPIO144__FUNC_O_CLKM1 (MTK_PIN_NO(144) | 4) +#define PINMUX_GPIO144__FUNC_O_SPIM3_CLK (MTK_PIN_NO(144) | 5) +#define PINMUX_GPIO144__FUNC_I1_URXD1 (MTK_PIN_NO(144) | 6) +#define PINMUX_GPIO144__FUNC_O_PGD_HV_HSC_PWR0 (MTK_PIN_NO(144) | 7) + +#define PINMUX_GPIO145__FUNC_B_GPIO145 (MTK_PIN_NO(145) | 0) +#define PINMUX_GPIO145__FUNC_O_DPI_D14 (MTK_PIN_NO(145) | 1) +#define PINMUX_GPIO145__FUNC_O_GBE_TXER (MTK_PIN_NO(145) | 2) +#define PINMUX_GPIO145__FUNC_B0_MD32_1_GPIO0 (MTK_PIN_NO(145) | 3) +#define PINMUX_GPIO145__FUNC_O_CMFLASH0 (MTK_PIN_NO(145) | 4) +#define PINMUX_GPIO145__FUNC_B0_SPIM3_MOSI (MTK_PIN_NO(145) | 5) +#define PINMUX_GPIO145__FUNC_B0_GBE_AUX_PPS2 (MTK_PIN_NO(145) | 6) +#define PINMUX_GPIO145__FUNC_O_PGD_HV_HSC_PWR1 (MTK_PIN_NO(145) | 7) + +#define PINMUX_GPIO146__FUNC_B_GPIO146 (MTK_PIN_NO(146) | 0) +#define PINMUX_GPIO146__FUNC_O_DPI_D15 (MTK_PIN_NO(146) | 1) +#define PINMUX_GPIO146__FUNC_I0_GBE_RXER (MTK_PIN_NO(146) | 2) +#define PINMUX_GPIO146__FUNC_B0_MD32_1_GPIO1 (MTK_PIN_NO(146) | 3) +#define PINMUX_GPIO146__FUNC_O_CMFLASH1 (MTK_PIN_NO(146) | 4) +#define PINMUX_GPIO146__FUNC_B0_SPIM3_MISO (MTK_PIN_NO(146) | 5) +#define PINMUX_GPIO146__FUNC_B0_GBE_AUX_PPS3 (MTK_PIN_NO(146) | 6) +#define PINMUX_GPIO146__FUNC_O_PGD_HV_HSC_PWR2 (MTK_PIN_NO(146) | 7) + +#define PINMUX_GPIO147__FUNC_B_GPIO147 (MTK_PIN_NO(147) | 0) +#define PINMUX_GPIO147__FUNC_O_DPI_HSYNC (MTK_PIN_NO(147) | 1) +#define PINMUX_GPIO147__FUNC_I0_GBE_COL (MTK_PIN_NO(147) | 2) +#define PINMUX_GPIO147__FUNC_O_I2SO1_MCK (MTK_PIN_NO(147) | 3) +#define PINMUX_GPIO147__FUNC_O_CMVREF0 (MTK_PIN_NO(147) | 4) +#define PINMUX_GPIO147__FUNC_O_SPDIF_OUT (MTK_PIN_NO(147) | 5) +#define PINMUX_GPIO147__FUNC_O_URTS1 (MTK_PIN_NO(147) | 6) +#define PINMUX_GPIO147__FUNC_O_PGD_HV_HSC_PWR3 (MTK_PIN_NO(147) | 7) + +#define PINMUX_GPIO148__FUNC_B_GPIO148 (MTK_PIN_NO(148) | 0) +#define PINMUX_GPIO148__FUNC_O_DPI_VSYNC (MTK_PIN_NO(148) | 1) +#define PINMUX_GPIO148__FUNC_I0_GBE_INTR (MTK_PIN_NO(148) | 2) +#define PINMUX_GPIO148__FUNC_O_I2SO1_BCK (MTK_PIN_NO(148) | 3) +#define PINMUX_GPIO148__FUNC_O_CMVREF1 (MTK_PIN_NO(148) | 4) +#define PINMUX_GPIO148__FUNC_I0_SPDIF_IN0 (MTK_PIN_NO(148) | 5) +#define PINMUX_GPIO148__FUNC_I1_UCTS1 (MTK_PIN_NO(148) | 6) +#define PINMUX_GPIO148__FUNC_O_PGD_HV_HSC_PWR4 (MTK_PIN_NO(148) | 7) + +#define PINMUX_GPIO149__FUNC_B_GPIO149 (MTK_PIN_NO(149) | 0) +#define PINMUX_GPIO149__FUNC_O_DPI_DE (MTK_PIN_NO(149) | 1) +#define PINMUX_GPIO149__FUNC_B0_GBE_AUX_PPS0 (MTK_PIN_NO(149) | 2) +#define PINMUX_GPIO149__FUNC_O_I2SO1_WS (MTK_PIN_NO(149) | 3) +#define PINMUX_GPIO149__FUNC_O_CMVREF2 (MTK_PIN_NO(149) | 4) +#define PINMUX_GPIO149__FUNC_I0_SPDIF_IN1 (MTK_PIN_NO(149) | 5) +#define PINMUX_GPIO149__FUNC_O_UTXD3 (MTK_PIN_NO(149) | 6) +#define PINMUX_GPIO149__FUNC_O_PGD_HV_HSC_PWR5 (MTK_PIN_NO(149) | 7) + +#define PINMUX_GPIO150__FUNC_B_GPIO150 (MTK_PIN_NO(150) | 0) +#define PINMUX_GPIO150__FUNC_O_DPI_CK (MTK_PIN_NO(150) | 1) +#define PINMUX_GPIO150__FUNC_B0_GBE_AUX_PPS1 (MTK_PIN_NO(150) | 2) +#define PINMUX_GPIO150__FUNC_O_I2SO1_D0 (MTK_PIN_NO(150) | 3) +#define PINMUX_GPIO150__FUNC_O_CMVREF3 (MTK_PIN_NO(150) | 4) +#define PINMUX_GPIO150__FUNC_I0_SPDIF_IN2 (MTK_PIN_NO(150) | 5) +#define PINMUX_GPIO150__FUNC_I1_URXD3 (MTK_PIN_NO(150) | 6) + +#define PINMUX_GPIO151__FUNC_B_GPIO151 (MTK_PIN_NO(151) | 0) +#define PINMUX_GPIO151__FUNC_B1_MSDC0_DAT7 (MTK_PIN_NO(151) | 1) + +#define PINMUX_GPIO152__FUNC_B_GPIO152 (MTK_PIN_NO(152) | 0) +#define PINMUX_GPIO152__FUNC_B1_MSDC0_DAT6 (MTK_PIN_NO(152) | 1) + +#define PINMUX_GPIO153__FUNC_B_GPIO153 (MTK_PIN_NO(153) | 0) +#define PINMUX_GPIO153__FUNC_B1_MSDC0_DAT5 (MTK_PIN_NO(153) | 1) + +#define PINMUX_GPIO154__FUNC_B_GPIO154 (MTK_PIN_NO(154) | 0) +#define PINMUX_GPIO154__FUNC_B1_MSDC0_DAT4 (MTK_PIN_NO(154) | 1) + +#define PINMUX_GPIO155__FUNC_B_GPIO155 (MTK_PIN_NO(155) | 0) +#define PINMUX_GPIO155__FUNC_O_MSDC0_RSTB (MTK_PIN_NO(155) | 1) + +#define PINMUX_GPIO156__FUNC_B_GPIO156 (MTK_PIN_NO(156) | 0) +#define PINMUX_GPIO156__FUNC_B1_MSDC0_CMD (MTK_PIN_NO(156) | 1) + +#define PINMUX_GPIO157__FUNC_B_GPIO157 (MTK_PIN_NO(157) | 0) +#define PINMUX_GPIO157__FUNC_B1_MSDC0_CLK (MTK_PIN_NO(157) | 1) + +#define PINMUX_GPIO158__FUNC_B_GPIO158 (MTK_PIN_NO(158) | 0) +#define PINMUX_GPIO158__FUNC_B1_MSDC0_DAT3 (MTK_PIN_NO(158) | 1) + +#define PINMUX_GPIO159__FUNC_B_GPIO159 (MTK_PIN_NO(159) | 0) +#define PINMUX_GPIO159__FUNC_B1_MSDC0_DAT2 (MTK_PIN_NO(159) | 1) + +#define PINMUX_GPIO160__FUNC_B_GPIO160 (MTK_PIN_NO(160) | 0) +#define PINMUX_GPIO160__FUNC_B1_MSDC0_DAT1 (MTK_PIN_NO(160) | 1) + +#define PINMUX_GPIO161__FUNC_B_GPIO161 (MTK_PIN_NO(161) | 0) +#define PINMUX_GPIO161__FUNC_B1_MSDC0_DAT0 (MTK_PIN_NO(161) | 1) + +#define PINMUX_GPIO162__FUNC_B_GPIO162 (MTK_PIN_NO(162) | 0) +#define PINMUX_GPIO162__FUNC_B0_MSDC0_DSL (MTK_PIN_NO(162) | 1) + +#define PINMUX_GPIO163__FUNC_B_GPIO163 (MTK_PIN_NO(163) | 0) +#define PINMUX_GPIO163__FUNC_B1_MSDC1_CMD (MTK_PIN_NO(163) | 1) +#define PINMUX_GPIO163__FUNC_O_SPDIF_OUT (MTK_PIN_NO(163) | 2) +#define PINMUX_GPIO163__FUNC_I1_MD32_0_JTAG_TMS (MTK_PIN_NO(163) | 3) +#define PINMUX_GPIO163__FUNC_I1_ADSP_JTAG0_TMS (MTK_PIN_NO(163) | 4) +#define PINMUX_GPIO163__FUNC_I1_SCP_JTAG0_TMS (MTK_PIN_NO(163) | 5) +#define PINMUX_GPIO163__FUNC_I1_CCU0_JTAG_TMS (MTK_PIN_NO(163) | 6) +#define PINMUX_GPIO163__FUNC_I0_IPU_JTAG_TMS (MTK_PIN_NO(163) | 7) + +#define PINMUX_GPIO164__FUNC_B_GPIO164 (MTK_PIN_NO(164) | 0) +#define PINMUX_GPIO164__FUNC_B1_MSDC1_CLK (MTK_PIN_NO(164) | 1) +#define PINMUX_GPIO164__FUNC_I0_SPDIF_IN0 (MTK_PIN_NO(164) | 2) +#define PINMUX_GPIO164__FUNC_I1_MD32_0_JTAG_TCK (MTK_PIN_NO(164) | 3) +#define PINMUX_GPIO164__FUNC_I0_ADSP_JTAG0_TCK (MTK_PIN_NO(164) | 4) +#define PINMUX_GPIO164__FUNC_I1_SCP_JTAG0_TCK (MTK_PIN_NO(164) | 5) +#define PINMUX_GPIO164__FUNC_I1_CCU0_JTAG_TCK (MTK_PIN_NO(164) | 6) +#define PINMUX_GPIO164__FUNC_I0_IPU_JTAG_TCK (MTK_PIN_NO(164) | 7) + +#define PINMUX_GPIO165__FUNC_B_GPIO165 (MTK_PIN_NO(165) | 0) +#define PINMUX_GPIO165__FUNC_B1_MSDC1_DAT0 (MTK_PIN_NO(165) | 1) +#define PINMUX_GPIO165__FUNC_I0_SPDIF_IN1 (MTK_PIN_NO(165) | 2) +#define PINMUX_GPIO165__FUNC_I1_MD32_0_JTAG_TDI (MTK_PIN_NO(165) | 3) +#define PINMUX_GPIO165__FUNC_I1_ADSP_JTAG0_TDI (MTK_PIN_NO(165) | 4) +#define PINMUX_GPIO165__FUNC_I1_SCP_JTAG0_TDI (MTK_PIN_NO(165) | 5) +#define PINMUX_GPIO165__FUNC_I1_CCU0_JTAG_TDI (MTK_PIN_NO(165) | 6) +#define PINMUX_GPIO165__FUNC_I0_IPU_JTAG_TDI (MTK_PIN_NO(165) | 7) + +#define PINMUX_GPIO166__FUNC_B_GPIO166 (MTK_PIN_NO(166) | 0) +#define PINMUX_GPIO166__FUNC_B1_MSDC1_DAT1 (MTK_PIN_NO(166) | 1) +#define PINMUX_GPIO166__FUNC_I0_SPDIF_IN2 (MTK_PIN_NO(166) | 2) +#define PINMUX_GPIO166__FUNC_O_MD32_0_JTAG_TDO (MTK_PIN_NO(166) | 3) +#define PINMUX_GPIO166__FUNC_O_ADSP_JTAG0_TDO (MTK_PIN_NO(166) | 4) +#define PINMUX_GPIO166__FUNC_O_SCP_JTAG0_TDO (MTK_PIN_NO(166) | 5) +#define PINMUX_GPIO166__FUNC_O_CCU0_JTAG_TDO (MTK_PIN_NO(166) | 6) +#define PINMUX_GPIO166__FUNC_O_IPU_JTAG_TDO (MTK_PIN_NO(166) | 7) + +#define PINMUX_GPIO167__FUNC_B_GPIO167 (MTK_PIN_NO(167) | 0) +#define PINMUX_GPIO167__FUNC_B1_MSDC1_DAT2 (MTK_PIN_NO(167) | 1) +#define PINMUX_GPIO167__FUNC_O_PWM_0 (MTK_PIN_NO(167) | 2) +#define PINMUX_GPIO167__FUNC_I1_MD32_0_JTAG_TRST (MTK_PIN_NO(167) | 3) +#define PINMUX_GPIO167__FUNC_I1_ADSP_JTAG0_TRSTN (MTK_PIN_NO(167) | 4) +#define PINMUX_GPIO167__FUNC_I0_SCP_JTAG0_TRSTN (MTK_PIN_NO(167) | 5) +#define PINMUX_GPIO167__FUNC_I1_CCU0_JTAG_TRST (MTK_PIN_NO(167) | 6) +#define PINMUX_GPIO167__FUNC_I0_IPU_JTAG_TRST (MTK_PIN_NO(167) | 7) + +#define PINMUX_GPIO168__FUNC_B_GPIO168 (MTK_PIN_NO(168) | 0) +#define PINMUX_GPIO168__FUNC_B1_MSDC1_DAT3 (MTK_PIN_NO(168) | 1) +#define PINMUX_GPIO168__FUNC_O_PWM_1 (MTK_PIN_NO(168) | 2) +#define PINMUX_GPIO168__FUNC_O_CLKM0 (MTK_PIN_NO(168) | 3) + +#define PINMUX_GPIO169__FUNC_B_GPIO169 (MTK_PIN_NO(169) | 0) +#define PINMUX_GPIO169__FUNC_B1_MSDC2_CMD (MTK_PIN_NO(169) | 1) +#define PINMUX_GPIO169__FUNC_O_LVTS_FOUT (MTK_PIN_NO(169) | 2) +#define PINMUX_GPIO169__FUNC_I1_MD32_1_JTAG_TMS (MTK_PIN_NO(169) | 3) +#define PINMUX_GPIO169__FUNC_I0_UDI_TMS (MTK_PIN_NO(169) | 4) +#define PINMUX_GPIO169__FUNC_I0_VPU_UDI_TMS (MTK_PIN_NO(169) | 5) +#define PINMUX_GPIO169__FUNC_B0_TDMIN_MCK (MTK_PIN_NO(169) | 6) +#define PINMUX_GPIO169__FUNC_I1_SSPM_JTAG_TMS (MTK_PIN_NO(169) | 7) + +#define PINMUX_GPIO170__FUNC_B_GPIO170 (MTK_PIN_NO(170) | 0) +#define PINMUX_GPIO170__FUNC_B1_MSDC2_CLK (MTK_PIN_NO(170) | 1) +#define PINMUX_GPIO170__FUNC_O_LVTS_SDO (MTK_PIN_NO(170) | 2) +#define PINMUX_GPIO170__FUNC_I1_MD32_1_JTAG_TCK (MTK_PIN_NO(170) | 3) +#define PINMUX_GPIO170__FUNC_I0_UDI_TCK (MTK_PIN_NO(170) | 4) +#define PINMUX_GPIO170__FUNC_I0_VPU_UDI_TCK (MTK_PIN_NO(170) | 5) +#define PINMUX_GPIO170__FUNC_B0_TDMIN_BCK (MTK_PIN_NO(170) | 6) +#define PINMUX_GPIO170__FUNC_I1_SSPM_JTAG_TCK (MTK_PIN_NO(170) | 7) + +#define PINMUX_GPIO171__FUNC_B_GPIO171 (MTK_PIN_NO(171) | 0) +#define PINMUX_GPIO171__FUNC_B1_MSDC2_DAT0 (MTK_PIN_NO(171) | 1) +#define PINMUX_GPIO171__FUNC_I0_LVTS_26M (MTK_PIN_NO(171) | 2) +#define PINMUX_GPIO171__FUNC_I1_MD32_1_JTAG_TDI (MTK_PIN_NO(171) | 3) +#define PINMUX_GPIO171__FUNC_I0_UDI_TDI (MTK_PIN_NO(171) | 4) +#define PINMUX_GPIO171__FUNC_I0_VPU_UDI_TDI (MTK_PIN_NO(171) | 5) +#define PINMUX_GPIO171__FUNC_B0_TDMIN_LRCK (MTK_PIN_NO(171) | 6) +#define PINMUX_GPIO171__FUNC_I1_SSPM_JTAG_TDI (MTK_PIN_NO(171) | 7) + +#define PINMUX_GPIO172__FUNC_B_GPIO172 (MTK_PIN_NO(172) | 0) +#define PINMUX_GPIO172__FUNC_B1_MSDC2_DAT1 (MTK_PIN_NO(172) | 1) +#define PINMUX_GPIO172__FUNC_I0_LVTS_SCF (MTK_PIN_NO(172) | 2) +#define PINMUX_GPIO172__FUNC_O_MD32_1_JTAG_TDO (MTK_PIN_NO(172) | 3) +#define PINMUX_GPIO172__FUNC_O_UDI_TDO (MTK_PIN_NO(172) | 4) +#define PINMUX_GPIO172__FUNC_O_VPU_UDI_TDO (MTK_PIN_NO(172) | 5) +#define PINMUX_GPIO172__FUNC_I0_TDMIN_DI (MTK_PIN_NO(172) | 6) +#define PINMUX_GPIO172__FUNC_O_SSPM_JTAG_TDO (MTK_PIN_NO(172) | 7) + +#define PINMUX_GPIO173__FUNC_B_GPIO173 (MTK_PIN_NO(173) | 0) +#define PINMUX_GPIO173__FUNC_B1_MSDC2_DAT2 (MTK_PIN_NO(173) | 1) +#define PINMUX_GPIO173__FUNC_I0_LVTS_SCK (MTK_PIN_NO(173) | 2) +#define PINMUX_GPIO173__FUNC_I1_MD32_1_JTAG_TRST (MTK_PIN_NO(173) | 3) +#define PINMUX_GPIO173__FUNC_I0_UDI_NTRST (MTK_PIN_NO(173) | 4) +#define PINMUX_GPIO173__FUNC_I0_VPU_UDI_NTRST (MTK_PIN_NO(173) | 5) +#define PINMUX_GPIO173__FUNC_I0_SSPM_JTAG_TRSTN (MTK_PIN_NO(173) | 7) + +#define PINMUX_GPIO174__FUNC_B_GPIO174 (MTK_PIN_NO(174) | 0) +#define PINMUX_GPIO174__FUNC_B1_MSDC2_DAT3 (MTK_PIN_NO(174) | 1) +#define PINMUX_GPIO174__FUNC_I0_LVTS_SDI (MTK_PIN_NO(174) | 2) + +#define PINMUX_GPIO175__FUNC_B_GPIO175 (MTK_PIN_NO(175) | 0) +#define PINMUX_GPIO175__FUNC_B0_SPMI_M_SCL (MTK_PIN_NO(175) | 1) + +#define PINMUX_GPIO176__FUNC_B_GPIO176 (MTK_PIN_NO(176) | 0) +#define PINMUX_GPIO176__FUNC_B0_SPMI_M_SDA (MTK_PIN_NO(176) | 1) + +#endif /* __MEDIATEK_MT8188-PINFUNC_H */ -- GitLab From 11b918d90aebf87b7d317ec95c17b46716f43d57 Mon Sep 17 00:00:00 2001 From: "Hui.Liu" Date: Thu, 18 Aug 2022 15:50:12 +0800 Subject: [PATCH 0069/2223] pinctrl: mediatek: add mt8188 driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pinctrl driver support for MediaTek SoC mt8188. Signed-off-by: Hui.Liu Reviewed-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220818075012.20880-3-hui.liu@mediatek.com Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/Kconfig | 12 + drivers/pinctrl/mediatek/Makefile | 1 + drivers/pinctrl/mediatek/pinctrl-mt8188.c | 1673 ++++++++++++ drivers/pinctrl/mediatek/pinctrl-mtk-mt8188.h | 2259 +++++++++++++++++ 4 files changed, 3945 insertions(+) create mode 100644 drivers/pinctrl/mediatek/pinctrl-mt8188.c create mode 100644 drivers/pinctrl/mediatek/pinctrl-mtk-mt8188.h diff --git a/drivers/pinctrl/mediatek/Kconfig b/drivers/pinctrl/mediatek/Kconfig index 1600a2c18eeef..fed02c6fea062 100644 --- a/drivers/pinctrl/mediatek/Kconfig +++ b/drivers/pinctrl/mediatek/Kconfig @@ -162,6 +162,18 @@ config PINCTRL_MT8186 default ARM64 && ARCH_MEDIATEK select PINCTRL_MTK_PARIS +config PINCTRL_MT8188 + bool "MediaTek MT8188 pin control" + depends on OF + depends on ARM64 || COMPILE_TEST + default ARM64 && ARCH_MEDIATEK + select PINCTRL_MTK_PARIS + help + Say yes here to support pin controller and gpio driver + on MediaTek MT8188 SoC. + In MTK platform, we support virtual gpio and use it to + map specific eint which doesn't have real gpio pin. + config PINCTRL_MT8192 bool "Mediatek MT8192 pin control" depends on OF diff --git a/drivers/pinctrl/mediatek/Makefile b/drivers/pinctrl/mediatek/Makefile index c8f226ae36c94..53265404a39d8 100644 --- a/drivers/pinctrl/mediatek/Makefile +++ b/drivers/pinctrl/mediatek/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_PINCTRL_MT8167) += pinctrl-mt8167.o obj-$(CONFIG_PINCTRL_MT8173) += pinctrl-mt8173.o obj-$(CONFIG_PINCTRL_MT8183) += pinctrl-mt8183.o obj-$(CONFIG_PINCTRL_MT8186) += pinctrl-mt8186.o +obj-$(CONFIG_PINCTRL_MT8188) += pinctrl-mt8188.o obj-$(CONFIG_PINCTRL_MT8192) += pinctrl-mt8192.o obj-$(CONFIG_PINCTRL_MT8195) += pinctrl-mt8195.o obj-$(CONFIG_PINCTRL_MT8365) += pinctrl-mt8365.o diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8188.c b/drivers/pinctrl/mediatek/pinctrl-mt8188.c new file mode 100644 index 0000000000000..d0e75c1b4417a --- /dev/null +++ b/drivers/pinctrl/mediatek/pinctrl-mt8188.c @@ -0,0 +1,1673 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 MediaTek Inc. + * Author: Hui Liu + * + */ + +#include +#include "pinctrl-mtk-mt8188.h" +#include "pinctrl-paris.h" + +/* MT8188 have multiple bases to program pin configuration listed as the below: + * iocfg[0]:0x10005000, iocfg[1]:0x11c00000, iocfg[2]:0x11e10000, + * iocfg[3]:0x11e20000, iocfg[4]:0x11ea0000 + * _i_based could be used to indicate what base the pin should be mapped into. + */ + +#define PIN_FIELD_BASE(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits) \ + PIN_FIELD_CALC(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits, \ + 32, 0) + +#define PINS_FIELD_BASE(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits) \ + PIN_FIELD_CALC(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits, \ + 32, 1) + +static const struct mtk_pin_field_calc mt8188_pin_mode_range[] = { + PIN_FIELD(0, 177, 0x0300, 0x10, 0, 4), +}; + +static const struct mtk_pin_field_calc mt8188_pin_dir_range[] = { + PIN_FIELD(0, 177, 0x0000, 0x10, 0, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_di_range[] = { + PIN_FIELD(0, 177, 0x0200, 0x10, 0, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_do_range[] = { + PIN_FIELD(0, 177, 0x0100, 0x10, 0, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_smt_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x0170, 0x10, 8, 1), + PIN_FIELD_BASE(1, 1, 1, 0x0170, 0x10, 9, 1), + PIN_FIELD_BASE(2, 2, 1, 0x0170, 0x10, 10, 1), + PIN_FIELD_BASE(3, 3, 1, 0x0170, 0x10, 11, 1), + PIN_FIELD_BASE(4, 4, 1, 0x0170, 0x10, 18, 1), + PIN_FIELD_BASE(5, 5, 1, 0x0170, 0x10, 18, 1), + PIN_FIELD_BASE(6, 6, 1, 0x0170, 0x10, 18, 1), + PIN_FIELD_BASE(7, 7, 1, 0x0170, 0x10, 12, 1), + PIN_FIELD_BASE(8, 8, 1, 0x0170, 0x10, 13, 1), + PIN_FIELD_BASE(9, 9, 1, 0x0170, 0x10, 14, 1), + PIN_FIELD_BASE(10, 10, 1, 0x0170, 0x10, 15, 1), + PIN_FIELD_BASE(11, 11, 1, 0x0170, 0x10, 19, 1), + PIN_FIELD_BASE(12, 12, 2, 0x0160, 0x10, 12, 1), + PIN_FIELD_BASE(13, 13, 2, 0x0160, 0x10, 13, 1), + PIN_FIELD_BASE(14, 14, 2, 0x0160, 0x10, 14, 1), + PIN_FIELD_BASE(15, 15, 2, 0x0160, 0x10, 15, 1), + PIN_FIELD_BASE(16, 16, 3, 0x00d0, 0x10, 10, 1), + PIN_FIELD_BASE(17, 17, 3, 0x00d0, 0x10, 10, 1), + PIN_FIELD_BASE(18, 18, 4, 0x00e0, 0x10, 9, 1), + PIN_FIELD_BASE(19, 19, 4, 0x00e0, 0x10, 9, 1), + PIN_FIELD_BASE(20, 20, 4, 0x00e0, 0x10, 9, 1), + PIN_FIELD_BASE(21, 21, 4, 0x00e0, 0x10, 9, 1), + PIN_FIELD_BASE(22, 22, 4, 0x00e0, 0x10, 0, 1), + PIN_FIELD_BASE(23, 23, 4, 0x00e0, 0x10, 1, 1), + PIN_FIELD_BASE(24, 24, 4, 0x00e0, 0x10, 2, 1), + PIN_FIELD_BASE(25, 25, 1, 0x0170, 0x10, 17, 1), + PIN_FIELD_BASE(26, 26, 1, 0x0170, 0x10, 17, 1), + PIN_FIELD_BASE(27, 27, 1, 0x0170, 0x10, 17, 1), + PIN_FIELD_BASE(28, 28, 1, 0x0170, 0x10, 18, 1), + PIN_FIELD_BASE(29, 29, 1, 0x0170, 0x10, 16, 1), + PIN_FIELD_BASE(30, 30, 1, 0x0170, 0x10, 17, 1), + PIN_FIELD_BASE(31, 31, 1, 0x0170, 0x10, 19, 1), + PIN_FIELD_BASE(32, 32, 1, 0x0170, 0x10, 19, 1), + PIN_FIELD_BASE(33, 33, 1, 0x0170, 0x10, 20, 1), + PIN_FIELD_BASE(34, 34, 1, 0x0170, 0x10, 20, 1), + PIN_FIELD_BASE(35, 35, 1, 0x0170, 0x10, 19, 1), + PIN_FIELD_BASE(36, 36, 1, 0x0170, 0x10, 20, 1), + PIN_FIELD_BASE(37, 37, 1, 0x0170, 0x10, 21, 1), + PIN_FIELD_BASE(38, 38, 1, 0x0170, 0x10, 20, 1), + PIN_FIELD_BASE(39, 39, 1, 0x0170, 0x10, 21, 1), + PIN_FIELD_BASE(40, 40, 1, 0x0170, 0x10, 21, 1), + PIN_FIELD_BASE(41, 41, 1, 0x0170, 0x10, 21, 1), + PIN_FIELD_BASE(42, 42, 2, 0x0160, 0x10, 21, 1), + PIN_FIELD_BASE(43, 43, 2, 0x0160, 0x10, 22, 1), + PIN_FIELD_BASE(44, 44, 2, 0x0160, 0x10, 21, 1), + PIN_FIELD_BASE(45, 45, 2, 0x0160, 0x10, 22, 1), + PIN_FIELD_BASE(46, 46, 3, 0x00d0, 0x10, 10, 1), + PIN_FIELD_BASE(47, 47, 1, 0x0170, 0x10, 16, 1), + PIN_FIELD_BASE(48, 48, 1, 0x0170, 0x10, 16, 1), + PIN_FIELD_BASE(49, 49, 1, 0x0170, 0x10, 16, 1), + PIN_FIELD_BASE(50, 50, 3, 0x00d0, 0x10, 10, 1), + PIN_FIELD_BASE(51, 51, 3, 0x00d0, 0x10, 11, 1), + PIN_FIELD_BASE(52, 52, 3, 0x00d0, 0x10, 11, 1), + PIN_FIELD_BASE(53, 53, 3, 0x00d0, 0x10, 11, 1), + PIN_FIELD_BASE(54, 54, 3, 0x00d0, 0x10, 11, 1), + PIN_FIELD_BASE(55, 55, 1, 0x0170, 0x10, 25, 1), + PIN_FIELD_BASE(56, 56, 1, 0x0170, 0x10, 28, 1), + PIN_FIELD_BASE(57, 57, 2, 0x0160, 0x10, 29, 1), + PIN_FIELD_BASE(58, 58, 2, 0x0160, 0x10, 31, 1), + PIN_FIELD_BASE(59, 59, 1, 0x0170, 0x10, 26, 1), + PIN_FIELD_BASE(60, 60, 1, 0x0170, 0x10, 29, 1), + PIN_FIELD_BASE(61, 61, 1, 0x0170, 0x10, 27, 1), + PIN_FIELD_BASE(62, 62, 1, 0x0170, 0x10, 30, 1), + PIN_FIELD_BASE(63, 63, 2, 0x0160, 0x10, 30, 1), + PIN_FIELD_BASE(64, 64, 2, 0x0170, 0x10, 0, 1), + PIN_FIELD_BASE(65, 65, 4, 0x00e0, 0x10, 10, 1), + PIN_FIELD_BASE(66, 66, 4, 0x00e0, 0x10, 12, 1), + PIN_FIELD_BASE(67, 67, 4, 0x00e0, 0x10, 11, 1), + PIN_FIELD_BASE(68, 68, 4, 0x00e0, 0x10, 13, 1), + PIN_FIELD_BASE(69, 69, 1, 0x0180, 0x10, 0, 1), + PIN_FIELD_BASE(70, 70, 1, 0x0170, 0x10, 31, 1), + PIN_FIELD_BASE(71, 71, 1, 0x0180, 0x10, 4, 1), + PIN_FIELD_BASE(72, 72, 1, 0x0180, 0x10, 3, 1), + PIN_FIELD_BASE(73, 73, 1, 0x0180, 0x10, 1, 1), + PIN_FIELD_BASE(74, 74, 1, 0x0180, 0x10, 2, 1), + PIN_FIELD_BASE(75, 75, 1, 0x0180, 0x10, 6, 1), + PIN_FIELD_BASE(76, 76, 1, 0x0180, 0x10, 5, 1), + PIN_FIELD_BASE(77, 77, 1, 0x0180, 0x10, 8, 1), + PIN_FIELD_BASE(78, 78, 1, 0x0180, 0x10, 7, 1), + PIN_FIELD_BASE(79, 79, 4, 0x00e0, 0x10, 15, 1), + PIN_FIELD_BASE(80, 80, 4, 0x00e0, 0x10, 14, 1), + PIN_FIELD_BASE(81, 81, 4, 0x00e0, 0x10, 17, 1), + PIN_FIELD_BASE(82, 82, 4, 0x00e0, 0x10, 16, 1), + PIN_FIELD_BASE(83, 83, 2, 0x0160, 0x10, 26, 1), + PIN_FIELD_BASE(84, 84, 2, 0x0160, 0x10, 26, 1), + PIN_FIELD_BASE(85, 85, 2, 0x0160, 0x10, 27, 1), + PIN_FIELD_BASE(86, 86, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(87, 87, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(88, 88, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(89, 89, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(90, 90, 2, 0x0160, 0x10, 27, 1), + PIN_FIELD_BASE(91, 91, 2, 0x0160, 0x10, 27, 1), + PIN_FIELD_BASE(92, 92, 2, 0x0160, 0x10, 18, 1), + PIN_FIELD_BASE(93, 93, 2, 0x0160, 0x10, 18, 1), + PIN_FIELD_BASE(94, 94, 2, 0x0160, 0x10, 18, 1), + PIN_FIELD_BASE(95, 95, 2, 0x0160, 0x10, 18, 1), + PIN_FIELD_BASE(96, 96, 2, 0x0160, 0x10, 22, 1), + PIN_FIELD_BASE(97, 97, 2, 0x0160, 0x10, 23, 1), + PIN_FIELD_BASE(98, 98, 2, 0x0160, 0x10, 24, 1), + PIN_FIELD_BASE(99, 99, 2, 0x0160, 0x10, 22, 1), + PIN_FIELD_BASE(100, 100, 2, 0x0160, 0x10, 16, 1), + PIN_FIELD_BASE(101, 101, 2, 0x0160, 0x10, 23, 1), + PIN_FIELD_BASE(102, 102, 2, 0x0160, 0x10, 23, 1), + PIN_FIELD_BASE(103, 103, 2, 0x0160, 0x10, 23, 1), + PIN_FIELD_BASE(104, 104, 2, 0x0160, 0x10, 24, 1), + PIN_FIELD_BASE(105, 105, 2, 0x0160, 0x10, 24, 1), + PIN_FIELD_BASE(106, 106, 2, 0x0160, 0x10, 24, 1), + PIN_FIELD_BASE(107, 107, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(108, 108, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(109, 109, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(110, 110, 2, 0x0160, 0x10, 17, 1), + PIN_FIELD_BASE(111, 111, 2, 0x0160, 0x10, 19, 1), + PIN_FIELD_BASE(112, 112, 2, 0x0160, 0x10, 19, 1), + PIN_FIELD_BASE(113, 113, 2, 0x0160, 0x10, 19, 1), + PIN_FIELD_BASE(114, 114, 2, 0x0160, 0x10, 19, 1), + PIN_FIELD_BASE(115, 115, 2, 0x0160, 0x10, 20, 1), + PIN_FIELD_BASE(116, 116, 2, 0x0160, 0x10, 20, 1), + PIN_FIELD_BASE(117, 117, 2, 0x0160, 0x10, 20, 1), + PIN_FIELD_BASE(118, 118, 2, 0x0160, 0x10, 20, 1), + PIN_FIELD_BASE(119, 119, 2, 0x0160, 0x10, 21, 1), + PIN_FIELD_BASE(120, 120, 2, 0x0160, 0x10, 21, 1), + PIN_FIELD_BASE(121, 121, 3, 0x00d0, 0x10, 6, 1), + PIN_FIELD_BASE(122, 122, 3, 0x00d0, 0x10, 9, 1), + PIN_FIELD_BASE(123, 123, 3, 0x00d0, 0x10, 8, 1), + PIN_FIELD_BASE(124, 124, 3, 0x00d0, 0x10, 7, 1), + PIN_FIELD_BASE(125, 125, 2, 0x0160, 0x10, 25, 1), + PIN_FIELD_BASE(126, 126, 2, 0x0160, 0x10, 25, 1), + PIN_FIELD_BASE(127, 127, 2, 0x0160, 0x10, 25, 1), + PIN_FIELD_BASE(128, 128, 2, 0x0160, 0x10, 25, 1), + PIN_FIELD_BASE(129, 129, 2, 0x0160, 0x10, 26, 1), + PIN_FIELD_BASE(130, 130, 2, 0x0160, 0x10, 26, 1), + PIN_FIELD_BASE(131, 131, 1, 0x0170, 0x10, 0, 1), + PIN_FIELD_BASE(132, 132, 1, 0x0170, 0x10, 1, 1), + PIN_FIELD_BASE(133, 133, 1, 0x0170, 0x10, 6, 1), + PIN_FIELD_BASE(134, 134, 1, 0x0170, 0x10, 7, 1), + PIN_FIELD_BASE(135, 135, 1, 0x0170, 0x10, 22, 1), + PIN_FIELD_BASE(136, 136, 1, 0x0170, 0x10, 22, 1), + PIN_FIELD_BASE(137, 137, 1, 0x0170, 0x10, 22, 1), + PIN_FIELD_BASE(138, 138, 1, 0x0170, 0x10, 22, 1), + PIN_FIELD_BASE(139, 139, 1, 0x0170, 0x10, 23, 1), + PIN_FIELD_BASE(140, 140, 1, 0x0170, 0x10, 23, 1), + PIN_FIELD_BASE(141, 141, 1, 0x0170, 0x10, 23, 1), + PIN_FIELD_BASE(142, 142, 1, 0x0170, 0x10, 23, 1), + PIN_FIELD_BASE(143, 143, 1, 0x0170, 0x10, 2, 1), + PIN_FIELD_BASE(144, 144, 1, 0x0170, 0x10, 3, 1), + PIN_FIELD_BASE(145, 145, 1, 0x0170, 0x10, 4, 1), + PIN_FIELD_BASE(146, 146, 1, 0x0170, 0x10, 5, 1), + PIN_FIELD_BASE(147, 147, 1, 0x0170, 0x10, 24, 1), + PIN_FIELD_BASE(148, 148, 1, 0x0170, 0x10, 24, 1), + PIN_FIELD_BASE(149, 149, 1, 0x0170, 0x10, 24, 1), + PIN_FIELD_BASE(150, 150, 1, 0x0170, 0x10, 24, 1), + PIN_FIELD_BASE(151, 151, 2, 0x0160, 0x10, 9, 1), + PIN_FIELD_BASE(152, 152, 2, 0x0160, 0x10, 8, 1), + PIN_FIELD_BASE(153, 153, 2, 0x0160, 0x10, 7, 1), + PIN_FIELD_BASE(154, 154, 2, 0x0160, 0x10, 6, 1), + PIN_FIELD_BASE(155, 155, 2, 0x0160, 0x10, 11, 1), + PIN_FIELD_BASE(156, 156, 2, 0x0160, 0x10, 1, 1), + PIN_FIELD_BASE(157, 157, 2, 0x0160, 0x10, 0, 1), + PIN_FIELD_BASE(158, 158, 2, 0x0160, 0x10, 5, 1), + PIN_FIELD_BASE(159, 159, 2, 0x0160, 0x10, 4, 1), + PIN_FIELD_BASE(160, 160, 2, 0x0160, 0x10, 3, 1), + PIN_FIELD_BASE(161, 161, 2, 0x0160, 0x10, 2, 1), + PIN_FIELD_BASE(162, 162, 2, 0x0160, 0x10, 10, 1), + PIN_FIELD_BASE(163, 163, 4, 0x00e0, 0x10, 4, 1), + PIN_FIELD_BASE(164, 164, 4, 0x00e0, 0x10, 3, 1), + PIN_FIELD_BASE(165, 165, 4, 0x00e0, 0x10, 5, 1), + PIN_FIELD_BASE(166, 166, 4, 0x00e0, 0x10, 6, 1), + PIN_FIELD_BASE(167, 167, 4, 0x00e0, 0x10, 7, 1), + PIN_FIELD_BASE(168, 168, 4, 0x00e0, 0x10, 8, 1), + PIN_FIELD_BASE(169, 169, 3, 0x00d0, 0x10, 1, 1), + PIN_FIELD_BASE(170, 170, 3, 0x00d0, 0x10, 0, 1), + PIN_FIELD_BASE(171, 171, 3, 0x00d0, 0x10, 2, 1), + PIN_FIELD_BASE(172, 172, 3, 0x00d0, 0x10, 3, 1), + PIN_FIELD_BASE(173, 173, 3, 0x00d0, 0x10, 4, 1), + PIN_FIELD_BASE(174, 174, 3, 0x00d0, 0x10, 5, 1), + PIN_FIELD_BASE(175, 175, 2, 0x0160, 0x10, 28, 1), + PIN_FIELD_BASE(176, 176, 2, 0x0160, 0x10, 28, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_ies_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x0080, 0x10, 26, 1), + PIN_FIELD_BASE(1, 1, 1, 0x0080, 0x10, 27, 1), + PIN_FIELD_BASE(2, 2, 1, 0x0080, 0x10, 28, 1), + PIN_FIELD_BASE(3, 3, 1, 0x0080, 0x10, 29, 1), + PIN_FIELD_BASE(4, 4, 1, 0x0080, 0x10, 30, 1), + PIN_FIELD_BASE(5, 5, 1, 0x0080, 0x10, 31, 1), + PIN_FIELD_BASE(6, 6, 1, 0x0090, 0x10, 0, 1), + PIN_FIELD_BASE(7, 7, 1, 0x0090, 0x10, 1, 1), + PIN_FIELD_BASE(8, 8, 1, 0x0090, 0x10, 2, 1), + PIN_FIELD_BASE(9, 9, 1, 0x0090, 0x10, 3, 1), + PIN_FIELD_BASE(10, 10, 1, 0x0090, 0x10, 4, 1), + PIN_FIELD_BASE(11, 11, 1, 0x0090, 0x10, 5, 1), + PIN_FIELD_BASE(12, 12, 2, 0x0070, 0x10, 24, 1), + PIN_FIELD_BASE(13, 13, 2, 0x0070, 0x10, 25, 1), + PIN_FIELD_BASE(14, 14, 2, 0x0070, 0x10, 26, 1), + PIN_FIELD_BASE(15, 15, 2, 0x0070, 0x10, 27, 1), + PIN_FIELD_BASE(16, 16, 3, 0x0040, 0x10, 1, 1), + PIN_FIELD_BASE(17, 17, 3, 0x0040, 0x10, 2, 1), + PIN_FIELD_BASE(18, 18, 4, 0x0050, 0x10, 3, 1), + PIN_FIELD_BASE(19, 19, 4, 0x0050, 0x10, 5, 1), + PIN_FIELD_BASE(20, 20, 4, 0x0050, 0x10, 4, 1), + PIN_FIELD_BASE(21, 21, 4, 0x0050, 0x10, 6, 1), + PIN_FIELD_BASE(22, 22, 4, 0x0050, 0x10, 0, 1), + PIN_FIELD_BASE(23, 23, 4, 0x0050, 0x10, 1, 1), + PIN_FIELD_BASE(24, 24, 4, 0x0050, 0x10, 2, 1), + PIN_FIELD_BASE(25, 25, 1, 0x0080, 0x10, 23, 1), + PIN_FIELD_BASE(26, 26, 1, 0x0080, 0x10, 22, 1), + PIN_FIELD_BASE(27, 27, 1, 0x0080, 0x10, 25, 1), + PIN_FIELD_BASE(28, 28, 1, 0x0080, 0x10, 24, 1), + PIN_FIELD_BASE(29, 29, 1, 0x0080, 0x10, 0, 1), + PIN_FIELD_BASE(30, 30, 1, 0x0080, 0x10, 1, 1), + PIN_FIELD_BASE(31, 31, 1, 0x0090, 0x10, 31, 1), + PIN_FIELD_BASE(32, 32, 1, 0x0090, 0x10, 30, 1), + PIN_FIELD_BASE(33, 33, 1, 0x00a0, 0x10, 1, 1), + PIN_FIELD_BASE(34, 34, 1, 0x00a0, 0x10, 0, 1), + PIN_FIELD_BASE(35, 35, 1, 0x00a0, 0x10, 3, 1), + PIN_FIELD_BASE(36, 36, 1, 0x00a0, 0x10, 2, 1), + PIN_FIELD_BASE(37, 37, 1, 0x0090, 0x10, 9, 1), + PIN_FIELD_BASE(38, 38, 1, 0x0090, 0x10, 6, 1), + PIN_FIELD_BASE(39, 39, 1, 0x0090, 0x10, 7, 1), + PIN_FIELD_BASE(40, 40, 1, 0x0090, 0x10, 8, 1), + PIN_FIELD_BASE(41, 41, 1, 0x0090, 0x10, 10, 1), + PIN_FIELD_BASE(42, 42, 2, 0x0080, 0x10, 10, 1), + PIN_FIELD_BASE(43, 43, 2, 0x0080, 0x10, 11, 1), + PIN_FIELD_BASE(44, 44, 2, 0x0080, 0x10, 12, 1), + PIN_FIELD_BASE(45, 45, 2, 0x0080, 0x10, 13, 1), + PIN_FIELD_BASE(46, 46, 3, 0x0040, 0x10, 0, 1), + PIN_FIELD_BASE(47, 47, 1, 0x0090, 0x10, 13, 1), + PIN_FIELD_BASE(48, 48, 1, 0x0090, 0x10, 12, 1), + PIN_FIELD_BASE(49, 49, 1, 0x0090, 0x10, 11, 1), + PIN_FIELD_BASE(50, 50, 3, 0x0040, 0x10, 5, 1), + PIN_FIELD_BASE(51, 51, 3, 0x0040, 0x10, 4, 1), + PIN_FIELD_BASE(52, 52, 3, 0x0040, 0x10, 3, 1), + PIN_FIELD_BASE(53, 53, 3, 0x0040, 0x10, 6, 1), + PIN_FIELD_BASE(54, 54, 3, 0x0040, 0x10, 7, 1), + PIN_FIELD_BASE(55, 55, 1, 0x0090, 0x10, 14, 1), + PIN_FIELD_BASE(56, 56, 1, 0x0090, 0x10, 17, 1), + PIN_FIELD_BASE(57, 57, 2, 0x0080, 0x10, 22, 1), + PIN_FIELD_BASE(58, 58, 2, 0x0080, 0x10, 25, 1), + PIN_FIELD_BASE(59, 59, 1, 0x0090, 0x10, 15, 1), + PIN_FIELD_BASE(60, 60, 1, 0x0090, 0x10, 18, 1), + PIN_FIELD_BASE(61, 61, 1, 0x0090, 0x10, 16, 1), + PIN_FIELD_BASE(62, 62, 1, 0x0090, 0x10, 19, 1), + PIN_FIELD_BASE(63, 63, 2, 0x0080, 0x10, 23, 1), + PIN_FIELD_BASE(64, 64, 2, 0x0080, 0x10, 26, 1), + PIN_FIELD_BASE(65, 65, 4, 0x0050, 0x10, 13, 1), + PIN_FIELD_BASE(66, 66, 4, 0x0050, 0x10, 15, 1), + PIN_FIELD_BASE(67, 67, 4, 0x0050, 0x10, 14, 1), + PIN_FIELD_BASE(68, 68, 4, 0x0050, 0x10, 16, 1), + PIN_FIELD_BASE(69, 69, 1, 0x0090, 0x10, 21, 1), + PIN_FIELD_BASE(70, 70, 1, 0x0090, 0x10, 20, 1), + PIN_FIELD_BASE(71, 71, 1, 0x0090, 0x10, 25, 1), + PIN_FIELD_BASE(72, 72, 1, 0x0090, 0x10, 24, 1), + PIN_FIELD_BASE(73, 73, 1, 0x0090, 0x10, 22, 1), + PIN_FIELD_BASE(74, 74, 1, 0x0090, 0x10, 23, 1), + PIN_FIELD_BASE(75, 75, 1, 0x0090, 0x10, 27, 1), + PIN_FIELD_BASE(76, 76, 1, 0x0090, 0x10, 26, 1), + PIN_FIELD_BASE(77, 77, 1, 0x0090, 0x10, 29, 1), + PIN_FIELD_BASE(78, 78, 1, 0x0090, 0x10, 28, 1), + PIN_FIELD_BASE(79, 79, 4, 0x0050, 0x10, 18, 1), + PIN_FIELD_BASE(80, 80, 4, 0x0050, 0x10, 17, 1), + PIN_FIELD_BASE(81, 81, 4, 0x0050, 0x10, 20, 1), + PIN_FIELD_BASE(82, 82, 4, 0x0050, 0x10, 19, 1), + PIN_FIELD_BASE(83, 83, 2, 0x0080, 0x10, 30, 1), + PIN_FIELD_BASE(84, 84, 2, 0x0080, 0x10, 29, 1), + PIN_FIELD_BASE(85, 85, 2, 0x0080, 0x10, 31, 1), + PIN_FIELD_BASE(86, 86, 2, 0x0090, 0x10, 1, 1), + PIN_FIELD_BASE(87, 87, 2, 0x0090, 0x10, 0, 1), + PIN_FIELD_BASE(88, 88, 2, 0x0090, 0x10, 2, 1), + PIN_FIELD_BASE(89, 89, 2, 0x0090, 0x10, 4, 1), + PIN_FIELD_BASE(90, 90, 2, 0x0090, 0x10, 3, 1), + PIN_FIELD_BASE(91, 91, 2, 0x0090, 0x10, 5, 1), + PIN_FIELD_BASE(92, 92, 2, 0x0080, 0x10, 19, 1), + PIN_FIELD_BASE(93, 93, 2, 0x0080, 0x10, 18, 1), + PIN_FIELD_BASE(94, 94, 2, 0x0080, 0x10, 21, 1), + PIN_FIELD_BASE(95, 95, 2, 0x0080, 0x10, 20, 1), + PIN_FIELD_BASE(96, 96, 2, 0x0080, 0x10, 15, 1), + PIN_FIELD_BASE(97, 97, 2, 0x0080, 0x10, 16, 1), + PIN_FIELD_BASE(98, 98, 2, 0x0080, 0x10, 24, 1), + PIN_FIELD_BASE(99, 99, 2, 0x0080, 0x10, 14, 1), + PIN_FIELD_BASE(100, 100, 2, 0x0080, 0x10, 17, 1), + PIN_FIELD_BASE(101, 101, 2, 0x0070, 0x10, 0, 1), + PIN_FIELD_BASE(102, 102, 2, 0x0070, 0x10, 5, 1), + PIN_FIELD_BASE(103, 103, 2, 0x0070, 0x10, 3, 1), + PIN_FIELD_BASE(104, 104, 2, 0x0070, 0x10, 4, 1), + PIN_FIELD_BASE(105, 105, 2, 0x0070, 0x10, 1, 1), + PIN_FIELD_BASE(106, 106, 2, 0x0070, 0x10, 2, 1), + PIN_FIELD_BASE(107, 107, 2, 0x0080, 0x10, 1, 1), + PIN_FIELD_BASE(108, 108, 2, 0x0070, 0x10, 28, 1), + PIN_FIELD_BASE(109, 109, 2, 0x0080, 0x10, 2, 1), + PIN_FIELD_BASE(110, 110, 2, 0x0070, 0x10, 29, 1), + PIN_FIELD_BASE(111, 111, 2, 0x0070, 0x10, 30, 1), + PIN_FIELD_BASE(112, 112, 2, 0x0070, 0x10, 31, 1), + PIN_FIELD_BASE(113, 113, 2, 0x0080, 0x10, 0, 1), + PIN_FIELD_BASE(114, 114, 2, 0x0080, 0x10, 8, 1), + PIN_FIELD_BASE(115, 115, 2, 0x0080, 0x10, 3, 1), + PIN_FIELD_BASE(116, 116, 2, 0x0080, 0x10, 9, 1), + PIN_FIELD_BASE(117, 117, 2, 0x0080, 0x10, 4, 1), + PIN_FIELD_BASE(118, 118, 2, 0x0080, 0x10, 5, 1), + PIN_FIELD_BASE(119, 119, 2, 0x0080, 0x10, 6, 1), + PIN_FIELD_BASE(120, 120, 2, 0x0080, 0x10, 7, 1), + PIN_FIELD_BASE(121, 121, 3, 0x0040, 0x10, 14, 1), + PIN_FIELD_BASE(122, 122, 3, 0x0040, 0x10, 17, 1), + PIN_FIELD_BASE(123, 123, 3, 0x0040, 0x10, 16, 1), + PIN_FIELD_BASE(124, 124, 3, 0x0040, 0x10, 15, 1), + PIN_FIELD_BASE(125, 125, 2, 0x0070, 0x10, 6, 1), + PIN_FIELD_BASE(126, 126, 2, 0x0070, 0x10, 7, 1), + PIN_FIELD_BASE(127, 127, 2, 0x0070, 0x10, 8, 1), + PIN_FIELD_BASE(128, 128, 2, 0x0070, 0x10, 9, 1), + PIN_FIELD_BASE(129, 129, 2, 0x0070, 0x10, 10, 1), + PIN_FIELD_BASE(130, 130, 2, 0x0070, 0x10, 11, 1), + PIN_FIELD_BASE(131, 131, 1, 0x0080, 0x10, 3, 1), + PIN_FIELD_BASE(132, 132, 1, 0x0080, 0x10, 4, 1), + PIN_FIELD_BASE(133, 133, 1, 0x0080, 0x10, 11, 1), + PIN_FIELD_BASE(134, 134, 1, 0x0080, 0x10, 12, 1), + PIN_FIELD_BASE(135, 135, 1, 0x0080, 0x10, 13, 1), + PIN_FIELD_BASE(136, 136, 1, 0x0080, 0x10, 14, 1), + PIN_FIELD_BASE(137, 137, 1, 0x0080, 0x10, 15, 1), + PIN_FIELD_BASE(138, 138, 1, 0x0080, 0x10, 16, 1), + PIN_FIELD_BASE(139, 139, 1, 0x0080, 0x10, 17, 1), + PIN_FIELD_BASE(140, 140, 1, 0x0080, 0x10, 18, 1), + PIN_FIELD_BASE(141, 141, 1, 0x0080, 0x10, 5, 1), + PIN_FIELD_BASE(142, 142, 1, 0x0080, 0x10, 6, 1), + PIN_FIELD_BASE(143, 143, 1, 0x0080, 0x10, 7, 1), + PIN_FIELD_BASE(144, 144, 1, 0x0080, 0x10, 8, 1), + PIN_FIELD_BASE(145, 145, 1, 0x0080, 0x10, 9, 1), + PIN_FIELD_BASE(146, 146, 1, 0x0080, 0x10, 10, 1), + PIN_FIELD_BASE(147, 147, 1, 0x0080, 0x10, 20, 1), + PIN_FIELD_BASE(148, 148, 1, 0x0080, 0x10, 21, 1), + PIN_FIELD_BASE(149, 149, 1, 0x0080, 0x10, 19, 1), + PIN_FIELD_BASE(150, 150, 1, 0x0080, 0x10, 2, 1), + PIN_FIELD_BASE(151, 151, 2, 0x0070, 0x10, 21, 1), + PIN_FIELD_BASE(152, 152, 2, 0x0070, 0x10, 20, 1), + PIN_FIELD_BASE(153, 153, 2, 0x0070, 0x10, 19, 1), + PIN_FIELD_BASE(154, 154, 2, 0x0070, 0x10, 18, 1), + PIN_FIELD_BASE(155, 155, 2, 0x0070, 0x10, 23, 1), + PIN_FIELD_BASE(156, 156, 2, 0x0070, 0x10, 13, 1), + PIN_FIELD_BASE(157, 157, 2, 0x0070, 0x10, 12, 1), + PIN_FIELD_BASE(158, 158, 2, 0x0070, 0x10, 17, 1), + PIN_FIELD_BASE(159, 159, 2, 0x0070, 0x10, 16, 1), + PIN_FIELD_BASE(160, 160, 2, 0x0070, 0x10, 15, 1), + PIN_FIELD_BASE(161, 161, 2, 0x0070, 0x10, 14, 1), + PIN_FIELD_BASE(162, 162, 2, 0x0070, 0x10, 22, 1), + PIN_FIELD_BASE(163, 163, 4, 0x0050, 0x10, 8, 1), + PIN_FIELD_BASE(164, 164, 4, 0x0050, 0x10, 7, 1), + PIN_FIELD_BASE(165, 165, 4, 0x0050, 0x10, 9, 1), + PIN_FIELD_BASE(166, 166, 4, 0x0050, 0x10, 10, 1), + PIN_FIELD_BASE(167, 167, 4, 0x0050, 0x10, 11, 1), + PIN_FIELD_BASE(168, 168, 4, 0x0050, 0x10, 12, 1), + PIN_FIELD_BASE(169, 169, 3, 0x0040, 0x10, 9, 1), + PIN_FIELD_BASE(170, 170, 3, 0x0040, 0x10, 8, 1), + PIN_FIELD_BASE(171, 171, 3, 0x0040, 0x10, 10, 1), + PIN_FIELD_BASE(172, 172, 3, 0x0040, 0x10, 11, 1), + PIN_FIELD_BASE(173, 173, 3, 0x0040, 0x10, 12, 1), + PIN_FIELD_BASE(174, 174, 3, 0x0040, 0x10, 13, 1), + PIN_FIELD_BASE(175, 175, 2, 0x0080, 0x10, 27, 1), + PIN_FIELD_BASE(176, 176, 2, 0x0080, 0x10, 28, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_tdsel_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x01b0, 0x10, 0, 4), + PIN_FIELD_BASE(1, 1, 1, 0x01b0, 0x10, 4, 4), + PIN_FIELD_BASE(2, 2, 1, 0x01b0, 0x10, 8, 4), + PIN_FIELD_BASE(3, 3, 1, 0x01b0, 0x10, 12, 4), + PIN_FIELD_BASE(4, 4, 1, 0x01c0, 0x10, 16, 4), + PIN_FIELD_BASE(5, 5, 1, 0x01c0, 0x10, 20, 4), + PIN_FIELD_BASE(6, 6, 1, 0x01c0, 0x10, 20, 4), + PIN_FIELD_BASE(7, 7, 1, 0x01b0, 0x10, 16, 4), + PIN_FIELD_BASE(8, 8, 1, 0x01b0, 0x10, 20, 4), + PIN_FIELD_BASE(9, 9, 1, 0x01b0, 0x10, 24, 4), + PIN_FIELD_BASE(10, 10, 1, 0x01b0, 0x10, 28, 4), + PIN_FIELD_BASE(11, 11, 1, 0x01c0, 0x10, 20, 4), + PIN_FIELD_BASE(12, 12, 2, 0x0190, 0x10, 16, 4), + PIN_FIELD_BASE(13, 13, 2, 0x0190, 0x10, 20, 4), + PIN_FIELD_BASE(14, 14, 2, 0x0190, 0x10, 24, 4), + PIN_FIELD_BASE(15, 15, 2, 0x0190, 0x10, 28, 4), + PIN_FIELD_BASE(16, 16, 3, 0x0100, 0x10, 8, 4), + PIN_FIELD_BASE(17, 17, 3, 0x0100, 0x10, 8, 4), + PIN_FIELD_BASE(18, 18, 4, 0x0110, 0x10, 4, 4), + PIN_FIELD_BASE(19, 19, 4, 0x0110, 0x10, 8, 4), + PIN_FIELD_BASE(20, 20, 4, 0x0110, 0x10, 8, 4), + PIN_FIELD_BASE(21, 21, 4, 0x0110, 0x10, 8, 4), + PIN_FIELD_BASE(22, 22, 4, 0x0100, 0x10, 0, 4), + PIN_FIELD_BASE(23, 23, 4, 0x0100, 0x10, 4, 4), + PIN_FIELD_BASE(24, 24, 4, 0x0100, 0x10, 8, 4), + PIN_FIELD_BASE(25, 25, 1, 0x01c0, 0x10, 8, 4), + PIN_FIELD_BASE(26, 26, 1, 0x01c0, 0x10, 8, 4), + PIN_FIELD_BASE(27, 27, 1, 0x01c0, 0x10, 8, 4), + PIN_FIELD_BASE(28, 28, 1, 0x01c0, 0x10, 12, 4), + PIN_FIELD_BASE(29, 29, 1, 0x01c0, 0x10, 0, 4), + PIN_FIELD_BASE(30, 30, 1, 0x01c0, 0x10, 8, 4), + PIN_FIELD_BASE(31, 31, 1, 0x01c0, 0x10, 20, 4), + PIN_FIELD_BASE(32, 32, 1, 0x01c0, 0x10, 24, 4), + PIN_FIELD_BASE(33, 33, 1, 0x01c0, 0x10, 24, 4), + PIN_FIELD_BASE(34, 34, 1, 0x01c0, 0x10, 28, 4), + PIN_FIELD_BASE(35, 35, 1, 0x01c0, 0x10, 24, 4), + PIN_FIELD_BASE(36, 36, 1, 0x01c0, 0x10, 24, 4), + PIN_FIELD_BASE(37, 37, 1, 0x01c0, 0x10, 28, 4), + PIN_FIELD_BASE(38, 38, 1, 0x01c0, 0x10, 28, 4), + PIN_FIELD_BASE(39, 39, 1, 0x01c0, 0x10, 28, 4), + PIN_FIELD_BASE(40, 40, 1, 0x01d0, 0x10, 0, 4), + PIN_FIELD_BASE(41, 41, 1, 0x01d0, 0x10, 0, 4), + PIN_FIELD_BASE(42, 42, 2, 0x01a0, 0x10, 16, 4), + PIN_FIELD_BASE(43, 43, 2, 0x01a0, 0x10, 20, 4), + PIN_FIELD_BASE(44, 44, 2, 0x01a0, 0x10, 16, 4), + PIN_FIELD_BASE(45, 45, 2, 0x01a0, 0x10, 20, 4), + PIN_FIELD_BASE(46, 46, 3, 0x0100, 0x10, 8, 4), + PIN_FIELD_BASE(47, 47, 1, 0x01c0, 0x10, 0, 4), + PIN_FIELD_BASE(48, 48, 1, 0x01c0, 0x10, 0, 4), + PIN_FIELD_BASE(49, 49, 1, 0x01c0, 0x10, 0, 4), + PIN_FIELD_BASE(50, 50, 3, 0x0100, 0x10, 8, 4), + PIN_FIELD_BASE(51, 51, 3, 0x0100, 0x10, 12, 4), + PIN_FIELD_BASE(52, 52, 3, 0x0100, 0x10, 12, 4), + PIN_FIELD_BASE(53, 53, 3, 0x0100, 0x10, 12, 4), + PIN_FIELD_BASE(54, 54, 3, 0x0100, 0x10, 12, 4), + PIN_FIELD_BASE(55, 55, 1, 0x01c0, 0x10, 12, 4), + PIN_FIELD_BASE(56, 56, 1, 0x01c0, 0x10, 12, 4), + PIN_FIELD_BASE(57, 57, 2, 0x01a0, 0x10, 24, 4), + PIN_FIELD_BASE(58, 58, 2, 0x01a0, 0x10, 24, 4), + PIN_FIELD_BASE(59, 59, 1, 0x01c0, 0x10, 16, 4), + PIN_FIELD_BASE(60, 60, 1, 0x01c0, 0x10, 12, 4), + PIN_FIELD_BASE(61, 61, 1, 0x01c0, 0x10, 16, 4), + PIN_FIELD_BASE(62, 62, 1, 0x01c0, 0x10, 16, 4), + PIN_FIELD_BASE(63, 63, 2, 0x01a0, 0x10, 20, 4), + PIN_FIELD_BASE(64, 64, 2, 0x01a0, 0x10, 20, 4), + PIN_FIELD_BASE(65, 65, 4, 0x0110, 0x10, 12, 4), + PIN_FIELD_BASE(66, 66, 4, 0x0110, 0x10, 8, 4), + PIN_FIELD_BASE(67, 67, 4, 0x0110, 0x10, 12, 4), + PIN_FIELD_BASE(68, 68, 4, 0x0110, 0x10, 12, 4), + PIN_FIELD_BASE(69, 69, 1, 0x01d0, 0x10, 16, 4), + PIN_FIELD_BASE(70, 70, 1, 0x01d0, 0x10, 12, 4), + PIN_FIELD_BASE(71, 71, 1, 0x01e0, 0x10, 0, 4), + PIN_FIELD_BASE(72, 72, 1, 0x01d0, 0x10, 28, 4), + PIN_FIELD_BASE(73, 73, 1, 0x01d0, 0x10, 20, 4), + PIN_FIELD_BASE(74, 74, 1, 0x01d0, 0x10, 24, 4), + PIN_FIELD_BASE(75, 75, 1, 0x01e0, 0x10, 8, 4), + PIN_FIELD_BASE(76, 76, 1, 0x01e0, 0x10, 4, 4), + PIN_FIELD_BASE(77, 77, 1, 0x01e0, 0x10, 16, 4), + PIN_FIELD_BASE(78, 78, 1, 0x01e0, 0x10, 12, 4), + PIN_FIELD_BASE(79, 79, 4, 0x0110, 0x10, 20, 4), + PIN_FIELD_BASE(80, 80, 4, 0x0110, 0x10, 16, 4), + PIN_FIELD_BASE(81, 81, 4, 0x0110, 0x10, 28, 4), + PIN_FIELD_BASE(82, 82, 4, 0x0110, 0x10, 24, 4), + PIN_FIELD_BASE(83, 83, 2, 0x01b0, 0x10, 8, 4), + PIN_FIELD_BASE(84, 84, 2, 0x01b0, 0x10, 8, 4), + PIN_FIELD_BASE(85, 85, 2, 0x01b0, 0x10, 12, 4), + PIN_FIELD_BASE(86, 86, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(87, 87, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(88, 88, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(89, 89, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(90, 90, 2, 0x01b0, 0x10, 12, 4), + PIN_FIELD_BASE(91, 91, 2, 0x01b0, 0x10, 12, 4), + PIN_FIELD_BASE(92, 92, 2, 0x01a0, 0x10, 4, 4), + PIN_FIELD_BASE(93, 93, 2, 0x01a0, 0x10, 4, 4), + PIN_FIELD_BASE(94, 94, 2, 0x01a0, 0x10, 4, 4), + PIN_FIELD_BASE(95, 95, 2, 0x01a0, 0x10, 4, 4), + PIN_FIELD_BASE(96, 96, 2, 0x01a0, 0x10, 24, 4), + PIN_FIELD_BASE(97, 97, 2, 0x01a0, 0x10, 28, 4), + PIN_FIELD_BASE(98, 98, 2, 0x01b0, 0x10, 0, 4), + PIN_FIELD_BASE(99, 99, 2, 0x01a0, 0x10, 24, 4), + PIN_FIELD_BASE(100, 100, 2, 0x01b0, 0x10, 20, 4), + PIN_FIELD_BASE(101, 101, 2, 0x01a0, 0x10, 28, 4), + PIN_FIELD_BASE(102, 102, 2, 0x01a0, 0x10, 28, 4), + PIN_FIELD_BASE(103, 103, 2, 0x01a0, 0x10, 28, 4), + PIN_FIELD_BASE(104, 104, 2, 0x01b0, 0x10, 0, 4), + PIN_FIELD_BASE(105, 105, 2, 0x01b0, 0x10, 0, 4), + PIN_FIELD_BASE(106, 106, 2, 0x01b0, 0x10, 0, 4), + PIN_FIELD_BASE(107, 107, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(108, 108, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(109, 109, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(110, 110, 2, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(111, 111, 2, 0x01a0, 0x10, 8, 4), + PIN_FIELD_BASE(112, 112, 2, 0x01a0, 0x10, 8, 4), + PIN_FIELD_BASE(113, 113, 2, 0x01a0, 0x10, 8, 4), + PIN_FIELD_BASE(114, 114, 2, 0x01a0, 0x10, 8, 4), + PIN_FIELD_BASE(115, 115, 2, 0x01a0, 0x10, 12, 4), + PIN_FIELD_BASE(116, 116, 2, 0x01a0, 0x10, 12, 4), + PIN_FIELD_BASE(117, 117, 2, 0x01a0, 0x10, 12, 4), + PIN_FIELD_BASE(118, 118, 2, 0x01a0, 0x10, 12, 4), + PIN_FIELD_BASE(119, 119, 2, 0x01a0, 0x10, 16, 4), + PIN_FIELD_BASE(120, 120, 2, 0x01a0, 0x10, 16, 4), + PIN_FIELD_BASE(121, 121, 3, 0x00f0, 0x10, 24, 4), + PIN_FIELD_BASE(122, 122, 3, 0x0100, 0x10, 4, 4), + PIN_FIELD_BASE(123, 123, 3, 0x0100, 0x10, 0, 4), + PIN_FIELD_BASE(124, 124, 3, 0x00f0, 0x10, 28, 4), + PIN_FIELD_BASE(125, 125, 2, 0x01b0, 0x10, 4, 4), + PIN_FIELD_BASE(126, 126, 2, 0x01b0, 0x10, 4, 4), + PIN_FIELD_BASE(127, 127, 2, 0x01b0, 0x10, 4, 4), + PIN_FIELD_BASE(128, 128, 2, 0x01b0, 0x10, 4, 4), + PIN_FIELD_BASE(129, 129, 2, 0x01b0, 0x10, 8, 4), + PIN_FIELD_BASE(130, 130, 2, 0x01b0, 0x10, 8, 4), + PIN_FIELD_BASE(131, 131, 1, 0x01a0, 0x10, 0, 4), + PIN_FIELD_BASE(132, 132, 1, 0x01a0, 0x10, 20, 4), + PIN_FIELD_BASE(133, 133, 1, 0x01a0, 0x10, 24, 4), + PIN_FIELD_BASE(134, 134, 1, 0x01a0, 0x10, 28, 4), + PIN_FIELD_BASE(135, 135, 1, 0x01d0, 0x10, 0, 4), + PIN_FIELD_BASE(136, 136, 1, 0x01d0, 0x10, 0, 4), + PIN_FIELD_BASE(137, 137, 1, 0x01d0, 0x10, 4, 4), + PIN_FIELD_BASE(138, 138, 1, 0x01d0, 0x10, 4, 4), + PIN_FIELD_BASE(139, 139, 1, 0x01d0, 0x10, 4, 4), + PIN_FIELD_BASE(140, 140, 1, 0x01d0, 0x10, 4, 4), + PIN_FIELD_BASE(141, 141, 1, 0x01d0, 0x10, 8, 4), + PIN_FIELD_BASE(142, 142, 1, 0x01d0, 0x10, 8, 4), + PIN_FIELD_BASE(143, 143, 1, 0x01a0, 0x10, 4, 4), + PIN_FIELD_BASE(144, 144, 1, 0x01a0, 0x10, 8, 4), + PIN_FIELD_BASE(145, 145, 1, 0x01a0, 0x10, 12, 4), + PIN_FIELD_BASE(146, 146, 1, 0x01a0, 0x10, 16, 4), + PIN_FIELD_BASE(147, 147, 1, 0x01d0, 0x10, 8, 4), + PIN_FIELD_BASE(148, 148, 1, 0x01d0, 0x10, 8, 4), + PIN_FIELD_BASE(149, 149, 1, 0x01c0, 0x10, 4, 4), + PIN_FIELD_BASE(150, 150, 1, 0x01c0, 0x10, 4, 4), + PIN_FIELD_BASE(151, 151, 2, 0x0190, 0x10, 4, 4), + PIN_FIELD_BASE(152, 152, 2, 0x0190, 0x10, 0, 4), + PIN_FIELD_BASE(153, 153, 2, 0x0180, 0x10, 28, 4), + PIN_FIELD_BASE(154, 154, 2, 0x0180, 0x10, 24, 4), + PIN_FIELD_BASE(155, 155, 2, 0x0190, 0x10, 12, 4), + PIN_FIELD_BASE(156, 156, 2, 0x0180, 0x10, 4, 4), + PIN_FIELD_BASE(157, 157, 2, 0x0180, 0x10, 0, 4), + PIN_FIELD_BASE(158, 158, 2, 0x0180, 0x10, 20, 4), + PIN_FIELD_BASE(159, 159, 2, 0x0180, 0x10, 16, 4), + PIN_FIELD_BASE(160, 160, 2, 0x0180, 0x10, 12, 4), + PIN_FIELD_BASE(161, 161, 2, 0x0180, 0x10, 8, 4), + PIN_FIELD_BASE(162, 162, 2, 0x0190, 0x10, 8, 4), + PIN_FIELD_BASE(163, 163, 4, 0x0100, 0x10, 16, 4), + PIN_FIELD_BASE(164, 164, 4, 0x0100, 0x10, 12, 4), + PIN_FIELD_BASE(165, 165, 4, 0x0100, 0x10, 20, 4), + PIN_FIELD_BASE(166, 166, 4, 0x0100, 0x10, 24, 4), + PIN_FIELD_BASE(167, 167, 4, 0x0100, 0x10, 28, 4), + PIN_FIELD_BASE(168, 168, 4, 0x0110, 0x10, 0, 4), + PIN_FIELD_BASE(169, 169, 3, 0x00f0, 0x10, 4, 4), + PIN_FIELD_BASE(170, 170, 3, 0x00f0, 0x10, 0, 4), + PIN_FIELD_BASE(171, 171, 3, 0x00f0, 0x10, 8, 4), + PIN_FIELD_BASE(172, 172, 3, 0x00f0, 0x10, 12, 4), + PIN_FIELD_BASE(173, 173, 3, 0x00f0, 0x10, 16, 4), + PIN_FIELD_BASE(174, 174, 3, 0x00f0, 0x10, 20, 4), + PIN_FIELD_BASE(175, 175, 2, 0x01b0, 0x10, 16, 4), + PIN_FIELD_BASE(176, 176, 2, 0x01b0, 0x10, 16, 4), +}; + +static const struct mtk_pin_field_calc mt8188_pin_rdsel_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x0130, 0x10, 18, 2), + PIN_FIELD_BASE(1, 1, 1, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(2, 2, 1, 0x0130, 0x10, 22, 2), + PIN_FIELD_BASE(3, 3, 1, 0x0130, 0x10, 24, 2), + PIN_FIELD_BASE(4, 4, 1, 0x0140, 0x10, 14, 2), + PIN_FIELD_BASE(5, 5, 1, 0x0140, 0x10, 16, 2), + PIN_FIELD_BASE(6, 6, 1, 0x0140, 0x10, 16, 2), + PIN_FIELD_BASE(7, 7, 1, 0x0130, 0x10, 26, 2), + PIN_FIELD_BASE(8, 8, 1, 0x0130, 0x10, 28, 2), + PIN_FIELD_BASE(9, 9, 1, 0x0130, 0x10, 30, 2), + PIN_FIELD_BASE(10, 10, 1, 0x0140, 0x10, 0, 2), + PIN_FIELD_BASE(11, 11, 1, 0x0140, 0x10, 16, 2), + PIN_FIELD_BASE(12, 12, 2, 0x0130, 0x10, 12, 2), + PIN_FIELD_BASE(13, 13, 2, 0x0130, 0x10, 14, 2), + PIN_FIELD_BASE(14, 14, 2, 0x0130, 0x10, 16, 2), + PIN_FIELD_BASE(15, 15, 2, 0x0130, 0x10, 18, 2), + PIN_FIELD_BASE(16, 16, 3, 0x00b0, 0x10, 14, 2), + PIN_FIELD_BASE(17, 17, 3, 0x00b0, 0x10, 14, 2), + PIN_FIELD_BASE(18, 18, 4, 0x00c0, 0x10, 12, 2), + PIN_FIELD_BASE(19, 19, 4, 0x00c0, 0x10, 12, 2), + PIN_FIELD_BASE(20, 20, 4, 0x00c0, 0x10, 12, 2), + PIN_FIELD_BASE(21, 21, 4, 0x00c0, 0x10, 12, 2), + PIN_FIELD_BASE(22, 22, 4, 0x00b0, 0x10, 0, 2), + PIN_FIELD_BASE(23, 23, 4, 0x00b0, 0x10, 2, 2), + PIN_FIELD_BASE(24, 24, 4, 0x00b0, 0x10, 4, 2), + PIN_FIELD_BASE(25, 25, 1, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(26, 26, 1, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(27, 27, 1, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(28, 28, 1, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(29, 29, 1, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(30, 30, 1, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(31, 31, 1, 0x0140, 0x10, 16, 2), + PIN_FIELD_BASE(32, 32, 1, 0x0140, 0x10, 18, 2), + PIN_FIELD_BASE(33, 33, 1, 0x0140, 0x10, 18, 2), + PIN_FIELD_BASE(34, 34, 1, 0x0140, 0x10, 20, 2), + PIN_FIELD_BASE(35, 35, 1, 0x0140, 0x10, 18, 2), + PIN_FIELD_BASE(36, 36, 1, 0x0140, 0x10, 18, 2), + PIN_FIELD_BASE(37, 37, 1, 0x0140, 0x10, 20, 2), + PIN_FIELD_BASE(38, 38, 1, 0x0140, 0x10, 20, 2), + PIN_FIELD_BASE(39, 39, 1, 0x0140, 0x10, 20, 2), + PIN_FIELD_BASE(40, 40, 1, 0x0140, 0x10, 22, 2), + PIN_FIELD_BASE(41, 41, 1, 0x0140, 0x10, 22, 2), + PIN_FIELD_BASE(42, 42, 2, 0x0130, 0x10, 30, 2), + PIN_FIELD_BASE(43, 43, 2, 0x0140, 0x10, 0, 2), + PIN_FIELD_BASE(44, 44, 2, 0x0130, 0x10, 30, 2), + PIN_FIELD_BASE(45, 45, 2, 0x0140, 0x10, 0, 2), + PIN_FIELD_BASE(46, 46, 3, 0x00b0, 0x10, 14, 2), + PIN_FIELD_BASE(47, 47, 1, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(48, 48, 1, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(49, 49, 1, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(50, 50, 3, 0x00b0, 0x10, 14, 2), + PIN_FIELD_BASE(51, 51, 3, 0x00b0, 0x10, 16, 2), + PIN_FIELD_BASE(52, 52, 3, 0x00b0, 0x10, 16, 2), + PIN_FIELD_BASE(53, 53, 3, 0x00b0, 0x10, 16, 2), + PIN_FIELD_BASE(54, 54, 3, 0x00b0, 0x10, 16, 2), + PIN_FIELD_BASE(55, 55, 1, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(56, 56, 1, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(57, 57, 2, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(58, 58, 2, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(59, 59, 1, 0x0140, 0x10, 14, 2), + PIN_FIELD_BASE(60, 60, 1, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(61, 61, 1, 0x0140, 0x10, 14, 2), + PIN_FIELD_BASE(62, 62, 1, 0x0140, 0x10, 14, 2), + PIN_FIELD_BASE(63, 63, 2, 0x0140, 0x10, 0, 2), + PIN_FIELD_BASE(64, 64, 2, 0x0140, 0x10, 0, 2), + PIN_FIELD_BASE(65, 65, 4, 0x00c0, 0x10, 14, 2), + PIN_FIELD_BASE(66, 66, 4, 0x00c0, 0x10, 14, 2), + PIN_FIELD_BASE(67, 67, 4, 0x00c0, 0x10, 14, 2), + PIN_FIELD_BASE(68, 68, 4, 0x00c0, 0x10, 14, 2), + PIN_FIELD_BASE(69, 69, 1, 0x0150, 0x10, 14, 2), + PIN_FIELD_BASE(70, 70, 1, 0x0150, 0x10, 12, 2), + PIN_FIELD_BASE(71, 71, 1, 0x0150, 0x10, 22, 2), + PIN_FIELD_BASE(72, 72, 1, 0x0150, 0x10, 20, 2), + PIN_FIELD_BASE(73, 73, 1, 0x0150, 0x10, 16, 2), + PIN_FIELD_BASE(74, 74, 1, 0x0150, 0x10, 18, 2), + PIN_FIELD_BASE(75, 75, 1, 0x0150, 0x10, 26, 2), + PIN_FIELD_BASE(76, 76, 1, 0x0150, 0x10, 24, 2), + PIN_FIELD_BASE(77, 77, 1, 0x0150, 0x10, 30, 2), + PIN_FIELD_BASE(78, 78, 1, 0x0150, 0x10, 28, 2), + PIN_FIELD_BASE(79, 79, 4, 0x00c0, 0x10, 18, 2), + PIN_FIELD_BASE(80, 80, 4, 0x00c0, 0x10, 16, 2), + PIN_FIELD_BASE(81, 81, 4, 0x00c0, 0x10, 22, 2), + PIN_FIELD_BASE(82, 82, 4, 0x00c0, 0x10, 20, 2), + PIN_FIELD_BASE(83, 83, 2, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(84, 84, 2, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(85, 85, 2, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(86, 86, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(87, 87, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(88, 88, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(89, 89, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(90, 90, 2, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(91, 91, 2, 0x0140, 0x10, 12, 2), + PIN_FIELD_BASE(92, 92, 2, 0x0130, 0x10, 22, 2), + PIN_FIELD_BASE(93, 93, 2, 0x0130, 0x10, 22, 2), + PIN_FIELD_BASE(94, 94, 2, 0x0130, 0x10, 22, 2), + PIN_FIELD_BASE(95, 95, 2, 0x0130, 0x10, 22, 2), + PIN_FIELD_BASE(96, 96, 2, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(97, 97, 2, 0x0140, 0x10, 4, 2), + PIN_FIELD_BASE(98, 98, 2, 0x0140, 0x10, 6, 2), + PIN_FIELD_BASE(99, 99, 2, 0x0140, 0x10, 2, 2), + PIN_FIELD_BASE(100, 100, 2, 0x0140, 0x10, 16, 2), + PIN_FIELD_BASE(101, 101, 2, 0x0140, 0x10, 4, 2), + PIN_FIELD_BASE(102, 102, 2, 0x0140, 0x10, 4, 2), + PIN_FIELD_BASE(103, 103, 2, 0x0140, 0x10, 4, 2), + PIN_FIELD_BASE(104, 104, 2, 0x0140, 0x10, 6, 2), + PIN_FIELD_BASE(105, 105, 2, 0x0140, 0x10, 6, 2), + PIN_FIELD_BASE(106, 106, 2, 0x0140, 0x10, 6, 2), + PIN_FIELD_BASE(107, 107, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(108, 108, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(109, 109, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(110, 110, 2, 0x0130, 0x10, 20, 2), + PIN_FIELD_BASE(111, 111, 2, 0x0130, 0x10, 24, 2), + PIN_FIELD_BASE(112, 112, 2, 0x0130, 0x10, 24, 2), + PIN_FIELD_BASE(113, 113, 2, 0x0130, 0x10, 24, 2), + PIN_FIELD_BASE(114, 114, 2, 0x0130, 0x10, 24, 2), + PIN_FIELD_BASE(115, 115, 2, 0x0130, 0x10, 28, 2), + PIN_FIELD_BASE(116, 116, 2, 0x0130, 0x10, 28, 2), + PIN_FIELD_BASE(117, 117, 2, 0x0130, 0x10, 28, 2), + PIN_FIELD_BASE(118, 118, 2, 0x0130, 0x10, 28, 2), + PIN_FIELD_BASE(119, 119, 2, 0x0130, 0x10, 30, 2), + PIN_FIELD_BASE(120, 120, 2, 0x0130, 0x10, 30, 2), + PIN_FIELD_BASE(121, 121, 3, 0x00b0, 0x10, 6, 2), + PIN_FIELD_BASE(122, 122, 3, 0x00b0, 0x10, 12, 2), + PIN_FIELD_BASE(123, 123, 3, 0x00b0, 0x10, 10, 2), + PIN_FIELD_BASE(124, 124, 3, 0x00b0, 0x10, 8, 2), + PIN_FIELD_BASE(125, 125, 2, 0x0140, 0x10, 8, 2), + PIN_FIELD_BASE(126, 126, 2, 0x0140, 0x10, 8, 2), + PIN_FIELD_BASE(127, 127, 2, 0x0140, 0x10, 8, 2), + PIN_FIELD_BASE(128, 128, 2, 0x0140, 0x10, 8, 2), + PIN_FIELD_BASE(129, 129, 2, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(130, 130, 2, 0x0140, 0x10, 10, 2), + PIN_FIELD_BASE(131, 131, 1, 0x0120, 0x10, 0, 6), + PIN_FIELD_BASE(132, 132, 1, 0x0130, 0x10, 0, 6), + PIN_FIELD_BASE(133, 133, 1, 0x0130, 0x10, 6, 6), + PIN_FIELD_BASE(134, 134, 1, 0x0130, 0x10, 12, 6), + PIN_FIELD_BASE(135, 135, 1, 0x0140, 0x10, 24, 6), + PIN_FIELD_BASE(136, 136, 1, 0x0140, 0x10, 24, 6), + PIN_FIELD_BASE(137, 137, 1, 0x0150, 0x10, 0, 6), + PIN_FIELD_BASE(138, 138, 1, 0x0150, 0x10, 0, 6), + PIN_FIELD_BASE(139, 139, 1, 0x0150, 0x10, 0, 6), + PIN_FIELD_BASE(140, 140, 1, 0x0150, 0x10, 0, 6), + PIN_FIELD_BASE(141, 141, 1, 0x0150, 0x10, 6, 6), + PIN_FIELD_BASE(142, 142, 1, 0x0150, 0x10, 6, 6), + PIN_FIELD_BASE(143, 143, 1, 0x0120, 0x10, 6, 6), + PIN_FIELD_BASE(144, 144, 1, 0x0120, 0x10, 12, 6), + PIN_FIELD_BASE(145, 145, 1, 0x0120, 0x10, 18, 6), + PIN_FIELD_BASE(146, 146, 1, 0x0120, 0x10, 24, 6), + PIN_FIELD_BASE(147, 147, 1, 0x0150, 0x10, 6, 6), + PIN_FIELD_BASE(148, 148, 1, 0x0150, 0x10, 6, 6), + PIN_FIELD_BASE(149, 149, 1, 0x0140, 0x10, 4, 6), + PIN_FIELD_BASE(150, 150, 1, 0x0140, 0x10, 4, 6), + PIN_FIELD_BASE(151, 151, 2, 0x0120, 0x10, 24, 6), + PIN_FIELD_BASE(152, 152, 2, 0x0120, 0x10, 18, 6), + PIN_FIELD_BASE(153, 153, 2, 0x0120, 0x10, 12, 6), + PIN_FIELD_BASE(154, 154, 2, 0x0120, 0x10, 6, 6), + PIN_FIELD_BASE(155, 155, 2, 0x0130, 0x10, 6, 6), + PIN_FIELD_BASE(156, 156, 2, 0x0110, 0x10, 6, 6), + PIN_FIELD_BASE(157, 157, 2, 0x0110, 0x10, 0, 6), + PIN_FIELD_BASE(158, 158, 2, 0x0120, 0x10, 0, 6), + PIN_FIELD_BASE(159, 159, 2, 0x0110, 0x10, 24, 6), + PIN_FIELD_BASE(160, 160, 2, 0x0110, 0x10, 18, 6), + PIN_FIELD_BASE(161, 161, 2, 0x0110, 0x10, 12, 6), + PIN_FIELD_BASE(162, 162, 2, 0x0130, 0x10, 0, 6), + PIN_FIELD_BASE(163, 163, 4, 0x00b0, 0x10, 12, 6), + PIN_FIELD_BASE(164, 164, 4, 0x00b0, 0x10, 6, 6), + PIN_FIELD_BASE(165, 165, 4, 0x00b0, 0x10, 18, 6), + PIN_FIELD_BASE(166, 166, 4, 0x00b0, 0x10, 24, 6), + PIN_FIELD_BASE(167, 167, 4, 0x00c0, 0x10, 0, 6), + PIN_FIELD_BASE(168, 168, 4, 0x00c0, 0x10, 6, 6), + PIN_FIELD_BASE(169, 169, 3, 0x00a0, 0x10, 6, 6), + PIN_FIELD_BASE(170, 170, 3, 0x00a0, 0x10, 0, 6), + PIN_FIELD_BASE(171, 171, 3, 0x00a0, 0x10, 12, 6), + PIN_FIELD_BASE(172, 172, 3, 0x00a0, 0x10, 18, 6), + PIN_FIELD_BASE(173, 173, 3, 0x00a0, 0x10, 24, 6), + PIN_FIELD_BASE(174, 174, 3, 0x00b0, 0x10, 0, 6), + PIN_FIELD_BASE(175, 175, 2, 0x0140, 0x10, 14, 2), + PIN_FIELD_BASE(176, 176, 2, 0x0140, 0x10, 14, 2), +}; + +static const struct mtk_pin_field_calc mt8188_pin_pupd_range[] = { + PIN_FIELD_BASE(42, 42, 2, 0x00c0, 0x10, 12, 1), + PIN_FIELD_BASE(43, 43, 2, 0x00c0, 0x10, 13, 1), + PIN_FIELD_BASE(44, 44, 2, 0x00c0, 0x10, 14, 1), + PIN_FIELD_BASE(45, 45, 2, 0x00c0, 0x10, 15, 1), + PIN_FIELD_BASE(131, 131, 1, 0x00d0, 0x10, 1, 1), + PIN_FIELD_BASE(132, 132, 1, 0x00d0, 0x10, 2, 1), + PIN_FIELD_BASE(133, 133, 1, 0x00d0, 0x10, 9, 1), + PIN_FIELD_BASE(134, 134, 1, 0x00d0, 0x10, 10, 1), + PIN_FIELD_BASE(135, 135, 1, 0x00d0, 0x10, 11, 1), + PIN_FIELD_BASE(136, 136, 1, 0x00d0, 0x10, 12, 1), + PIN_FIELD_BASE(137, 137, 1, 0x00d0, 0x10, 13, 1), + PIN_FIELD_BASE(138, 138, 1, 0x00d0, 0x10, 14, 1), + PIN_FIELD_BASE(139, 139, 1, 0x00d0, 0x10, 15, 1), + PIN_FIELD_BASE(140, 140, 1, 0x00d0, 0x10, 16, 1), + PIN_FIELD_BASE(141, 141, 1, 0x00d0, 0x10, 3, 1), + PIN_FIELD_BASE(142, 142, 1, 0x00d0, 0x10, 4, 1), + PIN_FIELD_BASE(143, 143, 1, 0x00d0, 0x10, 5, 1), + PIN_FIELD_BASE(144, 144, 1, 0x00d0, 0x10, 6, 1), + PIN_FIELD_BASE(145, 145, 1, 0x00d0, 0x10, 7, 1), + PIN_FIELD_BASE(146, 146, 1, 0x00d0, 0x10, 8, 1), + PIN_FIELD_BASE(147, 147, 1, 0x00d0, 0x10, 18, 1), + PIN_FIELD_BASE(148, 148, 1, 0x00d0, 0x10, 19, 1), + PIN_FIELD_BASE(149, 149, 1, 0x00d0, 0x10, 17, 1), + PIN_FIELD_BASE(150, 150, 1, 0x00d0, 0x10, 0, 1), + PIN_FIELD_BASE(151, 151, 2, 0x00c0, 0x10, 9, 1), + PIN_FIELD_BASE(152, 152, 2, 0x00c0, 0x10, 8, 1), + PIN_FIELD_BASE(153, 153, 2, 0x00c0, 0x10, 7, 1), + PIN_FIELD_BASE(154, 154, 2, 0x00c0, 0x10, 6, 1), + PIN_FIELD_BASE(155, 155, 2, 0x00c0, 0x10, 11, 1), + PIN_FIELD_BASE(156, 156, 2, 0x00c0, 0x10, 1, 1), + PIN_FIELD_BASE(157, 157, 2, 0x00c0, 0x10, 0, 1), + PIN_FIELD_BASE(158, 158, 2, 0x00c0, 0x10, 5, 1), + PIN_FIELD_BASE(159, 159, 2, 0x00c0, 0x10, 4, 1), + PIN_FIELD_BASE(160, 160, 2, 0x00c0, 0x10, 3, 1), + PIN_FIELD_BASE(161, 161, 2, 0x00c0, 0x10, 2, 1), + PIN_FIELD_BASE(162, 162, 2, 0x00c0, 0x10, 10, 1), + PIN_FIELD_BASE(163, 163, 4, 0x0070, 0x10, 1, 1), + PIN_FIELD_BASE(164, 164, 4, 0x0070, 0x10, 0, 1), + PIN_FIELD_BASE(165, 165, 4, 0x0070, 0x10, 2, 1), + PIN_FIELD_BASE(166, 166, 4, 0x0070, 0x10, 3, 1), + PIN_FIELD_BASE(167, 167, 4, 0x0070, 0x10, 4, 1), + PIN_FIELD_BASE(168, 168, 4, 0x0070, 0x10, 5, 1), + PIN_FIELD_BASE(169, 169, 3, 0x0060, 0x10, 1, 1), + PIN_FIELD_BASE(170, 170, 3, 0x0060, 0x10, 0, 1), + PIN_FIELD_BASE(171, 171, 3, 0x0060, 0x10, 2, 1), + PIN_FIELD_BASE(172, 172, 3, 0x0060, 0x10, 3, 1), + PIN_FIELD_BASE(173, 173, 3, 0x0060, 0x10, 4, 1), + PIN_FIELD_BASE(174, 174, 3, 0x0060, 0x10, 5, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_r0_range[] = { + PIN_FIELD_BASE(42, 42, 2, 0x00f0, 0x10, 12, 1), + PIN_FIELD_BASE(43, 43, 2, 0x00f0, 0x10, 13, 1), + PIN_FIELD_BASE(44, 44, 2, 0x00f0, 0x10, 14, 1), + PIN_FIELD_BASE(45, 45, 2, 0x00f0, 0x10, 15, 1), + PIN_FIELD_BASE(131, 131, 1, 0x0100, 0x10, 1, 1), + PIN_FIELD_BASE(132, 132, 1, 0x0100, 0x10, 2, 1), + PIN_FIELD_BASE(133, 133, 1, 0x0100, 0x10, 9, 1), + PIN_FIELD_BASE(134, 134, 1, 0x0100, 0x10, 10, 1), + PIN_FIELD_BASE(135, 135, 1, 0x0100, 0x10, 11, 1), + PIN_FIELD_BASE(136, 136, 1, 0x0100, 0x10, 12, 1), + PIN_FIELD_BASE(137, 137, 1, 0x0100, 0x10, 13, 1), + PIN_FIELD_BASE(138, 138, 1, 0x0100, 0x10, 14, 1), + PIN_FIELD_BASE(139, 139, 1, 0x0100, 0x10, 15, 1), + PIN_FIELD_BASE(140, 140, 1, 0x0100, 0x10, 16, 1), + PIN_FIELD_BASE(141, 141, 1, 0x0100, 0x10, 3, 1), + PIN_FIELD_BASE(142, 142, 1, 0x0100, 0x10, 4, 1), + PIN_FIELD_BASE(143, 143, 1, 0x0100, 0x10, 5, 1), + PIN_FIELD_BASE(144, 144, 1, 0x0100, 0x10, 6, 1), + PIN_FIELD_BASE(145, 145, 1, 0x0100, 0x10, 7, 1), + PIN_FIELD_BASE(146, 146, 1, 0x0100, 0x10, 8, 1), + PIN_FIELD_BASE(147, 147, 1, 0x0100, 0x10, 18, 1), + PIN_FIELD_BASE(148, 148, 1, 0x0100, 0x10, 19, 1), + PIN_FIELD_BASE(149, 149, 1, 0x0100, 0x10, 17, 1), + PIN_FIELD_BASE(150, 150, 1, 0x0100, 0x10, 0, 1), + PIN_FIELD_BASE(151, 151, 2, 0x00f0, 0x10, 9, 1), + PIN_FIELD_BASE(152, 152, 2, 0x00f0, 0x10, 8, 1), + PIN_FIELD_BASE(153, 153, 2, 0x00f0, 0x10, 7, 1), + PIN_FIELD_BASE(154, 154, 2, 0x00f0, 0x10, 6, 1), + PIN_FIELD_BASE(155, 155, 2, 0x00f0, 0x10, 11, 1), + PIN_FIELD_BASE(156, 156, 2, 0x00f0, 0x10, 1, 1), + PIN_FIELD_BASE(157, 157, 2, 0x00f0, 0x10, 0, 1), + PIN_FIELD_BASE(158, 158, 2, 0x00f0, 0x10, 5, 1), + PIN_FIELD_BASE(159, 159, 2, 0x00f0, 0x10, 4, 1), + PIN_FIELD_BASE(160, 160, 2, 0x00f0, 0x10, 3, 1), + PIN_FIELD_BASE(161, 161, 2, 0x00f0, 0x10, 2, 1), + PIN_FIELD_BASE(162, 162, 2, 0x00f0, 0x10, 10, 1), + PIN_FIELD_BASE(163, 163, 4, 0x0090, 0x10, 1, 1), + PIN_FIELD_BASE(164, 164, 4, 0x0090, 0x10, 0, 1), + PIN_FIELD_BASE(165, 165, 4, 0x0090, 0x10, 2, 1), + PIN_FIELD_BASE(166, 166, 4, 0x0090, 0x10, 3, 1), + PIN_FIELD_BASE(167, 167, 4, 0x0090, 0x10, 4, 1), + PIN_FIELD_BASE(168, 168, 4, 0x0090, 0x10, 5, 1), + PIN_FIELD_BASE(169, 169, 3, 0x0080, 0x10, 1, 1), + PIN_FIELD_BASE(170, 170, 3, 0x0080, 0x10, 0, 1), + PIN_FIELD_BASE(171, 171, 3, 0x0080, 0x10, 2, 1), + PIN_FIELD_BASE(172, 172, 3, 0x0080, 0x10, 3, 1), + PIN_FIELD_BASE(173, 173, 3, 0x0080, 0x10, 4, 1), + PIN_FIELD_BASE(174, 174, 3, 0x0080, 0x10, 5, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_r1_range[] = { + PIN_FIELD_BASE(42, 42, 2, 0x0100, 0x10, 12, 1), + PIN_FIELD_BASE(43, 43, 2, 0x0100, 0x10, 13, 1), + PIN_FIELD_BASE(44, 44, 2, 0x0100, 0x10, 14, 1), + PIN_FIELD_BASE(45, 45, 2, 0x0100, 0x10, 15, 1), + PIN_FIELD_BASE(131, 131, 1, 0x0110, 0x10, 1, 1), + PIN_FIELD_BASE(132, 132, 1, 0x0110, 0x10, 2, 1), + PIN_FIELD_BASE(133, 133, 1, 0x0110, 0x10, 9, 1), + PIN_FIELD_BASE(134, 134, 1, 0x0110, 0x10, 10, 1), + PIN_FIELD_BASE(135, 135, 1, 0x0110, 0x10, 11, 1), + PIN_FIELD_BASE(136, 136, 1, 0x0110, 0x10, 12, 1), + PIN_FIELD_BASE(137, 137, 1, 0x0110, 0x10, 13, 1), + PIN_FIELD_BASE(138, 138, 1, 0x0110, 0x10, 14, 1), + PIN_FIELD_BASE(139, 139, 1, 0x0110, 0x10, 15, 1), + PIN_FIELD_BASE(140, 140, 1, 0x0110, 0x10, 16, 1), + PIN_FIELD_BASE(141, 141, 1, 0x0110, 0x10, 3, 1), + PIN_FIELD_BASE(142, 142, 1, 0x0110, 0x10, 4, 1), + PIN_FIELD_BASE(143, 143, 1, 0x0110, 0x10, 5, 1), + PIN_FIELD_BASE(144, 144, 1, 0x0110, 0x10, 6, 1), + PIN_FIELD_BASE(145, 145, 1, 0x0110, 0x10, 7, 1), + PIN_FIELD_BASE(146, 146, 1, 0x0110, 0x10, 8, 1), + PIN_FIELD_BASE(147, 147, 1, 0x0110, 0x10, 18, 1), + PIN_FIELD_BASE(148, 148, 1, 0x0110, 0x10, 19, 1), + PIN_FIELD_BASE(149, 149, 1, 0x0110, 0x10, 17, 1), + PIN_FIELD_BASE(150, 150, 1, 0x0110, 0x10, 0, 1), + PIN_FIELD_BASE(151, 151, 2, 0x0100, 0x10, 9, 1), + PIN_FIELD_BASE(152, 152, 2, 0x0100, 0x10, 8, 1), + PIN_FIELD_BASE(153, 153, 2, 0x0100, 0x10, 7, 1), + PIN_FIELD_BASE(154, 154, 2, 0x0100, 0x10, 6, 1), + PIN_FIELD_BASE(155, 155, 2, 0x0100, 0x10, 11, 1), + PIN_FIELD_BASE(156, 156, 2, 0x0100, 0x10, 1, 1), + PIN_FIELD_BASE(157, 157, 2, 0x0100, 0x10, 0, 1), + PIN_FIELD_BASE(158, 158, 2, 0x0100, 0x10, 5, 1), + PIN_FIELD_BASE(159, 159, 2, 0x0100, 0x10, 4, 1), + PIN_FIELD_BASE(160, 160, 2, 0x0100, 0x10, 3, 1), + PIN_FIELD_BASE(161, 161, 2, 0x0100, 0x10, 2, 1), + PIN_FIELD_BASE(162, 162, 2, 0x0100, 0x10, 10, 1), + PIN_FIELD_BASE(163, 163, 4, 0x00a0, 0x10, 1, 1), + PIN_FIELD_BASE(164, 164, 4, 0x00a0, 0x10, 0, 1), + PIN_FIELD_BASE(165, 165, 4, 0x00a0, 0x10, 2, 1), + PIN_FIELD_BASE(166, 166, 4, 0x00a0, 0x10, 3, 1), + PIN_FIELD_BASE(167, 167, 4, 0x00a0, 0x10, 4, 1), + PIN_FIELD_BASE(168, 168, 4, 0x00a0, 0x10, 5, 1), + PIN_FIELD_BASE(169, 169, 3, 0x0090, 0x10, 1, 1), + PIN_FIELD_BASE(170, 170, 3, 0x0090, 0x10, 0, 1), + PIN_FIELD_BASE(171, 171, 3, 0x0090, 0x10, 2, 1), + PIN_FIELD_BASE(172, 172, 3, 0x0090, 0x10, 3, 1), + PIN_FIELD_BASE(173, 173, 3, 0x0090, 0x10, 4, 1), + PIN_FIELD_BASE(174, 174, 3, 0x0090, 0x10, 5, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_pu_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x00e0, 0x10, 6, 1), + PIN_FIELD_BASE(1, 1, 1, 0x00e0, 0x10, 7, 1), + PIN_FIELD_BASE(2, 2, 1, 0x00e0, 0x10, 8, 1), + PIN_FIELD_BASE(3, 3, 1, 0x00e0, 0x10, 9, 1), + PIN_FIELD_BASE(4, 4, 1, 0x00e0, 0x10, 10, 1), + PIN_FIELD_BASE(5, 5, 1, 0x00e0, 0x10, 11, 1), + PIN_FIELD_BASE(6, 6, 1, 0x00e0, 0x10, 12, 1), + PIN_FIELD_BASE(7, 7, 1, 0x00e0, 0x10, 13, 1), + PIN_FIELD_BASE(8, 8, 1, 0x00e0, 0x10, 14, 1), + PIN_FIELD_BASE(9, 9, 1, 0x00e0, 0x10, 15, 1), + PIN_FIELD_BASE(10, 10, 1, 0x00e0, 0x10, 16, 1), + PIN_FIELD_BASE(11, 11, 1, 0x00e0, 0x10, 17, 1), + PIN_FIELD_BASE(12, 12, 2, 0x00d0, 0x10, 12, 1), + PIN_FIELD_BASE(13, 13, 2, 0x00d0, 0x10, 13, 1), + PIN_FIELD_BASE(14, 14, 2, 0x00d0, 0x10, 14, 1), + PIN_FIELD_BASE(15, 15, 2, 0x00d0, 0x10, 15, 1), + PIN_FIELD_BASE(16, 16, 3, 0x0070, 0x10, 1, 1), + PIN_FIELD_BASE(17, 17, 3, 0x0070, 0x10, 2, 1), + PIN_FIELD_BASE(18, 18, 4, 0x0080, 0x10, 3, 1), + PIN_FIELD_BASE(19, 19, 4, 0x0080, 0x10, 5, 1), + PIN_FIELD_BASE(20, 20, 4, 0x0080, 0x10, 4, 1), + PIN_FIELD_BASE(21, 21, 4, 0x0080, 0x10, 6, 1), + PIN_FIELD_BASE(22, 22, 4, 0x0080, 0x10, 0, 1), + PIN_FIELD_BASE(23, 23, 4, 0x0080, 0x10, 1, 1), + PIN_FIELD_BASE(24, 24, 4, 0x0080, 0x10, 2, 1), + PIN_FIELD_BASE(25, 25, 1, 0x00e0, 0x10, 3, 1), + PIN_FIELD_BASE(26, 26, 1, 0x00e0, 0x10, 2, 1), + PIN_FIELD_BASE(27, 27, 1, 0x00e0, 0x10, 5, 1), + PIN_FIELD_BASE(28, 28, 1, 0x00e0, 0x10, 4, 1), + PIN_FIELD_BASE(29, 29, 1, 0x00e0, 0x10, 0, 1), + PIN_FIELD_BASE(30, 30, 1, 0x00e0, 0x10, 1, 1), + PIN_FIELD_BASE(31, 31, 1, 0x00f0, 0x10, 11, 1), + PIN_FIELD_BASE(32, 32, 1, 0x00f0, 0x10, 10, 1), + PIN_FIELD_BASE(33, 33, 1, 0x00f0, 0x10, 13, 1), + PIN_FIELD_BASE(34, 34, 1, 0x00f0, 0x10, 12, 1), + PIN_FIELD_BASE(35, 35, 1, 0x00f0, 0x10, 15, 1), + PIN_FIELD_BASE(36, 36, 1, 0x00f0, 0x10, 14, 1), + PIN_FIELD_BASE(37, 37, 1, 0x00e0, 0x10, 21, 1), + PIN_FIELD_BASE(38, 38, 1, 0x00e0, 0x10, 18, 1), + PIN_FIELD_BASE(39, 39, 1, 0x00e0, 0x10, 19, 1), + PIN_FIELD_BASE(40, 40, 1, 0x00e0, 0x10, 20, 1), + PIN_FIELD_BASE(41, 41, 1, 0x00e0, 0x10, 22, 1), + PIN_FIELD_BASE(46, 46, 3, 0x0070, 0x10, 0, 1), + PIN_FIELD_BASE(47, 47, 1, 0x00e0, 0x10, 25, 1), + PIN_FIELD_BASE(48, 48, 1, 0x00e0, 0x10, 24, 1), + PIN_FIELD_BASE(49, 49, 1, 0x00e0, 0x10, 23, 1), + PIN_FIELD_BASE(50, 50, 3, 0x0070, 0x10, 5, 1), + PIN_FIELD_BASE(51, 51, 3, 0x0070, 0x10, 4, 1), + PIN_FIELD_BASE(52, 52, 3, 0x0070, 0x10, 3, 1), + PIN_FIELD_BASE(53, 53, 3, 0x0070, 0x10, 6, 1), + PIN_FIELD_BASE(54, 54, 3, 0x0070, 0x10, 7, 1), + PIN_FIELD_BASE(55, 55, 1, 0x00e0, 0x10, 26, 1), + PIN_FIELD_BASE(56, 56, 1, 0x00e0, 0x10, 29, 1), + PIN_FIELD_BASE(57, 57, 2, 0x00e0, 0x10, 6, 1), + PIN_FIELD_BASE(58, 58, 2, 0x00e0, 0x10, 9, 1), + PIN_FIELD_BASE(59, 59, 1, 0x00e0, 0x10, 27, 1), + PIN_FIELD_BASE(60, 60, 1, 0x00e0, 0x10, 30, 1), + PIN_FIELD_BASE(61, 61, 1, 0x00e0, 0x10, 28, 1), + PIN_FIELD_BASE(62, 62, 1, 0x00e0, 0x10, 31, 1), + PIN_FIELD_BASE(63, 63, 2, 0x00e0, 0x10, 7, 1), + PIN_FIELD_BASE(64, 64, 2, 0x00e0, 0x10, 10, 1), + PIN_FIELD_BASE(65, 65, 4, 0x0080, 0x10, 7, 1), + PIN_FIELD_BASE(66, 66, 4, 0x0080, 0x10, 9, 1), + PIN_FIELD_BASE(67, 67, 4, 0x0080, 0x10, 8, 1), + PIN_FIELD_BASE(68, 68, 4, 0x0080, 0x10, 10, 1), + PIN_FIELD_BASE(69, 69, 1, 0x00f0, 0x10, 1, 1), + PIN_FIELD_BASE(70, 70, 1, 0x00f0, 0x10, 0, 1), + PIN_FIELD_BASE(71, 71, 1, 0x00f0, 0x10, 5, 1), + PIN_FIELD_BASE(72, 72, 1, 0x00f0, 0x10, 4, 1), + PIN_FIELD_BASE(73, 73, 1, 0x00f0, 0x10, 2, 1), + PIN_FIELD_BASE(74, 74, 1, 0x00f0, 0x10, 3, 1), + PIN_FIELD_BASE(75, 75, 1, 0x00f0, 0x10, 7, 1), + PIN_FIELD_BASE(76, 76, 1, 0x00f0, 0x10, 6, 1), + PIN_FIELD_BASE(77, 77, 1, 0x00f0, 0x10, 9, 1), + PIN_FIELD_BASE(78, 78, 1, 0x00f0, 0x10, 8, 1), + PIN_FIELD_BASE(79, 79, 4, 0x0080, 0x10, 12, 1), + PIN_FIELD_BASE(80, 80, 4, 0x0080, 0x10, 11, 1), + PIN_FIELD_BASE(81, 81, 4, 0x0080, 0x10, 14, 1), + PIN_FIELD_BASE(82, 82, 4, 0x0080, 0x10, 13, 1), + PIN_FIELD_BASE(83, 83, 2, 0x00e0, 0x10, 16, 1), + PIN_FIELD_BASE(84, 84, 2, 0x00e0, 0x10, 15, 1), + PIN_FIELD_BASE(85, 85, 2, 0x00e0, 0x10, 17, 1), + PIN_FIELD_BASE(86, 86, 2, 0x00e0, 0x10, 19, 1), + PIN_FIELD_BASE(87, 87, 2, 0x00e0, 0x10, 18, 1), + PIN_FIELD_BASE(88, 88, 2, 0x00e0, 0x10, 20, 1), + PIN_FIELD_BASE(89, 89, 2, 0x00e0, 0x10, 22, 1), + PIN_FIELD_BASE(90, 90, 2, 0x00e0, 0x10, 21, 1), + PIN_FIELD_BASE(91, 91, 2, 0x00e0, 0x10, 23, 1), + PIN_FIELD_BASE(92, 92, 2, 0x00e0, 0x10, 3, 1), + PIN_FIELD_BASE(93, 93, 2, 0x00e0, 0x10, 2, 1), + PIN_FIELD_BASE(94, 94, 2, 0x00e0, 0x10, 5, 1), + PIN_FIELD_BASE(95, 95, 2, 0x00e0, 0x10, 4, 1), + PIN_FIELD_BASE(96, 96, 2, 0x00d0, 0x10, 31, 1), + PIN_FIELD_BASE(97, 97, 2, 0x00e0, 0x10, 0, 1), + PIN_FIELD_BASE(98, 98, 2, 0x00e0, 0x10, 8, 1), + PIN_FIELD_BASE(99, 99, 2, 0x00d0, 0x10, 30, 1), + PIN_FIELD_BASE(100, 100, 2, 0x00e0, 0x10, 1, 1), + PIN_FIELD_BASE(101, 101, 2, 0x00d0, 0x10, 0, 1), + PIN_FIELD_BASE(102, 102, 2, 0x00d0, 0x10, 5, 1), + PIN_FIELD_BASE(103, 103, 2, 0x00d0, 0x10, 3, 1), + PIN_FIELD_BASE(104, 104, 2, 0x00d0, 0x10, 4, 1), + PIN_FIELD_BASE(105, 105, 2, 0x00d0, 0x10, 1, 1), + PIN_FIELD_BASE(106, 106, 2, 0x00d0, 0x10, 2, 1), + PIN_FIELD_BASE(107, 107, 2, 0x00d0, 0x10, 21, 1), + PIN_FIELD_BASE(108, 108, 2, 0x00d0, 0x10, 16, 1), + PIN_FIELD_BASE(109, 109, 2, 0x00d0, 0x10, 22, 1), + PIN_FIELD_BASE(110, 110, 2, 0x00d0, 0x10, 17, 1), + PIN_FIELD_BASE(111, 111, 2, 0x00d0, 0x10, 18, 1), + PIN_FIELD_BASE(112, 112, 2, 0x00d0, 0x10, 19, 1), + PIN_FIELD_BASE(113, 113, 2, 0x00d0, 0x10, 20, 1), + PIN_FIELD_BASE(114, 114, 2, 0x00d0, 0x10, 28, 1), + PIN_FIELD_BASE(115, 115, 2, 0x00d0, 0x10, 23, 1), + PIN_FIELD_BASE(116, 116, 2, 0x00d0, 0x10, 29, 1), + PIN_FIELD_BASE(117, 117, 2, 0x00d0, 0x10, 24, 1), + PIN_FIELD_BASE(118, 118, 2, 0x00d0, 0x10, 25, 1), + PIN_FIELD_BASE(119, 119, 2, 0x00d0, 0x10, 26, 1), + PIN_FIELD_BASE(120, 120, 2, 0x00d0, 0x10, 27, 1), + PIN_FIELD_BASE(121, 121, 3, 0x0070, 0x10, 8, 1), + PIN_FIELD_BASE(122, 122, 3, 0x0070, 0x10, 11, 1), + PIN_FIELD_BASE(123, 123, 3, 0x0070, 0x10, 10, 1), + PIN_FIELD_BASE(124, 124, 3, 0x0070, 0x10, 9, 1), + PIN_FIELD_BASE(125, 125, 2, 0x00d0, 0x10, 6, 1), + PIN_FIELD_BASE(126, 126, 2, 0x00d0, 0x10, 7, 1), + PIN_FIELD_BASE(127, 127, 2, 0x00d0, 0x10, 8, 1), + PIN_FIELD_BASE(128, 128, 2, 0x00d0, 0x10, 9, 1), + PIN_FIELD_BASE(129, 129, 2, 0x00d0, 0x10, 10, 1), + PIN_FIELD_BASE(130, 130, 2, 0x00d0, 0x10, 11, 1), + PIN_FIELD_BASE(175, 175, 2, 0x00e0, 0x10, 11, 1), + PIN_FIELD_BASE(176, 176, 2, 0x00e0, 0x10, 12, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_pd_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x00b0, 0x10, 6, 1), + PIN_FIELD_BASE(1, 1, 1, 0x00b0, 0x10, 7, 1), + PIN_FIELD_BASE(2, 2, 1, 0x00b0, 0x10, 8, 1), + PIN_FIELD_BASE(3, 3, 1, 0x00b0, 0x10, 9, 1), + PIN_FIELD_BASE(4, 4, 1, 0x00b0, 0x10, 10, 1), + PIN_FIELD_BASE(5, 5, 1, 0x00b0, 0x10, 11, 1), + PIN_FIELD_BASE(6, 6, 1, 0x00b0, 0x10, 12, 1), + PIN_FIELD_BASE(7, 7, 1, 0x00b0, 0x10, 13, 1), + PIN_FIELD_BASE(8, 8, 1, 0x00b0, 0x10, 14, 1), + PIN_FIELD_BASE(9, 9, 1, 0x00b0, 0x10, 15, 1), + PIN_FIELD_BASE(10, 10, 1, 0x00b0, 0x10, 16, 1), + PIN_FIELD_BASE(11, 11, 1, 0x00b0, 0x10, 17, 1), + PIN_FIELD_BASE(12, 12, 2, 0x00a0, 0x10, 12, 1), + PIN_FIELD_BASE(13, 13, 2, 0x00a0, 0x10, 13, 1), + PIN_FIELD_BASE(14, 14, 2, 0x00a0, 0x10, 14, 1), + PIN_FIELD_BASE(15, 15, 2, 0x00a0, 0x10, 15, 1), + PIN_FIELD_BASE(16, 16, 3, 0x0050, 0x10, 1, 1), + PIN_FIELD_BASE(17, 17, 3, 0x0050, 0x10, 2, 1), + PIN_FIELD_BASE(18, 18, 4, 0x0060, 0x10, 3, 1), + PIN_FIELD_BASE(19, 19, 4, 0x0060, 0x10, 5, 1), + PIN_FIELD_BASE(20, 20, 4, 0x0060, 0x10, 4, 1), + PIN_FIELD_BASE(21, 21, 4, 0x0060, 0x10, 6, 1), + PIN_FIELD_BASE(22, 22, 4, 0x0060, 0x10, 0, 1), + PIN_FIELD_BASE(23, 23, 4, 0x0060, 0x10, 1, 1), + PIN_FIELD_BASE(24, 24, 4, 0x0060, 0x10, 2, 1), + PIN_FIELD_BASE(25, 25, 1, 0x00b0, 0x10, 3, 1), + PIN_FIELD_BASE(26, 26, 1, 0x00b0, 0x10, 2, 1), + PIN_FIELD_BASE(27, 27, 1, 0x00b0, 0x10, 5, 1), + PIN_FIELD_BASE(28, 28, 1, 0x00b0, 0x10, 4, 1), + PIN_FIELD_BASE(29, 29, 1, 0x00b0, 0x10, 0, 1), + PIN_FIELD_BASE(30, 30, 1, 0x00b0, 0x10, 1, 1), + PIN_FIELD_BASE(31, 31, 1, 0x00c0, 0x10, 11, 1), + PIN_FIELD_BASE(32, 32, 1, 0x00c0, 0x10, 10, 1), + PIN_FIELD_BASE(33, 33, 1, 0x00c0, 0x10, 13, 1), + PIN_FIELD_BASE(34, 34, 1, 0x00c0, 0x10, 12, 1), + PIN_FIELD_BASE(35, 35, 1, 0x00c0, 0x10, 15, 1), + PIN_FIELD_BASE(36, 36, 1, 0x00c0, 0x10, 14, 1), + PIN_FIELD_BASE(37, 37, 1, 0x00b0, 0x10, 21, 1), + PIN_FIELD_BASE(38, 38, 1, 0x00b0, 0x10, 18, 1), + PIN_FIELD_BASE(39, 39, 1, 0x00b0, 0x10, 19, 1), + PIN_FIELD_BASE(40, 40, 1, 0x00b0, 0x10, 20, 1), + PIN_FIELD_BASE(41, 41, 1, 0x00b0, 0x10, 22, 1), + PIN_FIELD_BASE(46, 46, 3, 0x0050, 0x10, 0, 1), + PIN_FIELD_BASE(47, 47, 1, 0x00b0, 0x10, 25, 1), + PIN_FIELD_BASE(48, 48, 1, 0x00b0, 0x10, 24, 1), + PIN_FIELD_BASE(49, 49, 1, 0x00b0, 0x10, 23, 1), + PIN_FIELD_BASE(50, 50, 3, 0x0050, 0x10, 5, 1), + PIN_FIELD_BASE(51, 51, 3, 0x0050, 0x10, 4, 1), + PIN_FIELD_BASE(52, 52, 3, 0x0050, 0x10, 3, 1), + PIN_FIELD_BASE(53, 53, 3, 0x0050, 0x10, 6, 1), + PIN_FIELD_BASE(54, 54, 3, 0x0050, 0x10, 7, 1), + PIN_FIELD_BASE(55, 55, 1, 0x00b0, 0x10, 26, 1), + PIN_FIELD_BASE(56, 56, 1, 0x00b0, 0x10, 29, 1), + PIN_FIELD_BASE(57, 57, 2, 0x00b0, 0x10, 6, 1), + PIN_FIELD_BASE(58, 58, 2, 0x00b0, 0x10, 9, 1), + PIN_FIELD_BASE(59, 59, 1, 0x00b0, 0x10, 27, 1), + PIN_FIELD_BASE(60, 60, 1, 0x00b0, 0x10, 30, 1), + PIN_FIELD_BASE(61, 61, 1, 0x00b0, 0x10, 28, 1), + PIN_FIELD_BASE(62, 62, 1, 0x00b0, 0x10, 31, 1), + PIN_FIELD_BASE(63, 63, 2, 0x00b0, 0x10, 7, 1), + PIN_FIELD_BASE(64, 64, 2, 0x00b0, 0x10, 10, 1), + PIN_FIELD_BASE(65, 65, 4, 0x0060, 0x10, 7, 1), + PIN_FIELD_BASE(66, 66, 4, 0x0060, 0x10, 9, 1), + PIN_FIELD_BASE(67, 67, 4, 0x0060, 0x10, 8, 1), + PIN_FIELD_BASE(68, 68, 4, 0x0060, 0x10, 10, 1), + PIN_FIELD_BASE(69, 69, 1, 0x00c0, 0x10, 1, 1), + PIN_FIELD_BASE(70, 70, 1, 0x00c0, 0x10, 0, 1), + PIN_FIELD_BASE(71, 71, 1, 0x00c0, 0x10, 5, 1), + PIN_FIELD_BASE(72, 72, 1, 0x00c0, 0x10, 4, 1), + PIN_FIELD_BASE(73, 73, 1, 0x00c0, 0x10, 2, 1), + PIN_FIELD_BASE(74, 74, 1, 0x00c0, 0x10, 3, 1), + PIN_FIELD_BASE(75, 75, 1, 0x00c0, 0x10, 7, 1), + PIN_FIELD_BASE(76, 76, 1, 0x00c0, 0x10, 6, 1), + PIN_FIELD_BASE(77, 77, 1, 0x00c0, 0x10, 9, 1), + PIN_FIELD_BASE(78, 78, 1, 0x00c0, 0x10, 8, 1), + PIN_FIELD_BASE(79, 79, 4, 0x0060, 0x10, 12, 1), + PIN_FIELD_BASE(80, 80, 4, 0x0060, 0x10, 11, 1), + PIN_FIELD_BASE(81, 81, 4, 0x0060, 0x10, 14, 1), + PIN_FIELD_BASE(82, 82, 4, 0x0060, 0x10, 13, 1), + PIN_FIELD_BASE(83, 83, 2, 0x00b0, 0x10, 16, 1), + PIN_FIELD_BASE(84, 84, 2, 0x00b0, 0x10, 15, 1), + PIN_FIELD_BASE(85, 85, 2, 0x00b0, 0x10, 17, 1), + PIN_FIELD_BASE(86, 86, 2, 0x00b0, 0x10, 19, 1), + PIN_FIELD_BASE(87, 87, 2, 0x00b0, 0x10, 18, 1), + PIN_FIELD_BASE(88, 88, 2, 0x00b0, 0x10, 20, 1), + PIN_FIELD_BASE(89, 89, 2, 0x00b0, 0x10, 22, 1), + PIN_FIELD_BASE(90, 90, 2, 0x00b0, 0x10, 21, 1), + PIN_FIELD_BASE(91, 91, 2, 0x00b0, 0x10, 23, 1), + PIN_FIELD_BASE(92, 92, 2, 0x00b0, 0x10, 3, 1), + PIN_FIELD_BASE(93, 93, 2, 0x00b0, 0x10, 2, 1), + PIN_FIELD_BASE(94, 94, 2, 0x00b0, 0x10, 5, 1), + PIN_FIELD_BASE(95, 95, 2, 0x00b0, 0x10, 4, 1), + PIN_FIELD_BASE(96, 96, 2, 0x00a0, 0x10, 31, 1), + PIN_FIELD_BASE(97, 97, 2, 0x00b0, 0x10, 0, 1), + PIN_FIELD_BASE(98, 98, 2, 0x00b0, 0x10, 8, 1), + PIN_FIELD_BASE(99, 99, 2, 0x00a0, 0x10, 30, 1), + PIN_FIELD_BASE(100, 100, 2, 0x00b0, 0x10, 1, 1), + PIN_FIELD_BASE(101, 101, 2, 0x00a0, 0x10, 0, 1), + PIN_FIELD_BASE(102, 102, 2, 0x00a0, 0x10, 5, 1), + PIN_FIELD_BASE(103, 103, 2, 0x00a0, 0x10, 3, 1), + PIN_FIELD_BASE(104, 104, 2, 0x00a0, 0x10, 4, 1), + PIN_FIELD_BASE(105, 105, 2, 0x00a0, 0x10, 1, 1), + PIN_FIELD_BASE(106, 106, 2, 0x00a0, 0x10, 2, 1), + PIN_FIELD_BASE(107, 107, 2, 0x00a0, 0x10, 21, 1), + PIN_FIELD_BASE(108, 108, 2, 0x00a0, 0x10, 16, 1), + PIN_FIELD_BASE(109, 109, 2, 0x00a0, 0x10, 22, 1), + PIN_FIELD_BASE(110, 110, 2, 0x00a0, 0x10, 17, 1), + PIN_FIELD_BASE(111, 111, 2, 0x00a0, 0x10, 18, 1), + PIN_FIELD_BASE(112, 112, 2, 0x00a0, 0x10, 19, 1), + PIN_FIELD_BASE(113, 113, 2, 0x00a0, 0x10, 20, 1), + PIN_FIELD_BASE(114, 114, 2, 0x00a0, 0x10, 28, 1), + PIN_FIELD_BASE(115, 115, 2, 0x00a0, 0x10, 23, 1), + PIN_FIELD_BASE(116, 116, 2, 0x00a0, 0x10, 29, 1), + PIN_FIELD_BASE(117, 117, 2, 0x00a0, 0x10, 24, 1), + PIN_FIELD_BASE(118, 118, 2, 0x00a0, 0x10, 25, 1), + PIN_FIELD_BASE(119, 119, 2, 0x00a0, 0x10, 26, 1), + PIN_FIELD_BASE(120, 120, 2, 0x00a0, 0x10, 27, 1), + PIN_FIELD_BASE(121, 121, 3, 0x0050, 0x10, 8, 1), + PIN_FIELD_BASE(122, 122, 3, 0x0050, 0x10, 11, 1), + PIN_FIELD_BASE(123, 123, 3, 0x0050, 0x10, 10, 1), + PIN_FIELD_BASE(124, 124, 3, 0x0050, 0x10, 9, 1), + PIN_FIELD_BASE(125, 125, 2, 0x00a0, 0x10, 6, 1), + PIN_FIELD_BASE(126, 126, 2, 0x00a0, 0x10, 7, 1), + PIN_FIELD_BASE(127, 127, 2, 0x00a0, 0x10, 8, 1), + PIN_FIELD_BASE(128, 128, 2, 0x00a0, 0x10, 9, 1), + PIN_FIELD_BASE(129, 129, 2, 0x00a0, 0x10, 10, 1), + PIN_FIELD_BASE(130, 130, 2, 0x00a0, 0x10, 11, 1), + PIN_FIELD_BASE(175, 175, 2, 0x00b0, 0x10, 11, 1), + PIN_FIELD_BASE(176, 176, 2, 0x00b0, 0x10, 12, 1), +}; + +static const struct mtk_pin_field_calc mt8188_pin_drv_range[] = { + PIN_FIELD_BASE(0, 0, 1, 0x0000, 0x10, 24, 3), + PIN_FIELD_BASE(1, 1, 1, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(2, 2, 1, 0x0010, 0x10, 0, 3), + PIN_FIELD_BASE(3, 3, 1, 0x0010, 0x10, 3, 3), + PIN_FIELD_BASE(4, 4, 1, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(5, 5, 1, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(6, 6, 1, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(7, 7, 1, 0x0010, 0x10, 6, 3), + PIN_FIELD_BASE(8, 8, 1, 0x0010, 0x10, 9, 3), + PIN_FIELD_BASE(9, 9, 1, 0x0010, 0x10, 12, 3), + PIN_FIELD_BASE(10, 10, 1, 0x0010, 0x10, 15, 3), + PIN_FIELD_BASE(11, 11, 1, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(12, 12, 2, 0x0010, 0x10, 24, 3), + PIN_FIELD_BASE(13, 13, 2, 0x0010, 0x10, 27, 3), + PIN_FIELD_BASE(14, 14, 2, 0x0020, 0x10, 0, 3), + PIN_FIELD_BASE(15, 15, 2, 0x0020, 0x10, 3, 3), + PIN_FIELD_BASE(16, 16, 3, 0x0010, 0x10, 15, 3), + PIN_FIELD_BASE(17, 17, 3, 0x0010, 0x10, 15, 3), + PIN_FIELD_BASE(18, 18, 4, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(19, 19, 4, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(20, 20, 4, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(21, 21, 4, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(22, 22, 4, 0x0000, 0x10, 0, 3), + PIN_FIELD_BASE(23, 23, 4, 0x0000, 0x10, 3, 3), + PIN_FIELD_BASE(24, 24, 4, 0x0000, 0x10, 6, 3), + PIN_FIELD_BASE(25, 25, 1, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(26, 26, 1, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(27, 27, 1, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(28, 28, 1, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(29, 29, 1, 0x0020, 0x10, 3, 3), + PIN_FIELD_BASE(30, 30, 1, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(31, 31, 1, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(32, 32, 1, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(33, 33, 1, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(34, 34, 1, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(35, 35, 1, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(36, 36, 1, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(37, 37, 1, 0x0010, 0x10, 27, 3), + PIN_FIELD_BASE(38, 38, 1, 0x0010, 0x10, 18, 3), + PIN_FIELD_BASE(39, 39, 1, 0x0010, 0x10, 21, 3), + PIN_FIELD_BASE(40, 40, 1, 0x0010, 0x10, 24, 3), + PIN_FIELD_BASE(41, 41, 1, 0x0020, 0x10, 0, 3), + PIN_FIELD_BASE(42, 42, 2, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(43, 43, 2, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(44, 44, 2, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(45, 45, 2, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(46, 46, 3, 0x0010, 0x10, 15, 3), + PIN_FIELD_BASE(47, 47, 1, 0x0020, 0x10, 3, 3), + PIN_FIELD_BASE(48, 48, 1, 0x0020, 0x10, 3, 3), + PIN_FIELD_BASE(49, 49, 1, 0x0020, 0x10, 3, 3), + PIN_FIELD_BASE(50, 50, 3, 0x0000, 0x10, 6, 3), + PIN_FIELD_BASE(51, 51, 3, 0x0000, 0x10, 3, 3), + PIN_FIELD_BASE(52, 52, 3, 0x0000, 0x10, 0, 3), + PIN_FIELD_BASE(53, 53, 3, 0x0000, 0x10, 9, 3), + PIN_FIELD_BASE(54, 54, 3, 0x0000, 0x10, 12, 3), + PIN_FIELD_BASE(55, 55, 1, 0x0020, 0x10, 27, 3), + PIN_FIELD_BASE(56, 56, 1, 0x0030, 0x10, 6, 3), + PIN_FIELD_BASE(57, 57, 2, 0x0030, 0x10, 9, 3), + PIN_FIELD_BASE(58, 58, 2, 0x0030, 0x10, 15, 3), + PIN_FIELD_BASE(59, 59, 1, 0x0030, 0x10, 0, 3), + PIN_FIELD_BASE(60, 60, 1, 0x0030, 0x10, 9, 3), + PIN_FIELD_BASE(61, 61, 1, 0x0030, 0x10, 3, 3), + PIN_FIELD_BASE(62, 62, 1, 0x0030, 0x10, 12, 3), + PIN_FIELD_BASE(63, 63, 2, 0x0030, 0x10, 12, 3), + PIN_FIELD_BASE(64, 64, 2, 0x0030, 0x10, 18, 3), + PIN_FIELD_BASE(65, 65, 4, 0x0010, 0x10, 0, 3), + PIN_FIELD_BASE(66, 66, 4, 0x0010, 0x10, 6, 3), + PIN_FIELD_BASE(67, 67, 4, 0x0010, 0x10, 3, 3), + PIN_FIELD_BASE(68, 68, 4, 0x0010, 0x10, 9, 3), + PIN_FIELD_BASE(69, 69, 1, 0x0030, 0x10, 18, 3), + PIN_FIELD_BASE(70, 70, 1, 0x0030, 0x10, 15, 3), + PIN_FIELD_BASE(71, 71, 1, 0x0040, 0x10, 0, 3), + PIN_FIELD_BASE(72, 72, 1, 0x0030, 0x10, 27, 3), + PIN_FIELD_BASE(73, 73, 1, 0x0030, 0x10, 21, 3), + PIN_FIELD_BASE(74, 74, 1, 0x0030, 0x10, 24, 3), + PIN_FIELD_BASE(75, 75, 1, 0x0040, 0x10, 6, 3), + PIN_FIELD_BASE(76, 76, 1, 0x0040, 0x10, 3, 3), + PIN_FIELD_BASE(77, 77, 1, 0x0040, 0x10, 12, 3), + PIN_FIELD_BASE(78, 78, 1, 0x0040, 0x10, 9, 3), + PIN_FIELD_BASE(79, 79, 4, 0x0010, 0x10, 15, 3), + PIN_FIELD_BASE(80, 80, 4, 0x0010, 0x10, 12, 3), + PIN_FIELD_BASE(81, 81, 4, 0x0010, 0x10, 21, 3), + PIN_FIELD_BASE(82, 82, 4, 0x0010, 0x10, 18, 3), + PIN_FIELD_BASE(83, 83, 2, 0x0030, 0x10, 0, 3), + PIN_FIELD_BASE(84, 84, 2, 0x0020, 0x10, 27, 3), + PIN_FIELD_BASE(85, 85, 2, 0x0030, 0x10, 0, 3), + PIN_FIELD_BASE(86, 86, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(87, 87, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(88, 88, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(89, 89, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(90, 90, 2, 0x0030, 0x10, 0, 3), + PIN_FIELD_BASE(91, 91, 2, 0x0030, 0x10, 0, 3), + PIN_FIELD_BASE(92, 92, 2, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(93, 93, 2, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(94, 94, 2, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(95, 95, 2, 0x0020, 0x10, 9, 3), + PIN_FIELD_BASE(96, 96, 2, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(97, 97, 2, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(98, 98, 2, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(99, 99, 2, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(100, 100, 2, 0x0030, 0x10, 6, 3), + PIN_FIELD_BASE(101, 101, 2, 0x0000, 0x10, 0, 3), + PIN_FIELD_BASE(102, 102, 2, 0x0000, 0x10, 15, 3), + PIN_FIELD_BASE(103, 103, 2, 0x0000, 0x10, 9, 3), + PIN_FIELD_BASE(104, 104, 2, 0x0000, 0x10, 12, 3), + PIN_FIELD_BASE(105, 105, 2, 0x0000, 0x10, 3, 3), + PIN_FIELD_BASE(106, 106, 2, 0x0000, 0x10, 6, 3), + PIN_FIELD_BASE(107, 107, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(108, 108, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(109, 109, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(110, 110, 2, 0x0020, 0x10, 6, 3), + PIN_FIELD_BASE(111, 111, 2, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(112, 112, 2, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(113, 113, 2, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(114, 114, 2, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(115, 115, 2, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(116, 116, 2, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(117, 117, 2, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(118, 118, 2, 0x0020, 0x10, 12, 3), + PIN_FIELD_BASE(119, 119, 2, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(120, 120, 2, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(121, 121, 3, 0x0010, 0x10, 3, 3), + PIN_FIELD_BASE(122, 122, 3, 0x0010, 0x10, 12, 3), + PIN_FIELD_BASE(123, 123, 3, 0x0010, 0x10, 9, 3), + PIN_FIELD_BASE(124, 124, 3, 0x0010, 0x10, 6, 3), + PIN_FIELD_BASE(125, 125, 2, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(126, 126, 2, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(127, 127, 2, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(128, 128, 2, 0x0020, 0x10, 27, 3), + PIN_FIELD_BASE(129, 129, 2, 0x0020, 0x10, 27, 3), + PIN_FIELD_BASE(130, 130, 2, 0x0020, 0x10, 27, 3), + PIN_FIELD_BASE(131, 131, 1, 0x0000, 0x10, 0, 3), + PIN_FIELD_BASE(132, 132, 1, 0x0000, 0x10, 15, 3), + PIN_FIELD_BASE(133, 133, 1, 0x0000, 0x10, 18, 3), + PIN_FIELD_BASE(134, 134, 1, 0x0000, 0x10, 21, 3), + PIN_FIELD_BASE(135, 135, 1, 0x0020, 0x10, 15, 3), + PIN_FIELD_BASE(136, 136, 1, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(137, 137, 1, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(138, 138, 1, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(139, 139, 1, 0x0020, 0x10, 18, 3), + PIN_FIELD_BASE(140, 140, 1, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(141, 141, 1, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(142, 142, 1, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(143, 143, 1, 0x0000, 0x10, 3, 3), + PIN_FIELD_BASE(144, 144, 1, 0x0000, 0x10, 6, 3), + PIN_FIELD_BASE(145, 145, 1, 0x0000, 0x10, 9, 3), + PIN_FIELD_BASE(146, 146, 1, 0x0000, 0x10, 12, 3), + PIN_FIELD_BASE(147, 147, 1, 0x0020, 0x10, 21, 3), + PIN_FIELD_BASE(148, 148, 1, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(149, 149, 1, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(150, 150, 1, 0x0020, 0x10, 24, 3), + PIN_FIELD_BASE(151, 151, 2, 0x0010, 0x10, 15, 3), + PIN_FIELD_BASE(152, 152, 2, 0x0010, 0x10, 12, 3), + PIN_FIELD_BASE(153, 153, 2, 0x0010, 0x10, 9, 3), + PIN_FIELD_BASE(154, 154, 2, 0x0010, 0x10, 6, 3), + PIN_FIELD_BASE(155, 155, 2, 0x0010, 0x10, 21, 3), + PIN_FIELD_BASE(156, 156, 2, 0x0000, 0x10, 21, 3), + PIN_FIELD_BASE(157, 157, 2, 0x0000, 0x10, 18, 3), + PIN_FIELD_BASE(158, 158, 2, 0x0010, 0x10, 3, 3), + PIN_FIELD_BASE(159, 159, 2, 0x0010, 0x10, 0, 3), + PIN_FIELD_BASE(160, 160, 2, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(161, 161, 2, 0x0000, 0x10, 24, 3), + PIN_FIELD_BASE(162, 162, 2, 0x0010, 0x10, 18, 3), + PIN_FIELD_BASE(163, 163, 4, 0x0000, 0x10, 12, 3), + PIN_FIELD_BASE(164, 164, 4, 0x0000, 0x10, 9, 3), + PIN_FIELD_BASE(165, 165, 4, 0x0000, 0x10, 15, 3), + PIN_FIELD_BASE(166, 166, 4, 0x0000, 0x10, 18, 3), + PIN_FIELD_BASE(167, 167, 4, 0x0000, 0x10, 21, 3), + PIN_FIELD_BASE(168, 168, 4, 0x0000, 0x10, 24, 3), + PIN_FIELD_BASE(169, 169, 3, 0x0000, 0x10, 18, 3), + PIN_FIELD_BASE(170, 170, 3, 0x0000, 0x10, 15, 3), + PIN_FIELD_BASE(171, 171, 3, 0x0000, 0x10, 21, 3), + PIN_FIELD_BASE(172, 172, 3, 0x0000, 0x10, 24, 3), + PIN_FIELD_BASE(173, 173, 3, 0x0000, 0x10, 27, 3), + PIN_FIELD_BASE(174, 174, 3, 0x0010, 0x10, 0, 3), + PIN_FIELD_BASE(175, 175, 2, 0x0030, 0x10, 3, 3), + PIN_FIELD_BASE(176, 176, 2, 0x0030, 0x10, 3, 3), +}; + +static const struct mtk_pin_field_calc mt8188_pin_drv_adv_range[] = { + PIN_FIELD_BASE(53, 53, 3, 0x0020, 0x10, 0, 3), + PIN_FIELD_BASE(54, 54, 3, 0x0020, 0x10, 3, 3), + PIN_FIELD_BASE(55, 55, 1, 0x0060, 0x10, 0, 3), + PIN_FIELD_BASE(56, 56, 1, 0x0060, 0x10, 9, 3), + PIN_FIELD_BASE(57, 57, 2, 0x0050, 0x10, 0, 3), + PIN_FIELD_BASE(58, 58, 2, 0x0050, 0x10, 6, 3), + PIN_FIELD_BASE(59, 59, 1, 0x0060, 0x10, 3, 3), + PIN_FIELD_BASE(60, 60, 1, 0x0060, 0x10, 12, 3), + PIN_FIELD_BASE(61, 61, 1, 0x0060, 0x10, 6, 3), + PIN_FIELD_BASE(62, 62, 1, 0x0060, 0x10, 15, 3), + PIN_FIELD_BASE(63, 63, 2, 0x0050, 0x10, 3, 3), + PIN_FIELD_BASE(64, 64, 2, 0x0050, 0x10, 9, 3), + PIN_FIELD_BASE(65, 65, 4, 0x0030, 0x10, 0, 3), + PIN_FIELD_BASE(66, 66, 4, 0x0030, 0x10, 6, 3), + PIN_FIELD_BASE(67, 67, 4, 0x0030, 0x10, 3, 3), + PIN_FIELD_BASE(68, 68, 4, 0x0030, 0x10, 9, 3), + PIN_FIELD_BASE(175, 175, 2, 0x0050, 0x10, 12, 3), + PIN_FIELD_BASE(176, 176, 2, 0x0050, 0x10, 15, 3), +}; + +static const struct mtk_pin_field_calc mt8188_pin_rsel_range[] = { + PIN_FIELD_BASE(53, 53, 3, 0x00c0, 0x10, 0, 3), + PIN_FIELD_BASE(54, 54, 3, 0x00c0, 0x10, 3, 3), + PIN_FIELD_BASE(55, 55, 1, 0x0160, 0x10, 0, 3), + PIN_FIELD_BASE(56, 56, 1, 0x0160, 0x10, 9, 3), + PIN_FIELD_BASE(57, 57, 2, 0x0150, 0x10, 0, 3), + PIN_FIELD_BASE(58, 58, 2, 0x0150, 0x10, 6, 3), + PIN_FIELD_BASE(59, 59, 1, 0x0160, 0x10, 3, 3), + PIN_FIELD_BASE(60, 60, 1, 0x0160, 0x10, 12, 3), + PIN_FIELD_BASE(61, 61, 1, 0x0160, 0x10, 6, 3), + PIN_FIELD_BASE(62, 62, 1, 0x0160, 0x10, 15, 3), + PIN_FIELD_BASE(63, 63, 2, 0x0150, 0x10, 3, 3), + PIN_FIELD_BASE(64, 64, 2, 0x0150, 0x10, 9, 3), + PIN_FIELD_BASE(65, 65, 4, 0x00d0, 0x10, 0, 3), + PIN_FIELD_BASE(66, 66, 4, 0x00d0, 0x10, 6, 3), + PIN_FIELD_BASE(67, 67, 4, 0x00d0, 0x10, 3, 3), + PIN_FIELD_BASE(68, 68, 4, 0x00d0, 0x10, 9, 3), + PIN_FIELD_BASE(175, 175, 2, 0x0150, 0x10, 12, 3), + PIN_FIELD_BASE(176, 176, 2, 0x0150, 0x10, 15, 3), +}; + +static const struct mtk_pin_rsel mt8188_pin_rsel_val_range[] = { + PIN_RSEL(53, 68, 0x0, 75000, 75000), + PIN_RSEL(53, 68, 0x1, 10000, 5000), + PIN_RSEL(53, 68, 0x2, 5000, 75000), + PIN_RSEL(53, 68, 0x3, 4000, 5000), + PIN_RSEL(53, 68, 0x4, 3000, 75000), + PIN_RSEL(53, 68, 0x5, 2000, 5000), + PIN_RSEL(53, 68, 0x6, 1500, 75000), + PIN_RSEL(53, 68, 0x7, 1000, 5000), + PIN_RSEL(175, 176, 0x0, 75000, 75000), + PIN_RSEL(175, 176, 0x1, 10000, 5000), + PIN_RSEL(175, 176, 0x2, 5000, 75000), + PIN_RSEL(175, 176, 0x3, 4000, 5000), + PIN_RSEL(175, 176, 0x4, 3000, 75000), + PIN_RSEL(175, 176, 0x5, 2000, 5000), + PIN_RSEL(175, 176, 0x6, 1500, 75000), + PIN_RSEL(175, 176, 0x7, 1000, 5000), +}; + +static const unsigned int mt8188_pull_type[] = { + MTK_PULL_PU_PD_TYPE, /*0*/ + MTK_PULL_PU_PD_TYPE, /*1*/ + MTK_PULL_PU_PD_TYPE, /*2*/ + MTK_PULL_PU_PD_TYPE, /*3*/ + MTK_PULL_PU_PD_TYPE, /*4*/ + MTK_PULL_PU_PD_TYPE, /*5*/ + MTK_PULL_PU_PD_TYPE, /*6*/ + MTK_PULL_PU_PD_TYPE, /*7*/ + MTK_PULL_PU_PD_TYPE, /*8*/ + MTK_PULL_PU_PD_TYPE, /*9*/ + MTK_PULL_PU_PD_TYPE, /*10*/ + MTK_PULL_PU_PD_TYPE, /*11*/ + MTK_PULL_PU_PD_TYPE, /*12*/ + MTK_PULL_PU_PD_TYPE, /*13*/ + MTK_PULL_PU_PD_TYPE, /*14*/ + MTK_PULL_PU_PD_TYPE, /*15*/ + MTK_PULL_PU_PD_TYPE, /*16*/ + MTK_PULL_PU_PD_TYPE, /*17*/ + MTK_PULL_PU_PD_TYPE, /*18*/ + MTK_PULL_PU_PD_TYPE, /*19*/ + MTK_PULL_PU_PD_TYPE, /*20*/ + MTK_PULL_PU_PD_TYPE, /*21*/ + MTK_PULL_PU_PD_TYPE, /*22*/ + MTK_PULL_PU_PD_TYPE, /*23*/ + MTK_PULL_PU_PD_TYPE, /*24*/ + MTK_PULL_PU_PD_TYPE, /*25*/ + MTK_PULL_PU_PD_TYPE, /*26*/ + MTK_PULL_PU_PD_TYPE, /*27*/ + MTK_PULL_PU_PD_TYPE, /*28*/ + MTK_PULL_PU_PD_TYPE, /*29*/ + MTK_PULL_PU_PD_TYPE, /*30*/ + MTK_PULL_PU_PD_TYPE, /*31*/ + MTK_PULL_PU_PD_TYPE, /*32*/ + MTK_PULL_PU_PD_TYPE, /*33*/ + MTK_PULL_PU_PD_TYPE, /*34*/ + MTK_PULL_PU_PD_TYPE, /*35*/ + MTK_PULL_PU_PD_TYPE, /*36*/ + MTK_PULL_PU_PD_TYPE, /*37*/ + MTK_PULL_PU_PD_TYPE, /*38*/ + MTK_PULL_PU_PD_TYPE, /*39*/ + MTK_PULL_PU_PD_TYPE, /*40*/ + MTK_PULL_PU_PD_TYPE, /*41*/ + MTK_PULL_PUPD_R1R0_TYPE, /*42*/ + MTK_PULL_PUPD_R1R0_TYPE, /*43*/ + MTK_PULL_PUPD_R1R0_TYPE, /*44*/ + MTK_PULL_PUPD_R1R0_TYPE, /*45*/ + MTK_PULL_PU_PD_TYPE, /*46*/ + MTK_PULL_PU_PD_TYPE, /*47*/ + MTK_PULL_PU_PD_TYPE, /*48*/ + MTK_PULL_PU_PD_TYPE, /*49*/ + MTK_PULL_PU_PD_TYPE, /*50*/ + MTK_PULL_PU_PD_TYPE, /*51*/ + MTK_PULL_PU_PD_TYPE, /*52*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*53*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*54*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*55*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*56*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*57*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*58*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*59*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*60*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*61*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*62*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*63*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*64*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*65*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*66*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*67*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*68*/ + MTK_PULL_PU_PD_TYPE, /*69*/ + MTK_PULL_PU_PD_TYPE, /*70*/ + MTK_PULL_PU_PD_TYPE, /*71*/ + MTK_PULL_PU_PD_TYPE, /*72*/ + MTK_PULL_PU_PD_TYPE, /*73*/ + MTK_PULL_PU_PD_TYPE, /*74*/ + MTK_PULL_PU_PD_TYPE, /*75*/ + MTK_PULL_PU_PD_TYPE, /*76*/ + MTK_PULL_PU_PD_TYPE, /*77*/ + MTK_PULL_PU_PD_TYPE, /*78*/ + MTK_PULL_PU_PD_TYPE, /*79*/ + MTK_PULL_PU_PD_TYPE, /*80*/ + MTK_PULL_PU_PD_TYPE, /*81*/ + MTK_PULL_PU_PD_TYPE, /*82*/ + MTK_PULL_PU_PD_TYPE, /*83*/ + MTK_PULL_PU_PD_TYPE, /*84*/ + MTK_PULL_PU_PD_TYPE, /*85*/ + MTK_PULL_PU_PD_TYPE, /*86*/ + MTK_PULL_PU_PD_TYPE, /*87*/ + MTK_PULL_PU_PD_TYPE, /*88*/ + MTK_PULL_PU_PD_TYPE, /*89*/ + MTK_PULL_PU_PD_TYPE, /*90*/ + MTK_PULL_PU_PD_TYPE, /*91*/ + MTK_PULL_PU_PD_TYPE, /*92*/ + MTK_PULL_PU_PD_TYPE, /*93*/ + MTK_PULL_PU_PD_TYPE, /*94*/ + MTK_PULL_PU_PD_TYPE, /*95*/ + MTK_PULL_PU_PD_TYPE, /*96*/ + MTK_PULL_PU_PD_TYPE, /*97*/ + MTK_PULL_PU_PD_TYPE, /*98*/ + MTK_PULL_PU_PD_TYPE, /*99*/ + MTK_PULL_PU_PD_TYPE, /*100*/ + MTK_PULL_PU_PD_TYPE, /*101*/ + MTK_PULL_PU_PD_TYPE, /*102*/ + MTK_PULL_PU_PD_TYPE, /*103*/ + MTK_PULL_PU_PD_TYPE, /*104*/ + MTK_PULL_PU_PD_TYPE, /*105*/ + MTK_PULL_PU_PD_TYPE, /*106*/ + MTK_PULL_PU_PD_TYPE, /*107*/ + MTK_PULL_PU_PD_TYPE, /*108*/ + MTK_PULL_PU_PD_TYPE, /*109*/ + MTK_PULL_PU_PD_TYPE, /*110*/ + MTK_PULL_PU_PD_TYPE, /*111*/ + MTK_PULL_PU_PD_TYPE, /*112*/ + MTK_PULL_PU_PD_TYPE, /*113*/ + MTK_PULL_PU_PD_TYPE, /*114*/ + MTK_PULL_PU_PD_TYPE, /*115*/ + MTK_PULL_PU_PD_TYPE, /*116*/ + MTK_PULL_PU_PD_TYPE, /*117*/ + MTK_PULL_PU_PD_TYPE, /*118*/ + MTK_PULL_PU_PD_TYPE, /*119*/ + MTK_PULL_PU_PD_TYPE, /*120*/ + MTK_PULL_PU_PD_TYPE, /*121*/ + MTK_PULL_PU_PD_TYPE, /*122*/ + MTK_PULL_PU_PD_TYPE, /*123*/ + MTK_PULL_PU_PD_TYPE, /*124*/ + MTK_PULL_PU_PD_TYPE, /*125*/ + MTK_PULL_PU_PD_TYPE, /*126*/ + MTK_PULL_PU_PD_TYPE, /*127*/ + MTK_PULL_PU_PD_TYPE, /*128*/ + MTK_PULL_PU_PD_TYPE, /*129*/ + MTK_PULL_PU_PD_TYPE, /*130*/ + MTK_PULL_PUPD_R1R0_TYPE, /*131*/ + MTK_PULL_PUPD_R1R0_TYPE, /*132*/ + MTK_PULL_PUPD_R1R0_TYPE, /*133*/ + MTK_PULL_PUPD_R1R0_TYPE, /*134*/ + MTK_PULL_PUPD_R1R0_TYPE, /*135*/ + MTK_PULL_PUPD_R1R0_TYPE, /*136*/ + MTK_PULL_PUPD_R1R0_TYPE, /*137*/ + MTK_PULL_PUPD_R1R0_TYPE, /*138*/ + MTK_PULL_PUPD_R1R0_TYPE, /*139*/ + MTK_PULL_PUPD_R1R0_TYPE, /*140*/ + MTK_PULL_PUPD_R1R0_TYPE, /*141*/ + MTK_PULL_PUPD_R1R0_TYPE, /*142*/ + MTK_PULL_PUPD_R1R0_TYPE, /*143*/ + MTK_PULL_PUPD_R1R0_TYPE, /*144*/ + MTK_PULL_PUPD_R1R0_TYPE, /*145*/ + MTK_PULL_PUPD_R1R0_TYPE, /*146*/ + MTK_PULL_PUPD_R1R0_TYPE, /*147*/ + MTK_PULL_PUPD_R1R0_TYPE, /*148*/ + MTK_PULL_PUPD_R1R0_TYPE, /*149*/ + MTK_PULL_PUPD_R1R0_TYPE, /*150*/ + MTK_PULL_PUPD_R1R0_TYPE, /*151*/ + MTK_PULL_PUPD_R1R0_TYPE, /*152*/ + MTK_PULL_PUPD_R1R0_TYPE, /*153*/ + MTK_PULL_PUPD_R1R0_TYPE, /*154*/ + MTK_PULL_PUPD_R1R0_TYPE, /*155*/ + MTK_PULL_PUPD_R1R0_TYPE, /*156*/ + MTK_PULL_PUPD_R1R0_TYPE, /*157*/ + MTK_PULL_PUPD_R1R0_TYPE, /*158*/ + MTK_PULL_PUPD_R1R0_TYPE, /*159*/ + MTK_PULL_PUPD_R1R0_TYPE, /*160*/ + MTK_PULL_PUPD_R1R0_TYPE, /*161*/ + MTK_PULL_PUPD_R1R0_TYPE, /*162*/ + MTK_PULL_PUPD_R1R0_TYPE, /*163*/ + MTK_PULL_PUPD_R1R0_TYPE, /*164*/ + MTK_PULL_PUPD_R1R0_TYPE, /*165*/ + MTK_PULL_PUPD_R1R0_TYPE, /*166*/ + MTK_PULL_PUPD_R1R0_TYPE, /*167*/ + MTK_PULL_PUPD_R1R0_TYPE, /*168*/ + MTK_PULL_PUPD_R1R0_TYPE, /*169*/ + MTK_PULL_PUPD_R1R0_TYPE, /*170*/ + MTK_PULL_PUPD_R1R0_TYPE, /*171*/ + MTK_PULL_PUPD_R1R0_TYPE, /*172*/ + MTK_PULL_PUPD_R1R0_TYPE, /*173*/ + MTK_PULL_PUPD_R1R0_TYPE, /*174*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*175*/ + MTK_PULL_PU_PD_RSEL_TYPE, /*176*/ +}; + +static const struct mtk_pin_reg_calc mt8188_reg_cals[PINCTRL_PIN_REG_MAX] = { + [PINCTRL_PIN_REG_MODE] = MTK_RANGE(mt8188_pin_mode_range), + [PINCTRL_PIN_REG_DIR] = MTK_RANGE(mt8188_pin_dir_range), + [PINCTRL_PIN_REG_DI] = MTK_RANGE(mt8188_pin_di_range), + [PINCTRL_PIN_REG_DO] = MTK_RANGE(mt8188_pin_do_range), + [PINCTRL_PIN_REG_SMT] = MTK_RANGE(mt8188_pin_smt_range), + [PINCTRL_PIN_REG_IES] = MTK_RANGE(mt8188_pin_ies_range), + [PINCTRL_PIN_REG_TDSEL] = MTK_RANGE(mt8188_pin_tdsel_range), + [PINCTRL_PIN_REG_RDSEL] = MTK_RANGE(mt8188_pin_rdsel_range), + [PINCTRL_PIN_REG_PUPD] = MTK_RANGE(mt8188_pin_pupd_range), + [PINCTRL_PIN_REG_R0] = MTK_RANGE(mt8188_pin_r0_range), + [PINCTRL_PIN_REG_R1] = MTK_RANGE(mt8188_pin_r1_range), + [PINCTRL_PIN_REG_PU] = MTK_RANGE(mt8188_pin_pu_range), + [PINCTRL_PIN_REG_PD] = MTK_RANGE(mt8188_pin_pd_range), + [PINCTRL_PIN_REG_DRV] = MTK_RANGE(mt8188_pin_drv_range), + [PINCTRL_PIN_REG_DRV_ADV] = MTK_RANGE(mt8188_pin_drv_adv_range), + [PINCTRL_PIN_REG_RSEL] = MTK_RANGE(mt8188_pin_rsel_range), +}; + +static const char * const mt8188_pinctrl_register_base_name[] = { + "iocfg0", "iocfg_rm", "iocfg_lt", "iocfg_lm", "iocfg_rt", +}; + +static const struct mtk_eint_hw mt8188_eint_hw = { + .port_mask = 0xf, + .ports = 7, + .ap_num = 225, + .db_cnt = 32, +}; + +static const struct mtk_pin_soc mt8188_data = { + .reg_cal = mt8188_reg_cals, + .pins = mtk_pins_mt8188, + .npins = ARRAY_SIZE(mtk_pins_mt8188), + .ngrps = ARRAY_SIZE(mtk_pins_mt8188), + .eint_hw = &mt8188_eint_hw, + .nfuncs = 8, + .gpio_m = 0, + .base_names = mt8188_pinctrl_register_base_name, + .nbase_names = ARRAY_SIZE(mt8188_pinctrl_register_base_name), + .pull_type = mt8188_pull_type, + .pin_rsel = mt8188_pin_rsel_val_range, + .npin_rsel = ARRAY_SIZE(mt8188_pin_rsel_val_range), + .bias_set_combo = mtk_pinconf_bias_set_combo, + .bias_get_combo = mtk_pinconf_bias_get_combo, + .drive_set = mtk_pinconf_drive_set_rev1, + .drive_get = mtk_pinconf_drive_get_rev1, + .adv_drive_set = mtk_pinconf_adv_drive_set_raw, + .adv_drive_get = mtk_pinconf_adv_drive_get_raw, +}; + +static const struct of_device_id mt8188_pinctrl_of_match[] = { + { .compatible = "mediatek,mt8188-pinctrl", .data = &mt8188_data }, + { } +}; + +static struct platform_driver mt8188_pinctrl_driver = { + .driver = { + .name = "mt8188-pinctrl", + .of_match_table = mt8188_pinctrl_of_match, + .pm = &mtk_paris_pinctrl_pm_ops + }, + .probe = mtk_paris_pinctrl_probe, +}; + +static int __init mt8188_pinctrl_init(void) +{ + return platform_driver_register(&mt8188_pinctrl_driver); +} + +arch_initcall(mt8188_pinctrl_init); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MediaTek MT8188 Pinctrl Driver"); diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-mt8188.h b/drivers/pinctrl/mediatek/pinctrl-mtk-mt8188.h new file mode 100644 index 0000000000000..a487323748e27 --- /dev/null +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-mt8188.h @@ -0,0 +1,2259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 MediaTek Inc. + * Author: Hui Liu + * + */ + +#ifndef __PINCTRL_MTK_MT8188_H +#define __PINCTRL_MTK_MT8188_H + +#include "pinctrl-paris.h" + +static const struct mtk_pin_desc mtk_pins_mt8188[] = { + MTK_PIN( + 0, "GPIO0", + MTK_EINT_FUNCTION(0, 0), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO0"), + MTK_FUNCTION(1, "B0_TP_GPIO0_AO"), + MTK_FUNCTION(2, "O_SPIM5_CSB"), + MTK_FUNCTION(3, "O_UTXD1"), + MTK_FUNCTION(4, "O_DMIC3_CLK"), + MTK_FUNCTION(5, "B0_I2SIN_MCK"), + MTK_FUNCTION(6, "O_I2SO2_MCK"), + MTK_FUNCTION(7, "B0_DBG_MON_A0") + ), + + MTK_PIN( + 1, "GPIO1", + MTK_EINT_FUNCTION(0, 1), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO1"), + MTK_FUNCTION(1, "B0_TP_GPIO1_AO"), + MTK_FUNCTION(2, "O_SPIM5_CLK"), + MTK_FUNCTION(3, "I1_URXD1"), + MTK_FUNCTION(4, "I0_DMIC3_DAT"), + MTK_FUNCTION(5, "B0_I2SIN_BCK"), + MTK_FUNCTION(6, "B0_I2SO2_BCK"), + MTK_FUNCTION(7, "B0_DBG_MON_A1") + ), + + MTK_PIN( + 2, "GPIO2", + MTK_EINT_FUNCTION(0, 2), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO2"), + MTK_FUNCTION(1, "B0_TP_GPIO2_AO"), + MTK_FUNCTION(2, "B0_SPIM5_MOSI"), + MTK_FUNCTION(3, "O_URTS1"), + MTK_FUNCTION(4, "I0_DMIC3_DAT_R"), + MTK_FUNCTION(5, "B0_I2SIN_WS"), + MTK_FUNCTION(6, "B0_I2SO2_WS"), + MTK_FUNCTION(7, "B0_DBG_MON_A2") + ), + + MTK_PIN( + 3, "GPIO3", + MTK_EINT_FUNCTION(0, 3), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO3"), + MTK_FUNCTION(1, "B0_TP_GPIO3_AO"), + MTK_FUNCTION(2, "B0_SPIM5_MISO"), + MTK_FUNCTION(3, "I1_UCTS1"), + MTK_FUNCTION(4, "O_DMIC4_CLK"), + MTK_FUNCTION(5, "I0_I2SIN_D0"), + MTK_FUNCTION(6, "O_I2SO2_D0"), + MTK_FUNCTION(7, "B0_DBG_MON_A3") + ), + + MTK_PIN( + 4, "GPIO4", + MTK_EINT_FUNCTION(0, 4), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO4"), + MTK_FUNCTION(1, "B0_TP_GPIO4_AO"), + MTK_FUNCTION(2, "I0_SPDIF_IN2"), + MTK_FUNCTION(3, "O_I2SO1_MCK"), + MTK_FUNCTION(4, "I0_DMIC4_DAT"), + MTK_FUNCTION(5, "I0_I2SIN_D1"), + MTK_FUNCTION(6, "O_I2SO2_D1"), + MTK_FUNCTION(7, "B0_DBG_MON_A4") + ), + + MTK_PIN( + 5, "GPIO5", + MTK_EINT_FUNCTION(0, 5), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO5"), + MTK_FUNCTION(1, "B0_TP_GPIO5_AO"), + MTK_FUNCTION(2, "I0_SPDIF_IN1"), + MTK_FUNCTION(3, "O_I2SO1_BCK"), + MTK_FUNCTION(4, "I0_DMIC4_DAT_R"), + MTK_FUNCTION(5, "I0_I2SIN_D2"), + MTK_FUNCTION(6, "O_I2SO2_D2"), + MTK_FUNCTION(7, "B0_DBG_MON_A5") + ), + + MTK_PIN( + 6, "GPIO6", + MTK_EINT_FUNCTION(0, 6), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO6"), + MTK_FUNCTION(1, "B0_TP_GPIO6_AO"), + MTK_FUNCTION(2, "I0_SPDIF_IN0"), + MTK_FUNCTION(3, "O_I2SO1_WS"), + MTK_FUNCTION(4, "O_DMIC1_CLK"), + MTK_FUNCTION(5, "I0_I2SIN_D3"), + MTK_FUNCTION(6, "O_I2SO2_D3"), + MTK_FUNCTION(7, "B0_MD32_0_GPIO0") + ), + + MTK_PIN( + 7, "GPIO7", + MTK_EINT_FUNCTION(0, 7), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO7"), + MTK_FUNCTION(1, "B0_TP_GPIO7_AO"), + MTK_FUNCTION(2, "O_SPIM3_CSB"), + MTK_FUNCTION(3, "B0_TDMIN_MCK"), + MTK_FUNCTION(4, "I0_DMIC1_DAT"), + MTK_FUNCTION(5, "O_CMVREF0"), + MTK_FUNCTION(6, "O_CLKM0"), + MTK_FUNCTION(7, "B0_DBG_MON_A6") + ), + + MTK_PIN( + 8, "GPIO8", + MTK_EINT_FUNCTION(0, 8), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO8"), + MTK_FUNCTION(1, "B0_TP_GPIO0_AO"), + MTK_FUNCTION(2, "O_SPIM3_CLK"), + MTK_FUNCTION(3, "B0_TDMIN_BCK"), + MTK_FUNCTION(4, "I0_DMIC1_DAT_R"), + MTK_FUNCTION(5, "O_CMVREF1"), + MTK_FUNCTION(6, "O_CLKM1"), + MTK_FUNCTION(7, "B0_DBG_MON_A7") + ), + + MTK_PIN( + 9, "GPIO9", + MTK_EINT_FUNCTION(0, 9), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO9"), + MTK_FUNCTION(1, "B0_TP_GPIO1_AO"), + MTK_FUNCTION(2, "B0_SPIM3_MOSI"), + MTK_FUNCTION(3, "B0_TDMIN_LRCK"), + MTK_FUNCTION(4, "O_DMIC2_CLK"), + MTK_FUNCTION(5, "O_CMFLASH0"), + MTK_FUNCTION(6, "O_PWM_0"), + MTK_FUNCTION(7, "B0_DBG_MON_A8") + ), + + MTK_PIN( + 10, "GPIO10", + MTK_EINT_FUNCTION(0, 10), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO10"), + MTK_FUNCTION(1, "B0_TP_GPIO2_AO"), + MTK_FUNCTION(2, "B0_SPIM3_MISO"), + MTK_FUNCTION(3, "I0_TDMIN_DI"), + MTK_FUNCTION(4, "I0_DMIC2_DAT"), + MTK_FUNCTION(5, "O_CMFLASH1"), + MTK_FUNCTION(6, "O_PWM_1"), + MTK_FUNCTION(7, "B0_DBG_MON_A9") + ), + + MTK_PIN( + 11, "GPIO11", + MTK_EINT_FUNCTION(0, 11), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO11"), + MTK_FUNCTION(1, "B0_TP_GPIO3_AO"), + MTK_FUNCTION(2, "O_SPDIF_OUT"), + MTK_FUNCTION(3, "O_I2SO1_D0"), + MTK_FUNCTION(4, "I0_DMIC2_DAT_R"), + MTK_FUNCTION(5, "I0_DVFSRC_EXT_REQ"), + MTK_FUNCTION(6, "O_CMVREF6"), + MTK_FUNCTION(7, "B0_DBG_MON_A10") + ), + + MTK_PIN( + 12, "GPIO12", + MTK_EINT_FUNCTION(0, 12), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO12"), + MTK_FUNCTION(1, "B0_TP_GPIO4_AO"), + MTK_FUNCTION(2, "O_SPIM4_CSB"), + MTK_FUNCTION(3, "B1_JTMS_SEL3"), + MTK_FUNCTION(4, "B1_APU_JTAG_TMS"), + MTK_FUNCTION(5, "I0_VPU_UDI_TMS"), + MTK_FUNCTION(6, "I0_IPU_JTAG_TMS"), + MTK_FUNCTION(7, "I0_HDMITX20_HTPLG") + ), + + MTK_PIN( + 13, "GPIO13", + MTK_EINT_FUNCTION(0, 13), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO13"), + MTK_FUNCTION(1, "B0_TP_GPIO5_AO"), + MTK_FUNCTION(2, "O_SPIM4_CLK"), + MTK_FUNCTION(3, "I0_JTCK_SEL3"), + MTK_FUNCTION(4, "I0_APU_JTAG_TCK"), + MTK_FUNCTION(5, "I0_VPU_UDI_TCK"), + MTK_FUNCTION(6, "I0_IPU_JTAG_TCK"), + MTK_FUNCTION(7, "B1_HDMITX20_CEC") + ), + + MTK_PIN( + 14, "GPIO14", + MTK_EINT_FUNCTION(0, 14), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO14"), + MTK_FUNCTION(1, "B0_TP_GPIO6_AO"), + MTK_FUNCTION(2, "B0_SPIM4_MOSI"), + MTK_FUNCTION(3, "I1_JTDI_SEL3"), + MTK_FUNCTION(4, "I1_APU_JTAG_TDI"), + MTK_FUNCTION(5, "I0_VPU_UDI_TDI"), + MTK_FUNCTION(6, "I0_IPU_JTAG_TDI"), + MTK_FUNCTION(7, "B1_HDMITX20_SCL") + ), + + MTK_PIN( + 15, "GPIO15", + MTK_EINT_FUNCTION(0, 15), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO15"), + MTK_FUNCTION(1, "B0_TP_GPIO7_AO"), + MTK_FUNCTION(2, "B0_SPIM4_MISO"), + MTK_FUNCTION(3, "O_JTDO_SEL3"), + MTK_FUNCTION(4, "O_APU_JTAG_TDO"), + MTK_FUNCTION(5, "O_VPU_UDI_TDO"), + MTK_FUNCTION(6, "O_IPU_JTAG_TDO"), + MTK_FUNCTION(7, "B1_HDMITX20_SDA") + ), + + MTK_PIN( + 16, "GPIO16", + MTK_EINT_FUNCTION(0, 16), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO16"), + MTK_FUNCTION(1, "B0_TP_GPIO0_AO"), + MTK_FUNCTION(2, "O_UTXD3"), + MTK_FUNCTION(3, "I1_JTRSTn_SEL3"), + MTK_FUNCTION(4, "I0_APU_JTAG_TRST"), + MTK_FUNCTION(5, "I0_VPU_UDI_NTRST"), + MTK_FUNCTION(6, "I0_IPU_JTAG_TRST"), + MTK_FUNCTION(7, "O_HDMITX20_PWR5V") + ), + + MTK_PIN( + 17, "GPIO17", + MTK_EINT_FUNCTION(0, 17), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO17"), + MTK_FUNCTION(1, "B0_TP_GPIO1_AO"), + MTK_FUNCTION(2, "I1_URXD3"), + MTK_FUNCTION(3, "O_CMFLASH2"), + MTK_FUNCTION(4, "I0_EDP_TX_HPD"), + MTK_FUNCTION(5, "I0_DVFSRC_EXT_REQ"), + MTK_FUNCTION(6, "O_CMVREF7"), + MTK_FUNCTION(7, "B0_MD32_0_GPIO1") + ), + + MTK_PIN( + 18, "GPIO18", + MTK_EINT_FUNCTION(0, 18), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO18"), + MTK_FUNCTION(1, "B0_TP_GPIO2_AO"), + MTK_FUNCTION(2, "O_CMFLASH0"), + MTK_FUNCTION(3, "O_CMVREF4"), + MTK_FUNCTION(4, "B0_TDMIN_MCK"), + MTK_FUNCTION(5, "O_UTXD1"), + MTK_FUNCTION(6, "O_TP_UTXD1_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_A11") + ), + + MTK_PIN( + 19, "GPIO19", + MTK_EINT_FUNCTION(0, 19), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO19"), + MTK_FUNCTION(1, "B0_TP_GPIO3_AO"), + MTK_FUNCTION(2, "O_CMFLASH1"), + MTK_FUNCTION(3, "O_CMVREF5"), + MTK_FUNCTION(4, "B0_TDMIN_BCK"), + MTK_FUNCTION(5, "I1_URXD1"), + MTK_FUNCTION(6, "I1_TP_URXD1_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_A12") + ), + + MTK_PIN( + 20, "GPIO20", + MTK_EINT_FUNCTION(0, 20), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO20"), + MTK_FUNCTION(1, "B0_TP_GPIO4_AO"), + MTK_FUNCTION(2, "O_CMFLASH2"), + MTK_FUNCTION(3, "O_CLKM2"), + MTK_FUNCTION(4, "B0_TDMIN_LRCK"), + MTK_FUNCTION(5, "O_URTS1"), + MTK_FUNCTION(6, "O_TP_URTS1_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_A13") + ), + + MTK_PIN( + 21, "GPIO21", + MTK_EINT_FUNCTION(0, 21), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO21"), + MTK_FUNCTION(1, "B0_TP_GPIO5_AO"), + MTK_FUNCTION(2, "O_CMFLASH3"), + MTK_FUNCTION(3, "O_CLKM3"), + MTK_FUNCTION(4, "I0_TDMIN_DI"), + MTK_FUNCTION(5, "I1_UCTS1"), + MTK_FUNCTION(6, "I1_TP_UCTS1_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_A14") + ), + + MTK_PIN( + 22, "GPIO22", + MTK_EINT_FUNCTION(0, 22), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO22"), + MTK_FUNCTION(1, "O_CMMCLK0"), + MTK_FUNCTION(5, "B0_TP_GPIO6_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_A15") + ), + + MTK_PIN( + 23, "GPIO23", + MTK_EINT_FUNCTION(0, 23), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO23"), + MTK_FUNCTION(1, "O_CMMCLK1"), + MTK_FUNCTION(3, "O_PWM_2"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SCL"), + MTK_FUNCTION(5, "B0_TP_GPIO7_AO"), + MTK_FUNCTION(6, "I0_DP_TX_HPD"), + MTK_FUNCTION(7, "B0_DBG_MON_A16") + ), + + MTK_PIN( + 24, "GPIO24", + MTK_EINT_FUNCTION(0, 24), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO24"), + MTK_FUNCTION(1, "O_CMMCLK2"), + MTK_FUNCTION(3, "O_PWM_3"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SDA"), + MTK_FUNCTION(5, "I0_DVFSRC_EXT_REQ"), + MTK_FUNCTION(6, "I0_EDP_TX_HPD"), + MTK_FUNCTION(7, "B0_MD32_0_GPIO2") + ), + + MTK_PIN( + 25, "GPIO25", + MTK_EINT_FUNCTION(0, 25), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO25"), + MTK_FUNCTION(1, "O_LCM_RST"), + MTK_FUNCTION(2, "O_LCM1_RST"), + MTK_FUNCTION(3, "I0_DP_TX_HPD") + ), + + MTK_PIN( + 26, "GPIO26", + MTK_EINT_FUNCTION(0, 26), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO26"), + MTK_FUNCTION(1, "I0_DSI_TE"), + MTK_FUNCTION(2, "I0_DSI1_TE"), + MTK_FUNCTION(3, "I0_EDP_TX_HPD") + ), + + MTK_PIN( + 27, "GPIO27", + MTK_EINT_FUNCTION(0, 27), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO27"), + MTK_FUNCTION(1, "O_LCM1_RST"), + MTK_FUNCTION(2, "O_LCM_RST"), + MTK_FUNCTION(3, "I0_DP_TX_HPD"), + MTK_FUNCTION(4, "O_CMVREF2"), + MTK_FUNCTION(5, "O_mbistwriteen_trigger"), + MTK_FUNCTION(6, "O_PWM_2"), + MTK_FUNCTION(7, "B0_DBG_MON_A17") + ), + + MTK_PIN( + 28, "GPIO28", + MTK_EINT_FUNCTION(0, 28), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO28"), + MTK_FUNCTION(1, "I0_DSI1_TE"), + MTK_FUNCTION(2, "I0_DSI_TE"), + MTK_FUNCTION(3, "I0_EDP_TX_HPD"), + MTK_FUNCTION(4, "O_CMVREF3"), + MTK_FUNCTION(5, "O_mbistreaden_trigger"), + MTK_FUNCTION(6, "O_PWM_3"), + MTK_FUNCTION(7, "B0_DBG_MON_A18") + ), + + MTK_PIN( + 29, "GPIO29", + MTK_EINT_FUNCTION(0, 29), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO29"), + MTK_FUNCTION(1, "O_DISP_PWM0"), + MTK_FUNCTION(2, "O_DISP_PWM1") + ), + + MTK_PIN( + 30, "GPIO30", + MTK_EINT_FUNCTION(0, 30), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO30"), + MTK_FUNCTION(1, "O_DISP_PWM1"), + MTK_FUNCTION(2, "O_DISP_PWM0"), + MTK_FUNCTION(3, "O_CMFLASH3"), + MTK_FUNCTION(4, "O_PWM_1"), + MTK_FUNCTION(7, "B0_DBG_MON_A19") + ), + + MTK_PIN( + 31, "GPIO31", + MTK_EINT_FUNCTION(0, 31), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO31"), + MTK_FUNCTION(1, "O_UTXD0"), + MTK_FUNCTION(2, "O_TP_UTXD1_AO"), + MTK_FUNCTION(3, "O_ADSP_UTXD0"), + MTK_FUNCTION(4, "O_TP_UTXD2_AO"), + MTK_FUNCTION(5, "O_MD32_0_TXD"), + MTK_FUNCTION(6, "O_MD32_1_TXD"), + MTK_FUNCTION(7, "O_SSPM_UTXD_AO") + ), + + MTK_PIN( + 32, "GPIO32", + MTK_EINT_FUNCTION(0, 32), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO32"), + MTK_FUNCTION(1, "I1_URXD0"), + MTK_FUNCTION(2, "I1_TP_URXD1_AO"), + MTK_FUNCTION(3, "I1_ADSP_URXD0"), + MTK_FUNCTION(4, "I1_TP_URXD2_AO"), + MTK_FUNCTION(5, "I1_MD32_0_RXD"), + MTK_FUNCTION(6, "I1_MD32_1_RXD"), + MTK_FUNCTION(7, "I1_SSPM_URXD_AO") + ), + + MTK_PIN( + 33, "GPIO33", + MTK_EINT_FUNCTION(0, 33), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO33"), + MTK_FUNCTION(1, "O_UTXD1"), + MTK_FUNCTION(2, "O_URTS2"), + MTK_FUNCTION(3, "O_ADSP_UTXD0"), + MTK_FUNCTION(4, "O_TP_UTXD1_AO"), + MTK_FUNCTION(5, "O_mbistwriteen_trigger"), + MTK_FUNCTION(6, "O_MD32_0_TXD"), + MTK_FUNCTION(7, "O_SSPM_UTXD_AO") + ), + + MTK_PIN( + 34, "GPIO34", + MTK_EINT_FUNCTION(0, 34), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO34"), + MTK_FUNCTION(1, "I1_URXD1"), + MTK_FUNCTION(2, "I1_UCTS2"), + MTK_FUNCTION(3, "I1_ADSP_URXD0"), + MTK_FUNCTION(4, "I1_TP_URXD1_AO"), + MTK_FUNCTION(5, "O_mbistreaden_trigger"), + MTK_FUNCTION(6, "I1_MD32_0_RXD"), + MTK_FUNCTION(7, "I1_SSPM_URXD_AO") + ), + + MTK_PIN( + 35, "GPIO35", + MTK_EINT_FUNCTION(0, 35), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO35"), + MTK_FUNCTION(1, "O_UTXD2"), + MTK_FUNCTION(2, "O_URTS1"), + MTK_FUNCTION(3, "O_ADSP_UTXD0"), + MTK_FUNCTION(4, "O_TP_URTS1_AO"), + MTK_FUNCTION(5, "O_TP_UTXD2_AO"), + MTK_FUNCTION(6, "O_MD32_1_TXD"), + MTK_FUNCTION(7, "B0_DBG_MON_A20") + ), + + MTK_PIN( + 36, "GPIO36", + MTK_EINT_FUNCTION(0, 36), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO36"), + MTK_FUNCTION(1, "I1_URXD2"), + MTK_FUNCTION(2, "I1_UCTS1"), + MTK_FUNCTION(3, "I1_ADSP_URXD0"), + MTK_FUNCTION(4, "I1_TP_UCTS1_AO"), + MTK_FUNCTION(5, "I1_TP_URXD2_AO"), + MTK_FUNCTION(6, "I1_MD32_1_RXD"), + MTK_FUNCTION(7, "B0_DBG_MON_A21") + ), + + MTK_PIN( + 37, "GPIO37", + MTK_EINT_FUNCTION(0, 37), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO37"), + MTK_FUNCTION(1, "B1_JTMS_SEL1"), + MTK_FUNCTION(2, "I0_UDI_TMS"), + MTK_FUNCTION(3, "I1_SPM_JTAG_TMS"), + MTK_FUNCTION(4, "I1_ADSP_JTAG0_TMS"), + MTK_FUNCTION(5, "I1_SCP_JTAG0_TMS"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TMS"), + MTK_FUNCTION(7, "I1_MCUPM_JTAG_TMS") + ), + + MTK_PIN( + 38, "GPIO38", + MTK_EINT_FUNCTION(0, 38), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO38"), + MTK_FUNCTION(1, "I0_JTCK_SEL1"), + MTK_FUNCTION(2, "I0_UDI_TCK"), + MTK_FUNCTION(3, "I1_SPM_JTAG_TCK"), + MTK_FUNCTION(4, "I0_ADSP_JTAG0_TCK"), + MTK_FUNCTION(5, "I1_SCP_JTAG0_TCK"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TCK"), + MTK_FUNCTION(7, "I1_MCUPM_JTAG_TCK") + ), + + MTK_PIN( + 39, "GPIO39", + MTK_EINT_FUNCTION(0, 39), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO39"), + MTK_FUNCTION(1, "I1_JTDI_SEL1"), + MTK_FUNCTION(2, "I0_UDI_TDI"), + MTK_FUNCTION(3, "I1_SPM_JTAG_TDI"), + MTK_FUNCTION(4, "I1_ADSP_JTAG0_TDI"), + MTK_FUNCTION(5, "I1_SCP_JTAG0_TDI"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TDI"), + MTK_FUNCTION(7, "I1_MCUPM_JTAG_TDI") + ), + + MTK_PIN( + 40, "GPIO40", + MTK_EINT_FUNCTION(0, 40), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO40"), + MTK_FUNCTION(1, "O_JTDO_SEL1"), + MTK_FUNCTION(2, "O_UDI_TDO"), + MTK_FUNCTION(3, "O_SPM_JTAG_TDO"), + MTK_FUNCTION(4, "O_ADSP_JTAG0_TDO"), + MTK_FUNCTION(5, "O_SCP_JTAG0_TDO"), + MTK_FUNCTION(6, "O_CCU0_JTAG_TDO"), + MTK_FUNCTION(7, "O_MCUPM_JTAG_TDO") + ), + + MTK_PIN( + 41, "GPIO41", + MTK_EINT_FUNCTION(0, 41), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO41"), + MTK_FUNCTION(1, "I1_JTRSTn_SEL1"), + MTK_FUNCTION(2, "I0_UDI_NTRST"), + MTK_FUNCTION(3, "I0_SPM_JTAG_TRSTN"), + MTK_FUNCTION(4, "I1_ADSP_JTAG0_TRSTN"), + MTK_FUNCTION(5, "I0_SCP_JTAG0_TRSTN"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TRST"), + MTK_FUNCTION(7, "I0_MCUPM_JTAG_TRSTN") + ), + + MTK_PIN( + 42, "GPIO42", + MTK_EINT_FUNCTION(0, 42), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO42"), + MTK_FUNCTION(1, "B1_KPCOL0") + ), + + MTK_PIN( + 43, "GPIO43", + MTK_EINT_FUNCTION(0, 43), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO43"), + MTK_FUNCTION(1, "B1_KPCOL1"), + MTK_FUNCTION(2, "I0_DP_TX_HPD"), + MTK_FUNCTION(3, "O_CMFLASH2"), + MTK_FUNCTION(4, "I0_DVFSRC_EXT_REQ"), + MTK_FUNCTION(7, "O_mbistwriteen_trigger") + ), + + MTK_PIN( + 44, "GPIO44", + MTK_EINT_FUNCTION(0, 44), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO44"), + MTK_FUNCTION(1, "B1_KPROW0") + ), + + MTK_PIN( + 45, "GPIO45", + MTK_EINT_FUNCTION(0, 45), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO45"), + MTK_FUNCTION(1, "B1_KPROW1"), + MTK_FUNCTION(2, "I0_EDP_TX_HPD"), + MTK_FUNCTION(3, "O_CMFLASH3"), + MTK_FUNCTION(4, "B0_I2SIN_MCK"), + MTK_FUNCTION(7, "O_mbistreaden_trigger") + ), + + MTK_PIN( + 46, "GPIO46", + MTK_EINT_FUNCTION(0, 46), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO46"), + MTK_FUNCTION(1, "I0_DP_TX_HPD"), + MTK_FUNCTION(2, "O_PWM_0"), + MTK_FUNCTION(3, "I0_VBUSVALID_2P"), + MTK_FUNCTION(7, "B0_DBG_MON_A22") + ), + + MTK_PIN( + 47, "GPIO47", + MTK_EINT_FUNCTION(0, 47), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO47"), + MTK_FUNCTION(1, "I1_WAKEN"), + MTK_FUNCTION(6, "O_GDU_TROOPS_DET0") + ), + + MTK_PIN( + 48, "GPIO48", + MTK_EINT_FUNCTION(0, 48), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO48"), + MTK_FUNCTION(1, "O_PERSTN"), + MTK_FUNCTION(6, "O_GDU_TROOPS_DET1") + ), + + MTK_PIN( + 49, "GPIO49", + MTK_EINT_FUNCTION(0, 49), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO49"), + MTK_FUNCTION(1, "B1_CLKREQN"), + MTK_FUNCTION(6, "O_GDU_TROOPS_DET2") + ), + + MTK_PIN( + 50, "GPIO50", + MTK_EINT_FUNCTION(0, 50), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO50"), + MTK_FUNCTION(1, "O_HDMITX20_PWR5V"), + MTK_FUNCTION(3, "I1_IDDIG_1P"), + MTK_FUNCTION(4, "I1_SCP_JTAG1_TMS"), + MTK_FUNCTION(5, "I1_SSPM_JTAG_TMS"), + MTK_FUNCTION(6, "I1_MD32_0_JTAG_TMS"), + MTK_FUNCTION(7, "I1_MD32_1_JTAG_TMS") + ), + + MTK_PIN( + 51, "GPIO51", + MTK_EINT_FUNCTION(0, 51), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO51"), + MTK_FUNCTION(1, "I0_HDMITX20_HTPLG"), + MTK_FUNCTION(2, "I0_EDP_TX_HPD"), + MTK_FUNCTION(3, "O_USB_DRVVBUS_1P"), + MTK_FUNCTION(4, "I1_SCP_JTAG1_TCK"), + MTK_FUNCTION(5, "I1_SSPM_JTAG_TCK"), + MTK_FUNCTION(6, "I1_MD32_0_JTAG_TCK"), + MTK_FUNCTION(7, "I1_MD32_1_JTAG_TCK") + ), + + MTK_PIN( + 52, "GPIO52", + MTK_EINT_FUNCTION(0, 52), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO52"), + MTK_FUNCTION(1, "B1_HDMITX20_CEC"), + MTK_FUNCTION(3, "I0_VBUSVALID_1P"), + MTK_FUNCTION(4, "I1_SCP_JTAG1_TDI"), + MTK_FUNCTION(5, "I1_SSPM_JTAG_TDI"), + MTK_FUNCTION(6, "I1_MD32_0_JTAG_TDI"), + MTK_FUNCTION(7, "I1_MD32_1_JTAG_TDI") + ), + + MTK_PIN( + 53, "GPIO53", + MTK_EINT_FUNCTION(0, 53), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO53"), + MTK_FUNCTION(1, "B1_HDMITX20_SCL"), + MTK_FUNCTION(3, "I1_IDDIG_2P"), + MTK_FUNCTION(4, "O_SCP_JTAG1_TDO"), + MTK_FUNCTION(5, "O_SSPM_JTAG_TDO"), + MTK_FUNCTION(6, "O_MD32_0_JTAG_TDO"), + MTK_FUNCTION(7, "O_MD32_1_JTAG_TDO") + ), + + MTK_PIN( + 54, "GPIO54", + MTK_EINT_FUNCTION(0, 54), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO54"), + MTK_FUNCTION(1, "B1_HDMITX20_SDA"), + MTK_FUNCTION(3, "O_USB_DRVVBUS_2P"), + MTK_FUNCTION(4, "I0_SCP_JTAG1_TRSTN"), + MTK_FUNCTION(5, "I0_SSPM_JTAG_TRSTN"), + MTK_FUNCTION(6, "I1_MD32_0_JTAG_TRST"), + MTK_FUNCTION(7, "I1_MD32_1_JTAG_TRST") + ), + + MTK_PIN( + 55, "GPIO55", + MTK_EINT_FUNCTION(0, 55), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO55"), + MTK_FUNCTION(1, "B1_SCL0"), + MTK_FUNCTION(2, "B1_SCP_SCL0"), + MTK_FUNCTION(3, "B1_SCP_SCL1"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SCL") + ), + + MTK_PIN( + 56, "GPIO56", + MTK_EINT_FUNCTION(0, 56), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO56"), + MTK_FUNCTION(1, "B1_SDA0"), + MTK_FUNCTION(2, "B1_SCP_SDA0"), + MTK_FUNCTION(3, "B1_SCP_SDA1"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SDA") + ), + + MTK_PIN( + 57, "GPIO57", + MTK_EINT_FUNCTION(0, 57), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO57"), + MTK_FUNCTION(1, "B1_SCL1") + ), + + MTK_PIN( + 58, "GPIO58", + MTK_EINT_FUNCTION(0, 58), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO58"), + MTK_FUNCTION(1, "B1_SDA1") + ), + + MTK_PIN( + 59, "GPIO59", + MTK_EINT_FUNCTION(0, 59), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO59"), + MTK_FUNCTION(1, "B1_SCL2"), + MTK_FUNCTION(2, "B1_SCP_SCL0"), + MTK_FUNCTION(3, "B1_SCP_SCL1") + ), + + MTK_PIN( + 60, "GPIO60", + MTK_EINT_FUNCTION(0, 60), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO60"), + MTK_FUNCTION(1, "B1_SDA2"), + MTK_FUNCTION(2, "B1_SCP_SDA0"), + MTK_FUNCTION(3, "B1_SCP_SDA1") + ), + + MTK_PIN( + 61, "GPIO61", + MTK_EINT_FUNCTION(0, 61), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO61"), + MTK_FUNCTION(1, "B1_SCL3"), + MTK_FUNCTION(2, "B1_SCP_SCL0"), + MTK_FUNCTION(3, "B1_SCP_SCL1"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SCL") + ), + + MTK_PIN( + 62, "GPIO62", + MTK_EINT_FUNCTION(0, 62), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO62"), + MTK_FUNCTION(1, "B1_SDA3"), + MTK_FUNCTION(2, "B1_SCP_SDA0"), + MTK_FUNCTION(3, "B1_SCP_SDA1"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SDA") + ), + + MTK_PIN( + 63, "GPIO63", + MTK_EINT_FUNCTION(0, 63), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO63"), + MTK_FUNCTION(1, "B1_SCL4") + ), + + MTK_PIN( + 64, "GPIO64", + MTK_EINT_FUNCTION(0, 64), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO64"), + MTK_FUNCTION(1, "B1_SDA4") + ), + + MTK_PIN( + 65, "GPIO65", + MTK_EINT_FUNCTION(0, 65), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO65"), + MTK_FUNCTION(1, "B1_SCL5"), + MTK_FUNCTION(2, "B1_SCP_SCL0"), + MTK_FUNCTION(3, "B1_SCP_SCL1") + ), + + MTK_PIN( + 66, "GPIO66", + MTK_EINT_FUNCTION(0, 66), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO66"), + MTK_FUNCTION(1, "B1_SDA5"), + MTK_FUNCTION(2, "B1_SCP_SDA0"), + MTK_FUNCTION(3, "B1_SCP_SDA1") + ), + + MTK_PIN( + 67, "GPIO67", + MTK_EINT_FUNCTION(0, 67), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO67"), + MTK_FUNCTION(1, "B1_SCL6"), + MTK_FUNCTION(2, "B1_SCP_SCL0"), + MTK_FUNCTION(3, "B1_SCP_SCL1"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SCL") + ), + + MTK_PIN( + 68, "GPIO68", + MTK_EINT_FUNCTION(0, 68), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO68"), + MTK_FUNCTION(1, "B1_SDA6"), + MTK_FUNCTION(2, "B1_SCP_SDA0"), + MTK_FUNCTION(3, "B1_SCP_SDA1"), + MTK_FUNCTION(4, "B1_PCIE_PHY_I2C_SDA") + ), + + MTK_PIN( + 69, "GPIO69", + MTK_EINT_FUNCTION(0, 69), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO69"), + MTK_FUNCTION(1, "O_SPIM0_CSB"), + MTK_FUNCTION(2, "O_SCP_SPI0_CS"), + MTK_FUNCTION(3, "O_DMIC3_CLK"), + MTK_FUNCTION(4, "B0_MD32_1_GPIO0"), + MTK_FUNCTION(5, "O_CMVREF0"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP0_0"), + MTK_FUNCTION(7, "B0_DBG_MON_A23") + ), + + MTK_PIN( + 70, "GPIO70", + MTK_EINT_FUNCTION(0, 70), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO70"), + MTK_FUNCTION(1, "O_SPIM0_CLK"), + MTK_FUNCTION(2, "O_SCP_SPI0_CK"), + MTK_FUNCTION(3, "I0_DMIC3_DAT"), + MTK_FUNCTION(4, "B0_MD32_1_GPIO1"), + MTK_FUNCTION(5, "O_CMVREF1"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP0_1"), + MTK_FUNCTION(7, "B0_DBG_MON_A24") + ), + + MTK_PIN( + 71, "GPIO71", + MTK_EINT_FUNCTION(0, 71), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO71"), + MTK_FUNCTION(1, "B0_SPIM0_MOSI"), + MTK_FUNCTION(2, "O_SCP_SPI0_MO"), + MTK_FUNCTION(3, "I0_DMIC3_DAT_R"), + MTK_FUNCTION(4, "B0_MD32_1_GPIO2"), + MTK_FUNCTION(5, "O_CMVREF2"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP0_2"), + MTK_FUNCTION(7, "B0_DBG_MON_A25") + ), + + MTK_PIN( + 72, "GPIO72", + MTK_EINT_FUNCTION(0, 72), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO72"), + MTK_FUNCTION(1, "B0_SPIM0_MISO"), + MTK_FUNCTION(2, "I0_SCP_SPI0_MI"), + MTK_FUNCTION(3, "O_DMIC4_CLK"), + MTK_FUNCTION(5, "O_CMVREF3"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP1_0"), + MTK_FUNCTION(7, "B0_DBG_MON_A26") + ), + + MTK_PIN( + 73, "GPIO73", + MTK_EINT_FUNCTION(0, 73), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO73"), + MTK_FUNCTION(1, "B0_SPIM0_MIO2"), + MTK_FUNCTION(2, "O_UTXD3"), + MTK_FUNCTION(3, "I0_DMIC4_DAT"), + MTK_FUNCTION(4, "O_CLKM0"), + MTK_FUNCTION(5, "O_CMVREF4"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP1_1"), + MTK_FUNCTION(7, "B0_DBG_MON_A27") + ), + + MTK_PIN( + 74, "GPIO74", + MTK_EINT_FUNCTION(0, 74), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO74"), + MTK_FUNCTION(1, "B0_SPIM0_MIO3"), + MTK_FUNCTION(2, "I1_URXD3"), + MTK_FUNCTION(3, "I0_DMIC4_DAT_R"), + MTK_FUNCTION(4, "O_CLKM1"), + MTK_FUNCTION(5, "O_CMVREF5"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP1_2"), + MTK_FUNCTION(7, "B0_DBG_MON_A28") + ), + + MTK_PIN( + 75, "GPIO75", + MTK_EINT_FUNCTION(0, 75), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO75"), + MTK_FUNCTION(1, "O_SPIM1_CSB"), + MTK_FUNCTION(2, "O_SCP_SPI1_A_CS"), + MTK_FUNCTION(3, "B0_TDMIN_MCK"), + MTK_FUNCTION(4, "B1_SCP_SCL0"), + MTK_FUNCTION(5, "O_CMVREF6"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP2_0"), + MTK_FUNCTION(7, "B0_DBG_MON_A29") + ), + + MTK_PIN( + 76, "GPIO76", + MTK_EINT_FUNCTION(0, 76), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO76"), + MTK_FUNCTION(1, "O_SPIM1_CLK"), + MTK_FUNCTION(2, "O_SCP_SPI1_A_CK"), + MTK_FUNCTION(3, "B0_TDMIN_BCK"), + MTK_FUNCTION(4, "B1_SCP_SDA0"), + MTK_FUNCTION(5, "O_CMVREF7"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP2_1"), + MTK_FUNCTION(7, "B0_DBG_MON_A30") + ), + + MTK_PIN( + 77, "GPIO77", + MTK_EINT_FUNCTION(0, 77), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO77"), + MTK_FUNCTION(1, "B0_SPIM1_MOSI"), + MTK_FUNCTION(2, "O_SCP_SPI1_A_MO"), + MTK_FUNCTION(3, "B0_TDMIN_LRCK"), + MTK_FUNCTION(4, "B1_SCP_SCL1"), + MTK_FUNCTION(6, "O_GDU_SUM_TROOP2_2"), + MTK_FUNCTION(7, "B0_DBG_MON_A31") + ), + + MTK_PIN( + 78, "GPIO78", + MTK_EINT_FUNCTION(0, 78), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO78"), + MTK_FUNCTION(1, "B0_SPIM1_MISO"), + MTK_FUNCTION(2, "I0_SCP_SPI1_A_MI"), + MTK_FUNCTION(3, "I0_TDMIN_DI"), + MTK_FUNCTION(4, "B1_SCP_SDA1"), + MTK_FUNCTION(7, "B0_DBG_MON_A32") + ), + + MTK_PIN( + 79, "GPIO79", + MTK_EINT_FUNCTION(0, 79), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO79"), + MTK_FUNCTION(1, "O_SPIM2_CSB"), + MTK_FUNCTION(2, "O_SCP_SPI2_CS"), + MTK_FUNCTION(3, "O_I2SO1_MCK"), + MTK_FUNCTION(4, "O_UTXD2"), + MTK_FUNCTION(5, "O_TP_UTXD2_AO"), + MTK_FUNCTION(6, "B0_PCM_SYNC"), + MTK_FUNCTION(7, "B0_DBG_MON_B0") + ), + + MTK_PIN( + 80, "GPIO80", + MTK_EINT_FUNCTION(0, 80), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO80"), + MTK_FUNCTION(1, "O_SPIM2_CLK"), + MTK_FUNCTION(2, "O_SCP_SPI2_CK"), + MTK_FUNCTION(3, "O_I2SO1_BCK"), + MTK_FUNCTION(4, "I1_URXD2"), + MTK_FUNCTION(5, "I1_TP_URXD2_AO"), + MTK_FUNCTION(6, "B0_PCM_CLK"), + MTK_FUNCTION(7, "B0_DBG_MON_B1") + ), + + MTK_PIN( + 81, "GPIO81", + MTK_EINT_FUNCTION(0, 81), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO81"), + MTK_FUNCTION(1, "B0_SPIM2_MOSI"), + MTK_FUNCTION(2, "O_SCP_SPI2_MO"), + MTK_FUNCTION(3, "O_I2SO1_WS"), + MTK_FUNCTION(4, "O_URTS2"), + MTK_FUNCTION(5, "O_TP_URTS2_AO"), + MTK_FUNCTION(6, "O_PCM_DO"), + MTK_FUNCTION(7, "B0_DBG_MON_B2") + ), + + MTK_PIN( + 82, "GPIO82", + MTK_EINT_FUNCTION(0, 82), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO82"), + MTK_FUNCTION(1, "B0_SPIM2_MISO"), + MTK_FUNCTION(2, "I0_SCP_SPI2_MI"), + MTK_FUNCTION(3, "O_I2SO1_D0"), + MTK_FUNCTION(4, "I1_UCTS2"), + MTK_FUNCTION(5, "I1_TP_UCTS2_AO"), + MTK_FUNCTION(6, "I0_PCM_DI"), + MTK_FUNCTION(7, "B0_DBG_MON_B3") + ), + + MTK_PIN( + 83, "GPIO83", + MTK_EINT_FUNCTION(0, 83), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO83"), + MTK_FUNCTION(1, "I1_IDDIG") + ), + + MTK_PIN( + 84, "GPIO84", + MTK_EINT_FUNCTION(0, 84), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO84"), + MTK_FUNCTION(1, "O_USB_DRVVBUS") + ), + + MTK_PIN( + 85, "GPIO85", + MTK_EINT_FUNCTION(0, 85), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO85"), + MTK_FUNCTION(1, "I0_VBUSVALID") + ), + + MTK_PIN( + 86, "GPIO86", + MTK_EINT_FUNCTION(0, 86), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO86"), + MTK_FUNCTION(1, "I1_IDDIG_1P"), + MTK_FUNCTION(2, "O_UTXD1"), + MTK_FUNCTION(3, "O_URTS2"), + MTK_FUNCTION(4, "O_PWM_2"), + MTK_FUNCTION(5, "B0_TP_GPIO4_AO"), + MTK_FUNCTION(6, "O_AUXIF_ST0"), + MTK_FUNCTION(7, "B0_DBG_MON_B4") + ), + + MTK_PIN( + 87, "GPIO87", + MTK_EINT_FUNCTION(0, 87), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO87"), + MTK_FUNCTION(1, "O_USB_DRVVBUS_1P"), + MTK_FUNCTION(2, "I1_URXD1"), + MTK_FUNCTION(3, "I1_UCTS2"), + MTK_FUNCTION(4, "O_PWM_3"), + MTK_FUNCTION(5, "B0_TP_GPIO5_AO"), + MTK_FUNCTION(6, "O_AUXIF_CLK0"), + MTK_FUNCTION(7, "B0_DBG_MON_B5") + ), + + MTK_PIN( + 88, "GPIO88", + MTK_EINT_FUNCTION(0, 88), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO88"), + MTK_FUNCTION(1, "I0_VBUSVALID_1P"), + MTK_FUNCTION(2, "O_UTXD2"), + MTK_FUNCTION(3, "O_URTS1"), + MTK_FUNCTION(4, "O_CLKM2"), + MTK_FUNCTION(5, "B0_TP_GPIO6_AO"), + MTK_FUNCTION(6, "O_AUXIF_ST1"), + MTK_FUNCTION(7, "B0_DBG_MON_B6") + ), + + MTK_PIN( + 89, "GPIO89", + MTK_EINT_FUNCTION(0, 89), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO89"), + MTK_FUNCTION(1, "I1_IDDIG_2P"), + MTK_FUNCTION(2, "I1_URXD2"), + MTK_FUNCTION(3, "I1_UCTS1"), + MTK_FUNCTION(4, "O_CLKM3"), + MTK_FUNCTION(5, "B0_TP_GPIO7_AO"), + MTK_FUNCTION(6, "O_AUXIF_CLK1"), + MTK_FUNCTION(7, "B0_DBG_MON_B7") + ), + + MTK_PIN( + 90, "GPIO90", + MTK_EINT_FUNCTION(0, 90), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO90"), + MTK_FUNCTION(1, "O_USB_DRVVBUS_2P"), + MTK_FUNCTION(2, "O_UTXD3"), + MTK_FUNCTION(3, "O_ADSP_UTXD0"), + MTK_FUNCTION(4, "O_SSPM_UTXD_AO"), + MTK_FUNCTION(5, "O_MD32_0_TXD"), + MTK_FUNCTION(6, "O_MD32_1_TXD"), + MTK_FUNCTION(7, "B0_DBG_MON_B8") + ), + + MTK_PIN( + 91, "GPIO91", + MTK_EINT_FUNCTION(0, 91), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO91"), + MTK_FUNCTION(1, "I0_VBUSVALID_2P"), + MTK_FUNCTION(2, "I1_URXD3"), + MTK_FUNCTION(3, "I1_ADSP_URXD0"), + MTK_FUNCTION(4, "I1_SSPM_URXD_AO"), + MTK_FUNCTION(5, "I1_MD32_0_RXD"), + MTK_FUNCTION(6, "I1_MD32_1_RXD"), + MTK_FUNCTION(7, "B0_DBG_MON_B9") + ), + + MTK_PIN( + 92, "GPIO92", + MTK_EINT_FUNCTION(0, 92), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO92"), + MTK_FUNCTION(1, "O_PWRAP_SPI0_CSN") + ), + + MTK_PIN( + 93, "GPIO93", + MTK_EINT_FUNCTION(0, 93), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO93"), + MTK_FUNCTION(1, "O_PWRAP_SPI0_CK") + ), + + MTK_PIN( + 94, "GPIO94", + MTK_EINT_FUNCTION(0, 94), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO94"), + MTK_FUNCTION(1, "B0_PWRAP_SPI0_MO"), + MTK_FUNCTION(2, "B0_PWRAP_SPI0_MI") + ), + + MTK_PIN( + 95, "GPIO95", + MTK_EINT_FUNCTION(0, 95), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO95"), + MTK_FUNCTION(1, "B0_PWRAP_SPI0_MI"), + MTK_FUNCTION(2, "B0_PWRAP_SPI0_MO") + ), + + MTK_PIN( + 96, "GPIO96", + MTK_EINT_FUNCTION(0, 96), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO96"), + MTK_FUNCTION(1, "O_SRCLKENA0") + ), + + MTK_PIN( + 97, "GPIO97", + MTK_EINT_FUNCTION(0, 97), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO97"), + MTK_FUNCTION(1, "O_SRCLKENA1") + ), + + MTK_PIN( + 98, "GPIO98", + MTK_EINT_FUNCTION(0, 98), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO98"), + MTK_FUNCTION(1, "O_SCP_VREQ_VAO"), + MTK_FUNCTION(2, "I0_DVFSRC_EXT_REQ") + ), + + MTK_PIN( + 99, "GPIO99", + MTK_EINT_FUNCTION(0, 99), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO99"), + MTK_FUNCTION(1, "I0_RTC32K_CK") + ), + + MTK_PIN( + 100, "GPIO100", + MTK_EINT_FUNCTION(0, 100), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO100"), + MTK_FUNCTION(1, "O_WATCHDOG") + ), + + MTK_PIN( + 101, "GPIO101", + MTK_EINT_FUNCTION(0, 101), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO101"), + MTK_FUNCTION(1, "O_AUD_CLK_MOSI"), + MTK_FUNCTION(2, "O_I2SO1_MCK"), + MTK_FUNCTION(3, "B0_I2SIN_BCK") + ), + + MTK_PIN( + 102, "GPIO102", + MTK_EINT_FUNCTION(0, 102), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO102"), + MTK_FUNCTION(1, "O_AUD_SYNC_MOSI"), + MTK_FUNCTION(2, "O_I2SO1_BCK"), + MTK_FUNCTION(3, "B0_I2SIN_WS") + ), + + MTK_PIN( + 103, "GPIO103", + MTK_EINT_FUNCTION(0, 103), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO103"), + MTK_FUNCTION(1, "O_AUD_DAT_MOSI0"), + MTK_FUNCTION(2, "O_I2SO1_WS"), + MTK_FUNCTION(3, "I0_I2SIN_D0") + ), + + MTK_PIN( + 104, "GPIO104", + MTK_EINT_FUNCTION(0, 104), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO104"), + MTK_FUNCTION(1, "O_AUD_DAT_MOSI1"), + MTK_FUNCTION(2, "O_I2SO1_D0"), + MTK_FUNCTION(3, "I0_I2SIN_D1") + ), + + MTK_PIN( + 105, "GPIO105", + MTK_EINT_FUNCTION(0, 105), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO105"), + MTK_FUNCTION(1, "I0_AUD_DAT_MISO0"), + MTK_FUNCTION(2, "I0_VOW_DAT_MISO"), + MTK_FUNCTION(3, "I0_I2SIN_D2") + ), + + MTK_PIN( + 106, "GPIO106", + MTK_EINT_FUNCTION(0, 106), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO106"), + MTK_FUNCTION(1, "I0_AUD_DAT_MISO1"), + MTK_FUNCTION(2, "I0_VOW_CLK_MISO"), + MTK_FUNCTION(3, "I0_I2SIN_D3") + ), + + MTK_PIN( + 107, "GPIO107", + MTK_EINT_FUNCTION(0, 107), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO107"), + MTK_FUNCTION(1, "B0_I2SIN_MCK"), + MTK_FUNCTION(2, "I0_SPLIN_MCK"), + MTK_FUNCTION(3, "I0_SPDIF_IN0"), + MTK_FUNCTION(4, "O_CMVREF4"), + MTK_FUNCTION(5, "O_AUXIF_ST0"), + MTK_FUNCTION(6, "O_PGD_LV_LSC_PWR0") + ), + + MTK_PIN( + 108, "GPIO108", + MTK_EINT_FUNCTION(0, 108), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO108"), + MTK_FUNCTION(1, "B0_I2SIN_BCK"), + MTK_FUNCTION(2, "I0_SPLIN_LRCK"), + MTK_FUNCTION(3, "O_DMIC4_CLK"), + MTK_FUNCTION(4, "O_CMVREF5"), + MTK_FUNCTION(5, "O_AUXIF_CLK0"), + MTK_FUNCTION(6, "O_PGD_LV_LSC_PWR1"), + MTK_FUNCTION(7, "B0_DBG_MON_B10") + ), + + MTK_PIN( + 109, "GPIO109", + MTK_EINT_FUNCTION(0, 109), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO109"), + MTK_FUNCTION(1, "B0_I2SIN_WS"), + MTK_FUNCTION(2, "I0_SPLIN_BCK"), + MTK_FUNCTION(3, "I0_DMIC4_DAT"), + MTK_FUNCTION(4, "O_CMVREF6"), + MTK_FUNCTION(5, "O_AUXIF_ST1"), + MTK_FUNCTION(6, "O_PGD_LV_LSC_PWR2"), + MTK_FUNCTION(7, "B0_DBG_MON_B11") + ), + + MTK_PIN( + 110, "GPIO110", + MTK_EINT_FUNCTION(0, 110), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO110"), + MTK_FUNCTION(1, "I0_I2SIN_D0"), + MTK_FUNCTION(2, "I0_SPLIN_D0"), + MTK_FUNCTION(3, "I0_DMIC4_DAT_R"), + MTK_FUNCTION(4, "O_CMVREF7"), + MTK_FUNCTION(5, "O_AUXIF_CLK1"), + MTK_FUNCTION(6, "O_PGD_LV_LSC_PWR3"), + MTK_FUNCTION(7, "B0_DBG_MON_B12") + ), + + MTK_PIN( + 111, "GPIO111", + MTK_EINT_FUNCTION(0, 111), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO111"), + MTK_FUNCTION(1, "I0_I2SIN_D1"), + MTK_FUNCTION(2, "I0_SPLIN_D1"), + MTK_FUNCTION(3, "O_DMIC3_CLK"), + MTK_FUNCTION(4, "O_SPDIF_OUT"), + MTK_FUNCTION(6, "O_PGD_LV_LSC_PWR4"), + MTK_FUNCTION(7, "B0_DBG_MON_B13") + ), + + MTK_PIN( + 112, "GPIO112", + MTK_EINT_FUNCTION(0, 112), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO112"), + MTK_FUNCTION(1, "I0_I2SIN_D2"), + MTK_FUNCTION(2, "I0_SPLIN_D2"), + MTK_FUNCTION(3, "I0_DMIC3_DAT"), + MTK_FUNCTION(4, "B0_TDMIN_MCK"), + MTK_FUNCTION(5, "O_I2SO1_WS"), + MTK_FUNCTION(6, "O_PGD_LV_LSC_PWR5"), + MTK_FUNCTION(7, "B0_DBG_MON_B14") + ), + + MTK_PIN( + 113, "GPIO113", + MTK_EINT_FUNCTION(0, 113), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO113"), + MTK_FUNCTION(1, "I0_I2SIN_D3"), + MTK_FUNCTION(2, "I0_SPLIN_D3"), + MTK_FUNCTION(3, "I0_DMIC3_DAT_R"), + MTK_FUNCTION(4, "B0_TDMIN_BCK"), + MTK_FUNCTION(5, "O_I2SO1_D0"), + MTK_FUNCTION(7, "B0_DBG_MON_B15") + ), + + MTK_PIN( + 114, "GPIO114", + MTK_EINT_FUNCTION(0, 114), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO114"), + MTK_FUNCTION(1, "O_I2SO2_MCK"), + MTK_FUNCTION(2, "B0_I2SIN_MCK"), + MTK_FUNCTION(3, "I1_MCUPM_JTAG_TMS"), + MTK_FUNCTION(4, "B1_APU_JTAG_TMS"), + MTK_FUNCTION(5, "I1_SCP_JTAG1_TMS"), + MTK_FUNCTION(6, "I1_SPM_JTAG_TMS"), + MTK_FUNCTION(7, "B0_DBG_MON_B16") + ), + + MTK_PIN( + 115, "GPIO115", + MTK_EINT_FUNCTION(0, 115), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO115"), + MTK_FUNCTION(1, "B0_I2SO2_BCK"), + MTK_FUNCTION(2, "B0_I2SIN_BCK"), + MTK_FUNCTION(3, "I1_MCUPM_JTAG_TCK"), + MTK_FUNCTION(4, "I0_APU_JTAG_TCK"), + MTK_FUNCTION(5, "I1_SCP_JTAG1_TCK"), + MTK_FUNCTION(6, "I1_SPM_JTAG_TCK"), + MTK_FUNCTION(7, "B0_DBG_MON_B17") + ), + + MTK_PIN( + 116, "GPIO116", + MTK_EINT_FUNCTION(0, 116), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO116"), + MTK_FUNCTION(1, "B0_I2SO2_WS"), + MTK_FUNCTION(2, "B0_I2SIN_WS"), + MTK_FUNCTION(3, "I1_MCUPM_JTAG_TDI"), + MTK_FUNCTION(4, "I1_APU_JTAG_TDI"), + MTK_FUNCTION(5, "I1_SCP_JTAG1_TDI"), + MTK_FUNCTION(6, "I1_SPM_JTAG_TDI"), + MTK_FUNCTION(7, "B0_DBG_MON_B18") + ), + + MTK_PIN( + 117, "GPIO117", + MTK_EINT_FUNCTION(0, 117), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO117"), + MTK_FUNCTION(1, "O_I2SO2_D0"), + MTK_FUNCTION(2, "I0_I2SIN_D0"), + MTK_FUNCTION(3, "O_MCUPM_JTAG_TDO"), + MTK_FUNCTION(4, "O_APU_JTAG_TDO"), + MTK_FUNCTION(5, "O_SCP_JTAG1_TDO"), + MTK_FUNCTION(6, "O_SPM_JTAG_TDO"), + MTK_FUNCTION(7, "B0_DBG_MON_B19") + ), + + MTK_PIN( + 118, "GPIO118", + MTK_EINT_FUNCTION(0, 118), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO118"), + MTK_FUNCTION(1, "O_I2SO2_D1"), + MTK_FUNCTION(2, "I0_I2SIN_D1"), + MTK_FUNCTION(3, "I0_MCUPM_JTAG_TRSTN"), + MTK_FUNCTION(4, "I0_APU_JTAG_TRST"), + MTK_FUNCTION(5, "I0_SCP_JTAG1_TRSTN"), + MTK_FUNCTION(6, "I0_SPM_JTAG_TRSTN"), + MTK_FUNCTION(7, "B0_DBG_MON_B20") + ), + + MTK_PIN( + 119, "GPIO119", + MTK_EINT_FUNCTION(0, 119), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO119"), + MTK_FUNCTION(1, "O_I2SO2_D2"), + MTK_FUNCTION(2, "I0_I2SIN_D2"), + MTK_FUNCTION(3, "O_UTXD3"), + MTK_FUNCTION(4, "B0_TDMIN_LRCK"), + MTK_FUNCTION(5, "O_I2SO1_MCK"), + MTK_FUNCTION(6, "O_SSPM_UTXD_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_B21") + ), + + MTK_PIN( + 120, "GPIO120", + MTK_EINT_FUNCTION(0, 120), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO120"), + MTK_FUNCTION(1, "O_I2SO2_D3"), + MTK_FUNCTION(2, "I0_I2SIN_D3"), + MTK_FUNCTION(3, "I1_URXD3"), + MTK_FUNCTION(4, "I0_TDMIN_DI"), + MTK_FUNCTION(5, "O_I2SO1_BCK"), + MTK_FUNCTION(6, "I1_SSPM_URXD_AO"), + MTK_FUNCTION(7, "B0_DBG_MON_B22") + ), + + MTK_PIN( + 121, "GPIO121", + MTK_EINT_FUNCTION(0, 121), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO121"), + MTK_FUNCTION(1, "B0_PCM_CLK"), + MTK_FUNCTION(2, "O_SPIM4_CSB"), + MTK_FUNCTION(3, "O_SCP_SPI1_B_CS"), + MTK_FUNCTION(4, "O_TP_UTXD2_AO"), + MTK_FUNCTION(5, "O_AUXIF_ST0"), + MTK_FUNCTION(6, "O_PGD_DA_EFUSE_RDY"), + MTK_FUNCTION(7, "B0_DBG_MON_B23") + ), + + MTK_PIN( + 122, "GPIO122", + MTK_EINT_FUNCTION(0, 122), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO122"), + MTK_FUNCTION(1, "B0_PCM_SYNC"), + MTK_FUNCTION(2, "O_SPIM4_CLK"), + MTK_FUNCTION(3, "O_SCP_SPI1_B_CK"), + MTK_FUNCTION(4, "I1_TP_URXD2_AO"), + MTK_FUNCTION(5, "O_AUXIF_CLK0"), + MTK_FUNCTION(6, "O_PGD_DA_EFUSE_RDY_PRE"), + MTK_FUNCTION(7, "B0_DBG_MON_B24") + ), + + MTK_PIN( + 123, "GPIO123", + MTK_EINT_FUNCTION(0, 123), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO123"), + MTK_FUNCTION(1, "O_PCM_DO"), + MTK_FUNCTION(2, "B0_SPIM4_MOSI"), + MTK_FUNCTION(3, "O_SCP_SPI1_B_MO"), + MTK_FUNCTION(4, "O_TP_URTS2_AO"), + MTK_FUNCTION(5, "O_AUXIF_ST1"), + MTK_FUNCTION(6, "O_PGD_DA_PWRGD_RESET"), + MTK_FUNCTION(7, "B0_DBG_MON_B25") + ), + + MTK_PIN( + 124, "GPIO124", + MTK_EINT_FUNCTION(0, 124), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO124"), + MTK_FUNCTION(1, "I0_PCM_DI"), + MTK_FUNCTION(2, "B0_SPIM4_MISO"), + MTK_FUNCTION(3, "I0_SCP_SPI1_B_MI"), + MTK_FUNCTION(4, "I1_TP_UCTS2_AO"), + MTK_FUNCTION(5, "O_AUXIF_CLK1"), + MTK_FUNCTION(6, "O_PGD_DA_PWRGD_ENB"), + MTK_FUNCTION(7, "B0_DBG_MON_B26") + ), + + MTK_PIN( + 125, "GPIO125", + MTK_EINT_FUNCTION(0, 125), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO125"), + MTK_FUNCTION(1, "O_DMIC1_CLK"), + MTK_FUNCTION(2, "O_SPINOR_CK"), + MTK_FUNCTION(3, "B0_TDMIN_MCK"), + MTK_FUNCTION(6, "O_LVTS_FOUT"), + MTK_FUNCTION(7, "B0_DBG_MON_B27") + ), + + MTK_PIN( + 126, "GPIO126", + MTK_EINT_FUNCTION(0, 126), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO126"), + MTK_FUNCTION(1, "I0_DMIC1_DAT"), + MTK_FUNCTION(2, "O_SPINOR_CS"), + MTK_FUNCTION(3, "B0_TDMIN_BCK"), + MTK_FUNCTION(6, "O_LVTS_SDO"), + MTK_FUNCTION(7, "B0_DBG_MON_B28") + ), + + MTK_PIN( + 127, "GPIO127", + MTK_EINT_FUNCTION(0, 127), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO127"), + MTK_FUNCTION(1, "I0_DMIC1_DAT_R"), + MTK_FUNCTION(2, "B0_SPINOR_IO0"), + MTK_FUNCTION(3, "B0_TDMIN_LRCK"), + MTK_FUNCTION(6, "I0_LVTS_26M"), + MTK_FUNCTION(7, "B0_DBG_MON_B29") + ), + + MTK_PIN( + 128, "GPIO128", + MTK_EINT_FUNCTION(0, 128), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO128"), + MTK_FUNCTION(1, "O_DMIC2_CLK"), + MTK_FUNCTION(2, "B0_SPINOR_IO1"), + MTK_FUNCTION(3, "I0_TDMIN_DI"), + MTK_FUNCTION(6, "I0_LVTS_SCF"), + MTK_FUNCTION(7, "B0_DBG_MON_B30") + ), + + MTK_PIN( + 129, "GPIO129", + MTK_EINT_FUNCTION(0, 129), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO129"), + MTK_FUNCTION(1, "I0_DMIC2_DAT"), + MTK_FUNCTION(2, "B0_SPINOR_IO2"), + MTK_FUNCTION(3, "I0_SPDIF_IN1"), + MTK_FUNCTION(6, "I0_LVTS_SCK"), + MTK_FUNCTION(7, "B0_DBG_MON_B31") + ), + + MTK_PIN( + 130, "GPIO130", + MTK_EINT_FUNCTION(0, 130), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO130"), + MTK_FUNCTION(1, "I0_DMIC2_DAT_R"), + MTK_FUNCTION(2, "B0_SPINOR_IO3"), + MTK_FUNCTION(3, "I0_SPDIF_IN2"), + MTK_FUNCTION(6, "I0_LVTS_SDI"), + MTK_FUNCTION(7, "B0_DBG_MON_B32") + ), + + MTK_PIN( + 131, "GPIO131", + MTK_EINT_FUNCTION(0, 131), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO131"), + MTK_FUNCTION(1, "O_DPI_D0"), + MTK_FUNCTION(2, "O_GBE_TXD3"), + MTK_FUNCTION(3, "O_DMIC1_CLK"), + MTK_FUNCTION(4, "O_I2SO2_MCK"), + MTK_FUNCTION(5, "B0_TP_GPIO0_AO"), + MTK_FUNCTION(6, "O_SPIM5_CSB"), + MTK_FUNCTION(7, "O_PGD_LV_HSC_PWR0") + ), + + MTK_PIN( + 132, "GPIO132", + MTK_EINT_FUNCTION(0, 132), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO132"), + MTK_FUNCTION(1, "O_DPI_D1"), + MTK_FUNCTION(2, "O_GBE_TXD2"), + MTK_FUNCTION(3, "I0_DMIC1_DAT"), + MTK_FUNCTION(4, "B0_I2SO2_BCK"), + MTK_FUNCTION(5, "B0_TP_GPIO1_AO"), + MTK_FUNCTION(6, "O_SPIM5_CLK"), + MTK_FUNCTION(7, "O_PGD_LV_HSC_PWR1") + ), + + MTK_PIN( + 133, "GPIO133", + MTK_EINT_FUNCTION(0, 133), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO133"), + MTK_FUNCTION(1, "O_DPI_D2"), + MTK_FUNCTION(2, "O_GBE_TXD1"), + MTK_FUNCTION(3, "I0_DMIC1_DAT_R"), + MTK_FUNCTION(4, "B0_I2SO2_WS"), + MTK_FUNCTION(5, "B0_TP_GPIO2_AO"), + MTK_FUNCTION(6, "B0_SPIM5_MOSI"), + MTK_FUNCTION(7, "O_PGD_LV_HSC_PWR2") + ), + + MTK_PIN( + 134, "GPIO134", + MTK_EINT_FUNCTION(0, 134), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO134"), + MTK_FUNCTION(1, "O_DPI_D3"), + MTK_FUNCTION(2, "O_GBE_TXD0"), + MTK_FUNCTION(3, "O_DMIC2_CLK"), + MTK_FUNCTION(4, "O_I2SO2_D0"), + MTK_FUNCTION(5, "B0_TP_GPIO3_AO"), + MTK_FUNCTION(6, "B0_SPIM5_MISO"), + MTK_FUNCTION(7, "O_PGD_LV_HSC_PWR3") + ), + + MTK_PIN( + 135, "GPIO135", + MTK_EINT_FUNCTION(0, 135), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO135"), + MTK_FUNCTION(1, "O_DPI_D4"), + MTK_FUNCTION(2, "I0_GBE_RXD3"), + MTK_FUNCTION(3, "I0_DMIC2_DAT"), + MTK_FUNCTION(4, "O_I2SO2_D1"), + MTK_FUNCTION(5, "B0_TP_GPIO4_AO"), + MTK_FUNCTION(6, "I1_WAKEN"), + MTK_FUNCTION(7, "O_PGD_LV_HSC_PWR4") + ), + + MTK_PIN( + 136, "GPIO136", + MTK_EINT_FUNCTION(0, 136), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO136"), + MTK_FUNCTION(1, "O_DPI_D5"), + MTK_FUNCTION(2, "I0_GBE_RXD2"), + MTK_FUNCTION(3, "I0_DMIC2_DAT_R"), + MTK_FUNCTION(4, "O_I2SO2_D2"), + MTK_FUNCTION(5, "B0_TP_GPIO5_AO"), + MTK_FUNCTION(6, "O_PERSTN"), + MTK_FUNCTION(7, "O_PGD_LV_HSC_PWR5") + ), + + MTK_PIN( + 137, "GPIO137", + MTK_EINT_FUNCTION(0, 137), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO137"), + MTK_FUNCTION(1, "O_DPI_D6"), + MTK_FUNCTION(2, "I0_GBE_RXD1"), + MTK_FUNCTION(3, "O_DMIC3_CLK"), + MTK_FUNCTION(4, "O_I2SO2_D3"), + MTK_FUNCTION(5, "B0_TP_GPIO6_AO"), + MTK_FUNCTION(6, "B1_CLKREQN"), + MTK_FUNCTION(7, "O_PWM_0") + ), + + MTK_PIN( + 138, "GPIO138", + MTK_EINT_FUNCTION(0, 138), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO138"), + MTK_FUNCTION(1, "O_DPI_D7"), + MTK_FUNCTION(2, "I0_GBE_RXD0"), + MTK_FUNCTION(3, "I0_DMIC3_DAT"), + MTK_FUNCTION(4, "O_CLKM2"), + MTK_FUNCTION(5, "B0_TP_GPIO7_AO"), + MTK_FUNCTION(7, "B0_MD32_0_GPIO0") + ), + + MTK_PIN( + 139, "GPIO139", + MTK_EINT_FUNCTION(0, 139), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO139"), + MTK_FUNCTION(1, "O_DPI_D8"), + MTK_FUNCTION(2, "B0_GBE_TXC"), + MTK_FUNCTION(3, "I0_DMIC3_DAT_R"), + MTK_FUNCTION(4, "O_CLKM3"), + MTK_FUNCTION(5, "O_TP_UTXD2_AO"), + MTK_FUNCTION(6, "O_UTXD2"), + MTK_FUNCTION(7, "B0_MD32_0_GPIO1") + ), + + MTK_PIN( + 140, "GPIO140", + MTK_EINT_FUNCTION(0, 140), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO140"), + MTK_FUNCTION(1, "O_DPI_D9"), + MTK_FUNCTION(2, "I0_GBE_RXC"), + MTK_FUNCTION(3, "O_DMIC4_CLK"), + MTK_FUNCTION(4, "O_PWM_2"), + MTK_FUNCTION(5, "I1_TP_URXD2_AO"), + MTK_FUNCTION(6, "I1_URXD2"), + MTK_FUNCTION(7, "B0_MD32_0_GPIO2") + ), + + MTK_PIN( + 141, "GPIO141", + MTK_EINT_FUNCTION(0, 141), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO141"), + MTK_FUNCTION(1, "O_DPI_D10"), + MTK_FUNCTION(2, "I0_GBE_RXDV"), + MTK_FUNCTION(3, "I0_DMIC4_DAT"), + MTK_FUNCTION(4, "O_PWM_3"), + MTK_FUNCTION(5, "O_TP_URTS2_AO"), + MTK_FUNCTION(6, "O_URTS2"), + MTK_FUNCTION(7, "B0_MD32_1_GPIO0") + ), + + MTK_PIN( + 142, "GPIO142", + MTK_EINT_FUNCTION(0, 142), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO142"), + MTK_FUNCTION(1, "O_DPI_D11"), + MTK_FUNCTION(2, "O_GBE_TXEN"), + MTK_FUNCTION(3, "I0_DMIC4_DAT_R"), + MTK_FUNCTION(4, "O_PWM_1"), + MTK_FUNCTION(5, "I1_TP_UCTS2_AO"), + MTK_FUNCTION(6, "I1_UCTS2"), + MTK_FUNCTION(7, "B0_MD32_1_GPIO1") + ), + + MTK_PIN( + 143, "GPIO143", + MTK_EINT_FUNCTION(0, 143), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO143"), + MTK_FUNCTION(1, "O_DPI_D12"), + MTK_FUNCTION(2, "O_GBE_MDC"), + MTK_FUNCTION(3, "B0_MD32_0_GPIO0"), + MTK_FUNCTION(4, "O_CLKM0"), + MTK_FUNCTION(5, "O_SPIM3_CSB"), + MTK_FUNCTION(6, "O_UTXD1"), + MTK_FUNCTION(7, "B0_MD32_1_GPIO2") + ), + + MTK_PIN( + 144, "GPIO144", + MTK_EINT_FUNCTION(0, 144), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO144"), + MTK_FUNCTION(1, "O_DPI_D13"), + MTK_FUNCTION(2, "B1_GBE_MDIO"), + MTK_FUNCTION(3, "B0_MD32_0_GPIO1"), + MTK_FUNCTION(4, "O_CLKM1"), + MTK_FUNCTION(5, "O_SPIM3_CLK"), + MTK_FUNCTION(6, "I1_URXD1"), + MTK_FUNCTION(7, "O_PGD_HV_HSC_PWR0") + ), + + MTK_PIN( + 145, "GPIO145", + MTK_EINT_FUNCTION(0, 145), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO145"), + MTK_FUNCTION(1, "O_DPI_D14"), + MTK_FUNCTION(2, "O_GBE_TXER"), + MTK_FUNCTION(3, "B0_MD32_1_GPIO0"), + MTK_FUNCTION(4, "O_CMFLASH0"), + MTK_FUNCTION(5, "B0_SPIM3_MOSI"), + MTK_FUNCTION(6, "B0_GBE_AUX_PPS2"), + MTK_FUNCTION(7, "O_PGD_HV_HSC_PWR1") + ), + + MTK_PIN( + 146, "GPIO146", + MTK_EINT_FUNCTION(0, 146), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO146"), + MTK_FUNCTION(1, "O_DPI_D15"), + MTK_FUNCTION(2, "I0_GBE_RXER"), + MTK_FUNCTION(3, "B0_MD32_1_GPIO1"), + MTK_FUNCTION(4, "O_CMFLASH1"), + MTK_FUNCTION(5, "B0_SPIM3_MISO"), + MTK_FUNCTION(6, "B0_GBE_AUX_PPS3"), + MTK_FUNCTION(7, "O_PGD_HV_HSC_PWR2") + ), + + MTK_PIN( + 147, "GPIO147", + MTK_EINT_FUNCTION(0, 147), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO147"), + MTK_FUNCTION(1, "O_DPI_HSYNC"), + MTK_FUNCTION(2, "I0_GBE_COL"), + MTK_FUNCTION(3, "O_I2SO1_MCK"), + MTK_FUNCTION(4, "O_CMVREF0"), + MTK_FUNCTION(5, "O_SPDIF_OUT"), + MTK_FUNCTION(6, "O_URTS1"), + MTK_FUNCTION(7, "O_PGD_HV_HSC_PWR3") + ), + + MTK_PIN( + 148, "GPIO148", + MTK_EINT_FUNCTION(0, 148), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO148"), + MTK_FUNCTION(1, "O_DPI_VSYNC"), + MTK_FUNCTION(2, "I0_GBE_INTR"), + MTK_FUNCTION(3, "O_I2SO1_BCK"), + MTK_FUNCTION(4, "O_CMVREF1"), + MTK_FUNCTION(5, "I0_SPDIF_IN0"), + MTK_FUNCTION(6, "I1_UCTS1"), + MTK_FUNCTION(7, "O_PGD_HV_HSC_PWR4") + ), + + MTK_PIN( + 149, "GPIO149", + MTK_EINT_FUNCTION(0, 149), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO149"), + MTK_FUNCTION(1, "O_DPI_DE"), + MTK_FUNCTION(2, "B0_GBE_AUX_PPS0"), + MTK_FUNCTION(3, "O_I2SO1_WS"), + MTK_FUNCTION(4, "O_CMVREF2"), + MTK_FUNCTION(5, "I0_SPDIF_IN1"), + MTK_FUNCTION(6, "O_UTXD3"), + MTK_FUNCTION(7, "O_PGD_HV_HSC_PWR5") + ), + + MTK_PIN( + 150, "GPIO150", + MTK_EINT_FUNCTION(0, 150), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO150"), + MTK_FUNCTION(1, "O_DPI_CK"), + MTK_FUNCTION(2, "B0_GBE_AUX_PPS1"), + MTK_FUNCTION(3, "O_I2SO1_D0"), + MTK_FUNCTION(4, "O_CMVREF3"), + MTK_FUNCTION(5, "I0_SPDIF_IN2"), + MTK_FUNCTION(6, "I1_URXD3") + ), + + MTK_PIN( + 151, "GPIO151", + MTK_EINT_FUNCTION(0, 151), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO151"), + MTK_FUNCTION(1, "B1_MSDC0_DAT7") + ), + + MTK_PIN( + 152, "GPIO152", + MTK_EINT_FUNCTION(0, 152), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO152"), + MTK_FUNCTION(1, "B1_MSDC0_DAT6") + ), + + MTK_PIN( + 153, "GPIO153", + MTK_EINT_FUNCTION(0, 153), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO153"), + MTK_FUNCTION(1, "B1_MSDC0_DAT5") + ), + + MTK_PIN( + 154, "GPIO154", + MTK_EINT_FUNCTION(0, 154), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO154"), + MTK_FUNCTION(1, "B1_MSDC0_DAT4") + ), + + MTK_PIN( + 155, "GPIO155", + MTK_EINT_FUNCTION(0, 155), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO155"), + MTK_FUNCTION(1, "O_MSDC0_RSTB") + ), + + MTK_PIN( + 156, "GPIO156", + MTK_EINT_FUNCTION(0, 156), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO156"), + MTK_FUNCTION(1, "B1_MSDC0_CMD") + ), + + MTK_PIN( + 157, "GPIO157", + MTK_EINT_FUNCTION(0, 157), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO157"), + MTK_FUNCTION(1, "B1_MSDC0_CLK") + ), + + MTK_PIN( + 158, "GPIO158", + MTK_EINT_FUNCTION(0, 158), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO158"), + MTK_FUNCTION(1, "B1_MSDC0_DAT3") + ), + + MTK_PIN( + 159, "GPIO159", + MTK_EINT_FUNCTION(0, 159), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO159"), + MTK_FUNCTION(1, "B1_MSDC0_DAT2") + ), + + MTK_PIN( + 160, "GPIO160", + MTK_EINT_FUNCTION(0, 160), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO160"), + MTK_FUNCTION(1, "B1_MSDC0_DAT1") + ), + + MTK_PIN( + 161, "GPIO161", + MTK_EINT_FUNCTION(0, 161), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO161"), + MTK_FUNCTION(1, "B1_MSDC0_DAT0") + ), + + MTK_PIN( + 162, "GPIO162", + MTK_EINT_FUNCTION(0, 162), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO162"), + MTK_FUNCTION(1, "B0_MSDC0_DSL") + ), + + MTK_PIN( + 163, "GPIO163", + MTK_EINT_FUNCTION(0, 163), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO163"), + MTK_FUNCTION(1, "B1_MSDC1_CMD"), + MTK_FUNCTION(2, "O_SPDIF_OUT"), + MTK_FUNCTION(3, "I1_MD32_0_JTAG_TMS"), + MTK_FUNCTION(4, "I1_ADSP_JTAG0_TMS"), + MTK_FUNCTION(5, "I1_SCP_JTAG0_TMS"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TMS"), + MTK_FUNCTION(7, "I0_IPU_JTAG_TMS") + ), + + MTK_PIN( + 164, "GPIO164", + MTK_EINT_FUNCTION(0, 164), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO164"), + MTK_FUNCTION(1, "B1_MSDC1_CLK"), + MTK_FUNCTION(2, "I0_SPDIF_IN0"), + MTK_FUNCTION(3, "I1_MD32_0_JTAG_TCK"), + MTK_FUNCTION(4, "I0_ADSP_JTAG0_TCK"), + MTK_FUNCTION(5, "I1_SCP_JTAG0_TCK"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TCK"), + MTK_FUNCTION(7, "I0_IPU_JTAG_TCK") + ), + + MTK_PIN( + 165, "GPIO165", + MTK_EINT_FUNCTION(0, 165), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO165"), + MTK_FUNCTION(1, "B1_MSDC1_DAT0"), + MTK_FUNCTION(2, "I0_SPDIF_IN1"), + MTK_FUNCTION(3, "I1_MD32_0_JTAG_TDI"), + MTK_FUNCTION(4, "I1_ADSP_JTAG0_TDI"), + MTK_FUNCTION(5, "I1_SCP_JTAG0_TDI"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TDI"), + MTK_FUNCTION(7, "I0_IPU_JTAG_TDI") + ), + + MTK_PIN( + 166, "GPIO166", + MTK_EINT_FUNCTION(0, 166), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO166"), + MTK_FUNCTION(1, "B1_MSDC1_DAT1"), + MTK_FUNCTION(2, "I0_SPDIF_IN2"), + MTK_FUNCTION(3, "O_MD32_0_JTAG_TDO"), + MTK_FUNCTION(4, "O_ADSP_JTAG0_TDO"), + MTK_FUNCTION(5, "O_SCP_JTAG0_TDO"), + MTK_FUNCTION(6, "O_CCU0_JTAG_TDO"), + MTK_FUNCTION(7, "O_IPU_JTAG_TDO") + ), + + MTK_PIN( + 167, "GPIO167", + MTK_EINT_FUNCTION(0, 167), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO167"), + MTK_FUNCTION(1, "B1_MSDC1_DAT2"), + MTK_FUNCTION(2, "O_PWM_0"), + MTK_FUNCTION(3, "I1_MD32_0_JTAG_TRST"), + MTK_FUNCTION(4, "I1_ADSP_JTAG0_TRSTN"), + MTK_FUNCTION(5, "I0_SCP_JTAG0_TRSTN"), + MTK_FUNCTION(6, "I1_CCU0_JTAG_TRST"), + MTK_FUNCTION(7, "I0_IPU_JTAG_TRST") + ), + + MTK_PIN( + 168, "GPIO168", + MTK_EINT_FUNCTION(0, 168), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO168"), + MTK_FUNCTION(1, "B1_MSDC1_DAT3"), + MTK_FUNCTION(2, "O_PWM_1"), + MTK_FUNCTION(3, "O_CLKM0") + ), + + MTK_PIN( + 169, "GPIO169", + MTK_EINT_FUNCTION(0, 169), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO169"), + MTK_FUNCTION(1, "B1_MSDC2_CMD"), + MTK_FUNCTION(2, "O_LVTS_FOUT"), + MTK_FUNCTION(3, "I1_MD32_1_JTAG_TMS"), + MTK_FUNCTION(4, "I0_UDI_TMS"), + MTK_FUNCTION(5, "I0_VPU_UDI_TMS"), + MTK_FUNCTION(6, "B0_TDMIN_MCK"), + MTK_FUNCTION(7, "I1_SSPM_JTAG_TMS") + ), + + MTK_PIN( + 170, "GPIO170", + MTK_EINT_FUNCTION(0, 170), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO170"), + MTK_FUNCTION(1, "B1_MSDC2_CLK"), + MTK_FUNCTION(2, "O_LVTS_SDO"), + MTK_FUNCTION(3, "I1_MD32_1_JTAG_TCK"), + MTK_FUNCTION(4, "I0_UDI_TCK"), + MTK_FUNCTION(5, "I0_VPU_UDI_TCK"), + MTK_FUNCTION(6, "B0_TDMIN_BCK"), + MTK_FUNCTION(7, "I1_SSPM_JTAG_TCK") + ), + + MTK_PIN( + 171, "GPIO171", + MTK_EINT_FUNCTION(0, 171), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO171"), + MTK_FUNCTION(1, "B1_MSDC2_DAT0"), + MTK_FUNCTION(2, "I0_LVTS_26M"), + MTK_FUNCTION(3, "I1_MD32_1_JTAG_TDI"), + MTK_FUNCTION(4, "I0_UDI_TDI"), + MTK_FUNCTION(5, "I0_VPU_UDI_TDI"), + MTK_FUNCTION(6, "B0_TDMIN_LRCK"), + MTK_FUNCTION(7, "I1_SSPM_JTAG_TDI") + ), + + MTK_PIN( + 172, "GPIO172", + MTK_EINT_FUNCTION(0, 172), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO172"), + MTK_FUNCTION(1, "B1_MSDC2_DAT1"), + MTK_FUNCTION(2, "I0_LVTS_SCF"), + MTK_FUNCTION(3, "O_MD32_1_JTAG_TDO"), + MTK_FUNCTION(4, "O_UDI_TDO"), + MTK_FUNCTION(5, "O_VPU_UDI_TDO"), + MTK_FUNCTION(6, "I0_TDMIN_DI"), + MTK_FUNCTION(7, "O_SSPM_JTAG_TDO") + ), + + MTK_PIN( + 173, "GPIO173", + MTK_EINT_FUNCTION(0, 173), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO173"), + MTK_FUNCTION(1, "B1_MSDC2_DAT2"), + MTK_FUNCTION(2, "I0_LVTS_SCK"), + MTK_FUNCTION(3, "I1_MD32_1_JTAG_TRST"), + MTK_FUNCTION(4, "I0_UDI_NTRST"), + MTK_FUNCTION(5, "I0_VPU_UDI_NTRST"), + MTK_FUNCTION(7, "I0_SSPM_JTAG_TRSTN") + ), + + MTK_PIN( + 174, "GPIO174", + MTK_EINT_FUNCTION(0, 174), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO174"), + MTK_FUNCTION(1, "B1_MSDC2_DAT3"), + MTK_FUNCTION(2, "I0_LVTS_SDI") + ), + + MTK_PIN( + 175, "GPIO175", + MTK_EINT_FUNCTION(0, 175), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO175"), + MTK_FUNCTION(1, "B0_SPMI_M_SCL") + ), + + MTK_PIN( + 176, "GPIO176", + MTK_EINT_FUNCTION(0, 176), + DRV_GRP4, + MTK_FUNCTION(0, "B_GPIO176"), + MTK_FUNCTION(1, "B0_SPMI_M_SDA") + ), + + MTK_PIN( + 177, "GPIO177", + MTK_EINT_FUNCTION(0, 212), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 178, "GPIO178", + MTK_EINT_FUNCTION(0, 213), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 179, "GPIO179", + MTK_EINT_FUNCTION(0, 214), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 180, "GPIO180", + MTK_EINT_FUNCTION(0, 215), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 181, "GPIO181", + MTK_EINT_FUNCTION(0, 216), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 182, "GPIO182", + MTK_EINT_FUNCTION(0, 217), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 183, "GPIO183", + MTK_EINT_FUNCTION(0, 218), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 184, "GPIO184", + MTK_EINT_FUNCTION(0, 219), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 185, "GPIO185", + MTK_EINT_FUNCTION(0, 220), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 186, "GPIO186", + MTK_EINT_FUNCTION(0, 221), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 187, "GPIO187", + MTK_EINT_FUNCTION(0, 222), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 188, "GPIO188", + MTK_EINT_FUNCTION(0, 223), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ), + + MTK_PIN( + 189, "GPIO189", + MTK_EINT_FUNCTION(0, 224), + DRV_FIXED, + MTK_FUNCTION(0, NULL) + ) +}; + +#endif /* __PINCTRL__MTK_MT8188_H */ -- GitLab From 0684bc79cd52edca88e430b177f06d980aed5779 Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Fri, 19 Aug 2022 20:06:49 +0800 Subject: [PATCH 0070/2223] dt-bindings: pinctrl: mt8186: Fix 'reg-names' for pinctrl nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mt8186 contains 8 GPIO physical address bases that correspond to the 'reg-names' of the pinctrl driver. The 'reg-names' entries in bindings are ordered incorrectly, though. The system crashes due of an erroneous address when the regulator initializes. We fix the 'reg-names' for the pinctrl nodes and the pinctrl-mt8186 example in bindings. Fixes: 338e953f1bd1 ("dt-bindings: pinctrl: mt8186: add pinctrl file and binding document") Co-developed-by: Guodong Liu Signed-off-by: Guodong Liu Signed-off-by: Allen-KH Cheng Acked-by: Krzysztof Kozlowski Reviewed-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220819120649.21523-1-allen-kh.cheng@mediatek.com Signed-off-by: Linus Walleij --- .../bindings/pinctrl/pinctrl-mt8186.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8186.yaml b/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8186.yaml index 1eeb885ce0c6b..26573a793b576 100644 --- a/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8186.yaml +++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8186.yaml @@ -41,12 +41,12 @@ properties: Gpio base register names. items: - const: iocfg0 - - const: iocfg_bm - - const: iocfg_bl - - const: iocfg_br + - const: iocfg_lt - const: iocfg_lm + - const: iocfg_lb + - const: iocfg_bl - const: iocfg_rb - - const: iocfg_tl + - const: iocfg_rt - const: eint interrupt-controller: true @@ -235,9 +235,9 @@ examples: <0x10002A00 0x0200>, <0x10002c00 0x0200>, <0x1000b000 0x1000>; - reg-names = "iocfg0", "iocfg_bm", "iocfg_bl", - "iocfg_br", "iocfg_lm", "iocfg_rb", - "iocfg_tl", "eint"; + reg-names = "iocfg0", "iocfg_lt", "iocfg_lm", + "iocfg_lb", "iocfg_bl", "iocfg_rb", + "iocfg_rt", "eint"; gpio-controller; #gpio-cells = <2>; gpio-ranges = <&pio 0 0 185>; -- GitLab From c412a97cf6c5253fcf4ae5545be5775b2417d61b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 22 Aug 2022 11:30:12 -0500 Subject: [PATCH 0071/2223] gfs2: Use TRY lock in gfs2_inode_lookup for UNLINKED inodes Before this patch, delete_work_func() would check for the GLF_DEMOTE flag on the iopen glock and if set, it would perform special processing. However, there was a race whereby the GLF_DEMOTE flag could be set by another process after the check. Then when it called gfs2_lookup_by_inum() which calls gfs2_inode_lookup(), it tried to lock the iopen glock in SH mode, but the GLF_DEMOTE flag prevented the request from being granted. But the iopen glock could never be demoted because that happens when the inode is evicted, and the evict was never completed because of the failed lookup. To fix that, change function gfs2_inode_lookup() so that when GFS2_BLKST_UNLINKED inodes are searched, it uses the LM_FLAG_TRY flag for the iopen glock. If the locking request fails, fail gfs2_inode_lookup() with -EAGAIN so that delete_work_func() can retry the operation later. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 8 +++++--- fs/gfs2/inode.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 41b6c89e4bf7d..57dcfd05b3620 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1018,16 +1018,18 @@ static void delete_work_func(struct work_struct *work) if (gfs2_queue_delete_work(gl, 5 * HZ)) return; } - goto out; } inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); - if (!IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + (gfs2_queue_delete_work(gl, 5 * HZ))) + return; + } else { d_prune_aliases(inode); iput(inode); } -out: gfs2_glock_put(gl); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c8ec876f33ea3..56ded979988ca 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -130,6 +130,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; + int extra_flags = 0; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -141,9 +142,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; - if (blktype != GFS2_BLKST_UNLINKED) + if (blktype == GFS2_BLKST_UNLINKED) + extra_flags |= LM_FLAG_TRY; + else gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, + GL_EXACT | extra_flags, &ip->i_iopen_gh); gfs2_glock_put(io_gl); if (unlikely(error)) @@ -210,6 +214,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail: + if (error == GLR_TRYFAILED) + error = -EAGAIN; if (gfs2_holder_initialized(&ip->i_iopen_gh)) gfs2_glock_dq_uninit(&ip->i_iopen_gh); if (gfs2_holder_initialized(&i_gh)) -- GitLab From 04133b607a78f2fd3daadbe5519513942b0f3a05 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 18 Aug 2022 13:32:36 -0500 Subject: [PATCH 0072/2223] gfs2: Prevent double iput for journal on error When a gfs2 file system is withdrawn it does iput on its journal to allow recovery from another cluster node. If it's unable to get a replacement inode for whatever reason, the journal descriptor would still be pointing at the evicted inode. So when unmount clears out the list of journals, it would do a second iput referencing the pointer. To avoid this, set the inode pointer to NULL. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8241029a2a5d2..95c79a3ec1612 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -204,6 +204,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * exception code in glock_dq. */ iput(inode); + sdp->sd_jdesc->jd_inode = NULL; /* * Wait until the journal inode's glock is freed. This allows try locks * on other nodes to be successful, otherwise we remain the owner of -- GitLab From 053640a73838400dca23087d66a9c0db579adafb Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 18 Aug 2022 13:32:37 -0500 Subject: [PATCH 0073/2223] gfs2: Dequeue waiters when withdrawn When a withdraw occurs, ordinary (not system) glocks may not be granted anymore. Later, when the file system is unmounted, gfs2_gl_hash_clear() tries to clear out all the glocks, but these un-grantable pending waiters prevent some glocks from being freed. So the unmount hangs, at least for its ten-minute timeout period. This patch takes measures to remove any pending waiters from the glocks that will never be granted. This allows the unmount to proceed in a reasonable period of time. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 14 ++++++++++++++ fs/gfs2/glock.h | 1 + fs/gfs2/util.c | 5 +++++ 3 files changed, 20 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 57dcfd05b3620..858616afcae66 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2196,6 +2196,20 @@ static void dump_glock_func(struct gfs2_glock *gl) dump_glock(NULL, gl, true); } +static void withdraw_dq(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_lockref.lock); + if (!__lockref_is_dead(&gl->gl_lockref) && + glock_blocked_by_withdraw(gl)) + do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ + spin_unlock(&gl->gl_lockref.lock); +} + +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) +{ + glock_hash_walk(withdraw_dq, sdp); +} + /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5aed8b500cf5a..0199a3dcb1140 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -274,6 +274,7 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 95c79a3ec1612..88185a3415040 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -164,6 +164,11 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) } if (!ret) gfs2_make_fs_ro(sdp); + /* + * Dequeue any pending non-system glock holders that can no + * longer be granted because the file system is withdrawn. + */ + gfs2_gl_dq_holders(sdp); gfs2_freeze_unlock(&freeze_gh); } -- GitLab From 86934198eefa10a71f35162b06c44c36d85b98ba Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 18 Aug 2022 13:32:38 -0500 Subject: [PATCH 0074/2223] gfs2: Clear flags when withdraw prevents xmote There are a couple places in function do_xmote where normal processing is circumvented due to withdraws in progress. However, since we bypass most of do_xmote() we bypass telling dlm to lock the dlm lock, which means dlm will never respond with a completion callback. Since the completion callback ordinarily clears GLF_LOCK, this patch changes function do_xmote to handle those situations more gracefully so the file system may be unmounted after withdraw. A very similar situation happens with the GLF_DEMOTE_IN_PROGRESS flag, which is cleared by function finish_xmote(). Since the withdraw causes us to skip the majority of do_xmote, it therefore also skips the call to finish_xmote() so the DEMOTE_IN_PROGRESS flag needs to be cleared manually. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 858616afcae66..dca2cbf0338c8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -59,6 +59,8 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void __gfs2_glock_dq(struct gfs2_holder *gh); +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; @@ -730,7 +732,8 @@ static bool is_system_glock(struct gfs2_glock *gl) * */ -static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, + unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { @@ -741,7 +744,8 @@ __acquires(&gl->gl_lockref.lock) if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) - return; + goto skip_inval; + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -826,6 +830,20 @@ skip_inval: (target != LM_ST_UNLOCKED || test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { if (!is_system_glock(gl)) { + handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */ + /* + * Ordinarily, we would call dlm and its callback would call + * finish_xmote, which would call state_change() to the new state. + * Since we withdrew, we won't call dlm, so call state_change + * manually, but to the UNLOCKED state we desire. + */ + state_change(gl, LM_ST_UNLOCKED); + /* + * We skip telling dlm to do the locking, so we won't get a + * reply that would otherwise clear GLF_LOCK. So we clear it here. + */ + clear_bit(GLF_LOCK, &gl->gl_flags); + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); goto out; } else { -- GitLab From 0e1fa5155a364de7d3de770eb382980933376699 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Sat, 6 Aug 2022 18:53:01 +1000 Subject: [PATCH 0075/2223] MAINTAINERS: Add Mahesh J Salgaonkar as EEH maintainer Update EEH entry: - Russell: lacks time to maintain EEH. - Oliver: lacks time & hardware to do actual maintenance, but happy to field questions and review things. - Mahesh: glad to take over EEH maintenance. [bhelgaas: commit log, add Mahesh, make Oliver reviewer] Link: https://lore.kernel.org/r/20220806085301.25142-1-ruscur@russell.cc Signed-off-by: Russell Currey Signed-off-by: Bjorn Helgaas Acked-by: Michael Ellerman --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f60dfac7661c4..51def5ac94624 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15696,8 +15696,8 @@ F: drivers/pci/endpoint/ F: tools/pci/ PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC -M: Russell Currey -M: Oliver O'Halloran +M: Mahesh J Salgaonkar +R: Oliver O'Halloran L: linuxppc-dev@lists.ozlabs.org S: Supported F: Documentation/PCI/pci-error-recovery.rst -- GitLab From bbe2a5d87602ce0ac206e9f41fca9bd76d75da11 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 26 Aug 2022 15:26:50 +1000 Subject: [PATCH 0076/2223] pinctrl: fixup for "i2c: Make remove callback return void" Fix up the build. Signed-off-by: Stephen Rothwell Link: https://lore.kernel.org/r/20220826152650.2c55e482@canb.auug.org.au Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index a29df0920f4f4..05791212822e5 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1352,14 +1352,12 @@ err_exit: return ret; } -static int cy8c95x0_remove(struct i2c_client *client) +static void cy8c95x0_remove(struct i2c_client *client) { struct cy8c95x0_pinctrl *chip = i2c_get_clientdata(client); if (!IS_ERR_OR_NULL(chip->regulator)) regulator_disable(chip->regulator); - - return 0; } static struct i2c_driver cy8c95x0_driver = { -- GitLab From 76e55d938c5bfd2b28ee868fe071181cce5353ad Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 23 Aug 2022 18:07:52 -0500 Subject: [PATCH 0077/2223] pinctrl: amd: Pick some different unicode symbols Feedback from Kent had showed some better selections for symbols to use for pinctrl-amd debugfs output. Adopt some of those instead. Fixes: e8129a076a50 ("pinctrl: amd: Use unicode for debugfs output") Suggested-by: Kent Gibson Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20220823230753.14799-1-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 4691a33bc374f..fda41907c4f17 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -246,7 +246,7 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) } seq_printf(s, "GPIO bank%d\n", bank); for (; i < pin_num; i++) { - seq_printf(s, "📌%d\t", i); + seq_printf(s, "#%d\t", i); raw_spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + i * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); @@ -278,32 +278,32 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) } if (pin_reg & BIT(INTERRUPT_MASK_OFF)) - interrupt_mask = "-"; + interrupt_mask = "😛"; else - interrupt_mask = "+"; - seq_printf(s, "int %s (🎭 %s)| active-%s| %s-🔫| ", + interrupt_mask = "😷"; + seq_printf(s, "int %s (%s)| active-%s| %s-⚡| ", interrupt_enable, interrupt_mask, active_level, level_trig); if (pin_reg & BIT(WAKE_CNTRL_OFF_S0I3)) - wake_cntrl0 = "+"; + wake_cntrl0 = "⏰"; else - wake_cntrl0 = "∅"; - seq_printf(s, "S0i3 🌅 %s| ", wake_cntrl0); + wake_cntrl0 = " ∅"; + seq_printf(s, "S0i3 %s| ", wake_cntrl0); if (pin_reg & BIT(WAKE_CNTRL_OFF_S3)) - wake_cntrl1 = "+"; + wake_cntrl1 = "⏰"; else - wake_cntrl1 = "∅"; - seq_printf(s, "S3 🌅 %s| ", wake_cntrl1); + wake_cntrl1 = " ∅"; + seq_printf(s, "S3 %s| ", wake_cntrl1); if (pin_reg & BIT(WAKE_CNTRL_OFF_S4)) - wake_cntrl2 = "+"; + wake_cntrl2 = "⏰"; else - wake_cntrl2 = "∅"; - seq_printf(s, "S4/S5 🌅 %s| ", wake_cntrl2); + wake_cntrl2 = " ∅"; + seq_printf(s, "S4/S5 %s| ", wake_cntrl2); if (pin_reg & BIT(PULL_UP_ENABLE_OFF)) { pull_up_enable = "+"; @@ -367,7 +367,7 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) debounce_enable = " ∅"; } snprintf(debounce_value, sizeof(debounce_value), "%u", time * unit); - seq_printf(s, "debounce %s (⏰ %sus)| ", debounce_enable, debounce_value); + seq_printf(s, "debounce %s (🕑 %sus)| ", debounce_enable, debounce_value); seq_printf(s, " 0x%x\n", pin_reg); } } -- GitLab From 204c0300c4e99707e9fb6e57840aa1127060e63f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 26 Aug 2022 15:12:17 +0200 Subject: [PATCH 0078/2223] gfs2: Switch from strlcpy to strscpy Switch from strlcpy to strscpy and make sure that @count is the size of the smaller of the source and destination buffers. This prevents reading beyond the end of the source buffer when the source string isn't null terminated. Found by a modified version of syzkaller. Suggested-by: Wolfram Sang Signed-off-by: Andreas Gruenbacher --- fs/gfs2/ops_fstype.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 549879929c847..236b59ef93b68 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -381,8 +381,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) if (!table[0]) table = sdp->sd_vfs->s_id; - strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); - strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN); + + strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN); + strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN); table = sdp->sd_table_name; while ((table = strchr(table, '/'))) @@ -1439,13 +1441,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (o) { case Opt_lockproto: - strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); break; case Opt_locktable: - strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); break; case Opt_hostdata: - strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); break; case Opt_spectator: args->ar_spectator = 1; -- GitLab From 9194e0f88a74d98f98b33183e6dda87c3753dd71 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 23 Aug 2022 09:56:37 -0500 Subject: [PATCH 0079/2223] dt-bindings: pinctrl: Add missing (unevaluated|additional)Properties on child nodes In order to ensure only documented properties are present, node schemas must have unevaluatedProperties or additionalProperties set to false (typically). Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220823145649.3118479-6-robh@kernel.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml | 1 + .../devicetree/bindings/pinctrl/intel,pinctrl-keembay.yaml | 1 + .../devicetree/bindings/pinctrl/intel,pinctrl-thunderbay.yaml | 1 + .../devicetree/bindings/pinctrl/marvell,ac5-pinctrl.yaml | 1 + .../devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml | 2 ++ .../devicetree/bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml | 1 + .../devicetree/bindings/pinctrl/renesas,rza1-ports.yaml | 1 + Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml | 3 +++ .../devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 3 +++ .../devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml | 1 + 10 files changed, 15 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml index 175a992f15e16..8a9fb9b433ca1 100644 --- a/Documentation/devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml @@ -23,6 +23,7 @@ patternProperties: '-pins$': type: object $ref: pinmux-node.yaml# + additionalProperties: false properties: function: diff --git a/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-keembay.yaml b/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-keembay.yaml index 5e99d79499b49..846651ff77c91 100644 --- a/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-keembay.yaml +++ b/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-keembay.yaml @@ -44,6 +44,7 @@ properties: patternProperties: '^gpio@[0-9a-f]*$': type: object + additionalProperties: false description: Child nodes can be specified to contain pin configuration information, diff --git a/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-thunderbay.yaml b/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-thunderbay.yaml index 0ec476248f216..6f30b5337ca25 100644 --- a/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-thunderbay.yaml +++ b/Documentation/devicetree/bindings/pinctrl/intel,pinctrl-thunderbay.yaml @@ -42,6 +42,7 @@ properties: patternProperties: '^gpio@[0-9a-f]*$': type: object + additionalProperties: false description: Child nodes can be specified to contain pin configuration information, diff --git a/Documentation/devicetree/bindings/pinctrl/marvell,ac5-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/marvell,ac5-pinctrl.yaml index a651b2744caf3..491f67e7cc4fa 100644 --- a/Documentation/devicetree/bindings/pinctrl/marvell,ac5-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/marvell,ac5-pinctrl.yaml @@ -24,6 +24,7 @@ patternProperties: '-pins$': type: object $ref: pinmux-node.yaml# + additionalProperties: false properties: marvell,function: diff --git a/Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml index e7601c0f5a695..840f649e36ce3 100644 --- a/Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml @@ -76,6 +76,8 @@ required: patternProperties: '-[0-9]*$': type: object + additionalProperties: false + patternProperties: '-pins*$': type: object diff --git a/Documentation/devicetree/bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml index 7a11beb8f222d..7b7f840ffc4cf 100644 --- a/Documentation/devicetree/bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml @@ -30,6 +30,7 @@ patternProperties: "^gpio@[0-7]$": type: object + additionalProperties: false description: Eight GPIO banks (gpio@0 to gpio@7), that each contain between 14 and 18 diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,rza1-ports.yaml b/Documentation/devicetree/bindings/pinctrl/renesas,rza1-ports.yaml index 8ed4b98a16289..9083040c996ab 100644 --- a/Documentation/devicetree/bindings/pinctrl/renesas,rza1-ports.yaml +++ b/Documentation/devicetree/bindings/pinctrl/renesas,rza1-ports.yaml @@ -41,6 +41,7 @@ required: patternProperties: "^gpio-[0-9]*$": type: object + additionalProperties: false description: Each port of the r7s72100 pin controller hardware is itself a GPIO diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml index 3a65c66ca71d2..d006a940c7c64 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml @@ -97,6 +97,9 @@ patternProperties: additionalProperties: false "^(initial|sleep)-state$": + type: object + additionalProperties: false + patternProperties: "^(pin-[a-z0-9-]+|[a-z0-9-]+-pin)$": $ref: samsung,pinctrl-pins-cfg.yaml diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml index d35dcc4f02421..53c952d93ea28 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -115,9 +115,12 @@ patternProperties: '-[0-9]*$': type: object + additionalProperties: false + patternProperties: '^pins': type: object + additionalProperties: false description: | A pinctrl node should contain at least one subnode representing the pinctrl group available on the machine. Each subnode will list the diff --git a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml index 306524885a2b8..98b4663f9766c 100644 --- a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml @@ -36,6 +36,7 @@ patternProperties: pins it needs, and how they should be configured, with regard to muxer configuration, pullups, drive strength. $ref: "pinmux-node.yaml" + additionalProperties: false properties: function: -- GitLab From 1ebfe7e36182a658819e4ded44d38d4033c8bbfb Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Thu, 25 Aug 2022 20:41:34 +0800 Subject: [PATCH 0080/2223] pinctrl: nuvoton: Use 'unsigned int' instead of just 'unsigned'. 'unsigned int' should be clearer than 'unsigned'. Signed-off-by: Jilin Yuan Link: https://lore.kernel.org/r/20220825124134.30242-1-yuanjilin@cdjrlc.com Signed-off-by: Linus Walleij --- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index 64d8a568b3dbd..1c4e89b046de1 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -81,11 +81,11 @@ struct npcm7xx_gpio { int irq; struct irq_chip irq_chip; u32 pinctrl_id; - int (*direction_input)(struct gpio_chip *chip, unsigned offset); - int (*direction_output)(struct gpio_chip *chip, unsigned offset, + int (*direction_input)(struct gpio_chip *chip, unsigned int offset); + int (*direction_output)(struct gpio_chip *chip, unsigned int offset, int value); - int (*request)(struct gpio_chip *chip, unsigned offset); - void (*free)(struct gpio_chip *chip, unsigned offset); + int (*request)(struct gpio_chip *chip, unsigned int offset); + void (*free)(struct gpio_chip *chip, unsigned int offset); }; struct npcm7xx_pinctrl { -- GitLab From 2b96f92ca4257c05e352f61742839b451e293949 Mon Sep 17 00:00:00 2001 From: Josef Johansson Date: Mon, 14 Feb 2022 11:07:47 +0100 Subject: [PATCH 0081/2223] PCI/MSI: Correct 'can_mask' test in msi_add_msi_desc() 71020a3c0dff4 ("PCI/MSI: Use msi_add_msi_desc()") inadvertently reversed the sense of "msi_attrib.can_mask" in one use: - if (entry->pci.msi_attrib.can_mask) { - addr = pci_msix_desc_addr(entry); - entry->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); + if (!desc.pci.msi_attrib.can_mask) { + addr = pci_msix_desc_addr(&desc); + desc.pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); Restore the original test. [bhelgaas: commit log] Fixes: 71020a3c0dff4 ("PCI/MSI: Use msi_add_msi_desc()") Link: https://lore.kernel.org/r/d818f9c9-a432-213e-4152-eaff3b7da52e@oderland.se Signed-off-by: Josef Johansson Signed-off-by: Bjorn Helgaas Reviewed-by: Jason Gunthorpe --- drivers/pci/msi/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 9037a7827eca7..fdd2ec09651e9 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -526,7 +526,7 @@ static int msix_setup_msi_descs(struct pci_dev *dev, void __iomem *base, desc.pci.msi_attrib.can_mask = !pci_msi_ignore_mask && !desc.pci.msi_attrib.is_virtual; - if (!desc.pci.msi_attrib.can_mask) { + if (desc.pci.msi_attrib.can_mask) { addr = pci_msix_desc_addr(&desc); desc.pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); } -- GitLab From 423511ec23e2a6fa7830ed76b0283268e795d09d Mon Sep 17 00:00:00 2001 From: Will McVicker Date: Thu, 25 Aug 2022 23:54:02 +0000 Subject: [PATCH 0082/2223] PCI: dwc: Drop dependency on ZONE_DMA32 Re-work the msi_msg DMA allocation logic to use dmam_alloc_coherent() which uses the coherent DMA mask to try to return an allocation within the DMA mask limits. With that, we now can drop the msi_page parameter in struct dw_pcie_rp. This allows kernel configurations that disable ZONE_DMA32 to continue supporting a 32-bit DMA mask. Without this patch, the PCIe host device will fail to probe when ZONE_DMA32 is disabled. Link: https://lore.kernel.org/r/20220825235404.4132818-2-willmcvicker@google.com Fixes: 35797e672ff0 ("PCI: dwc: Fix MSI msi_msg DMA mapping") Reported-by: Isaac J. Manjarres Signed-off-by: Will McVicker Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Jingoo Han --- .../pci/controller/dwc/pcie-designware-host.c | 28 +++++-------------- drivers/pci/controller/dwc/pcie-designware.h | 1 - 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 7746f94a715f5..39f3b37d4033c 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -267,15 +267,6 @@ static void dw_pcie_free_msi(struct dw_pcie_rp *pp) irq_domain_remove(pp->msi_domain); irq_domain_remove(pp->irq_domain); - - if (pp->msi_data) { - struct dw_pcie *pci = to_dw_pcie_from_pp(pp); - struct device *dev = pci->dev; - - dma_unmap_page(dev, pp->msi_data, PAGE_SIZE, DMA_FROM_DEVICE); - if (pp->msi_page) - __free_page(pp->msi_page); - } } static void dw_pcie_msi_init(struct dw_pcie_rp *pp) @@ -336,6 +327,7 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp) struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct device *dev = pci->dev; struct platform_device *pdev = to_platform_device(dev); + u64 *msi_vaddr; int ret; u32 ctrl, num_ctrls; @@ -375,22 +367,16 @@ static int dw_pcie_msi_host_init(struct dw_pcie_rp *pp) dw_chained_msi_isr, pp); } - ret = dma_set_mask(dev, DMA_BIT_MASK(32)); + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); if (ret) dev_warn(dev, "Failed to set DMA mask to 32-bit. Devices with only 32-bit MSI support may not work properly\n"); - pp->msi_page = alloc_page(GFP_DMA32); - pp->msi_data = dma_map_page(dev, pp->msi_page, 0, - PAGE_SIZE, DMA_FROM_DEVICE); - ret = dma_mapping_error(dev, pp->msi_data); - if (ret) { - dev_err(pci->dev, "Failed to map MSI data\n"); - __free_page(pp->msi_page); - pp->msi_page = NULL; - pp->msi_data = 0; + msi_vaddr = dmam_alloc_coherent(dev, sizeof(u64), &pp->msi_data, + GFP_KERNEL); + if (!msi_vaddr) { + dev_err(dev, "Failed to alloc and map MSI data\n"); dw_pcie_free_msi(pp); - - return ret; + return -ENOMEM; } return 0; diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 09b887093a84f..a871ae7eb59ec 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -243,7 +243,6 @@ struct dw_pcie_rp { struct irq_domain *irq_domain; struct irq_domain *msi_domain; dma_addr_t msi_data; - struct page *msi_page; struct irq_chip *msi_irq_chip; u32 num_vectors; u32 irq_mask[MAX_MSI_CTRLS]; -- GitLab From 80dc113aaa47c0d1dfd01f708d4d0c083022121b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 11 Aug 2022 15:53:34 -0700 Subject: [PATCH 0083/2223] f2fs: LFS mode does not support ATGC ATGC is using SSR which violates LFS mode used by zoned device. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2451623c05a7a..fe462484f5fa0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1342,6 +1342,11 @@ default_check: return -EINVAL; } + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; -- GitLab From 605b0a778aa2599aa902ae639b8e9937c74b869b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Aug 2022 22:49:50 -0700 Subject: [PATCH 0084/2223] f2fs: fix wrong continue condition in GC We should decrease the frozen counter. Cc: stable@vger.kernel.org Fixes: 325163e9892b ("f2fs: add gc_urgent_high_remaining sysfs node") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6da21d405ce1e..45f90e3c46d4b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -97,14 +97,10 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT_HIGH) { spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited) { - if (!sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_limited = false; - spin_unlock(&sbi->gc_urgent_high_lock); - sbi->gc_mode = GC_NORMAL; - continue; - } - sbi->gc_urgent_high_remaining--; + if (sbi->gc_urgent_high_limited && + !sbi->gc_urgent_high_remaining--) { + sbi->gc_urgent_high_limited = false; + sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_urgent_high_lock); } -- GitLab From b87846bd61c7c09560617da416208a5454530d57 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Aug 2022 15:33:00 -0700 Subject: [PATCH 0085/2223] f2fs: use memcpy_{to,from}_page() where possible This is simpler, and as a side effect it replaces several uses of kmap_atomic() with its recommended replacement kmap_local_page(). Signed-off-by: Eric Biggers Reviewed-by: Fabio M. De Francesco Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 15 ++++----------- fs/f2fs/super.c | 11 ++--------- fs/f2fs/verity.c | 10 ++-------- 3 files changed, 8 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bf46a7dfbea2f..73da933180369 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode) void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; - void *src_addr, *dst_addr; if (PageUptodate(page)) return; @@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage) zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(inode, ipage); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - flush_dcache_page(page); - kunmap_atomic(dst_addr); + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); if (!PageUptodate(page)) SetPageUptodate(page); } @@ -246,7 +242,6 @@ out: int f2fs_write_inline_data(struct inode *inode, struct page *page) { - void *src_addr, *dst_addr; struct dnode_of_data dn; int err; @@ -263,10 +258,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); - src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(inode, dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - kunmap_atomic(src_addr); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); set_page_dirty(dn.inode_page); f2fs_clear_page_cache_dirty_tag(page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fe462484f5fa0..e910f0e39d764 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2470,7 +2470,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t toread; loff_t i_size = i_size_read(inode); struct page *page; - char *kaddr; if (off > i_size) return 0; @@ -2503,9 +2502,7 @@ repeat: return -EIO; } - kaddr = kmap_atomic(page); - memcpy(data, kaddr + offset, tocopy); - kunmap_atomic(kaddr); + memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; @@ -2527,7 +2524,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct page *page; void *fsdata = NULL; - char *kaddr; int err = 0; int tocopy; @@ -2546,10 +2542,7 @@ retry: break; } - kaddr = kmap_atomic(page); - memcpy(kaddr + offset, data, tocopy); - kunmap_atomic(kaddr); - flush_dcache_page(page); + memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, page, fsdata); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 7b8f2b41c29b1..97ec60f39d696 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -85,16 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); if (res < 0) -- GitLab From 34a23525601a16f625b48c3bb0a67fbc795810b3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Aug 2022 11:04:41 +0800 Subject: [PATCH 0086/2223] f2fs: iostat: support accounting compressed IO Previously, we supported to account FS_CDATA_READ_IO type IO only, in this patch, it adds to account more type IO for compressed file: - APP_BUFFERED_CDATA_IO - APP_MAPPED_CDATA_IO - FS_CDATA_IO - APP_BUFFERED_CDATA_READ_IO - APP_MAPPED_CDATA_READ_IO Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++- fs/f2fs/data.c | 8 ++-- fs/f2fs/f2fs.h | 5 +++ fs/f2fs/file.c | 16 ++++---- fs/f2fs/gc.c | 12 +++--- fs/f2fs/iostat.c | 74 ++++++++++++++++++++++++++----------- fs/f2fs/iostat.h | 4 +- fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 11 +++--- include/trace/events/f2fs.h | 24 +++++++++--- 10 files changed, 109 insertions(+), 52 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8259e0fa97e1f..7de48e791920c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -89,7 +89,7 @@ repeat: return ERR_PTR(err); } - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -276,7 +276,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, f2fs_put_page(page, err ? 1 : 0); if (!err) - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); } out: blk_finish_plug(&plug); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa3ccddfa0376..0869fbbb5516f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1083,7 +1083,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); return 0; } @@ -2122,7 +2122,8 @@ submit_and_realloc: goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); - f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = block_nr; goto out; @@ -2270,8 +2271,7 @@ submit_and_realloc: refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = blkaddr; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3c7cdb70fe2ef..809419a258ed2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1158,7 +1158,10 @@ enum iostat_type { APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ @@ -1172,6 +1175,8 @@ enum iostat_type { APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce4905a073b3c..771f1f7f3690c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -43,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) ret = filemap_fault(vmf); if (!ret) - f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, - F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); @@ -154,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); - f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); @@ -4212,7 +4212,7 @@ static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_READ); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); return 0; } @@ -4301,7 +4301,8 @@ skip_read_trace: } else { ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); } if (trace_f2fs_dataread_end_enabled()) trace_f2fs_dataread_end(inode, pos, ret); @@ -4418,7 +4419,8 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (ret > 0) { iocb->ki_pos += ret; - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); } return ret; } @@ -4431,7 +4433,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_WRITE); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); return 0; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 45f90e3c46d4b..2a3816c20f846 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1202,8 +1202,8 @@ got_it: f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: @@ -1303,8 +1303,10 @@ static int move_data_block(struct inode *inode, block_t bidx, goto up_out; } - f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || @@ -1356,7 +1358,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } - f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index d84c5f6cc09d7..3166a8939ed4f 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", + seq_printf(seq, "fs gc node: %-16llu\n", sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", + seq_printf(seq, "fs cp data: %-16llu\n", sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", + seq_printf(seq, "fs cp node: %-16llu\n", sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", + seq_printf(seq, "fs cp meta: %-16llu\n", sbi->rw_iostat[FS_CP_META_IO]); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_READ_IO]); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", + seq_printf(seq, "fs discard: %-16llu\n", sbi->rw_iostat[FS_DISCARD]); return 0; @@ -159,7 +169,7 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_unlock_irq(&sbi->iostat_lat_lock); } -void f2fs_update_iostat(struct f2fs_sb_info *sbi, +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { unsigned long flags; @@ -176,6 +186,28 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 22a2d01f57ef3..2c048307b6e0b 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -31,7 +31,7 @@ struct iostat_lat_info { extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); -extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes); struct bio_iostat_ctx { @@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e06a0c478b39a..2484285be3ad9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1369,7 +1369,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) err = f2fs_submit_page_bio(&fio); if (!err) - f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0de21f82d7bc8..a5054725d0b63 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1171,7 +1171,7 @@ submit: atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); lstart += len; start += len; @@ -3388,7 +3388,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); - f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -3398,7 +3398,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, @@ -3412,7 +3412,7 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); - f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) @@ -3453,7 +3453,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); } return err; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index f1e9222377368..b262985f0c3a2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1823,7 +1823,10 @@ TRACE_EVENT(f2fs_iostat, __field(unsigned long long, app_bio) __field(unsigned long long, app_wio) __field(unsigned long long, app_mio) + __field(unsigned long long, app_bcdio) + __field(unsigned long long, app_mcdio) __field(unsigned long long, fs_dio) + __field(unsigned long long, fs_cdio) __field(unsigned long long, fs_nio) __field(unsigned long long, fs_mio) __field(unsigned long long, fs_gc_dio) @@ -1835,6 +1838,8 @@ TRACE_EVENT(f2fs_iostat, __field(unsigned long long, app_brio) __field(unsigned long long, app_rio) __field(unsigned long long, app_mrio) + __field(unsigned long long, app_bcrio) + __field(unsigned long long, app_mcrio) __field(unsigned long long, fs_drio) __field(unsigned long long, fs_gdrio) __field(unsigned long long, fs_cdrio) @@ -1849,7 +1854,10 @@ TRACE_EVENT(f2fs_iostat, __entry->app_bio = iostat[APP_BUFFERED_IO]; __entry->app_wio = iostat[APP_WRITE_IO]; __entry->app_mio = iostat[APP_MAPPED_IO]; + __entry->app_bcdio = iostat[APP_BUFFERED_CDATA_IO]; + __entry->app_mcdio = iostat[APP_MAPPED_CDATA_IO]; __entry->fs_dio = iostat[FS_DATA_IO]; + __entry->fs_cdio = iostat[FS_CDATA_IO]; __entry->fs_nio = iostat[FS_NODE_IO]; __entry->fs_mio = iostat[FS_META_IO]; __entry->fs_gc_dio = iostat[FS_GC_DATA_IO]; @@ -1861,6 +1869,8 @@ TRACE_EVENT(f2fs_iostat, __entry->app_brio = iostat[APP_BUFFERED_READ_IO]; __entry->app_rio = iostat[APP_READ_IO]; __entry->app_mrio = iostat[APP_MAPPED_READ_IO]; + __entry->app_bcrio = iostat[APP_BUFFERED_CDATA_READ_IO]; + __entry->app_mcrio = iostat[APP_MAPPED_CDATA_READ_IO]; __entry->fs_drio = iostat[FS_DATA_READ_IO]; __entry->fs_gdrio = iostat[FS_GDATA_READ_IO]; __entry->fs_cdrio = iostat[FS_CDATA_READ_IO]; @@ -1870,20 +1880,24 @@ TRACE_EVENT(f2fs_iostat, ), TP_printk("dev = (%d,%d), " - "app [write=%llu (direct=%llu, buffered=%llu), mapped=%llu], " - "fs [data=%llu, node=%llu, meta=%llu, discard=%llu], " + "app [write=%llu (direct=%llu, buffered=%llu), mapped=%llu, " + "compr(buffered=%llu, mapped=%llu)], " + "fs [data=%llu, cdata=%llu, node=%llu, meta=%llu, discard=%llu], " "gc [data=%llu, node=%llu], " "cp [data=%llu, node=%llu, meta=%llu], " "app [read=%llu (direct=%llu, buffered=%llu), mapped=%llu], " - "fs [data=%llu, (gc_data=%llu, compr_data=%llu), " + "compr(buffered=%llu, mapped=%llu)], " + "fs [data=%llu, (gc_data=%llu, cdata=%llu), " "node=%llu, meta=%llu]", show_dev(__entry->dev), __entry->app_wio, __entry->app_dio, - __entry->app_bio, __entry->app_mio, __entry->fs_dio, + __entry->app_bio, __entry->app_mio, __entry->app_bcdio, + __entry->app_mcdio, __entry->fs_dio, __entry->fs_cdio, __entry->fs_nio, __entry->fs_mio, __entry->fs_discard, __entry->fs_gc_dio, __entry->fs_gc_nio, __entry->fs_cp_dio, __entry->fs_cp_nio, __entry->fs_cp_mio, __entry->app_rio, __entry->app_drio, __entry->app_brio, - __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, + __entry->app_mrio, __entry->app_bcrio, __entry->app_mcrio, + __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); -- GitLab From 265576181b4afda8c60ae85261f55a8430419884 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Aug 2022 11:06:00 +0800 Subject: [PATCH 0087/2223] f2fs: remove gc_urgent_high_limited for cleanup Remove redundant sbi->gc_urgent_high_limited. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/gc.c | 8 ++++---- fs/f2fs/sysfs.c | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 809419a258ed2..6770210aae704 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1731,7 +1731,6 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ spinlock_t gc_urgent_high_lock; - bool gc_urgent_high_limited; /* indicates having limited trial count */ unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2a3816c20f846..fd400d148afb2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -97,10 +97,10 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT_HIGH) { spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited && - !sbi->gc_urgent_high_remaining--) { - sbi->gc_urgent_high_limited = false; - sbi->gc_mode = GC_NORMAL; + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_urgent_high_lock); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index eba5fb1629d71..39ebf0ad133a9 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -527,7 +527,6 @@ out: if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t != 0; sbi->gc_urgent_high_remaining = t; spin_unlock(&sbi->gc_urgent_high_lock); -- GitLab From 2baedb9f93c42d35016c3c2e3015d67fbcb058b0 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 30 Apr 2022 11:47:40 +0300 Subject: [PATCH 0088/2223] PCI: qcom-ep: Add MODULE_DEVICE_TABLE Add MODULE_DEVICE_TABLE to enable module autoloading for respective device. Link: https://lore.kernel.org/r/20220430084740.3769925-1-dmitry.baryshkov@linaro.org Fixes: f55fee56a631 ("PCI: qcom-ep: Add Qualcomm PCIe Endpoint controller driver") Signed-off-by: Dmitry Baryshkov Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index ec99116ad05c8..4c87167861fd6 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -704,6 +704,7 @@ static const struct of_device_id qcom_pcie_ep_match[] = { { .compatible = "qcom,sdx55-pcie-ep", }, { } }; +MODULE_DEVICE_TABLE(of, qcom_pcie_ep_match); static struct platform_driver qcom_pcie_ep_driver = { .probe = qcom_pcie_ep_probe, -- GitLab From c0f1bc4e91c52be73ae1a5e6fd53371f5a7f0333 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 14 Aug 2022 00:50:19 -0500 Subject: [PATCH 0089/2223] memblock tests: add command line help option Add a help command line option to the help message. Add the help option to the short and long options so it will be recognized as a valid option. Usage: $ ./main -h Or: $ ./main --help Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/0f3b93a79de78c0da1ca90f74fe35e9a85c7cf93.1660451025.git.remckee0@gmail.com --- tools/testing/memblock/tests/common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index e43b2676af816..76a8ad818f3a7 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -14,14 +14,16 @@ static struct test_memory memory_block; static const char __maybe_unused *prefixes[PREFIXES_MAX]; static int __maybe_unused nr_prefixes; -static const char *short_opts = "mv"; +static const char *short_opts = "hmv"; static const struct option long_opts[] = { + {"help", 0, NULL, 'h'}, {"movable-node", 0, NULL, 'm'}, {"verbose", 0, NULL, 'v'}, {NULL, 0, NULL, 0} }; static const char * const help_opts[] = { + "display this help message and exit", "disallow allocations from regions marked as hotplugged\n\t\t\t" "by simulating enabling the \"movable_node\" kernel\n\t\t\t" "parameter", -- GitLab From 61ebea2ba19826ce6dff8686b72ecbea8269f6cc Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 14 Aug 2022 00:50:20 -0500 Subject: [PATCH 0090/2223] memblock tests: update reference to obsolete build option in comments The VERBOSE build option was replaced with the --verbose runtime option, but the comments describing the ASSERT_*() macros still refer to the VERBOSE build option. Update these comments so that they refer to the --verbose runtime option. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/5f8a4c2bde34cc029282c68d47eda982d950f421.1660451025.git.remckee0@gmail.com --- tools/testing/memblock/tests/common.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 3e7f23d341d79..d396e5423a8e1 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -16,7 +16,8 @@ * ASSERT_EQ(): * Check the condition * @_expected == @_seen - * If false, print failed test message (if in VERBOSE mode) and then assert + * If false, print failed test message (if running with --verbose) and then + * assert. */ #define ASSERT_EQ(_expected, _seen) do { \ if ((_expected) != (_seen)) \ @@ -28,7 +29,8 @@ * ASSERT_NE(): * Check the condition * @_expected != @_seen - * If false, print failed test message (if in VERBOSE mode) and then assert + * If false, print failed test message (if running with --verbose) and then + * assert. */ #define ASSERT_NE(_expected, _seen) do { \ if ((_expected) == (_seen)) \ @@ -40,7 +42,8 @@ * ASSERT_LT(): * Check the condition * @_expected < @_seen - * If false, print failed test message (if in VERBOSE mode) and then assert + * If false, print failed test message (if running with --verbose) and then + * assert. */ #define ASSERT_LT(_expected, _seen) do { \ if ((_expected) >= (_seen)) \ -- GitLab From ac76d803c4f6c2a32c9c7436d14467e099fd2bfa Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:43 -0500 Subject: [PATCH 0091/2223] memblock tests: update tests to check if memblock_alloc zeroed memory Add an assert in memblock_alloc() tests where allocation is expected to occur. The assert checks whether the entire chunk of allocated memory is cleared. The current memblock_alloc() tests do not check whether the allocated memory was zeroed. memblock_alloc() should zero the allocated memory since it is a wrapper for memblock_alloc_try_nid(). Reviewed-by: Shaoqin Huang Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/83ffb941b65074f40eb14552f8bfe5b71fe50abd.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 23 +++++++++++++++++++++++ tools/testing/memblock/tests/common.c | 7 +++++++ tools/testing/memblock/tests/common.h | 12 ++++++++++++ 3 files changed, 42 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index a14f38eb8a890..c97da91a98d66 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -22,6 +22,8 @@ static int alloc_top_down_simple_check(void) allocated_ptr = memblock_alloc(size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, size); + ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, expected_start); @@ -80,6 +82,8 @@ static int alloc_top_down_disjoint_check(void) allocated_ptr = memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -125,6 +129,8 @@ static int alloc_top_down_before_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - total_size); @@ -173,6 +179,8 @@ static int alloc_top_down_after_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2_size); @@ -223,6 +231,8 @@ static int alloc_top_down_second_fit_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base - r3_size); @@ -277,6 +287,8 @@ static int alloc_in_between_generic_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2.size - r3_size); @@ -418,6 +430,8 @@ static int alloc_limited_space_generic_check(void) allocated_ptr = memblock_alloc(available_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, available_size); + ASSERT_EQ(rgn->size, MEM_SIZE); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -472,6 +486,8 @@ static int alloc_bottom_up_simple_check(void) allocated_ptr = memblock_alloc(SZ_2, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, SZ_2); + ASSERT_EQ(rgn->size, SZ_2); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -528,6 +544,7 @@ static int alloc_bottom_up_disjoint_check(void) allocated_ptr = memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -571,6 +588,8 @@ static int alloc_bottom_up_before_check(void) allocated_ptr = memblock_alloc(r1_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r1_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -618,6 +637,8 @@ static int alloc_bottom_up_after_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base); @@ -669,6 +690,8 @@ static int alloc_bottom_up_second_fit_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base); diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index 76a8ad818f3a7..eec6901081af3 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -60,16 +60,23 @@ void reset_memblock_attributes(void) memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; } +static inline void fill_memblock(void) +{ + memset(memory_block.base, 1, MEM_SIZE); +} + void setup_memblock(void) { reset_memblock_regions(); memblock_add((phys_addr_t)memory_block.base, MEM_SIZE); + fill_memblock(); } void dummy_physical_memory_init(void) { memory_block.base = malloc(MEM_SIZE); assert(memory_block.base); + fill_memblock(); } void dummy_physical_memory_cleanup(void) diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index d396e5423a8e1..93e559780890e 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -51,6 +51,18 @@ assert((_expected) < (_seen)); \ } while (0) +/** + * ASSERT_MEM_EQ(): + * Check that the first @_size bytes of @_seen are all equal to @_expected. + * If false, print failed test message (if running with --verbose) and then + * assert. + */ +#define ASSERT_MEM_EQ(_seen, _expected, _size) do { \ + for (int _i = 0; _i < (_size); _i++) { \ + ASSERT_EQ(((char *)_seen)[_i], (_expected)); \ + } \ +} while (0) + #define PREFIX_PUSH() prefix_push(__func__) /* -- GitLab From 25b9defb5bc4aee8beb51ded07838e12745426f9 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:44 -0500 Subject: [PATCH 0092/2223] memblock tests: update zeroed memory check for memblock_alloc_* tests Update the assert in memblock_alloc_try_nid() and memblock_alloc_from() tests that checks whether the memory is cleared so that it checks the entire chunk of allocated memory instead of just the first byte. Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/24b3271751756100142e65b75284d43b4d30c9b7.1661578349.git.remckee0@gmail.com --- .../memblock/tests/alloc_helpers_api.c | 8 +-- tools/testing/memblock/tests/alloc_nid_api.c | 72 +++++-------------- 2 files changed, 20 insertions(+), 60 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index 1069b4bdd5fdd..f1c7d6f170b62 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -19,7 +19,6 @@ static int alloc_from_simple_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -31,10 +30,9 @@ static int alloc_from_simple_generic_check(void) min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES; allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -66,7 +64,6 @@ static int alloc_from_misaligned_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -79,10 +76,9 @@ static int alloc_from_misaligned_generic_check(void) min_addr = memblock_end_of_DRAM() - (SMP_CACHE_BYTES * 2 - 1); allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - SMP_CACHE_BYTES); diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 255fd514e9f5d..a069534c459e5 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -19,7 +19,6 @@ static int alloc_try_nid_top_down_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -35,11 +34,10 @@ static int alloc_try_nid_top_down_simple_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -74,7 +72,6 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -91,11 +88,10 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size - misalign); @@ -128,7 +124,6 @@ static int alloc_try_nid_exact_address_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -144,11 +139,10 @@ static int alloc_try_nid_exact_address_generic_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -183,7 +177,6 @@ static int alloc_try_nid_top_down_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -198,10 +191,9 @@ static int alloc_try_nid_top_down_narrow_range_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -277,7 +269,6 @@ static int alloc_try_nid_min_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -298,10 +289,9 @@ static int alloc_try_nid_min_reserved_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, reserved_base); @@ -332,7 +322,6 @@ static int alloc_try_nid_max_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -351,10 +340,9 @@ static int alloc_try_nid_max_reserved_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, min_addr); @@ -389,7 +377,6 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -417,10 +404,9 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn1->size, r1.size + r3_size); ASSERT_EQ(rgn1->base, max_addr - r3_size); @@ -456,7 +442,6 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -483,10 +468,9 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r2.base); @@ -522,7 +506,6 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -550,10 +533,9 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -634,7 +616,6 @@ static int alloc_try_nid_top_down_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -649,10 +630,9 @@ static int alloc_try_nid_top_down_cap_max_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -674,7 +654,6 @@ static int alloc_try_nid_top_down_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -689,10 +668,9 @@ static int alloc_try_nid_top_down_cap_min_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -723,7 +701,6 @@ static int alloc_try_nid_bottom_up_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -740,11 +717,10 @@ static int alloc_try_nid_bottom_up_simple_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -779,7 +755,6 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -797,11 +772,10 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr + (SMP_CACHE_BYTES - misalign)); @@ -836,7 +810,6 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -852,10 +825,9 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -890,7 +862,6 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -919,10 +890,9 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, max_addr); @@ -964,7 +934,6 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[1]; struct memblock_region *rgn3 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -993,10 +962,9 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn3->size, r3_size); ASSERT_EQ(rgn3->base, memblock_start_of_DRAM()); @@ -1024,7 +992,6 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -1040,10 +1007,9 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -1065,7 +1031,6 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -1081,10 +1046,9 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); -- GitLab From fb2e97fe853ff515df473d4acec6707816e05d87 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:45 -0500 Subject: [PATCH 0093/2223] memblock tests: add labels to verbose output for generic alloc tests Generic tests for memblock_alloc*() functions do not use separate functions for testing top-down and bottom-up allocation directions. Therefore, the function name that is displayed in the verbose testing output does not include the allocation direction. Add an additional prefix when running generic tests for memblock_alloc*() functions that indicates which allocation direction is set. The prefix will be displayed when the tests are run in verbose mode. Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/fb76a42253d2a196a7daea29dd8121a69904f58e.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 36 +++++++------------ .../memblock/tests/alloc_helpers_api.c | 12 +++---- tools/testing/memblock/tests/alloc_nid_api.c | 36 +++++++------------ tools/testing/memblock/tests/common.h | 16 +++++++++ 4 files changed, 44 insertions(+), 56 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index c97da91a98d66..de3405634f8a6 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -751,10 +751,8 @@ static int alloc_after_check(void) static int alloc_in_between_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_in_between_generic_check(); - memblock_set_bottom_up(true); - alloc_in_between_generic_check(); + run_top_down(alloc_in_between_generic_check); + run_bottom_up(alloc_in_between_generic_check); return 0; } @@ -773,10 +771,8 @@ static int alloc_second_fit_check(void) static int alloc_small_gaps_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_small_gaps_generic_check(); - memblock_set_bottom_up(true); - alloc_small_gaps_generic_check(); + run_top_down(alloc_small_gaps_generic_check); + run_bottom_up(alloc_small_gaps_generic_check); return 0; } @@ -784,10 +780,8 @@ static int alloc_small_gaps_check(void) static int alloc_all_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_all_reserved_generic_check(); - memblock_set_bottom_up(true); - alloc_all_reserved_generic_check(); + run_top_down(alloc_all_reserved_generic_check); + run_bottom_up(alloc_all_reserved_generic_check); return 0; } @@ -795,10 +789,8 @@ static int alloc_all_reserved_check(void) static int alloc_no_space_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_no_space_generic_check(); - memblock_set_bottom_up(true); - alloc_no_space_generic_check(); + run_top_down(alloc_no_space_generic_check); + run_bottom_up(alloc_no_space_generic_check); return 0; } @@ -806,10 +798,8 @@ static int alloc_no_space_check(void) static int alloc_limited_space_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_limited_space_generic_check(); - memblock_set_bottom_up(true); - alloc_limited_space_generic_check(); + run_top_down(alloc_limited_space_generic_check); + run_bottom_up(alloc_limited_space_generic_check); return 0; } @@ -817,10 +807,8 @@ static int alloc_limited_space_check(void) static int alloc_no_memory_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_no_memory_generic_check(); - memblock_set_bottom_up(true); - alloc_no_memory_generic_check(); + run_top_down(alloc_no_memory_generic_check); + run_bottom_up(alloc_no_memory_generic_check); return 0; } diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index f1c7d6f170b62..06577bd0e349b 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -357,10 +357,8 @@ static int alloc_from_bottom_up_min_addr_cap_check(void) static int alloc_from_simple_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_from_simple_generic_check(); - memblock_set_bottom_up(true); - alloc_from_simple_generic_check(); + run_top_down(alloc_from_simple_generic_check); + run_bottom_up(alloc_from_simple_generic_check); return 0; } @@ -368,10 +366,8 @@ static int alloc_from_simple_check(void) static int alloc_from_misaligned_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_from_misaligned_generic_check(); - memblock_set_bottom_up(true); - alloc_from_misaligned_generic_check(); + run_top_down(alloc_from_misaligned_generic_check); + run_bottom_up(alloc_from_misaligned_generic_check); return 0; } diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index a069534c459e5..9324d706ee3ab 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -1142,10 +1142,8 @@ static int alloc_try_nid_cap_min_check(void) static int alloc_try_nid_min_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_min_reserved_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_min_reserved_generic_check(); + run_top_down(alloc_try_nid_min_reserved_generic_check); + run_bottom_up(alloc_try_nid_min_reserved_generic_check); return 0; } @@ -1153,10 +1151,8 @@ static int alloc_try_nid_min_reserved_check(void) static int alloc_try_nid_max_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_max_reserved_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_max_reserved_generic_check(); + run_top_down(alloc_try_nid_max_reserved_generic_check); + run_bottom_up(alloc_try_nid_max_reserved_generic_check); return 0; } @@ -1164,10 +1160,8 @@ static int alloc_try_nid_max_reserved_check(void) static int alloc_try_nid_exact_address_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_exact_address_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_exact_address_generic_check(); + run_top_down(alloc_try_nid_exact_address_generic_check); + run_bottom_up(alloc_try_nid_exact_address_generic_check); return 0; } @@ -1175,10 +1169,8 @@ static int alloc_try_nid_exact_address_check(void) static int alloc_try_nid_reserved_full_merge_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_reserved_full_merge_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_reserved_full_merge_generic_check(); + run_top_down(alloc_try_nid_reserved_full_merge_generic_check); + run_bottom_up(alloc_try_nid_reserved_full_merge_generic_check); return 0; } @@ -1186,10 +1178,8 @@ static int alloc_try_nid_reserved_full_merge_check(void) static int alloc_try_nid_reserved_all_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_reserved_all_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_reserved_all_generic_check(); + run_top_down(alloc_try_nid_reserved_all_generic_check); + run_bottom_up(alloc_try_nid_reserved_all_generic_check); return 0; } @@ -1197,10 +1187,8 @@ static int alloc_try_nid_reserved_all_check(void) static int alloc_try_nid_low_max_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_low_max_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_low_max_generic_check(); + run_top_down(alloc_try_nid_low_max_generic_check); + run_bottom_up(alloc_try_nid_low_max_generic_check); return 0; } diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 93e559780890e..c53f9c365714c 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -100,4 +100,20 @@ static inline void test_pass_pop(void) prefix_pop(); } +static inline void run_top_down(int (*func)()) +{ + memblock_set_bottom_up(false); + prefix_push("top-down"); + func(); + prefix_pop(); +} + +static inline void run_bottom_up(int (*func)()) +{ + memblock_set_bottom_up(true); + prefix_push("bottom-up"); + func(); + prefix_pop(); +} + #endif -- GitLab From 21a233f68afe55aafa8b79705c97f7a1d37be3e1 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:46 -0500 Subject: [PATCH 0094/2223] memblock tests: add additional tests for basic api and memblock_alloc Add tests for memblock_add(), memblock_reserve(), memblock_remove(), memblock_free(), and memblock_alloc() for the following test scenarios. memblock_add() and memblock_reserve(): - add/reserve a memory block in the gap between two existing memory blocks, and check that the blocks are merged into one region - try to add/reserve memblock regions that extend past PHYS_ADDR_MAX memblock_remove() and memblock_free(): - remove/free a region when it is the only available region + These tests ensure that the first region is overwritten with a "dummy" region when the last remaining region of that type is removed or freed. - remove/free() a region that overlaps with two existing regions of the relevant type - try to remove/free memblock regions that extend past PHYS_ADDR_MAX memblock_alloc(): - try to allocate a region that is larger than the total size of available memory (memblock.memory) Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/c23c0393c5b9a53fe7f676996913c629495e9727.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 44 ++ tools/testing/memblock/tests/basic_api.c | 499 +++++++++++++++++++++++ 2 files changed, 543 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index de3405634f8a6..e20e326d636fa 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -469,6 +469,40 @@ static int alloc_no_memory_generic_check(void) return 0; } +/* + * A test that tries to allocate a region that is larger than the total size of + * available memory (memblock.memory): + * + * +-----------------------------------+ + * | new | + * +-----------------------------------+ + * | | + * | | + * +---------------------------------+ + * + * Expect no allocation to happen. + */ +static int alloc_too_large_generic_check(void) +{ + struct memblock_region *rgn = &memblock.reserved.regions[0]; + void *allocated_ptr = NULL; + + PREFIX_PUSH(); + + setup_memblock(); + + allocated_ptr = memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); + + ASSERT_EQ(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, 0); + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(memblock.reserved.total_size, 0); + + test_pass_pop(); + + return 0; +} + /* * A simple test that tries to allocate a small memory region. * Expect to allocate an aligned region at the beginning of the available @@ -813,6 +847,15 @@ static int alloc_no_memory_check(void) return 0; } +static int alloc_too_large_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_too_large_generic_check); + run_bottom_up(alloc_too_large_generic_check); + + return 0; +} + int memblock_alloc_checks(void) { const char *func_testing = "memblock_alloc"; @@ -835,6 +878,7 @@ int memblock_alloc_checks(void) alloc_no_space_check(); alloc_limited_space_check(); alloc_no_memory_check(); + alloc_too_large_check(); dummy_physical_memory_cleanup(); diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 66f46f261e668..ea79396e46111 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -326,6 +326,102 @@ static int memblock_add_twice_check(void) return 0; } +/* + * A test that tries to add two memory blocks that don't overlap with one + * another and then add a third memory block in the space between the first two: + * + * | +--------+--------+--------+ | + * | | r1 | r3 | r2 | | + * +--------+--------+--------+--------+--+ + * + * Expect to merge the three entries into one region that starts at r1.base + * and has size of r1.size + r2.size + r3.size. The region counter and total + * size of the available memory are updated. + */ +static int memblock_add_between_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = SZ_1G, + .size = SZ_8K + }; + struct region r2 = { + .base = SZ_1G + SZ_16K, + .size = SZ_8K + }; + struct region r3 = { + .base = SZ_1G + SZ_8K, + .size = SZ_8K + }; + + PREFIX_PUSH(); + + total_size = r1.size + r2.size + r3.size; + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_add(r3.base, r3.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries to add a memory block r when r extends past + * PHYS_ADDR_MAX: + * + * +--------+ + * | r | + * +--------+ + * | +----+ + * | | rgn| + * +----------------------------+----+ + * + * Expect to add a memory block of size PHYS_ADDR_MAX - r.base. Expect the + * total size of available memory and the counter to be updated. + */ +static int memblock_add_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.memory.regions[0]; + + struct region r = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = PHYS_ADDR_MAX - r.base; + + reset_memblock_regions(); + memblock_add(r.base, r.size); + + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_add_checks(void) { prefix_reset(); @@ -339,6 +435,8 @@ static int memblock_add_checks(void) memblock_add_overlap_bottom_check(); memblock_add_within_check(); memblock_add_twice_check(); + memblock_add_between_check(); + memblock_add_near_max_check(); prefix_pop(); @@ -604,6 +702,102 @@ static int memblock_reserve_twice_check(void) return 0; } +/* + * A test that tries to mark two memory blocks that don't overlap as reserved + * and then reserve a third memory block in the space between the first two: + * + * | +--------+--------+--------+ | + * | | r1 | r3 | r2 | | + * +--------+--------+--------+--------+--+ + * + * Expect to merge the three entries into one reserved region that starts at + * r1.base and has size of r1.size + r2.size + r3.size. The region counter and + * total for memblock.reserved are updated. + */ +static int memblock_reserve_between_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.reserved.regions[0]; + + struct region r1 = { + .base = SZ_1G, + .size = SZ_8K + }; + struct region r2 = { + .base = SZ_1G + SZ_16K, + .size = SZ_8K + }; + struct region r3 = { + .base = SZ_1G + SZ_8K, + .size = SZ_8K + }; + + PREFIX_PUSH(); + + total_size = r1.size + r2.size + r3.size; + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + memblock_reserve(r3.base, r3.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries to reserve a memory block r when r extends past + * PHYS_ADDR_MAX: + * + * +--------+ + * | r | + * +--------+ + * | +----+ + * | | rgn| + * +----------------------------+----+ + * + * Expect to reserve a memory block of size PHYS_ADDR_MAX - r.base. Expect the + * total size of reserved memory and the counter to be updated. + */ +static int memblock_reserve_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.reserved.regions[0]; + + struct region r = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = PHYS_ADDR_MAX - r.base; + + reset_memblock_regions(); + memblock_reserve(r.base, r.size); + + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_reserve_checks(void) { prefix_reset(); @@ -616,6 +810,8 @@ static int memblock_reserve_checks(void) memblock_reserve_overlap_bottom_check(); memblock_reserve_within_check(); memblock_reserve_twice_check(); + memblock_reserve_between_check(); + memblock_reserve_near_max_check(); prefix_pop(); @@ -887,6 +1083,155 @@ static int memblock_remove_within_check(void) return 0; } +/* + * A simple test that tries to remove a region r1 from the array of + * available memory regions when r1 is the only available region. + * Expect to add a memory block r1 and then remove r1 so that a dummy + * region is added. The region counter stays the same, and the total size + * is updated. + */ +static int memblock_remove_only_region_check(void) +{ + struct memblock_region *rgn; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = SZ_2K, + .size = SZ_4K + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_remove(r1.base, r1.size); + + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(rgn->size, 0); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, 0); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries remove a region r2 from the array of available + * memory regions when r2 extends past PHYS_ADDR_MAX: + * + * +--------+ + * | r2 | + * +--------+ + * | +---+....+ + * | |rgn| | + * +------------------------+---+----+ + * + * Expect that only the portion between PHYS_ADDR_MAX and r2.base is removed. + * Expect the total size of available memory to be updated and the counter to + * not be updated. + */ +static int memblock_remove_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = PHYS_ADDR_MAX - SZ_2M, + .size = SZ_2M + }; + + struct region r2 = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = r1.size - (PHYS_ADDR_MAX - r2.base); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_remove(r2.base, r2.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to remove a region r3 that overlaps with two existing + * regions r1 and r2: + * + * +----------------+ + * | r3 | + * +----------------+ + * | +----+..... ........+--------+ + * | | |r1 : : |r2 | | + * +----+----+----+---+-------+--------+-----+ + * + * Expect that only the intersections of r1 with r3 and r2 with r3 are removed + * from the available memory pool. Expect the total size of available memory to + * be updated and the counter to not be updated. + */ +static int memblock_remove_overlap_two_check(void) +{ + struct memblock_region *rgn1, *rgn2; + phys_addr_t new_r1_size, new_r2_size, r2_end, r3_end, total_size; + + rgn1 = &memblock.memory.regions[0]; + rgn2 = &memblock.memory.regions[1]; + + struct region r1 = { + .base = SZ_16M, + .size = SZ_32M + }; + struct region r2 = { + .base = SZ_64M, + .size = SZ_64M + }; + struct region r3 = { + .base = SZ_32M, + .size = SZ_64M + }; + + PREFIX_PUSH(); + + r2_end = r2.base + r2.size; + r3_end = r3.base + r3.size; + new_r1_size = r3.base - r1.base; + new_r2_size = r2_end - r3_end; + total_size = new_r1_size + new_r2_size; + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_remove(r3.base, r3.size); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, new_r1_size); + + ASSERT_EQ(rgn2->base, r3_end); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.memory.cnt, 2); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_remove_checks(void) { prefix_reset(); @@ -898,6 +1243,9 @@ static int memblock_remove_checks(void) memblock_remove_overlap_top_check(); memblock_remove_overlap_bottom_check(); memblock_remove_within_check(); + memblock_remove_only_region_check(); + memblock_remove_near_max_check(); + memblock_remove_overlap_two_check(); prefix_pop(); @@ -1163,6 +1511,154 @@ static int memblock_free_within_check(void) return 0; } +/* + * A simple test that tries to free a memory block r1 that was marked + * earlier as reserved when r1 is the only available region. + * Expect to reserve a memory block r1 and then free r1 so that r1 is + * overwritten with a dummy region. The region counter stays the same, + * and the total size is updated. + */ +static int memblock_free_only_region_check(void) +{ + struct memblock_region *rgn; + + rgn = &memblock.reserved.regions[0]; + + struct region r1 = { + .base = SZ_2K, + .size = SZ_4K + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_free((void *)r1.base, r1.size); + + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(rgn->size, 0); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, 0); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries free a region r2 when r2 extends past PHYS_ADDR_MAX: + * + * +--------+ + * | r2 | + * +--------+ + * | +---+....+ + * | |rgn| | + * +------------------------+---+----+ + * + * Expect that only the portion between PHYS_ADDR_MAX and r2.base is freed. + * Expect the total size of reserved memory to be updated and the counter to + * not be updated. + */ +static int memblock_free_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.reserved.regions[0]; + + struct region r1 = { + .base = PHYS_ADDR_MAX - SZ_2M, + .size = SZ_2M + }; + + struct region r2 = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = r1.size - (PHYS_ADDR_MAX - r2.base); + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_free((void *)r2.base, r2.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to free a reserved region r3 that overlaps with two + * existing reserved regions r1 and r2: + * + * +----------------+ + * | r3 | + * +----------------+ + * | +----+..... ........+--------+ + * | | |r1 : : |r2 | | + * +----+----+----+---+-------+--------+-----+ + * + * Expect that only the intersections of r1 with r3 and r2 with r3 are freed + * from the collection of reserved memory. Expect the total size of reserved + * memory to be updated and the counter to not be updated. + */ +static int memblock_free_overlap_two_check(void) +{ + struct memblock_region *rgn1, *rgn2; + phys_addr_t new_r1_size, new_r2_size, r2_end, r3_end, total_size; + + rgn1 = &memblock.reserved.regions[0]; + rgn2 = &memblock.reserved.regions[1]; + + struct region r1 = { + .base = SZ_16M, + .size = SZ_32M + }; + struct region r2 = { + .base = SZ_64M, + .size = SZ_64M + }; + struct region r3 = { + .base = SZ_32M, + .size = SZ_64M + }; + + PREFIX_PUSH(); + + r2_end = r2.base + r2.size; + r3_end = r3.base + r3.size; + new_r1_size = r3.base - r1.base; + new_r2_size = r2_end - r3_end; + total_size = new_r1_size + new_r2_size; + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + memblock_free((void *)r3.base, r3.size); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, new_r1_size); + + ASSERT_EQ(rgn2->base, r3_end); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_free_checks(void) { prefix_reset(); @@ -1174,6 +1670,9 @@ static int memblock_free_checks(void) memblock_free_overlap_top_check(); memblock_free_overlap_bottom_check(); memblock_free_within_check(); + memblock_free_only_region_check(); + memblock_free_near_max_check(); + memblock_free_overlap_two_check(); prefix_pop(); -- GitLab From deee033e0f8ea66a9f4acfc1eb069fdef3013bec Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:47 -0500 Subject: [PATCH 0095/2223] memblock tests: update alloc_api to test memblock_alloc_raw Update memblock_alloc() tests so that they test either memblock_alloc() or memblock_alloc_raw() depending on the value of alloc_test_flags. Run through all the existing tests in memblock_alloc_api twice: once for memblock_alloc() and once for memblock_alloc_raw(). When the tests run memblock_alloc(), they test that the entire memory region is zero. When the tests run memblock_alloc_raw(), they test that the entire memory region is nonzero. The content of the memory region is initialized to nonzero, and we expect it to remain unchanged if running memblock_alloc_raw(). Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/5a7cfb2f807ee2cb53ee77f9f5c910107b253d6e.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 91 +++++++++++++++--------- tools/testing/memblock/tests/common.h | 27 +++++++ 2 files changed, 85 insertions(+), 33 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index e20e326d636fa..36dd7e254cce9 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -1,6 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "alloc_api.h" +static int alloc_test_flags = TEST_F_NONE; + +static inline const char * const get_memblock_alloc_name(int flags) +{ + if (flags & TEST_F_RAW) + return "memblock_alloc_raw"; + return "memblock_alloc"; +} + +static inline void *run_memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + if (alloc_test_flags & TEST_F_RAW) + return memblock_alloc_raw(size, align); + return memblock_alloc(size, align); +} + /* * A simple test that tries to allocate a small memory region. * Expect to allocate an aligned region near the end of the available memory. @@ -19,10 +35,10 @@ static int alloc_top_down_simple_check(void) expected_start = memblock_end_of_DRAM() - SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc(size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, expected_start); @@ -79,10 +95,10 @@ static int alloc_top_down_disjoint_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, alignment); + allocated_ptr = run_memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -126,10 +142,10 @@ static int alloc_top_down_before_check(void) memblock_reserve(memblock_end_of_DRAM() - total_size, r1_size); - allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - total_size); @@ -176,10 +192,10 @@ static int alloc_top_down_after_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2_size); @@ -228,10 +244,10 @@ static int alloc_top_down_second_fit_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_test_flags); ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base - r3_size); @@ -284,10 +300,10 @@ static int alloc_in_between_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2.size - r3_size); @@ -332,7 +348,7 @@ static int alloc_small_gaps_generic_check(void) region_end += gap_size + region_size; } - allocated_ptr = memblock_alloc(region_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(region_size, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); @@ -356,7 +372,7 @@ static int alloc_all_reserved_generic_check(void) /* Simulate full memory */ memblock_reserve(memblock_start_of_DRAM(), MEM_SIZE); - allocated_ptr = memblock_alloc(SZ_256, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_256, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); @@ -392,7 +408,7 @@ static int alloc_no_space_generic_check(void) /* Simulate almost-full memory */ memblock_reserve(memblock_start_of_DRAM(), reserved_size); - allocated_ptr = memblock_alloc(SZ_1K, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_1K, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); @@ -427,10 +443,10 @@ static int alloc_limited_space_generic_check(void) /* Simulate almost-full memory */ memblock_reserve(memblock_start_of_DRAM(), reserved_size); - allocated_ptr = memblock_alloc(available_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(available_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, available_size); + assert_mem_content(allocated_ptr, available_size, alloc_test_flags); ASSERT_EQ(rgn->size, MEM_SIZE); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -457,7 +473,7 @@ static int alloc_no_memory_generic_check(void) reset_memblock_regions(); - allocated_ptr = memblock_alloc(SZ_1K, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_1K, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); ASSERT_EQ(rgn->size, 0); @@ -491,7 +507,7 @@ static int alloc_too_large_generic_check(void) setup_memblock(); - allocated_ptr = memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); ASSERT_EQ(rgn->size, 0); @@ -517,10 +533,10 @@ static int alloc_bottom_up_simple_check(void) setup_memblock(); - allocated_ptr = memblock_alloc(SZ_2, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_2, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, SZ_2); + assert_mem_content(allocated_ptr, SZ_2, alloc_test_flags); ASSERT_EQ(rgn->size, SZ_2); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -575,10 +591,10 @@ static int alloc_bottom_up_disjoint_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, alignment); + allocated_ptr = run_memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -619,10 +635,10 @@ static int alloc_bottom_up_before_check(void) memblock_reserve(memblock_start_of_DRAM() + r1_size, r2_size); - allocated_ptr = memblock_alloc(r1_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r1_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r1_size); + assert_mem_content(allocated_ptr, r1_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -668,10 +684,10 @@ static int alloc_bottom_up_after_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base); @@ -721,10 +737,10 @@ static int alloc_bottom_up_second_fit_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_test_flags); ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base); @@ -856,13 +872,14 @@ static int alloc_too_large_check(void) return 0; } -int memblock_alloc_checks(void) +static int memblock_alloc_checks_internal(int flags) { - const char *func_testing = "memblock_alloc"; + const char *func = get_memblock_alloc_name(flags); + alloc_test_flags = flags; prefix_reset(); - prefix_push(func_testing); - test_print("Running %s tests...\n", func_testing); + prefix_push(func); + test_print("Running %s tests...\n", func); reset_memblock_attributes(); dummy_physical_memory_init(); @@ -886,3 +903,11 @@ int memblock_alloc_checks(void) return 0; } + +int memblock_alloc_checks(void) +{ + memblock_alloc_checks_internal(TEST_F_NONE); + memblock_alloc_checks_internal(TEST_F_RAW); + + return 0; +} diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index c53f9c365714c..78128e109a95c 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -12,6 +12,13 @@ #define MEM_SIZE SZ_16K +enum test_flags { + /* No special request. */ + TEST_F_NONE = 0x0, + /* Perform raw allocations (no zeroing of memory). */ + TEST_F_RAW = 0x1, +}; + /** * ASSERT_EQ(): * Check the condition @@ -63,6 +70,18 @@ } \ } while (0) +/** + * ASSERT_MEM_NE(): + * Check that none of the first @_size bytes of @_seen are equal to @_expected. + * If false, print failed test message (if running with --verbose) and then + * assert. + */ +#define ASSERT_MEM_NE(_seen, _expected, _size) do { \ + for (int _i = 0; _i < (_size); _i++) { \ + ASSERT_NE(((char *)_seen)[_i], (_expected)); \ + } \ +} while (0) + #define PREFIX_PUSH() prefix_push(__func__) /* @@ -116,4 +135,12 @@ static inline void run_bottom_up(int (*func)()) prefix_pop(); } +static inline void assert_mem_content(void *mem, int size, int flags) +{ + if (flags & TEST_F_RAW) + ASSERT_MEM_NE(mem, 0, size); + else + ASSERT_MEM_EQ(mem, 0, size); +} + #endif -- GitLab From ae544fd62c14265dc663a65b3f9c6c5a6134098a Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:48 -0500 Subject: [PATCH 0096/2223] memblock tests: update alloc_nid_api to test memblock_alloc_try_nid_raw Update memblock_alloc_try_nid() tests so that they test either memblock_alloc_try_nid() or memblock_alloc_try_nid_raw() depending on the value of alloc_nid_test_flags. Run through all the existing tests in alloc_nid_api twice: once for memblock_alloc_try_nid() and once for memblock_alloc_try_nid_raw(). When the tests run memblock_alloc_try_nid(), they test that the entire memory region is zero. When the tests run memblock_alloc_try_nid_raw(), they test that the entire memory region is nonzero. The content of the memory region is initialized to nonzero, and we expect it to remain unchanged if running memblock_alloc_try_nid_raw(). Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/6fa8938f67872841c10a00afb042947d1d280a04.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_nid_api.c | 180 ++++++++++++------- 1 file changed, 111 insertions(+), 69 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 9324d706ee3ab..32b3c1594fddd 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -1,6 +1,26 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "alloc_nid_api.h" +static int alloc_nid_test_flags = TEST_F_NONE; + +static inline const char * const get_memblock_alloc_try_nid_name(int flags) +{ + if (flags & TEST_F_RAW) + return "memblock_alloc_try_nid_raw"; + return "memblock_alloc_try_nid"; +} + +static inline void *run_memblock_alloc_try_nid(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid) +{ + if (alloc_nid_test_flags & TEST_F_RAW) + return memblock_alloc_try_nid_raw(size, align, min_addr, + max_addr, nid); + return memblock_alloc_try_nid(size, align, min_addr, max_addr, nid); +} + /* * A simple test that tries to allocate a memory region within min_addr and * max_addr range: @@ -32,12 +52,13 @@ static int alloc_try_nid_top_down_simple_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -86,12 +107,13 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512 + misalign; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size - misalign); @@ -137,12 +159,13 @@ static int alloc_try_nid_exact_address_generic_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES; max_addr = min_addr + size; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -189,11 +212,12 @@ static int alloc_try_nid_top_down_narrow_range_check(void) min_addr = memblock_start_of_DRAM() + SZ_512; max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -241,8 +265,9 @@ static int alloc_try_nid_low_max_generic_check(void) min_addr = memblock_start_of_DRAM(); max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -287,11 +312,12 @@ static int alloc_try_nid_min_reserved_generic_check(void) memblock_reserve(reserved_base, r1_size); - allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, reserved_base); @@ -338,11 +364,12 @@ static int alloc_try_nid_max_reserved_generic_check(void) memblock_reserve(max_addr, r1_size); - allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, min_addr); @@ -402,11 +429,12 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn1->size, r1.size + r3_size); ASSERT_EQ(rgn1->base, max_addr - r3_size); @@ -466,11 +494,12 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r2.base); @@ -531,11 +560,12 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -597,8 +627,9 @@ static int alloc_try_nid_reserved_all_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -628,11 +659,12 @@ static int alloc_try_nid_top_down_cap_max_check(void) min_addr = memblock_end_of_DRAM() - SZ_1K; max_addr = memblock_end_of_DRAM() + SZ_256; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -666,11 +698,12 @@ static int alloc_try_nid_top_down_cap_min_check(void) min_addr = memblock_start_of_DRAM() - SZ_256; max_addr = memblock_end_of_DRAM(); - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -714,13 +747,13 @@ static int alloc_try_nid_bottom_up_simple_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -769,13 +802,13 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) min_addr = memblock_start_of_DRAM() + misalign; max_addr = min_addr + SZ_512; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr + (SMP_CACHE_BYTES - misalign)); @@ -822,12 +855,12 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) min_addr = memblock_start_of_DRAM() + SZ_512; max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -887,12 +920,12 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, max_addr); @@ -959,12 +992,12 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn3->size, r3_size); ASSERT_EQ(rgn3->base, memblock_start_of_DRAM()); @@ -1004,12 +1037,12 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) min_addr = memblock_start_of_DRAM() + SZ_1K; max_addr = memblock_end_of_DRAM() + SZ_256; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -1043,12 +1076,12 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM() - SZ_256; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -1193,13 +1226,14 @@ static int alloc_try_nid_low_max_check(void) return 0; } -int memblock_alloc_nid_checks(void) +static int memblock_alloc_nid_checks_internal(int flags) { - const char *func_testing = "memblock_alloc_try_nid"; + const char *func = get_memblock_alloc_try_nid_name(flags); + alloc_nid_test_flags = flags; prefix_reset(); - prefix_push(func_testing); - test_print("Running %s tests...\n", func_testing); + prefix_push(func); + test_print("Running %s tests...\n", func); reset_memblock_attributes(); dummy_physical_memory_init(); @@ -1225,3 +1259,11 @@ int memblock_alloc_nid_checks(void) return 0; } + +int memblock_alloc_nid_checks(void) +{ + memblock_alloc_nid_checks_internal(TEST_F_NONE); + memblock_alloc_nid_checks_internal(TEST_F_RAW); + + return 0; +} -- GitLab From a541c6d428f775efcfe25236062c96b59e31b57a Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:49 -0500 Subject: [PATCH 0097/2223] memblock tests: add tests for memblock_*bottom_up functions Add simple tests for memblock_set_bottom_up() and memblock_bottom_up(). Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/b03701d2faeaf00f7184e4b72903de4e5e939437.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/basic_api.c | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index ea79396e46111..c7490291c4851 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -1679,6 +1679,50 @@ static int memblock_free_checks(void) return 0; } +static int memblock_set_bottom_up_check(void) +{ + prefix_push("memblock_set_bottom_up"); + + memblock_set_bottom_up(false); + ASSERT_EQ(memblock.bottom_up, false); + memblock_set_bottom_up(true); + ASSERT_EQ(memblock.bottom_up, true); + + reset_memblock_attributes(); + test_pass_pop(); + + return 0; +} + +static int memblock_bottom_up_check(void) +{ + prefix_push("memblock_bottom_up"); + + memblock_set_bottom_up(false); + ASSERT_EQ(memblock_bottom_up(), memblock.bottom_up); + ASSERT_EQ(memblock_bottom_up(), false); + memblock_set_bottom_up(true); + ASSERT_EQ(memblock_bottom_up(), memblock.bottom_up); + ASSERT_EQ(memblock_bottom_up(), true); + + reset_memblock_attributes(); + test_pass_pop(); + + return 0; +} + +static int memblock_bottom_up_checks(void) +{ + test_print("Running memblock_*bottom_up tests...\n"); + + prefix_reset(); + memblock_set_bottom_up_check(); + prefix_reset(); + memblock_bottom_up_check(); + + return 0; +} + int memblock_basic_checks(void) { memblock_initialization_check(); @@ -1686,6 +1730,7 @@ int memblock_basic_checks(void) memblock_reserve_checks(); memblock_remove_checks(); memblock_free_checks(); + memblock_bottom_up_checks(); return 0; } -- GitLab From dcd45ad2ad784c35bfba8ae93c285574bc2a8a1e Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:50 -0500 Subject: [PATCH 0098/2223] memblock tests: add tests for memblock_trim_memory Add tests for memblock_trim_memory() for the following scenarios: - all regions aligned - one unaligned region that is smaller than the alignment - one unaligned region that is unaligned at the base - one unaligned region that is unaligned at the end Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/0e5f55154a3b66581e04ba3717978795cbc08a5b.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/basic_api.c | 223 +++++++++++++++++++++++ 1 file changed, 223 insertions(+) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index c7490291c4851..a13a57ba0815f 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -8,6 +8,7 @@ #define FUNC_RESERVE "memblock_reserve" #define FUNC_REMOVE "memblock_remove" #define FUNC_FREE "memblock_free" +#define FUNC_TRIM "memblock_trim_memory" static int memblock_initialization_check(void) { @@ -1723,6 +1724,227 @@ static int memblock_bottom_up_checks(void) return 0; } +/* + * A test that tries to trim memory when both ends of the memory region are + * aligned. Expect that the memory will not be trimmed. Expect the counter to + * not be updated. + */ +static int memblock_trim_memory_aligned_check(void) +{ + struct memblock_region *rgn; + const phys_addr_t alignment = SMP_CACHE_BYTES; + + rgn = &memblock.memory.regions[0]; + + struct region r = { + .base = alignment, + .size = alignment * 4 + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r.base, r.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, r.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to trim memory when there are two available regions, r1 and + * r2. Region r1 is aligned on both ends and region r2 is unaligned on one end + * and smaller than the alignment: + * + * alignment + * |--------| + * | +-----------------+ +------+ | + * | | r1 | | r2 | | + * +--------+-----------------+--------+------+---+ + * ^ ^ ^ ^ ^ + * |________|________|________| | + * | Unaligned address + * Aligned addresses + * + * Expect that r1 will not be trimmed and r2 will be removed. Expect the + * counter to be updated. + */ +static int memblock_trim_memory_too_small_check(void) +{ + struct memblock_region *rgn; + const phys_addr_t alignment = SMP_CACHE_BYTES; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = alignment, + .size = alignment * 2 + }; + struct region r2 = { + .base = alignment * 4, + .size = alignment - SZ_2 + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, r1.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to trim memory when there are two available regions, r1 and + * r2. Region r1 is aligned on both ends and region r2 is unaligned at the base + * and aligned at the end: + * + * Unaligned address + * | + * v + * | +-----------------+ +---------------+ | + * | | r1 | | r2 | | + * +--------+-----------------+----------+---------------+---+ + * ^ ^ ^ ^ ^ ^ + * |________|________|________|________|________| + * | + * Aligned addresses + * + * Expect that r1 will not be trimmed and r2 will be trimmed at the base. + * Expect the counter to not be updated. + */ +static int memblock_trim_memory_unaligned_base_check(void) +{ + struct memblock_region *rgn1, *rgn2; + const phys_addr_t alignment = SMP_CACHE_BYTES; + phys_addr_t offset = SZ_2; + phys_addr_t new_r2_base, new_r2_size; + + rgn1 = &memblock.memory.regions[0]; + rgn2 = &memblock.memory.regions[1]; + + struct region r1 = { + .base = alignment, + .size = alignment * 2 + }; + struct region r2 = { + .base = alignment * 4 + offset, + .size = alignment * 2 - offset + }; + + PREFIX_PUSH(); + + new_r2_base = r2.base + (alignment - offset); + new_r2_size = r2.size - (alignment - offset); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1.size); + + ASSERT_EQ(rgn2->base, new_r2_base); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.memory.cnt, 2); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to trim memory when there are two available regions, r1 and + * r2. Region r1 is aligned on both ends and region r2 is aligned at the base + * and unaligned at the end: + * + * Unaligned address + * | + * v + * | +-----------------+ +---------------+ | + * | | r1 | | r2 | | + * +--------+-----------------+--------+---------------+---+ + * ^ ^ ^ ^ ^ ^ + * |________|________|________|________|________| + * | + * Aligned addresses + * + * Expect that r1 will not be trimmed and r2 will be trimmed at the end. + * Expect the counter to not be updated. + */ +static int memblock_trim_memory_unaligned_end_check(void) +{ + struct memblock_region *rgn1, *rgn2; + const phys_addr_t alignment = SMP_CACHE_BYTES; + phys_addr_t offset = SZ_2; + phys_addr_t new_r2_size; + + rgn1 = &memblock.memory.regions[0]; + rgn2 = &memblock.memory.regions[1]; + + struct region r1 = { + .base = alignment, + .size = alignment * 2 + }; + struct region r2 = { + .base = alignment * 4, + .size = alignment * 2 - offset + }; + + PREFIX_PUSH(); + + new_r2_size = r2.size - (alignment - offset); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1.size); + + ASSERT_EQ(rgn2->base, r2.base); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.memory.cnt, 2); + + test_pass_pop(); + + return 0; +} + +static int memblock_trim_memory_checks(void) +{ + prefix_reset(); + prefix_push(FUNC_TRIM); + test_print("Running %s tests...\n", FUNC_TRIM); + + memblock_trim_memory_aligned_check(); + memblock_trim_memory_too_small_check(); + memblock_trim_memory_unaligned_base_check(); + memblock_trim_memory_unaligned_end_check(); + + prefix_pop(); + + return 0; +} + int memblock_basic_checks(void) { memblock_initialization_check(); @@ -1731,6 +1953,7 @@ int memblock_basic_checks(void) memblock_remove_checks(); memblock_free_checks(); memblock_bottom_up_checks(); + memblock_trim_memory_checks(); return 0; } -- GitLab From c5872d6a04d24b7de095fe446896c35cb7bae465 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 14:14:53 -0700 Subject: [PATCH 0099/2223] Input: clps711x-keypad - get rid of OF_GPIO dependency There is no such dependency in the driver, but it's implicitly used to have OF property APIs available. Replace that by device property API and get rid of OF_GPIO dependency. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220830182839.47965-1-andriy.shevchenko@linux.intel.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 2 +- drivers/input/keyboard/clps711x-keypad.c | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index a20ee693b22b5..2c6cef222f9c7 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -186,7 +186,7 @@ config KEYBOARD_QT2160 config KEYBOARD_CLPS711X tristate "CLPS711X Keypad support" - depends on OF_GPIO && (ARCH_CLPS711X || COMPILE_TEST) + depends on ARCH_CLPS711X || COMPILE_TEST select INPUT_MATRIXKMAP help Say Y here to enable the matrix keypad on the Cirrus Logic diff --git a/drivers/input/keyboard/clps711x-keypad.c b/drivers/input/keyboard/clps711x-keypad.c index 939c88655fc02..4c1a3e611edd7 100644 --- a/drivers/input/keyboard/clps711x-keypad.c +++ b/drivers/input/keyboard/clps711x-keypad.c @@ -6,9 +6,11 @@ */ #include +#include #include -#include +#include #include +#include #include #include #include @@ -86,7 +88,6 @@ static int clps711x_keypad_probe(struct platform_device *pdev) { struct clps711x_keypad_data *priv; struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; struct input_dev *input; u32 poll_interval; int i, err; @@ -95,11 +96,11 @@ static int clps711x_keypad_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->syscon = syscon_regmap_lookup_by_phandle(np, "syscon"); + priv->syscon = syscon_regmap_lookup_by_phandle(dev->of_node, "syscon"); if (IS_ERR(priv->syscon)) return PTR_ERR(priv->syscon); - priv->row_count = of_gpio_named_count(np, "row-gpios"); + priv->row_count = gpiod_count(dev, "row"); if (priv->row_count < 1) return -EINVAL; @@ -119,7 +120,7 @@ static int clps711x_keypad_probe(struct platform_device *pdev) return PTR_ERR(data->desc); } - err = of_property_read_u32(np, "poll-interval", &poll_interval); + err = device_property_read_u32(dev, "poll-interval", &poll_interval); if (err) return err; @@ -143,7 +144,7 @@ static int clps711x_keypad_probe(struct platform_device *pdev) return err; input_set_capability(input, EV_MSC, MSC_SCAN); - if (of_property_read_bool(np, "autorepeat")) + if (device_property_read_bool(dev, "autorepeat")) __set_bit(EV_REP, input->evbit); /* Set all columns to low */ -- GitLab From f8f7f47d576f7f5d44ef9237f356bd6d42002614 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 14:15:18 -0700 Subject: [PATCH 0100/2223] Input: matrix_keypad - replace of_gpio_named_count() by gpiod_count() As a preparation to unexport of_gpio_named_count(), convert the driver to use gpiod_count() instead. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220830183552.50695-1-andriy.shevchenko@linux.intel.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/matrix_keypad.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c index 30924b57058f2..63f078f2bc4ae 100644 --- a/drivers/input/keyboard/matrix_keypad.c +++ b/drivers/input/keyboard/matrix_keypad.c @@ -416,9 +416,9 @@ matrix_keypad_parse_dt(struct device *dev) return ERR_PTR(-ENOMEM); } - pdata->num_row_gpios = nrow = of_gpio_named_count(np, "row-gpios"); - pdata->num_col_gpios = ncol = of_gpio_named_count(np, "col-gpios"); - if (nrow <= 0 || ncol <= 0) { + pdata->num_row_gpios = nrow = gpiod_count(dev, "row"); + pdata->num_col_gpios = ncol = gpiod_count(dev, "col"); + if (nrow < 0 || ncol < 0) { dev_err(dev, "number of keypad rows/columns not specified\n"); return ERR_PTR(-EINVAL); } -- GitLab From 9d2b2e83ef277b9c7b8852e8717140daa373ccf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 20:54:10 -0700 Subject: [PATCH 0101/2223] Input: adp5588-keys - support gpi key events as 'gpio keys' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change replaces the support for GPIs as key event generators. Instead of reporting the events directly, we add a gpio based irqchip so that these events can be consumed by keys defined in the gpio-keys driver (as it's goal is indeed for keys on GPIOs capable of generating interrupts). With this, the gpio-adp5588 driver can also be dropped. The basic idea is that all the pins that are not being used as part of the keymap matrix can be possibly requested as GPIOs by gpio-keys (it's also fine to use these pins as plain interrupts though that's not really the point). Since the gpiochip now also has irqchip capabilities, we should only remove it after we free the device interrupt (otherwise we could, in theory, be handling GPIs interrupts while the gpiochip is concurrently removed). Thus the call 'adp5588_gpio_add()' is moved and since the setup phase also needs to come before making the gpios visible, we also need to move 'adp5588_setup()'. While at it, always select GPIOLIB so that we don't need to use #ifdef guards. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-2-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 2 + drivers/input/keyboard/adp5588-keys.c | 274 +++++++++++++------------- include/linux/platform_data/adp5588.h | 2 - 3 files changed, 144 insertions(+), 134 deletions(-) diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 2c6cef222f9c7..e445e760a41a2 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -40,6 +40,8 @@ config KEYBOARD_ADP5520 config KEYBOARD_ADP5588 tristate "ADP5588/87 I2C QWERTY Keypad and IO Expander" depends on I2C + select GPIOLIB + select GPIOLIB_IRQCHIP help Say Y here if you want to use a ADP5588/87 attached to your system I2C bus. diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index e2719737360a1..f5f7ddfe68beb 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -40,21 +40,21 @@ #define WA_DELAYED_READOUT_REVID(rev) ((rev) < 4) #define WA_DELAYED_READOUT_TIME 25 +#define ADP5588_INVALID_HWIRQ (~0UL) + struct adp5588_kpad { struct i2c_client *client; struct input_dev *input; ktime_t irq_time; unsigned long delay; unsigned short keycode[ADP5588_KEYMAPSIZE]; - const struct adp5588_gpi_map *gpimap; - unsigned short gpimapsize; -#ifdef CONFIG_GPIOLIB unsigned char gpiomap[ADP5588_MAXGPIO]; struct gpio_chip gc; struct mutex gpio_lock; /* Protect cached dir, dat_out */ u8 dat_out[3]; u8 dir[3]; -#endif + u8 int_en[3]; + u8 irq_mask[3]; }; static int adp5588_read(struct i2c_client *client, u8 reg) @@ -72,7 +72,6 @@ static int adp5588_write(struct i2c_client *client, u8 reg, u8 val) return i2c_smbus_write_byte_data(client, reg, val); } -#ifdef CONFIG_GPIOLIB static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned off) { struct adp5588_kpad *kpad = gpiochip_get_data(chip); @@ -171,9 +170,6 @@ static int adp5588_build_gpiomap(struct adp5588_kpad *kpad, for (i = 0; i < pdata->cols; i++) pin_used[i + GPI_PIN_COL_BASE - GPI_PIN_BASE] = true; - for (i = 0; i < kpad->gpimapsize; i++) - pin_used[kpad->gpimap[i].pin - GPI_PIN_BASE] = true; - for (i = 0; i < ADP5588_MAXGPIO; i++) if (!pin_used[i]) kpad->gpiomap[n_unused++] = i; @@ -196,11 +192,79 @@ static void adp5588_gpio_do_teardown(void *_kpad) dev_warn(&kpad->client->dev, "teardown failed %d\n", error); } +static void adp5588_irq_bus_lock(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct adp5588_kpad *kpad = gpiochip_get_data(gc); + + mutex_lock(&kpad->gpio_lock); +} + +static void adp5588_irq_bus_sync_unlock(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct adp5588_kpad *kpad = gpiochip_get_data(gc); + int i; + + for (i = 0; i <= ADP5588_BANK(ADP5588_MAXGPIO); i++) { + if (kpad->int_en[i] ^ kpad->irq_mask[i]) { + kpad->int_en[i] = kpad->irq_mask[i]; + adp5588_write(kpad->client, GPI_EM1 + i, kpad->int_en[i]); + } + } + + mutex_unlock(&kpad->gpio_lock); +} + +static void adp5588_irq_mask(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct adp5588_kpad *kpad = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + unsigned long real_irq = kpad->gpiomap[hwirq]; + + kpad->irq_mask[ADP5588_BANK(real_irq)] &= ~ADP5588_BIT(real_irq); + gpiochip_disable_irq(gc, hwirq); +} + +static void adp5588_irq_unmask(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct adp5588_kpad *kpad = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + unsigned long real_irq = kpad->gpiomap[hwirq]; + + gpiochip_enable_irq(gc, hwirq); + kpad->irq_mask[ADP5588_BANK(real_irq)] |= ADP5588_BIT(real_irq); +} + +static int adp5588_irq_set_type(struct irq_data *d, unsigned int type) +{ + if (!(type & IRQ_TYPE_EDGE_BOTH)) + return -EINVAL; + + irq_set_handler_locked(d, handle_edge_irq); + + return 0; +} + +static const struct irq_chip adp5588_irq_chip = { + .name = "adp5588", + .irq_mask = adp5588_irq_mask, + .irq_unmask = adp5588_irq_unmask, + .irq_bus_lock = adp5588_irq_bus_lock, + .irq_bus_sync_unlock = adp5588_irq_bus_sync_unlock, + .irq_set_type = adp5588_irq_set_type, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static int adp5588_gpio_add(struct adp5588_kpad *kpad) { struct device *dev = &kpad->client->dev; const struct adp5588_kpad_platform_data *pdata = dev_get_platdata(dev); const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data; + struct gpio_irq_chip *girq; int i, error; if (!gpio_data) @@ -212,6 +276,7 @@ static int adp5588_gpio_add(struct adp5588_kpad *kpad) return 0; } + kpad->gc.parent = &kpad->client->dev; kpad->gc.direction_input = adp5588_gpio_direction_input; kpad->gc.direction_output = adp5588_gpio_direction_output; kpad->gc.get = adp5588_gpio_get_value; @@ -223,6 +288,11 @@ static int adp5588_gpio_add(struct adp5588_kpad *kpad) kpad->gc.owner = THIS_MODULE; kpad->gc.names = gpio_data->names; + girq = &kpad->gc.irq; + gpio_irq_chip_set_chip(girq, &adp5588_irq_chip); + girq->handler = handle_bad_irq; + girq->threaded = true; + mutex_init(&kpad->gpio_lock); error = devm_gpiochip_add_data(dev, &kpad->gc, kpad); @@ -255,35 +325,73 @@ static int adp5588_gpio_add(struct adp5588_kpad *kpad) return 0; } -#else -static inline int adp5588_gpio_add(struct adp5588_kpad *kpad) +static unsigned long adp5588_gpiomap_get_hwirq(struct device *dev, + const u8 *map, unsigned int gpio, + unsigned int ngpios) { - return 0; + unsigned int hwirq; + + for (hwirq = 0; hwirq < ngpios; hwirq++) + if (map[hwirq] == gpio) + return hwirq; + + /* should never happen */ + dev_warn_ratelimited(dev, "could not find the hwirq for gpio(%u)\n", gpio); + + return ADP5588_INVALID_HWIRQ; +} + +static void adp5588_gpio_irq_handle(struct adp5588_kpad *kpad, int key_val, + int key_press) +{ + unsigned int irq, gpio = key_val - GPI_PIN_BASE, irq_type; + struct i2c_client *client = kpad->client; + struct irq_data *irqd; + unsigned long hwirq; + + hwirq = adp5588_gpiomap_get_hwirq(&client->dev, kpad->gpiomap, + gpio, kpad->gc.ngpio); + if (hwirq == ADP5588_INVALID_HWIRQ) { + dev_err(&client->dev, "Could not get hwirq for key(%u)\n", key_val); + return; + } + + irq = irq_find_mapping(kpad->gc.irq.domain, hwirq); + if (!irq) + return; + + irqd = irq_get_irq_data(irq); + if (!irqd) { + dev_err(&client->dev, "Could not get irq(%u) data\n", irq); + return; + } + + irq_type = irqd_get_trigger_type(irqd); + + /* + * Default is active low which means key_press is asserted on + * the falling edge. + */ + if ((irq_type & IRQ_TYPE_EDGE_RISING && !key_press) || + (irq_type & IRQ_TYPE_EDGE_FALLING && key_press)) + handle_nested_irq(irq); } -#endif static void adp5588_report_events(struct adp5588_kpad *kpad, int ev_cnt) { - int i, j; + int i; for (i = 0; i < ev_cnt; i++) { int key = adp5588_read(kpad->client, Key_EVENTA + i); int key_val = key & KEY_EV_MASK; + int key_press = key & KEY_EV_PRESSED; - if (key_val >= GPI_PIN_BASE && key_val <= GPI_PIN_END) { - for (j = 0; j < kpad->gpimapsize; j++) { - if (key_val == kpad->gpimap[j].pin) { - input_report_switch(kpad->input, - kpad->gpimap[j].sw_evt, - key & KEY_EV_PRESSED); - break; - } - } - } else { + if (key_val >= GPI_PIN_BASE && key_val <= GPI_PIN_END) + /* gpio line used as IRQ source */ + adp5588_gpio_irq_handle(kpad, key_val, key_press); + else input_report_key(kpad->input, - kpad->keycode[key_val - 1], - key & KEY_EV_PRESSED); - } + kpad->keycode[key_val - 1], key_press); } } @@ -341,7 +449,6 @@ static int adp5588_setup(struct i2c_client *client) dev_get_platdata(&client->dev); const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data; int i, ret; - unsigned char evt_mode1 = 0, evt_mode2 = 0, evt_mode3 = 0; ret = adp5588_write(client, KP_GPIO1, KP_SEL(pdata->rows)); ret |= adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF); @@ -356,23 +463,6 @@ static int adp5588_setup(struct i2c_client *client) for (i = 0; i < KEYP_MAX_EVENT; i++) ret |= adp5588_read(client, Key_EVENTA); - for (i = 0; i < pdata->gpimapsize; i++) { - unsigned short pin = pdata->gpimap[i].pin; - - if (pin <= GPI_PIN_ROW_END) { - evt_mode1 |= (1 << (pin - GPI_PIN_ROW_BASE)); - } else { - evt_mode2 |= ((1 << (pin - GPI_PIN_COL_BASE)) & 0xFF); - evt_mode3 |= ((1 << (pin - GPI_PIN_COL_BASE)) >> 8); - } - } - - if (pdata->gpimapsize) { - ret |= adp5588_write(client, GPI_EM1, evt_mode1); - ret |= adp5588_write(client, GPI_EM2, evt_mode2); - ret |= adp5588_write(client, GPI_EM3, evt_mode3); - } - if (gpio_data) { for (i = 0; i <= ADP5588_BANK(ADP5588_MAXGPIO); i++) { int pull_mask = gpio_data->pullup_dis_mask; @@ -399,44 +489,6 @@ static int adp5588_setup(struct i2c_client *client) return 0; } -static void adp5588_report_switch_state(struct adp5588_kpad *kpad) -{ - int gpi_stat1 = adp5588_read(kpad->client, GPIO_DAT_STAT1); - int gpi_stat2 = adp5588_read(kpad->client, GPIO_DAT_STAT2); - int gpi_stat3 = adp5588_read(kpad->client, GPIO_DAT_STAT3); - int gpi_stat_tmp, pin_loc; - int i; - - for (i = 0; i < kpad->gpimapsize; i++) { - unsigned short pin = kpad->gpimap[i].pin; - - if (pin <= GPI_PIN_ROW_END) { - gpi_stat_tmp = gpi_stat1; - pin_loc = pin - GPI_PIN_ROW_BASE; - } else if ((pin - GPI_PIN_COL_BASE) < 8) { - gpi_stat_tmp = gpi_stat2; - pin_loc = pin - GPI_PIN_COL_BASE; - } else { - gpi_stat_tmp = gpi_stat3; - pin_loc = pin - GPI_PIN_COL_BASE - 8; - } - - if (gpi_stat_tmp < 0) { - dev_err(&kpad->client->dev, - "Can't read GPIO_DAT_STAT switch %d default to OFF\n", - pin); - gpi_stat_tmp = 0; - } - - input_report_switch(kpad->input, - kpad->gpimap[i].sw_evt, - !(gpi_stat_tmp & (1 << pin_loc))); - } - - input_sync(kpad->input); -} - - static int adp5588_probe(struct i2c_client *client, const struct i2c_device_id *id) { @@ -469,37 +521,6 @@ static int adp5588_probe(struct i2c_client *client, return -EINVAL; } - if (!pdata->gpimap && pdata->gpimapsize) { - dev_err(&client->dev, "invalid gpimap from pdata\n"); - return -EINVAL; - } - - if (pdata->gpimapsize > ADP5588_GPIMAPSIZE_MAX) { - dev_err(&client->dev, "invalid gpimapsize\n"); - return -EINVAL; - } - - for (i = 0; i < pdata->gpimapsize; i++) { - unsigned short pin = pdata->gpimap[i].pin; - - if (pin < GPI_PIN_BASE || pin > GPI_PIN_END) { - dev_err(&client->dev, "invalid gpi pin data\n"); - return -EINVAL; - } - - if (pin <= GPI_PIN_ROW_END) { - if (pin - GPI_PIN_ROW_BASE + 1 <= pdata->rows) { - dev_err(&client->dev, "invalid gpi row data\n"); - return -EINVAL; - } - } else { - if (pin - GPI_PIN_COL_BASE + 1 <= pdata->cols) { - dev_err(&client->dev, "invalid gpi col data\n"); - return -EINVAL; - } - } - } - if (!client->irq) { dev_err(&client->dev, "no IRQ?\n"); return -EINVAL; @@ -541,9 +562,6 @@ static int adp5588_probe(struct i2c_client *client, memcpy(kpad->keycode, pdata->keymap, pdata->keymapsize * input->keycodesize); - kpad->gpimap = pdata->gpimap; - kpad->gpimapsize = pdata->gpimapsize; - /* setup input device */ __set_bit(EV_KEY, input->evbit); @@ -555,11 +573,6 @@ static int adp5588_probe(struct i2c_client *client, __set_bit(kpad->keycode[i], input->keybit); __clear_bit(KEY_RESERVED, input->keybit); - if (kpad->gpimapsize) - __set_bit(EV_SW, input->evbit); - for (i = 0; i < kpad->gpimapsize; i++) - __set_bit(kpad->gpimap[i].sw_evt, input->swbit); - error = input_register_device(input); if (error) { dev_err(&client->dev, "unable to register input device: %d\n", @@ -567,6 +580,14 @@ static int adp5588_probe(struct i2c_client *client, return error; } + error = adp5588_setup(client); + if (error) + return error; + + error = adp5588_gpio_add(kpad); + if (error) + return error; + error = devm_request_threaded_irq(&client->dev, client->irq, adp5588_hard_irq, adp5588_thread_irq, IRQF_TRIGGER_FALLING | IRQF_ONESHOT, @@ -577,17 +598,6 @@ static int adp5588_probe(struct i2c_client *client, return error; } - error = adp5588_setup(client); - if (error) - return error; - - if (kpad->gpimapsize) - adp5588_report_switch_state(kpad); - - error = adp5588_gpio_add(kpad); - if (error) - return error; - dev_info(&client->dev, "Rev.%d keypad, irq %d\n", revid, client->irq); return 0; } diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h index 6d3f7d911a92e..82170ec8c266c 100644 --- a/include/linux/platform_data/adp5588.h +++ b/include/linux/platform_data/adp5588.h @@ -147,8 +147,6 @@ struct adp5588_kpad_platform_data { unsigned en_keylock:1; /* Enable Key Lock feature */ unsigned short unlock_key1; /* Unlock Key 1 */ unsigned short unlock_key2; /* Unlock Key 2 */ - const struct adp5588_gpi_map *gpimap; - unsigned short gpimapsize; const struct adp5588_gpio_platform_data *gpio_data; }; -- GitLab From 5ddc896088b02bcf07ec4f16ae75db43b35b0bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 20:55:47 -0700 Subject: [PATCH 0102/2223] gpio: gpio-adp5588: drop the driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With commit 9d2b2e83ef27 ("Input: adp5588-keys - support gpi key events as 'gpio keys'") the irchip functionality is directly supported in the input driver as the main goal of these pins is to be used as gpio keys. Hence, this driver can be removed. Signed-off-by: Nuno Sá Acked-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-3-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 1 - drivers/gpio/Kconfig | 14 -- drivers/gpio/Makefile | 1 - drivers/gpio/gpio-adp5588.c | 446 ------------------------------------ 4 files changed, 462 deletions(-) delete mode 100644 drivers/gpio/gpio-adp5588.c diff --git a/MAINTAINERS b/MAINTAINERS index 711bcd4f6269b..8404c18e6bcf1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -556,7 +556,6 @@ M: Michael Hennerich S: Supported W: http://wiki.analog.com/ADP5588 W: https://ez.analog.com/linux-software-drivers -F: drivers/gpio/gpio-adp5588.c F: drivers/input/keyboard/adp5588-keys.c ADP8860 BACKLIGHT DRIVER (ADP8860/ADP8861/ADP8863) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 0642f579196f2..3055ed2e115ab 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -985,20 +985,6 @@ endmenu menu "I2C GPIO expanders" depends on I2C -config GPIO_ADP5588 - tristate "ADP5588 I2C GPIO expander" - help - This option enables support for 18 GPIOs found - on Analog Devices ADP5588 GPIO Expanders. - -config GPIO_ADP5588_IRQ - bool "Interrupt controller support for ADP5588" - depends on GPIO_ADP5588=y - select GPIOLIB_IRQCHIP - help - Say yes here to enable the adp5588 to be used as an interrupt - controller. It requires the driver to be built in the kernel. - config GPIO_ADNP tristate "Avionic Design N-bit GPIO expander" depends on OF_GPIO diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index a0985d30f51bb..5b890a695f829 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -25,7 +25,6 @@ obj-$(CONFIG_GPIO_74X164) += gpio-74x164.o obj-$(CONFIG_GPIO_74XX_MMIO) += gpio-74xx-mmio.o obj-$(CONFIG_GPIO_ADNP) += gpio-adnp.o obj-$(CONFIG_GPIO_ADP5520) += gpio-adp5520.o -obj-$(CONFIG_GPIO_ADP5588) += gpio-adp5588.o obj-$(CONFIG_GPIO_AGGREGATOR) += gpio-aggregator.o obj-$(CONFIG_GPIO_ALTERA_A10SR) += gpio-altera-a10sr.o obj-$(CONFIG_GPIO_ALTERA) += gpio-altera.o diff --git a/drivers/gpio/gpio-adp5588.c b/drivers/gpio/gpio-adp5588.c deleted file mode 100644 index 9b562dbbd7332..0000000000000 --- a/drivers/gpio/gpio-adp5588.c +++ /dev/null @@ -1,446 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * GPIO Chip driver for Analog Devices - * ADP5588/ADP5587 I/O Expander and QWERTY Keypad Controller - * - * Copyright 2009-2010 Analog Devices Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Early pre 4.0 Silicon required to delay readout by at least 25ms, - * since the Event Counter Register updated 25ms after the interrupt - * asserted. - */ -#define WA_DELAYED_READOUT_REVID(rev) ((rev) < 4) - -struct adp5588_gpio { - struct i2c_client *client; - struct gpio_chip gpio_chip; - struct mutex lock; /* protect cached dir, dat_out */ - /* protect serialized access to the interrupt controller bus */ - struct mutex irq_lock; - uint8_t dat_out[3]; - uint8_t dir[3]; - uint8_t int_lvl_low[3]; - uint8_t int_lvl_high[3]; - uint8_t int_en[3]; - uint8_t irq_mask[3]; - uint8_t int_input_en[3]; -}; - -static int adp5588_gpio_read(struct i2c_client *client, u8 reg) -{ - int ret = i2c_smbus_read_byte_data(client, reg); - - if (ret < 0) - dev_err(&client->dev, "Read Error\n"); - - return ret; -} - -static int adp5588_gpio_write(struct i2c_client *client, u8 reg, u8 val) -{ - int ret = i2c_smbus_write_byte_data(client, reg, val); - - if (ret < 0) - dev_err(&client->dev, "Write Error\n"); - - return ret; -} - -static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned off) -{ - struct adp5588_gpio *dev = gpiochip_get_data(chip); - unsigned bank = ADP5588_BANK(off); - unsigned bit = ADP5588_BIT(off); - int val; - - mutex_lock(&dev->lock); - - if (dev->dir[bank] & bit) - val = dev->dat_out[bank]; - else - val = adp5588_gpio_read(dev->client, GPIO_DAT_STAT1 + bank); - - mutex_unlock(&dev->lock); - - return !!(val & bit); -} - -static void adp5588_gpio_set_value(struct gpio_chip *chip, - unsigned off, int val) -{ - unsigned bank, bit; - struct adp5588_gpio *dev = gpiochip_get_data(chip); - - bank = ADP5588_BANK(off); - bit = ADP5588_BIT(off); - - mutex_lock(&dev->lock); - if (val) - dev->dat_out[bank] |= bit; - else - dev->dat_out[bank] &= ~bit; - - adp5588_gpio_write(dev->client, GPIO_DAT_OUT1 + bank, - dev->dat_out[bank]); - mutex_unlock(&dev->lock); -} - -static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned off) -{ - int ret; - unsigned bank; - struct adp5588_gpio *dev = gpiochip_get_data(chip); - - bank = ADP5588_BANK(off); - - mutex_lock(&dev->lock); - dev->dir[bank] &= ~ADP5588_BIT(off); - ret = adp5588_gpio_write(dev->client, GPIO_DIR1 + bank, dev->dir[bank]); - mutex_unlock(&dev->lock); - - return ret; -} - -static int adp5588_gpio_direction_output(struct gpio_chip *chip, - unsigned off, int val) -{ - int ret; - unsigned bank, bit; - struct adp5588_gpio *dev = gpiochip_get_data(chip); - - bank = ADP5588_BANK(off); - bit = ADP5588_BIT(off); - - mutex_lock(&dev->lock); - dev->dir[bank] |= bit; - - if (val) - dev->dat_out[bank] |= bit; - else - dev->dat_out[bank] &= ~bit; - - ret = adp5588_gpio_write(dev->client, GPIO_DAT_OUT1 + bank, - dev->dat_out[bank]); - ret |= adp5588_gpio_write(dev->client, GPIO_DIR1 + bank, - dev->dir[bank]); - mutex_unlock(&dev->lock); - - return ret; -} - -#ifdef CONFIG_GPIO_ADP5588_IRQ - -static void adp5588_irq_bus_lock(struct irq_data *d) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct adp5588_gpio *dev = gpiochip_get_data(gc); - - mutex_lock(&dev->irq_lock); -} - - /* - * genirq core code can issue chip->mask/unmask from atomic context. - * This doesn't work for slow busses where an access needs to sleep. - * bus_sync_unlock() is therefore called outside the atomic context, - * syncs the current irq mask state with the slow external controller - * and unlocks the bus. - */ - -static void adp5588_irq_bus_sync_unlock(struct irq_data *d) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct adp5588_gpio *dev = gpiochip_get_data(gc); - int i; - - for (i = 0; i <= ADP5588_BANK(ADP5588_MAXGPIO); i++) { - if (dev->int_input_en[i]) { - mutex_lock(&dev->lock); - dev->dir[i] &= ~dev->int_input_en[i]; - dev->int_input_en[i] = 0; - adp5588_gpio_write(dev->client, GPIO_DIR1 + i, - dev->dir[i]); - mutex_unlock(&dev->lock); - } - - if (dev->int_en[i] ^ dev->irq_mask[i]) { - dev->int_en[i] = dev->irq_mask[i]; - adp5588_gpio_write(dev->client, GPI_EM1 + i, - dev->int_en[i]); - } - } - - mutex_unlock(&dev->irq_lock); -} - -static void adp5588_irq_mask(struct irq_data *d) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct adp5588_gpio *dev = gpiochip_get_data(gc); - - dev->irq_mask[ADP5588_BANK(d->hwirq)] &= ~ADP5588_BIT(d->hwirq); -} - -static void adp5588_irq_unmask(struct irq_data *d) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct adp5588_gpio *dev = gpiochip_get_data(gc); - - dev->irq_mask[ADP5588_BANK(d->hwirq)] |= ADP5588_BIT(d->hwirq); -} - -static int adp5588_irq_set_type(struct irq_data *d, unsigned int type) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct adp5588_gpio *dev = gpiochip_get_data(gc); - uint16_t gpio = d->hwirq; - unsigned bank, bit; - - bank = ADP5588_BANK(gpio); - bit = ADP5588_BIT(gpio); - - dev->int_lvl_low[bank] &= ~bit; - dev->int_lvl_high[bank] &= ~bit; - - if (type & IRQ_TYPE_EDGE_BOTH || type & IRQ_TYPE_LEVEL_HIGH) - dev->int_lvl_high[bank] |= bit; - - if (type & IRQ_TYPE_EDGE_BOTH || type & IRQ_TYPE_LEVEL_LOW) - dev->int_lvl_low[bank] |= bit; - - dev->int_input_en[bank] |= bit; - - return 0; -} - -static struct irq_chip adp5588_irq_chip = { - .name = "adp5588", - .irq_mask = adp5588_irq_mask, - .irq_unmask = adp5588_irq_unmask, - .irq_bus_lock = adp5588_irq_bus_lock, - .irq_bus_sync_unlock = adp5588_irq_bus_sync_unlock, - .irq_set_type = adp5588_irq_set_type, -}; - -static irqreturn_t adp5588_irq_handler(int irq, void *devid) -{ - struct adp5588_gpio *dev = devid; - int status = adp5588_gpio_read(dev->client, INT_STAT); - - if (status & ADP5588_KE_INT) { - int ev_cnt = adp5588_gpio_read(dev->client, KEY_LCK_EC_STAT); - - if (ev_cnt > 0) { - int i; - - for (i = 0; i < (ev_cnt & ADP5588_KEC); i++) { - int key = adp5588_gpio_read(dev->client, - Key_EVENTA + i); - /* GPIN events begin at 97, - * bit 7 indicates logic level - */ - int gpio = (key & 0x7f) - 97; - int lvl = key & (1 << 7); - int bank = ADP5588_BANK(gpio); - int bit = ADP5588_BIT(gpio); - - if ((lvl && dev->int_lvl_high[bank] & bit) || - (!lvl && dev->int_lvl_low[bank] & bit)) - handle_nested_irq(irq_find_mapping( - dev->gpio_chip.irq.domain, gpio)); - } - } - } - - adp5588_gpio_write(dev->client, INT_STAT, status); /* Status is W1C */ - - return IRQ_HANDLED; -} - - -static int adp5588_irq_init_hw(struct gpio_chip *gc) -{ - struct adp5588_gpio *dev = gpiochip_get_data(gc); - /* Enable IRQs after registering chip */ - adp5588_gpio_write(dev->client, CFG, - ADP5588_AUTO_INC | ADP5588_INT_CFG | ADP5588_KE_IEN); - - return 0; -} - -static int adp5588_irq_setup(struct adp5588_gpio *dev) -{ - struct i2c_client *client = dev->client; - int ret; - struct adp5588_gpio_platform_data *pdata = - dev_get_platdata(&client->dev); - struct gpio_irq_chip *girq; - - adp5588_gpio_write(client, CFG, ADP5588_AUTO_INC); - adp5588_gpio_write(client, INT_STAT, -1); /* status is W1C */ - - mutex_init(&dev->irq_lock); - - ret = devm_request_threaded_irq(&client->dev, client->irq, - NULL, adp5588_irq_handler, IRQF_ONESHOT - | IRQF_TRIGGER_FALLING | IRQF_SHARED, - dev_name(&client->dev), dev); - if (ret) { - dev_err(&client->dev, "failed to request irq %d\n", - client->irq); - return ret; - } - - /* This will be registered in the call to devm_gpiochip_add_data() */ - girq = &dev->gpio_chip.irq; - girq->chip = &adp5588_irq_chip; - /* This will let us handle the parent IRQ in the driver */ - girq->parent_handler = NULL; - girq->num_parents = 0; - girq->parents = NULL; - girq->first = pdata ? pdata->irq_base : 0; - girq->default_type = IRQ_TYPE_NONE; - girq->handler = handle_simple_irq; - girq->init_hw = adp5588_irq_init_hw; - girq->threaded = true; - - return 0; -} - -#else -static int adp5588_irq_setup(struct adp5588_gpio *dev) -{ - struct i2c_client *client = dev->client; - dev_warn(&client->dev, "interrupt support not compiled in\n"); - - return 0; -} - -#endif /* CONFIG_GPIO_ADP5588_IRQ */ - -static int adp5588_gpio_probe(struct i2c_client *client) -{ - struct adp5588_gpio_platform_data *pdata = - dev_get_platdata(&client->dev); - struct adp5588_gpio *dev; - struct gpio_chip *gc; - int ret, i, revid; - unsigned int pullup_dis_mask = 0; - - if (!i2c_check_functionality(client->adapter, - I2C_FUNC_SMBUS_BYTE_DATA)) { - dev_err(&client->dev, "SMBUS Byte Data not Supported\n"); - return -EIO; - } - - dev = devm_kzalloc(&client->dev, sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - dev->client = client; - - gc = &dev->gpio_chip; - gc->direction_input = adp5588_gpio_direction_input; - gc->direction_output = adp5588_gpio_direction_output; - gc->get = adp5588_gpio_get_value; - gc->set = adp5588_gpio_set_value; - gc->can_sleep = true; - gc->base = -1; - gc->parent = &client->dev; - - if (pdata) { - gc->base = pdata->gpio_start; - gc->names = pdata->names; - pullup_dis_mask = pdata->pullup_dis_mask; - } - - gc->ngpio = ADP5588_MAXGPIO; - gc->label = client->name; - gc->owner = THIS_MODULE; - - mutex_init(&dev->lock); - - ret = adp5588_gpio_read(dev->client, DEV_ID); - if (ret < 0) - return ret; - - revid = ret & ADP5588_DEVICE_ID_MASK; - - for (i = 0, ret = 0; i <= ADP5588_BANK(ADP5588_MAXGPIO); i++) { - dev->dat_out[i] = adp5588_gpio_read(client, GPIO_DAT_OUT1 + i); - dev->dir[i] = adp5588_gpio_read(client, GPIO_DIR1 + i); - ret |= adp5588_gpio_write(client, KP_GPIO1 + i, 0); - ret |= adp5588_gpio_write(client, GPIO_PULL1 + i, - (pullup_dis_mask >> (8 * i)) & 0xFF); - ret |= adp5588_gpio_write(client, GPIO_INT_EN1 + i, 0); - if (ret) - return ret; - } - - if (client->irq) { - if (WA_DELAYED_READOUT_REVID(revid)) { - dev_warn(&client->dev, "GPIO int not supported\n"); - } else { - ret = adp5588_irq_setup(dev); - if (ret) - return ret; - } - } - - ret = devm_gpiochip_add_data(&client->dev, &dev->gpio_chip, dev); - if (ret) - return ret; - - i2c_set_clientdata(client, dev); - - return 0; -} - -static void adp5588_gpio_remove(struct i2c_client *client) -{ - struct adp5588_gpio *dev = i2c_get_clientdata(client); - - if (dev->client->irq) - free_irq(dev->client->irq, dev); -} - -static const struct i2c_device_id adp5588_gpio_id[] = { - { "adp5588-gpio" }, - {} -}; -MODULE_DEVICE_TABLE(i2c, adp5588_gpio_id); - -static const struct of_device_id adp5588_gpio_of_id[] = { - { .compatible = "adi,adp5588-gpio" }, - {} -}; -MODULE_DEVICE_TABLE(of, adp5588_gpio_of_id); - -static struct i2c_driver adp5588_gpio_driver = { - .driver = { - .name = "adp5588-gpio", - .of_match_table = adp5588_gpio_of_id, - }, - .probe_new = adp5588_gpio_probe, - .remove = adp5588_gpio_remove, - .id_table = adp5588_gpio_id, -}; - -module_i2c_driver(adp5588_gpio_driver); - -MODULE_AUTHOR("Michael Hennerich "); -MODULE_DESCRIPTION("GPIO ADP5588 Driver"); -MODULE_LICENSE("GPL"); -- GitLab From e960309ce31865713051854d38740575a6bc0a60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 20:58:26 -0700 Subject: [PATCH 0103/2223] Input: adp5588-keys - bail out on returned error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't continue in code paths after some error is found. It makes no sense to do any other device configuration if a previous one failed. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-4-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/adp5588-keys.c | 56 ++++++++++++++++++--------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index f5f7ddfe68beb..2452ea4128b3f 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -147,9 +147,13 @@ static int adp5588_gpio_direction_output(struct gpio_chip *chip, ret = adp5588_write(kpad->client, GPIO_DAT_OUT1 + bank, kpad->dat_out[bank]); - ret |= adp5588_write(kpad->client, GPIO_DIR1 + bank, + if (ret) + goto out_unlock; + + ret = adp5588_write(kpad->client, GPIO_DIR1 + bank, kpad->dir[bank]); +out_unlock: mutex_unlock(&kpad->gpio_lock); return ret; @@ -451,42 +455,58 @@ static int adp5588_setup(struct i2c_client *client) int i, ret; ret = adp5588_write(client, KP_GPIO1, KP_SEL(pdata->rows)); - ret |= adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF); - ret |= adp5588_write(client, KP_GPIO3, KP_SEL(pdata->cols) >> 8); + if (ret) + return ret; + + ret = adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF); + if (ret) + return ret; + + ret = adp5588_write(client, KP_GPIO3, KP_SEL(pdata->cols) >> 8); + if (ret) + return ret; if (pdata->en_keylock) { - ret |= adp5588_write(client, UNLOCK1, pdata->unlock_key1); - ret |= adp5588_write(client, UNLOCK2, pdata->unlock_key2); - ret |= adp5588_write(client, KEY_LCK_EC_STAT, ADP5588_K_LCK_EN); + ret = adp5588_write(client, UNLOCK1, pdata->unlock_key1); + if (ret) + return ret; + + ret = adp5588_write(client, UNLOCK2, pdata->unlock_key2); + if (ret) + return ret; + + ret = adp5588_write(client, KEY_LCK_EC_STAT, ADP5588_K_LCK_EN); + if (ret) + return ret; } - for (i = 0; i < KEYP_MAX_EVENT; i++) - ret |= adp5588_read(client, Key_EVENTA); + for (i = 0; i < KEYP_MAX_EVENT; i++) { + ret = adp5588_read(client, Key_EVENTA); + if (ret) + return ret; + } if (gpio_data) { for (i = 0; i <= ADP5588_BANK(ADP5588_MAXGPIO); i++) { int pull_mask = gpio_data->pullup_dis_mask; - ret |= adp5588_write(client, GPIO_PULL1 + i, + ret = adp5588_write(client, GPIO_PULL1 + i, (pull_mask >> (8 * i)) & 0xFF); + if (ret) + return ret; } } - ret |= adp5588_write(client, INT_STAT, + ret = adp5588_write(client, INT_STAT, ADP5588_CMP2_INT | ADP5588_CMP1_INT | ADP5588_OVR_FLOW_INT | ADP5588_K_LCK_INT | ADP5588_GPI_INT | ADP5588_KE_INT); /* Status is W1C */ + if (ret) + return ret; - ret |= adp5588_write(client, CFG, ADP5588_INT_CFG | + return adp5588_write(client, CFG, ADP5588_INT_CFG | ADP5588_OVR_FLOW_IEN | ADP5588_KE_IEN); - - if (ret < 0) { - dev_err(&client->dev, "Write Error\n"); - return ret; - } - - return 0; } static int adp5588_probe(struct i2c_client *client, -- GitLab From 6704a86283b7e79ff7ae36d388466428f6672962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:00:14 -0700 Subject: [PATCH 0104/2223] Input: adp5588-keys - add support for fw properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use firmware properties (eg: OF) to get the device specific configuration. This change just replaces the platform data since there was no platform using it and so, it makes no sense having both. Special note to the PULL-UP disable setting that is now supported as part of the gpio subsystem (using 'set_config()' callback). Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-5-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 1 + drivers/input/keyboard/adp5588-keys.c | 395 +++++++++++++++++++------- include/linux/platform_data/adp5588.h | 169 ----------- 3 files changed, 289 insertions(+), 276 deletions(-) delete mode 100644 include/linux/platform_data/adp5588.h diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index e445e760a41a2..8b0281c4f3c52 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -42,6 +42,7 @@ config KEYBOARD_ADP5588 depends on I2C select GPIOLIB select GPIOLIB_IRQCHIP + select INPUT_MATRIXKMAP help Say Y here if you want to use a ADP5588/87 attached to your system I2C bus. diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index 2452ea4128b3f..77d538ed45974 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -13,16 +13,149 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include #include -#include +#define DEV_ID 0x00 /* Device ID */ +#define CFG 0x01 /* Configuration Register1 */ +#define INT_STAT 0x02 /* Interrupt Status Register */ +#define KEY_LCK_EC_STAT 0x03 /* Key Lock and Event Counter Register */ +#define Key_EVENTA 0x04 /* Key Event Register A */ +#define Key_EVENTB 0x05 /* Key Event Register B */ +#define Key_EVENTC 0x06 /* Key Event Register C */ +#define Key_EVENTD 0x07 /* Key Event Register D */ +#define Key_EVENTE 0x08 /* Key Event Register E */ +#define Key_EVENTF 0x09 /* Key Event Register F */ +#define Key_EVENTG 0x0A /* Key Event Register G */ +#define Key_EVENTH 0x0B /* Key Event Register H */ +#define Key_EVENTI 0x0C /* Key Event Register I */ +#define Key_EVENTJ 0x0D /* Key Event Register J */ +#define KP_LCK_TMR 0x0E /* Keypad Lock1 to Lock2 Timer */ +#define UNLOCK1 0x0F /* Unlock Key1 */ +#define UNLOCK2 0x10 /* Unlock Key2 */ +#define GPIO_INT_STAT1 0x11 /* GPIO Interrupt Status */ +#define GPIO_INT_STAT2 0x12 /* GPIO Interrupt Status */ +#define GPIO_INT_STAT3 0x13 /* GPIO Interrupt Status */ +#define GPIO_DAT_STAT1 0x14 /* GPIO Data Status, Read twice to clear */ +#define GPIO_DAT_STAT2 0x15 /* GPIO Data Status, Read twice to clear */ +#define GPIO_DAT_STAT3 0x16 /* GPIO Data Status, Read twice to clear */ +#define GPIO_DAT_OUT1 0x17 /* GPIO DATA OUT */ +#define GPIO_DAT_OUT2 0x18 /* GPIO DATA OUT */ +#define GPIO_DAT_OUT3 0x19 /* GPIO DATA OUT */ +#define GPIO_INT_EN1 0x1A /* GPIO Interrupt Enable */ +#define GPIO_INT_EN2 0x1B /* GPIO Interrupt Enable */ +#define GPIO_INT_EN3 0x1C /* GPIO Interrupt Enable */ +#define KP_GPIO1 0x1D /* Keypad or GPIO Selection */ +#define KP_GPIO2 0x1E /* Keypad or GPIO Selection */ +#define KP_GPIO3 0x1F /* Keypad or GPIO Selection */ +#define GPI_EM1 0x20 /* GPI Event Mode 1 */ +#define GPI_EM2 0x21 /* GPI Event Mode 2 */ +#define GPI_EM3 0x22 /* GPI Event Mode 3 */ +#define GPIO_DIR1 0x23 /* GPIO Data Direction */ +#define GPIO_DIR2 0x24 /* GPIO Data Direction */ +#define GPIO_DIR3 0x25 /* GPIO Data Direction */ +#define GPIO_INT_LVL1 0x26 /* GPIO Edge/Level Detect */ +#define GPIO_INT_LVL2 0x27 /* GPIO Edge/Level Detect */ +#define GPIO_INT_LVL3 0x28 /* GPIO Edge/Level Detect */ +#define Debounce_DIS1 0x29 /* Debounce Disable */ +#define Debounce_DIS2 0x2A /* Debounce Disable */ +#define Debounce_DIS3 0x2B /* Debounce Disable */ +#define GPIO_PULL1 0x2C /* GPIO Pull Disable */ +#define GPIO_PULL2 0x2D /* GPIO Pull Disable */ +#define GPIO_PULL3 0x2E /* GPIO Pull Disable */ +#define CMP_CFG_STAT 0x30 /* Comparator Configuration and Status Register */ +#define CMP_CONFG_SENS1 0x31 /* Sensor1 Comparator Configuration Register */ +#define CMP_CONFG_SENS2 0x32 /* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */ +#define CMP1_LVL2_TRIP 0x33 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */ +#define CMP1_LVL2_HYS 0x34 /* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */ +#define CMP1_LVL3_TRIP 0x35 /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */ +#define CMP1_LVL3_HYS 0x36 /* Sensor 2 Comparator Configuration Register */ +#define CMP2_LVL2_TRIP 0x37 /* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */ +#define CMP2_LVL2_HYS 0x38 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */ +#define CMP2_LVL3_TRIP 0x39 /* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */ +#define CMP2_LVL3_HYS 0x3A /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */ +#define CMP1_ADC_DAT_R1 0x3B /* Comparator 1 ADC data Register1 */ +#define CMP1_ADC_DAT_R2 0x3C /* Comparator 1 ADC data Register2 */ +#define CMP2_ADC_DAT_R1 0x3D /* Comparator 2 ADC data Register1 */ +#define CMP2_ADC_DAT_R2 0x3E /* Comparator 2 ADC data Register2 */ + +#define ADP5588_DEVICE_ID_MASK 0xF + + /* Configuration Register1 */ +#define ADP5588_AUTO_INC (1 << 7) +#define ADP5588_GPIEM_CFG (1 << 6) +#define ADP5588_OVR_FLOW_M (1 << 5) +#define ADP5588_INT_CFG (1 << 4) +#define ADP5588_OVR_FLOW_IEN (1 << 3) +#define ADP5588_K_LCK_IM (1 << 2) +#define ADP5588_GPI_IEN (1 << 1) +#define ADP5588_KE_IEN (1 << 0) + +/* Interrupt Status Register */ +#define ADP5588_CMP2_INT (1 << 5) +#define ADP5588_CMP1_INT (1 << 4) +#define ADP5588_OVR_FLOW_INT (1 << 3) +#define ADP5588_K_LCK_INT (1 << 2) +#define ADP5588_GPI_INT (1 << 1) +#define ADP5588_KE_INT (1 << 0) + +/* Key Lock and Event Counter Register */ +#define ADP5588_K_LCK_EN (1 << 6) +#define ADP5588_LCK21 0x30 +#define ADP5588_KEC 0xF + +#define ADP5588_MAXGPIO 18 +#define ADP5588_BANK(offs) ((offs) >> 3) +#define ADP5588_BIT(offs) (1u << ((offs) & 0x7)) + +/* Put one of these structures in i2c_board_info platform_data */ + +/* + * 128 so it fits matrix-keymap maximum number of keys when the full + * 10cols * 8rows are used. + */ +#define ADP5588_KEYMAPSIZE 128 + +#define GPI_PIN_ROW0 97 +#define GPI_PIN_ROW1 98 +#define GPI_PIN_ROW2 99 +#define GPI_PIN_ROW3 100 +#define GPI_PIN_ROW4 101 +#define GPI_PIN_ROW5 102 +#define GPI_PIN_ROW6 103 +#define GPI_PIN_ROW7 104 +#define GPI_PIN_COL0 105 +#define GPI_PIN_COL1 106 +#define GPI_PIN_COL2 107 +#define GPI_PIN_COL3 108 +#define GPI_PIN_COL4 109 +#define GPI_PIN_COL5 110 +#define GPI_PIN_COL6 111 +#define GPI_PIN_COL7 112 +#define GPI_PIN_COL8 113 +#define GPI_PIN_COL9 114 + +#define GPI_PIN_ROW_BASE GPI_PIN_ROW0 +#define GPI_PIN_ROW_END GPI_PIN_ROW7 +#define GPI_PIN_COL_BASE GPI_PIN_COL0 +#define GPI_PIN_COL_END GPI_PIN_COL9 + +#define GPI_PIN_BASE GPI_PIN_ROW_BASE +#define GPI_PIN_END GPI_PIN_COL_END + +#define ADP5588_ROWS_MAX (GPI_PIN_ROW7 - GPI_PIN_ROW0 + 1) +#define ADP5588_COLS_MAX (GPI_PIN_COL9 - GPI_PIN_COL0 + 1) + +#define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1) /* Key Event Register xy */ #define KEY_EV_PRESSED (1 << 7) @@ -47,6 +180,11 @@ struct adp5588_kpad { struct input_dev *input; ktime_t irq_time; unsigned long delay; + u32 row_shift; + u32 rows; + u32 cols; + u32 unlock_keys[2]; + int nkeys_unlock; unsigned short keycode[ADP5588_KEYMAPSIZE]; unsigned char gpiomap[ADP5588_MAXGPIO]; struct gpio_chip gc; @@ -55,6 +193,7 @@ struct adp5588_kpad { u8 dir[3]; u8 int_en[3]; u8 irq_mask[3]; + u8 pull_dis[3]; }; static int adp5588_read(struct i2c_client *client, u8 reg) @@ -111,6 +250,41 @@ static void adp5588_gpio_set_value(struct gpio_chip *chip, mutex_unlock(&kpad->gpio_lock); } +static int adp5588_gpio_set_config(struct gpio_chip *chip, unsigned int off, + unsigned long config) +{ + struct adp5588_kpad *kpad = gpiochip_get_data(chip); + unsigned int bank = ADP5588_BANK(kpad->gpiomap[off]); + unsigned int bit = ADP5588_BIT(kpad->gpiomap[off]); + bool pull_disable; + int ret; + + switch (pinconf_to_config_param(config)) { + case PIN_CONFIG_BIAS_PULL_UP: + pull_disable = false; + break; + case PIN_CONFIG_BIAS_DISABLE: + pull_disable = true; + break; + default: + return -ENOTSUPP; + } + + mutex_lock(&kpad->gpio_lock); + + if (pull_disable) + kpad->pull_dis[bank] |= bit; + else + kpad->pull_dis[bank] &= bit; + + ret = adp5588_write(kpad->client, GPIO_PULL1 + bank, + kpad->pull_dis[bank]); + + mutex_unlock(&kpad->gpio_lock); + + return ret; +} + static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned off) { struct adp5588_kpad *kpad = gpiochip_get_data(chip); @@ -159,8 +333,7 @@ out_unlock: return ret; } -static int adp5588_build_gpiomap(struct adp5588_kpad *kpad, - const struct adp5588_kpad_platform_data *pdata) +static int adp5588_build_gpiomap(struct adp5588_kpad *kpad) { bool pin_used[ADP5588_MAXGPIO]; int n_unused = 0; @@ -168,10 +341,10 @@ static int adp5588_build_gpiomap(struct adp5588_kpad *kpad, memset(pin_used, 0, sizeof(pin_used)); - for (i = 0; i < pdata->rows; i++) + for (i = 0; i < kpad->rows; i++) pin_used[i] = true; - for (i = 0; i < pdata->cols; i++) + for (i = 0; i < kpad->cols; i++) pin_used[i + GPI_PIN_COL_BASE - GPI_PIN_BASE] = true; for (i = 0; i < ADP5588_MAXGPIO; i++) @@ -181,21 +354,6 @@ static int adp5588_build_gpiomap(struct adp5588_kpad *kpad, return n_unused; } -static void adp5588_gpio_do_teardown(void *_kpad) -{ - struct adp5588_kpad *kpad = _kpad; - struct device *dev = &kpad->client->dev; - const struct adp5588_kpad_platform_data *pdata = dev_get_platdata(dev); - const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data; - int error; - - error = gpio_data->teardown(kpad->client, - kpad->gc.base, kpad->gc.ngpio, - gpio_data->context); - if (error) - dev_warn(&kpad->client->dev, "teardown failed %d\n", error); -} - static void adp5588_irq_bus_lock(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); @@ -266,15 +424,10 @@ static const struct irq_chip adp5588_irq_chip = { static int adp5588_gpio_add(struct adp5588_kpad *kpad) { struct device *dev = &kpad->client->dev; - const struct adp5588_kpad_platform_data *pdata = dev_get_platdata(dev); - const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data; struct gpio_irq_chip *girq; int i, error; - if (!gpio_data) - return 0; - - kpad->gc.ngpio = adp5588_build_gpiomap(kpad, pdata); + kpad->gc.ngpio = adp5588_build_gpiomap(kpad); if (kpad->gc.ngpio == 0) { dev_info(dev, "No unused gpios left to export\n"); return 0; @@ -285,12 +438,12 @@ static int adp5588_gpio_add(struct adp5588_kpad *kpad) kpad->gc.direction_output = adp5588_gpio_direction_output; kpad->gc.get = adp5588_gpio_get_value; kpad->gc.set = adp5588_gpio_set_value; + kpad->gc.set_config = adp5588_gpio_set_config; kpad->gc.can_sleep = 1; - kpad->gc.base = gpio_data->gpio_start; + kpad->gc.base = -1; kpad->gc.label = kpad->client->name; kpad->gc.owner = THIS_MODULE; - kpad->gc.names = gpio_data->names; girq = &kpad->gc.irq; gpio_irq_chip_set_chip(girq, &adp5588_irq_chip); @@ -309,21 +462,7 @@ static int adp5588_gpio_add(struct adp5588_kpad *kpad) kpad->dat_out[i] = adp5588_read(kpad->client, GPIO_DAT_OUT1 + i); kpad->dir[i] = adp5588_read(kpad->client, GPIO_DIR1 + i); - } - - if (gpio_data->setup) { - error = gpio_data->setup(kpad->client, - kpad->gc.base, kpad->gc.ngpio, - gpio_data->context); - if (error) - dev_warn(dev, "setup failed: %d\n", error); - } - - if (gpio_data->teardown) { - error = devm_add_action(dev, adp5588_gpio_do_teardown, kpad); - if (error) - dev_warn(dev, "failed to schedule teardown: %d\n", - error); + kpad->pull_dis[i] = adp5588_read(kpad->client, GPIO_PULL1 + i); } return 0; @@ -390,12 +529,21 @@ static void adp5588_report_events(struct adp5588_kpad *kpad, int ev_cnt) int key_val = key & KEY_EV_MASK; int key_press = key & KEY_EV_PRESSED; - if (key_val >= GPI_PIN_BASE && key_val <= GPI_PIN_END) + if (key_val >= GPI_PIN_BASE && key_val <= GPI_PIN_END) { /* gpio line used as IRQ source */ adp5588_gpio_irq_handle(kpad, key_val, key_press); - else + } else { + int row = (key_val - 1) / ADP5588_COLS_MAX; + int col = (key_val - 1) % ADP5588_COLS_MAX; + int code = MATRIX_SCAN_CODE(row, col, kpad->row_shift); + + dev_dbg_ratelimited(&kpad->client->dev, + "report key(%d) r(%d) c(%d) code(%d)\n", + key_val, row, col, kpad->keycode[code]); + input_report_key(kpad->input, - kpad->keycode[key_val - 1], key_press); + kpad->keycode[code], key_press); + } } } @@ -447,34 +595,30 @@ static irqreturn_t adp5588_thread_irq(int irq, void *handle) return IRQ_HANDLED; } -static int adp5588_setup(struct i2c_client *client) +static int adp5588_setup(struct adp5588_kpad *kpad) { - const struct adp5588_kpad_platform_data *pdata = - dev_get_platdata(&client->dev); - const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data; + struct i2c_client *client = kpad->client; int i, ret; - ret = adp5588_write(client, KP_GPIO1, KP_SEL(pdata->rows)); + ret = adp5588_write(client, KP_GPIO1, KP_SEL(kpad->rows)); if (ret) return ret; - ret = adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF); + ret = adp5588_write(client, KP_GPIO2, KP_SEL(kpad->cols) & 0xFF); if (ret) return ret; - ret = adp5588_write(client, KP_GPIO3, KP_SEL(pdata->cols) >> 8); + ret = adp5588_write(client, KP_GPIO3, KP_SEL(kpad->cols) >> 8); if (ret) return ret; - if (pdata->en_keylock) { - ret = adp5588_write(client, UNLOCK1, pdata->unlock_key1); - if (ret) - return ret; - - ret = adp5588_write(client, UNLOCK2, pdata->unlock_key2); + for (i = 0; i < kpad->nkeys_unlock; i++) { + ret = adp5588_write(client, UNLOCK1 + i, kpad->unlock_keys[i]); if (ret) return ret; + } + if (kpad->nkeys_unlock) { ret = adp5588_write(client, KEY_LCK_EC_STAT, ADP5588_K_LCK_EN); if (ret) return ret; @@ -486,17 +630,6 @@ static int adp5588_setup(struct i2c_client *client) return ret; } - if (gpio_data) { - for (i = 0; i <= ADP5588_BANK(ADP5588_MAXGPIO); i++) { - int pull_mask = gpio_data->pullup_dis_mask; - - ret = adp5588_write(client, GPIO_PULL1 + i, - (pull_mask >> (8 * i)) & 0xFF); - if (ret) - return ret; - } - } - ret = adp5588_write(client, INT_STAT, ADP5588_CMP2_INT | ADP5588_CMP1_INT | ADP5588_OVR_FLOW_INT | ADP5588_K_LCK_INT | @@ -509,15 +642,84 @@ static int adp5588_setup(struct i2c_client *client) ADP5588_KE_IEN); } +static int adp5588_fw_parse(struct adp5588_kpad *kpad) +{ + struct i2c_client *client = kpad->client; + int ret, i; + + ret = matrix_keypad_parse_properties(&client->dev, &kpad->rows, + &kpad->cols); + if (ret) + return ret; + + if (kpad->rows > ADP5588_ROWS_MAX || kpad->cols > ADP5588_COLS_MAX) { + dev_err(&client->dev, "Invalid nr of rows(%u) or cols(%u)\n", + kpad->rows, kpad->cols); + return -EINVAL; + } + + ret = matrix_keypad_build_keymap(NULL, NULL, kpad->rows, kpad->cols, + kpad->keycode, kpad->input); + if (ret) + return ret; + + kpad->row_shift = get_count_order(kpad->cols); + + if (device_property_read_bool(&client->dev, "autorepeat")) + __set_bit(EV_REP, kpad->input->evbit); + + kpad->nkeys_unlock = device_property_count_u32(&client->dev, + "adi,unlock-keys"); + if (kpad->nkeys_unlock <= 0) { + /* so that we don't end up enabling key lock */ + kpad->nkeys_unlock = 0; + return 0; + } + + if (kpad->nkeys_unlock > ARRAY_SIZE(kpad->unlock_keys)) { + dev_err(&client->dev, "number of unlock keys(%d) > (%zu)\n", + kpad->nkeys_unlock, ARRAY_SIZE(kpad->unlock_keys)); + return -EINVAL; + } + + ret = device_property_read_u32_array(&client->dev, "adi,unlock-keys", + kpad->unlock_keys, + kpad->nkeys_unlock); + if (ret) + return ret; + + for (i = 0; i < kpad->nkeys_unlock; i++) { + /* + * Even though it should be possible (as stated in the datasheet) + * to use GPIs (which are part of the keys event) as unlock keys, + * it was not working at all and was leading to overflow events + * at some point. Hence, for now, let's just allow keys which are + * part of keypad matrix to be used and if a reliable way of + * using GPIs is found, this condition can be removed/lightened. + */ + if (kpad->unlock_keys[i] >= kpad->cols * kpad->rows) { + dev_err(&client->dev, "Invalid unlock key(%d)\n", + kpad->unlock_keys[i]); + return -EINVAL; + } + + /* + * Firmware properties keys start from 0 but on the device they + * start from 1. + */ + kpad->unlock_keys[i] += 1; + } + + return 0; +} + static int adp5588_probe(struct i2c_client *client, const struct i2c_device_id *id) { struct adp5588_kpad *kpad; - const struct adp5588_kpad_platform_data *pdata = - dev_get_platdata(&client->dev); struct input_dev *input; unsigned int revid; - int ret, i; + int ret; int error; if (!i2c_check_functionality(client->adapter, @@ -526,21 +728,6 @@ static int adp5588_probe(struct i2c_client *client, return -EIO; } - if (!pdata) { - dev_err(&client->dev, "no platform data?\n"); - return -EINVAL; - } - - if (!pdata->rows || !pdata->cols || !pdata->keymap) { - dev_err(&client->dev, "no rows, cols or keymap from pdata\n"); - return -EINVAL; - } - - if (pdata->keymapsize != ADP5588_KEYMAPSIZE) { - dev_err(&client->dev, "invalid keymapsize\n"); - return -EINVAL; - } - if (!client->irq) { dev_err(&client->dev, "no IRQ?\n"); return -EINVAL; @@ -557,6 +744,10 @@ static int adp5588_probe(struct i2c_client *client, kpad->client = client; kpad->input = input; + error = adp5588_fw_parse(kpad); + if (error) + return error; + ret = adp5588_read(client, DEV_ID); if (ret < 0) return ret; @@ -575,24 +766,6 @@ static int adp5588_probe(struct i2c_client *client, input->id.product = 0x0001; input->id.version = revid; - input->keycodesize = sizeof(kpad->keycode[0]); - input->keycodemax = pdata->keymapsize; - input->keycode = kpad->keycode; - - memcpy(kpad->keycode, pdata->keymap, - pdata->keymapsize * input->keycodesize); - - /* setup input device */ - __set_bit(EV_KEY, input->evbit); - - if (pdata->repeat) - __set_bit(EV_REP, input->evbit); - - for (i = 0; i < input->keycodemax; i++) - if (kpad->keycode[i] <= KEY_MAX) - __set_bit(kpad->keycode[i], input->keybit); - __clear_bit(KEY_RESERVED, input->keybit); - error = input_register_device(input); if (error) { dev_err(&client->dev, "unable to register input device: %d\n", @@ -600,7 +773,7 @@ static int adp5588_probe(struct i2c_client *client, return error; } - error = adp5588_setup(client); + error = adp5588_setup(kpad); if (error) return error; @@ -656,9 +829,17 @@ static const struct i2c_device_id adp5588_id[] = { }; MODULE_DEVICE_TABLE(i2c, adp5588_id); +static const struct of_device_id adp5588_of_match[] = { + { .compatible = "adi,adp5588" }, + { .compatible = "adi,adp5587" }, + {} +}; +MODULE_DEVICE_TABLE(of, adp5588_of_match); + static struct i2c_driver adp5588_driver = { .driver = { .name = KBUILD_MODNAME, + .of_match_table = adp5588_of_match, .pm = &adp5588_dev_pm_ops, }, .probe = adp5588_probe, diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h deleted file mode 100644 index 82170ec8c266c..0000000000000 --- a/include/linux/platform_data/adp5588.h +++ /dev/null @@ -1,169 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Analog Devices ADP5588 I/O Expander and QWERTY Keypad Controller - * - * Copyright 2009-2010 Analog Devices Inc. - */ - -#ifndef _ADP5588_H -#define _ADP5588_H - -#define DEV_ID 0x00 /* Device ID */ -#define CFG 0x01 /* Configuration Register1 */ -#define INT_STAT 0x02 /* Interrupt Status Register */ -#define KEY_LCK_EC_STAT 0x03 /* Key Lock and Event Counter Register */ -#define Key_EVENTA 0x04 /* Key Event Register A */ -#define Key_EVENTB 0x05 /* Key Event Register B */ -#define Key_EVENTC 0x06 /* Key Event Register C */ -#define Key_EVENTD 0x07 /* Key Event Register D */ -#define Key_EVENTE 0x08 /* Key Event Register E */ -#define Key_EVENTF 0x09 /* Key Event Register F */ -#define Key_EVENTG 0x0A /* Key Event Register G */ -#define Key_EVENTH 0x0B /* Key Event Register H */ -#define Key_EVENTI 0x0C /* Key Event Register I */ -#define Key_EVENTJ 0x0D /* Key Event Register J */ -#define KP_LCK_TMR 0x0E /* Keypad Lock1 to Lock2 Timer */ -#define UNLOCK1 0x0F /* Unlock Key1 */ -#define UNLOCK2 0x10 /* Unlock Key2 */ -#define GPIO_INT_STAT1 0x11 /* GPIO Interrupt Status */ -#define GPIO_INT_STAT2 0x12 /* GPIO Interrupt Status */ -#define GPIO_INT_STAT3 0x13 /* GPIO Interrupt Status */ -#define GPIO_DAT_STAT1 0x14 /* GPIO Data Status, Read twice to clear */ -#define GPIO_DAT_STAT2 0x15 /* GPIO Data Status, Read twice to clear */ -#define GPIO_DAT_STAT3 0x16 /* GPIO Data Status, Read twice to clear */ -#define GPIO_DAT_OUT1 0x17 /* GPIO DATA OUT */ -#define GPIO_DAT_OUT2 0x18 /* GPIO DATA OUT */ -#define GPIO_DAT_OUT3 0x19 /* GPIO DATA OUT */ -#define GPIO_INT_EN1 0x1A /* GPIO Interrupt Enable */ -#define GPIO_INT_EN2 0x1B /* GPIO Interrupt Enable */ -#define GPIO_INT_EN3 0x1C /* GPIO Interrupt Enable */ -#define KP_GPIO1 0x1D /* Keypad or GPIO Selection */ -#define KP_GPIO2 0x1E /* Keypad or GPIO Selection */ -#define KP_GPIO3 0x1F /* Keypad or GPIO Selection */ -#define GPI_EM1 0x20 /* GPI Event Mode 1 */ -#define GPI_EM2 0x21 /* GPI Event Mode 2 */ -#define GPI_EM3 0x22 /* GPI Event Mode 3 */ -#define GPIO_DIR1 0x23 /* GPIO Data Direction */ -#define GPIO_DIR2 0x24 /* GPIO Data Direction */ -#define GPIO_DIR3 0x25 /* GPIO Data Direction */ -#define GPIO_INT_LVL1 0x26 /* GPIO Edge/Level Detect */ -#define GPIO_INT_LVL2 0x27 /* GPIO Edge/Level Detect */ -#define GPIO_INT_LVL3 0x28 /* GPIO Edge/Level Detect */ -#define Debounce_DIS1 0x29 /* Debounce Disable */ -#define Debounce_DIS2 0x2A /* Debounce Disable */ -#define Debounce_DIS3 0x2B /* Debounce Disable */ -#define GPIO_PULL1 0x2C /* GPIO Pull Disable */ -#define GPIO_PULL2 0x2D /* GPIO Pull Disable */ -#define GPIO_PULL3 0x2E /* GPIO Pull Disable */ -#define CMP_CFG_STAT 0x30 /* Comparator Configuration and Status Register */ -#define CMP_CONFG_SENS1 0x31 /* Sensor1 Comparator Configuration Register */ -#define CMP_CONFG_SENS2 0x32 /* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */ -#define CMP1_LVL2_TRIP 0x33 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */ -#define CMP1_LVL2_HYS 0x34 /* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */ -#define CMP1_LVL3_TRIP 0x35 /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */ -#define CMP1_LVL3_HYS 0x36 /* Sensor 2 Comparator Configuration Register */ -#define CMP2_LVL2_TRIP 0x37 /* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */ -#define CMP2_LVL2_HYS 0x38 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */ -#define CMP2_LVL3_TRIP 0x39 /* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */ -#define CMP2_LVL3_HYS 0x3A /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */ -#define CMP1_ADC_DAT_R1 0x3B /* Comparator 1 ADC data Register1 */ -#define CMP1_ADC_DAT_R2 0x3C /* Comparator 1 ADC data Register2 */ -#define CMP2_ADC_DAT_R1 0x3D /* Comparator 2 ADC data Register1 */ -#define CMP2_ADC_DAT_R2 0x3E /* Comparator 2 ADC data Register2 */ - -#define ADP5588_DEVICE_ID_MASK 0xF - - /* Configuration Register1 */ -#define ADP5588_AUTO_INC (1 << 7) -#define ADP5588_GPIEM_CFG (1 << 6) -#define ADP5588_OVR_FLOW_M (1 << 5) -#define ADP5588_INT_CFG (1 << 4) -#define ADP5588_OVR_FLOW_IEN (1 << 3) -#define ADP5588_K_LCK_IM (1 << 2) -#define ADP5588_GPI_IEN (1 << 1) -#define ADP5588_KE_IEN (1 << 0) - -/* Interrupt Status Register */ -#define ADP5588_CMP2_INT (1 << 5) -#define ADP5588_CMP1_INT (1 << 4) -#define ADP5588_OVR_FLOW_INT (1 << 3) -#define ADP5588_K_LCK_INT (1 << 2) -#define ADP5588_GPI_INT (1 << 1) -#define ADP5588_KE_INT (1 << 0) - -/* Key Lock and Event Counter Register */ -#define ADP5588_K_LCK_EN (1 << 6) -#define ADP5588_LCK21 0x30 -#define ADP5588_KEC 0xF - -#define ADP5588_MAXGPIO 18 -#define ADP5588_BANK(offs) ((offs) >> 3) -#define ADP5588_BIT(offs) (1u << ((offs) & 0x7)) - -/* Put one of these structures in i2c_board_info platform_data */ - -#define ADP5588_KEYMAPSIZE 80 - -#define GPI_PIN_ROW0 97 -#define GPI_PIN_ROW1 98 -#define GPI_PIN_ROW2 99 -#define GPI_PIN_ROW3 100 -#define GPI_PIN_ROW4 101 -#define GPI_PIN_ROW5 102 -#define GPI_PIN_ROW6 103 -#define GPI_PIN_ROW7 104 -#define GPI_PIN_COL0 105 -#define GPI_PIN_COL1 106 -#define GPI_PIN_COL2 107 -#define GPI_PIN_COL3 108 -#define GPI_PIN_COL4 109 -#define GPI_PIN_COL5 110 -#define GPI_PIN_COL6 111 -#define GPI_PIN_COL7 112 -#define GPI_PIN_COL8 113 -#define GPI_PIN_COL9 114 - -#define GPI_PIN_ROW_BASE GPI_PIN_ROW0 -#define GPI_PIN_ROW_END GPI_PIN_ROW7 -#define GPI_PIN_COL_BASE GPI_PIN_COL0 -#define GPI_PIN_COL_END GPI_PIN_COL9 - -#define GPI_PIN_BASE GPI_PIN_ROW_BASE -#define GPI_PIN_END GPI_PIN_COL_END - -#define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1) - -struct adp5588_gpi_map { - unsigned short pin; - unsigned short sw_evt; -}; - -struct adp5588_kpad_platform_data { - int rows; /* Number of rows */ - int cols; /* Number of columns */ - const unsigned short *keymap; /* Pointer to keymap */ - unsigned short keymapsize; /* Keymap size */ - unsigned repeat:1; /* Enable key repeat */ - unsigned en_keylock:1; /* Enable Key Lock feature */ - unsigned short unlock_key1; /* Unlock Key 1 */ - unsigned short unlock_key2; /* Unlock Key 2 */ - const struct adp5588_gpio_platform_data *gpio_data; -}; - -struct i2c_client; /* forward declaration */ - -struct adp5588_gpio_platform_data { - int gpio_start; /* GPIO Chip base # */ - const char *const *names; - unsigned irq_base; /* interrupt base # */ - unsigned pullup_dis_mask; /* Pull-Up Disable Mask */ - int (*setup)(struct i2c_client *client, - unsigned gpio, unsigned ngpio, - void *context); - int (*teardown)(struct i2c_client *client, - unsigned gpio, unsigned ngpio, - void *context); - void *context; -}; - -#endif -- GitLab From 81ce5b77417ac9623d1c6270a2f75a0a3a734d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:08:18 -0700 Subject: [PATCH 0105/2223] dt-bindings: input: adp5588: add bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add device tree bindings for the adp5588-keys driver. Signed-off-by: Nuno Sá Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220829131553.690063-6-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- .../bindings/input/adi,adp5588.yaml | 111 ++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 112 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/adi,adp5588.yaml diff --git a/Documentation/devicetree/bindings/input/adi,adp5588.yaml b/Documentation/devicetree/bindings/input/adi,adp5588.yaml new file mode 100644 index 0000000000000..26ea66834ae24 --- /dev/null +++ b/Documentation/devicetree/bindings/input/adi,adp5588.yaml @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/adi,adp5588.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices ADP5588 Keypad Controller + +maintainers: + - Nuno Sá + +description: | + Analog Devices Mobile I/O Expander and QWERTY Keypad Controller + https://www.analog.com/media/en/technical-documentation/data-sheets/ADP5588.pdf + +allOf: + - $ref: matrix-keymap.yaml# + - $ref: input.yaml# + +properties: + compatible: + enum: + - adi,adp5587 + - adi,adp5588 + + reg: + maxItems: 1 + + vcc-supply: + description: Supply Voltage Input + + reset-gpios: + description: + If specified, it will be asserted during driver probe. As the line is + active low, it should be marked GPIO_ACTIVE_LOW. + maxItems: 1 + + interrupts: + maxItems: 1 + + gpio-controller: + description: + This property applies if either keypad,num-rows lower than 8 or + keypad,num-columns lower than 10. + + '#gpio-cells': + const: 2 + + interrupt-controller: + description: + This property applies if either keypad,num-rows lower than 8 or + keypad,num-columns lower than 10. + + '#interrupt-cells': + const: 2 + + adi,unlock-keys: + description: + Specifies a maximum of 2 keys that can be used to unlock the keypad. + If this property is set, the keyboard will be locked and only unlocked + after these keys are pressed. If only one key is set, a double click is + needed to unlock the keypad. The value of this property cannot be bigger + or equal than keypad,num-rows * keypad,num-columns. + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 2 + +required: + - compatible + - reg + - interrupts + - keypad,num-rows + - keypad,num-columns + - linux,keymap + +unevaluatedProperties: false + +examples: + - | + #include + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + keys@34 { + compatible = "adi,adp5588"; + reg = <0x34>; + + vcc-supply = <&vcc>; + interrupts = <21 IRQ_TYPE_EDGE_FALLING>; + interrupt-parent = <&gpio>; + reset-gpios = <&gpio 20 GPIO_ACTIVE_LOW>; + + keypad,num-rows = <1>; + keypad,num-columns = <9>; + linux,keymap = < + MATRIX_KEY(0x00, 0x00, KEY_1) + MATRIX_KEY(0x00, 0x01, KEY_2) + MATRIX_KEY(0x00, 0x02, KEY_3) + MATRIX_KEY(0x00, 0x03, KEY_4) + MATRIX_KEY(0x00, 0x04, KEY_5) + MATRIX_KEY(0x00, 0x05, KEY_6) + MATRIX_KEY(0x00, 0x06, KEY_7) + MATRIX_KEY(0x00, 0x07, KEY_8) + MATRIX_KEY(0x00, 0x08, KEY_9) + >; + }; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index 8404c18e6bcf1..aa71df8b699ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -556,6 +556,7 @@ M: Michael Hennerich S: Supported W: http://wiki.analog.com/ADP5588 W: https://ez.analog.com/linux-software-drivers +F: Documentation/devicetree/bindings/input/adi,adp5588.yaml F: drivers/input/keyboard/adp5588-keys.c ADP8860 BACKLIGHT DRIVER (ADP8860/ADP8861/ADP8863) -- GitLab From 0063aecc61e1beec09611d830f4aae5a90a96c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:09:32 -0700 Subject: [PATCH 0106/2223] Input: adp5588-keys - do not check for irq presence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no need for an extra check for 'client-irq'. Just let it fail when calling 'request_irq()'. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-7-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/adp5588-keys.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index 77d538ed45974..9ff35910fc5da 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -728,11 +728,6 @@ static int adp5588_probe(struct i2c_client *client, return -EIO; } - if (!client->irq) { - dev_err(&client->dev, "no IRQ?\n"); - return -EINVAL; - } - kpad = devm_kzalloc(&client->dev, sizeof(*kpad), GFP_KERNEL); if (!kpad) return -ENOMEM; -- GitLab From e22d21d31f5d0ade2fa80c44d615ee488b723063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:09:54 -0700 Subject: [PATCH 0107/2223] Input: adp5588-keys - fix coding style warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just some code cleanup regarding coding style. With the introduction of the bits.h macros changes in the code are indeed introduced. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-8-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/adp5588-keys.c | 98 +++++++++++++-------------- 1 file changed, 48 insertions(+), 50 deletions(-) diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index 9ff35910fc5da..565123e5894bb 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -8,6 +8,7 @@ * Copyright (C) 2008-2010 Analog Devices Inc. */ +#include #include #include #include @@ -29,16 +30,16 @@ #define CFG 0x01 /* Configuration Register1 */ #define INT_STAT 0x02 /* Interrupt Status Register */ #define KEY_LCK_EC_STAT 0x03 /* Key Lock and Event Counter Register */ -#define Key_EVENTA 0x04 /* Key Event Register A */ -#define Key_EVENTB 0x05 /* Key Event Register B */ -#define Key_EVENTC 0x06 /* Key Event Register C */ -#define Key_EVENTD 0x07 /* Key Event Register D */ -#define Key_EVENTE 0x08 /* Key Event Register E */ -#define Key_EVENTF 0x09 /* Key Event Register F */ -#define Key_EVENTG 0x0A /* Key Event Register G */ -#define Key_EVENTH 0x0B /* Key Event Register H */ -#define Key_EVENTI 0x0C /* Key Event Register I */ -#define Key_EVENTJ 0x0D /* Key Event Register J */ +#define KEY_EVENTA 0x04 /* Key Event Register A */ +#define KEY_EVENTB 0x05 /* Key Event Register B */ +#define KEY_EVENTC 0x06 /* Key Event Register C */ +#define KEY_EVENTD 0x07 /* Key Event Register D */ +#define KEY_EVENTE 0x08 /* Key Event Register E */ +#define KEY_EVENTF 0x09 /* Key Event Register F */ +#define KEY_EVENTG 0x0A /* Key Event Register G */ +#define KEY_EVENTH 0x0B /* Key Event Register H */ +#define KEY_EVENTI 0x0C /* Key Event Register I */ +#define KEY_EVENTJ 0x0D /* Key Event Register J */ #define KP_LCK_TMR 0x0E /* Keypad Lock1 to Lock2 Timer */ #define UNLOCK1 0x0F /* Unlock Key1 */ #define UNLOCK2 0x10 /* Unlock Key2 */ @@ -66,9 +67,9 @@ #define GPIO_INT_LVL1 0x26 /* GPIO Edge/Level Detect */ #define GPIO_INT_LVL2 0x27 /* GPIO Edge/Level Detect */ #define GPIO_INT_LVL3 0x28 /* GPIO Edge/Level Detect */ -#define Debounce_DIS1 0x29 /* Debounce Disable */ -#define Debounce_DIS2 0x2A /* Debounce Disable */ -#define Debounce_DIS3 0x2B /* Debounce Disable */ +#define DEBOUNCE_DIS1 0x29 /* Debounce Disable */ +#define DEBOUNCE_DIS2 0x2A /* Debounce Disable */ +#define DEBOUNCE_DIS3 0x2B /* Debounce Disable */ #define GPIO_PULL1 0x2C /* GPIO Pull Disable */ #define GPIO_PULL2 0x2D /* GPIO Pull Disable */ #define GPIO_PULL3 0x2E /* GPIO Pull Disable */ @@ -91,27 +92,27 @@ #define ADP5588_DEVICE_ID_MASK 0xF /* Configuration Register1 */ -#define ADP5588_AUTO_INC (1 << 7) -#define ADP5588_GPIEM_CFG (1 << 6) -#define ADP5588_OVR_FLOW_M (1 << 5) -#define ADP5588_INT_CFG (1 << 4) -#define ADP5588_OVR_FLOW_IEN (1 << 3) -#define ADP5588_K_LCK_IM (1 << 2) -#define ADP5588_GPI_IEN (1 << 1) -#define ADP5588_KE_IEN (1 << 0) +#define ADP5588_AUTO_INC BIT(7) +#define ADP5588_GPIEM_CFG BIT(6) +#define ADP5588_OVR_FLOW_M BIT(5) +#define ADP5588_INT_CFG BIT(4) +#define ADP5588_OVR_FLOW_IEN BIT(3) +#define ADP5588_K_LCK_IM BIT(2) +#define ADP5588_GPI_IEN BIT(1) +#define ADP5588_KE_IEN BIT(0) /* Interrupt Status Register */ -#define ADP5588_CMP2_INT (1 << 5) -#define ADP5588_CMP1_INT (1 << 4) -#define ADP5588_OVR_FLOW_INT (1 << 3) -#define ADP5588_K_LCK_INT (1 << 2) -#define ADP5588_GPI_INT (1 << 1) -#define ADP5588_KE_INT (1 << 0) +#define ADP5588_CMP2_INT BIT(5) +#define ADP5588_CMP1_INT BIT(4) +#define ADP5588_OVR_FLOW_INT BIT(3) +#define ADP5588_K_LCK_INT BIT(2) +#define ADP5588_GPI_INT BIT(1) +#define ADP5588_KE_INT BIT(0) /* Key Lock and Event Counter Register */ -#define ADP5588_K_LCK_EN (1 << 6) +#define ADP5588_K_LCK_EN BIT(6) #define ADP5588_LCK21 0x30 -#define ADP5588_KEC 0xF +#define ADP5588_KEC GENMASK(3, 0) #define ADP5588_MAXGPIO 18 #define ADP5588_BANK(offs) ((offs) >> 3) @@ -158,10 +159,10 @@ #define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1) /* Key Event Register xy */ -#define KEY_EV_PRESSED (1 << 7) -#define KEY_EV_MASK (0x7F) +#define KEY_EV_PRESSED BIT(7) +#define KEY_EV_MASK GENMASK(6, 0) -#define KP_SEL(x) (0xFFFF >> (16 - x)) /* 2^x-1 */ +#define KP_SEL(x) (BIT(x) - 1) /* 2^x-1 */ #define KEYP_MAX_EVENT 10 @@ -211,7 +212,7 @@ static int adp5588_write(struct i2c_client *client, u8 reg, u8 val) return i2c_smbus_write_byte_data(client, reg, val); } -static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned off) +static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned int off) { struct adp5588_kpad *kpad = gpiochip_get_data(chip); unsigned int bank = ADP5588_BANK(kpad->gpiomap[off]); @@ -231,7 +232,7 @@ static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned off) } static void adp5588_gpio_set_value(struct gpio_chip *chip, - unsigned off, int val) + unsigned int off, int val) { struct adp5588_kpad *kpad = gpiochip_get_data(chip); unsigned int bank = ADP5588_BANK(kpad->gpiomap[off]); @@ -244,8 +245,7 @@ static void adp5588_gpio_set_value(struct gpio_chip *chip, else kpad->dat_out[bank] &= ~bit; - adp5588_write(kpad->client, GPIO_DAT_OUT1 + bank, - kpad->dat_out[bank]); + adp5588_write(kpad->client, GPIO_DAT_OUT1 + bank, kpad->dat_out[bank]); mutex_unlock(&kpad->gpio_lock); } @@ -285,7 +285,7 @@ static int adp5588_gpio_set_config(struct gpio_chip *chip, unsigned int off, return ret; } -static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned off) +static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned int off) { struct adp5588_kpad *kpad = gpiochip_get_data(chip); unsigned int bank = ADP5588_BANK(kpad->gpiomap[off]); @@ -303,7 +303,7 @@ static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned off) } static int adp5588_gpio_direction_output(struct gpio_chip *chip, - unsigned off, int val) + unsigned int off, int val) { struct adp5588_kpad *kpad = gpiochip_get_data(chip); unsigned int bank = ADP5588_BANK(kpad->gpiomap[off]); @@ -320,12 +320,11 @@ static int adp5588_gpio_direction_output(struct gpio_chip *chip, kpad->dat_out[bank] &= ~bit; ret = adp5588_write(kpad->client, GPIO_DAT_OUT1 + bank, - kpad->dat_out[bank]); + kpad->dat_out[bank]); if (ret) goto out_unlock; - ret = adp5588_write(kpad->client, GPIO_DIR1 + bank, - kpad->dir[bank]); + ret = adp5588_write(kpad->client, GPIO_DIR1 + bank, kpad->dir[bank]); out_unlock: mutex_unlock(&kpad->gpio_lock); @@ -525,7 +524,7 @@ static void adp5588_report_events(struct adp5588_kpad *kpad, int ev_cnt) int i; for (i = 0; i < ev_cnt; i++) { - int key = adp5588_read(kpad->client, Key_EVENTA + i); + int key = adp5588_read(kpad->client, KEY_EVENTA + i); int key_val = key & KEY_EV_MASK; int key_press = key & KEY_EV_PRESSED; @@ -625,21 +624,20 @@ static int adp5588_setup(struct adp5588_kpad *kpad) } for (i = 0; i < KEYP_MAX_EVENT; i++) { - ret = adp5588_read(client, Key_EVENTA); + ret = adp5588_read(client, KEY_EVENTA); if (ret) return ret; } ret = adp5588_write(client, INT_STAT, - ADP5588_CMP2_INT | ADP5588_CMP1_INT | - ADP5588_OVR_FLOW_INT | ADP5588_K_LCK_INT | - ADP5588_GPI_INT | ADP5588_KE_INT); /* Status is W1C */ + ADP5588_CMP2_INT | ADP5588_CMP1_INT | + ADP5588_OVR_FLOW_INT | ADP5588_K_LCK_INT | + ADP5588_GPI_INT | ADP5588_KE_INT); /* Status is W1C */ if (ret) return ret; return adp5588_write(client, CFG, ADP5588_INT_CFG | - ADP5588_OVR_FLOW_IEN | - ADP5588_KE_IEN); + ADP5588_OVR_FLOW_IEN | ADP5588_KE_IEN); } static int adp5588_fw_parse(struct adp5588_kpad *kpad) @@ -723,7 +721,7 @@ static int adp5588_probe(struct i2c_client *client, int error; if (!i2c_check_functionality(client->adapter, - I2C_FUNC_SMBUS_BYTE_DATA)) { + I2C_FUNC_SMBUS_BYTE_DATA)) { dev_err(&client->dev, "SMBUS Byte Data not Supported\n"); return -EIO; } @@ -747,7 +745,7 @@ static int adp5588_probe(struct i2c_client *client, if (ret < 0) return ret; - revid = (u8) ret & ADP5588_DEVICE_ID_MASK; + revid = ret & ADP5588_DEVICE_ID_MASK; if (WA_DELAYED_READOUT_REVID(revid)) kpad->delay = msecs_to_jiffies(WA_DELAYED_READOUT_TIME); -- GitLab From cfacae58646462c5afedc8b42ad72a0968678e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:12:54 -0700 Subject: [PATCH 0108/2223] Input: adp5588-keys - add optional reset gpio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optionally reset the device during probe. Signed-off-by: Nuno Sá Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-9-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/adp5588-keys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index 565123e5894bb..950abc1c25a33 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -716,6 +717,7 @@ static int adp5588_probe(struct i2c_client *client, { struct adp5588_kpad *kpad; struct input_dev *input; + struct gpio_desc *gpio; unsigned int revid; int ret; int error; @@ -741,6 +743,16 @@ static int adp5588_probe(struct i2c_client *client, if (error) return error; + gpio = devm_gpiod_get_optional(&client->dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); + + if (gpio) { + fsleep(30); + gpiod_set_value_cansleep(gpio, 0); + fsleep(60); + } + ret = adp5588_read(client, DEV_ID); if (ret < 0) return ret; -- GitLab From 73d4a5423ecee8e108056134c53f82c7a95a90d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:13:10 -0700 Subject: [PATCH 0109/2223] Input: adp5588-keys - add regulator support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support feeding VCC through a regulator. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-10-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/adp5588-keys.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index 950abc1c25a33..1db6b28db7bad 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -712,12 +713,18 @@ static int adp5588_fw_parse(struct adp5588_kpad *kpad) return 0; } +static void adp5588_disable_regulator(void *reg) +{ + regulator_disable(reg); +} + static int adp5588_probe(struct i2c_client *client, const struct i2c_device_id *id) { struct adp5588_kpad *kpad; struct input_dev *input; struct gpio_desc *gpio; + struct regulator *vcc; unsigned int revid; int ret; int error; @@ -743,6 +750,19 @@ static int adp5588_probe(struct i2c_client *client, if (error) return error; + vcc = devm_regulator_get(&client->dev, "vcc"); + if (IS_ERR(vcc)) + return PTR_ERR(vcc); + + error = regulator_enable(vcc); + if (error) + return error; + + error = devm_add_action_or_reset(&client->dev, + adp5588_disable_regulator, vcc); + if (error) + return error; + gpio = devm_gpiod_get_optional(&client->dev, "reset", GPIOD_OUT_HIGH); if (IS_ERR(gpio)) return PTR_ERR(gpio); -- GitLab From 4f35adaee07d182a4a7ef6b960c614ff3c5b4090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:15:03 -0700 Subject: [PATCH 0110/2223] Input: adp5588-keys - use new PM macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the new PM macros (DEFINE_SIMPLE_DEV_PM_OPS() and pm_sleep_ptr()), the compiler has visibility to see that the functions are not used when !CONFIG_PM and hence, remove the dead code. As such, there's no need for '__maybe_unused'. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-11-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/adp5588-keys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c index 1db6b28db7bad..7cd83c8e71108 100644 --- a/drivers/input/keyboard/adp5588-keys.c +++ b/drivers/input/keyboard/adp5588-keys.c @@ -827,7 +827,7 @@ static void adp5588_remove(struct i2c_client *client) /* all resources will be freed by devm */ } -static int __maybe_unused adp5588_suspend(struct device *dev) +static int adp5588_suspend(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); @@ -836,7 +836,7 @@ static int __maybe_unused adp5588_suspend(struct device *dev) return 0; } -static int __maybe_unused adp5588_resume(struct device *dev) +static int adp5588_resume(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); @@ -845,7 +845,7 @@ static int __maybe_unused adp5588_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(adp5588_dev_pm_ops, adp5588_suspend, adp5588_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(adp5588_dev_pm_ops, adp5588_suspend, adp5588_resume); static const struct i2c_device_id adp5588_id[] = { { "adp5588-keys", 0 }, @@ -865,7 +865,7 @@ static struct i2c_driver adp5588_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = adp5588_of_match, - .pm = &adp5588_dev_pm_ops, + .pm = pm_sleep_ptr(&adp5588_dev_pm_ops), }, .probe = adp5588_probe, .remove = adp5588_remove, -- GitLab From cf517fef601b9dde151f0afc27164d13bf1fd907 Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Thu, 18 Aug 2022 18:18:39 +0800 Subject: [PATCH 0111/2223] pinctrl: aspeed: Force to disable the function's signal When the driver want to disable the signal of the function, it doesn't need to query the state of the mux function's signal on a pin. The condition below will miss the disable of the signal: Ball | Default | P0 Signal | P0 Expression | Other -----+---------+-----------+-----------------------------+---------- E21 GPIOG0 SD2CLK SCU4B4[16]=1 & SCU450[1]=1 GPIOG0 -----+---------+-----------+-----------------------------+---------- B22 GPIOG1 SD2CMD SCU4B4[17]=1 & SCU450[1]=1 GPIOG1 -----+---------+-----------+-----------------------------+---------- Assume the register status like below: SCU4B4[16] == 1 & SCU4B4[17] == 1 & SCU450[1]==1 After the driver set the Ball E21 to the GPIOG0: SCU4B4[16] == 0 & SCU4B4[17] == 1 & SCU450[1]==0 When the driver want to set the Ball B22 to the GPIOG1, the condition of the SD2CMD will be false causing SCU4B4[17] not to be cleared. Signed-off-by: Billy Tsai Acked-by: Andrew Jeffery Link: https://lore.kernel.org/r/20220818101839.28860-1-billy_tsai@aspeedtech.com Signed-off-by: Linus Walleij --- drivers/pinctrl/aspeed/pinctrl-aspeed.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/pinctrl/aspeed/pinctrl-aspeed.c b/drivers/pinctrl/aspeed/pinctrl-aspeed.c index 83d47ff1cea8f..a30912a92f057 100644 --- a/drivers/pinctrl/aspeed/pinctrl-aspeed.c +++ b/drivers/pinctrl/aspeed/pinctrl-aspeed.c @@ -92,19 +92,10 @@ static int aspeed_sig_expr_enable(struct aspeed_pinmux_data *ctx, static int aspeed_sig_expr_disable(struct aspeed_pinmux_data *ctx, const struct aspeed_sig_expr *expr) { - int ret; - pr_debug("Disabling signal %s for %s\n", expr->signal, expr->function); - ret = aspeed_sig_expr_eval(ctx, expr, true); - if (ret < 0) - return ret; - - if (ret) - return aspeed_sig_expr_set(ctx, expr, false); - - return 0; + return aspeed_sig_expr_set(ctx, expr, false); } /** -- GitLab From 3160b37e5cb695e866e06c3fdbc385846b569294 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Tue, 30 Aug 2022 16:35:25 +0530 Subject: [PATCH 0112/2223] pinctrl: amd: change dev_warn to dev_dbg for additional feature support Use dev_dbg instead of dev_warn for additional support of pinmux feature. Signed-off-by: Basavaraj Natikar Link: https://lore.kernel.org/r/20220830110525.1933198-1-Basavaraj.Natikar@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index fda41907c4f17..ecf65237b2632 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -1051,13 +1051,13 @@ static void amd_get_iomux_res(struct amd_gpio *gpio_dev) index = device_property_match_string(dev, "pinctrl-resource-names", "iomux"); if (index < 0) { - dev_warn(dev, "failed to get iomux index\n"); + dev_dbg(dev, "iomux not supported\n"); goto out_no_pinmux; } gpio_dev->iomux_base = devm_platform_ioremap_resource(gpio_dev->pdev, index); if (IS_ERR(gpio_dev->iomux_base)) { - dev_warn(dev, "Failed to get iomux %d io resource\n", index); + dev_dbg(dev, "iomux not supported %d io resource\n", index); goto out_no_pinmux; } -- GitLab From 87c2a29a6bf1a078d82427d42a2480a61814f8e3 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Tue, 30 Aug 2022 16:27:27 +0200 Subject: [PATCH 0113/2223] pinctrl: imx8m: kconfig: Depends on SOC_IMX8M Change PINCTRL_IMX8M* dependency from just ARCH_MXC to SOC_IMX8M, likewise is done for other PINCTRL_IMX* kconfig. This avoid polluting the config when SOC_IMX8M is not enabled. Signed-off-by: Francesco Dolcini Reviewed-by: Fabio Estevam Link: https://lore.kernel.org/r/20220830142727.313080-1-francesco.dolcini@toradex.com Signed-off-by: Linus Walleij --- drivers/pinctrl/freescale/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/freescale/Kconfig b/drivers/pinctrl/freescale/Kconfig index d96b1130efd39..365fcff8e470d 100644 --- a/drivers/pinctrl/freescale/Kconfig +++ b/drivers/pinctrl/freescale/Kconfig @@ -119,28 +119,28 @@ config PINCTRL_IMX7ULP config PINCTRL_IMX8MM tristate "IMX8MM pinctrl driver" - depends on ARCH_MXC + depends on SOC_IMX8M select PINCTRL_IMX help Say Y here to enable the imx8mm pinctrl driver config PINCTRL_IMX8MN tristate "IMX8MN pinctrl driver" - depends on ARCH_MXC + depends on SOC_IMX8M select PINCTRL_IMX help Say Y here to enable the imx8mn pinctrl driver config PINCTRL_IMX8MP tristate "IMX8MP pinctrl driver" - depends on ARCH_MXC + depends on SOC_IMX8M select PINCTRL_IMX help Say Y here to enable the imx8mp pinctrl driver config PINCTRL_IMX8MQ tristate "IMX8MQ pinctrl driver" - depends on ARCH_MXC + depends on SOC_IMX8M select PINCTRL_IMX help Say Y here to enable the imx8mq pinctrl driver -- GitLab From 8a32cff217b7a0f1ab3b744fc9cd0626f08f7f15 Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Wed, 31 Aug 2022 10:16:25 -0700 Subject: [PATCH 0114/2223] Input: colibri-vf50-ts - don't depend on VF610_ADC Any IIO ADC can be used with the driver, so do not depend on VF610_ADC. Signed-off-by: Max Krummenacher Signed-off-by: Francesco Dolcini Link: https://lore.kernel.org/r/20220712101619.326120-2-francesco.dolcini@toradex.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index 2d70c945b20a9..dc90a3ea51eed 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -1335,7 +1335,7 @@ config TOUCHSCREEN_ZFORCE config TOUCHSCREEN_COLIBRI_VF50 tristate "Toradex Colibri on board touchscreen driver" - depends on IIO && VF610_ADC + depends on IIO depends on GPIOLIB || COMPILE_TEST help Say Y here if you have a Colibri VF50 and plan to use -- GitLab From a212f5ca5718d6f8c246d90e231aa76beb05bc23 Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Wed, 31 Aug 2022 10:17:01 -0700 Subject: [PATCH 0115/2223] dt-bindings: input: colibri-vf50-ts: Improve documentation Clarify properties definition, drop unused pinctrl-2 state 'gpio'. Signed-off-by: Max Krummenacher Signed-off-by: Francesco Dolcini Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220712101619.326120-3-francesco.dolcini@toradex.com Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/colibri-vf50-ts.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt b/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt index 2e1490a8fe74e..ca304357c374a 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt @@ -3,15 +3,16 @@ Required Properties: - compatible must be toradex,vf50-touchscreen - io-channels: adc channels being used by the Colibri VF50 module + IIO ADC for Y-, X-, Y+, X+ connections - xp-gpios: FET gate driver for input of X+ - xm-gpios: FET gate driver for input of X- - yp-gpios: FET gate driver for input of Y+ - ym-gpios: FET gate driver for input of Y- -- interrupts: pen irq interrupt for touch detection -- pinctrl-names: "idle", "default", "gpios" -- pinctrl-0: pinctrl node for pen/touch detection state pinmux +- interrupts: pen irq interrupt for touch detection, signal from X plate +- pinctrl-names: "idle", "default" +- pinctrl-0: pinctrl node for pen/touch detection, pinctrl must provide + pull-up resistor on X+, X-. - pinctrl-1: pinctrl node for X/Y and pressure measurement (ADC) state pinmux -- pinctrl-2: pinctrl node for gpios functioning as FET gate drivers - vf50-ts-min-pressure: pressure level at which to stop measuring X/Y values Example: @@ -26,9 +27,8 @@ Example: ym-gpios = <&gpio0 4 GPIO_ACTIVE_HIGH>; interrupt-parent = <&gpio0>; interrupts = <8 IRQ_TYPE_LEVEL_LOW>; - pinctrl-names = "idle","default","gpios"; - pinctrl-0 = <&pinctrl_touchctrl_idle>; - pinctrl-1 = <&pinctrl_touchctrl_default>; - pinctrl-2 = <&pinctrl_touchctrl_gpios>; + pinctrl-names = "idle","default"; + pinctrl-0 = <&pinctrl_touchctrl_idle>, <&pinctrl_touchctrl_gpios>; + pinctrl-1 = <&pinctrl_touchctrl_default>, <&pinctrl_touchctrl_gpios>; vf50-ts-min-pressure = <200>; }; -- GitLab From ed3d5bd20dcdfdbe110feeabf120cba7bd329ad8 Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Fri, 12 Aug 2022 12:36:39 -0700 Subject: [PATCH 0116/2223] Input: rt5120 - add power key support Add RT5120 PMIC power key support. Signed-off-by: ChiYuan Huang Link: https://lore.kernel.org/r/1660100142-32493-4-git-send-email-u0084500@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/Kconfig | 9 +++ drivers/input/misc/Makefile | 1 + drivers/input/misc/rt5120-pwrkey.c | 120 +++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 drivers/input/misc/rt5120-pwrkey.c diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 968240288c61c..9f088900f863b 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -909,6 +909,15 @@ config INPUT_SC27XX_VIBRA To compile this driver as a module, choose M here. The module will be called sc27xx_vibra. +config INPUT_RT5120_PWRKEY + tristate "RT5120 PMIC power key support" + depends on MFD_RT5120 || COMPILE_TEST + help + This enables support for RT5120 PMIC power key driver. + + To compile this driver as a module, choose M here. the module will + be called rt5120-pwrkey. + config INPUT_STPMIC1_ONKEY tristate "STPMIC1 PMIC Onkey support" depends on MFD_STPMIC1 diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index 9eea13e98d480..6abefc41037b5 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_INPUT_RAVE_SP_PWRBUTTON) += rave-sp-pwrbutton.o obj-$(CONFIG_INPUT_RB532_BUTTON) += rb532_button.o obj-$(CONFIG_INPUT_REGULATOR_HAPTIC) += regulator-haptic.o obj-$(CONFIG_INPUT_RETU_PWRBUTTON) += retu-pwrbutton.o +obj-$(CONFIG_INPUT_RT5120_PWRKEY) += rt5120-pwrkey.o obj-$(CONFIG_INPUT_AXP20X_PEK) += axp20x-pek.o obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER) += rotary_encoder.o obj-$(CONFIG_INPUT_RK805_PWRKEY) += rk805-pwrkey.o diff --git a/drivers/input/misc/rt5120-pwrkey.c b/drivers/input/misc/rt5120-pwrkey.c new file mode 100644 index 0000000000000..8a8c1aeeed050 --- /dev/null +++ b/drivers/input/misc/rt5120-pwrkey.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Richtek Technology Corp. + * Author: ChiYuan Huang + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RT5120_REG_INTSTAT 0x1E +#define RT5120_PWRKEYSTAT_MASK BIT(7) + +struct rt5120_priv { + struct regmap *regmap; + struct input_dev *input; +}; + +static irqreturn_t rt5120_pwrkey_handler(int irq, void *devid) +{ + struct rt5120_priv *priv = devid; + unsigned int stat; + int error; + + error = regmap_read(priv->regmap, RT5120_REG_INTSTAT, &stat); + if (error) + return IRQ_NONE; + + input_report_key(priv->input, KEY_POWER, + !(stat & RT5120_PWRKEYSTAT_MASK)); + input_sync(priv->input); + + return IRQ_HANDLED; +} + +static int rt5120_pwrkey_probe(struct platform_device *pdev) +{ + struct rt5120_priv *priv; + struct device *dev = &pdev->dev; + int press_irq, release_irq; + int error; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->regmap = dev_get_regmap(dev->parent, NULL); + if (!priv->regmap) { + dev_err(dev, "Failed to init regmap\n"); + return -ENODEV; + } + + press_irq = platform_get_irq_byname(pdev, "pwrkey-press"); + if (press_irq < 0) + return press_irq; + + release_irq = platform_get_irq_byname(pdev, "pwrkey-release"); + if (release_irq < 0) + return release_irq; + + /* Make input device be device resource managed */ + priv->input = devm_input_allocate_device(dev); + if (!priv->input) + return -ENOMEM; + + priv->input->name = "rt5120_pwrkey"; + priv->input->phys = "rt5120_pwrkey/input0"; + priv->input->id.bustype = BUS_I2C; + input_set_capability(priv->input, EV_KEY, KEY_POWER); + + error = input_register_device(priv->input); + if (error) { + dev_err(dev, "Failed to register input device: %d\n", error); + return error; + } + + error = devm_request_threaded_irq(dev, press_irq, + NULL, rt5120_pwrkey_handler, + 0, "pwrkey-press", priv); + if (error) { + dev_err(dev, + "Failed to register pwrkey press irq: %d\n", error); + return error; + } + + error = devm_request_threaded_irq(dev, release_irq, + NULL, rt5120_pwrkey_handler, + 0, "pwrkey-release", priv); + if (error) { + dev_err(dev, + "Failed to register pwrkey release irq: %d\n", error); + return error; + } + + return 0; +} + +static const struct of_device_id r5120_pwrkey_match_table[] = { + { .compatible = "richtek,rt5120-pwrkey" }, + {} +}; +MODULE_DEVICE_TABLE(of, r5120_pwrkey_match_table); + +static struct platform_driver rt5120_pwrkey_driver = { + .driver = { + .name = "rt5120-pwrkey", + .of_match_table = r5120_pwrkey_match_table, + }, + .probe = rt5120_pwrkey_probe, +}; +module_platform_driver(rt5120_pwrkey_driver); + +MODULE_AUTHOR("ChiYuan Huang "); +MODULE_DESCRIPTION("Richtek RT5120 power key driver"); +MODULE_LICENSE("GPL"); -- GitLab From e34a0425b8ef524355811e7408dc1d53d08dc538 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:01 -0300 Subject: [PATCH 0117/2223] vfio/pci: Split linux/vfio_pci_core.h The header in include/linux should have only the exported interface for other vfio_pci modules to use. Internal definitions for vfio_pci.ko should be in a "priv" header along side the .c files. Move the internal declarations out of vfio_pci_core.h. They either move to vfio_pci_priv.h or to the C file that is the only user. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_config.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 19 +++- drivers/vfio/pci/vfio_pci_igd.c | 2 +- drivers/vfio/pci/vfio_pci_intrs.c | 16 +++- drivers/vfio/pci/vfio_pci_priv.h | 106 +++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_rdwr.c | 2 +- drivers/vfio/pci/vfio_pci_zdev.c | 2 +- include/linux/vfio_pci_core.h | 134 +---------------------------- 9 files changed, 145 insertions(+), 140 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_priv.h diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 4d1a97415a27b..d9b5c03f8d5b2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -25,7 +25,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "VFIO PCI - User Level meta-driver" diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 442d3ba4122b2..5f43b28075eec 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -26,7 +26,7 @@ #include #include -#include +#include "vfio_pci_priv.h" /* Fake capability ID for standard config space */ #define PCI_CAP_ID_BASIC 0 diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index c8d3b0450fb35..04180a0836cc9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,7 +28,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "core driver for VFIO based PCI devices" @@ -41,6 +41,23 @@ static bool disable_idle_d3; static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); +struct vfio_pci_dummy_resource { + struct resource resource; + int index; + struct list_head res_next; +}; + +struct vfio_pci_vf_token { + struct mutex lock; + uuid_t uuid; + int users; +}; + +struct vfio_pci_mmap_vma { + struct vm_area_struct *vma; + struct list_head vma_next; +}; + static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 352c725ccf181..8177e9a1da3bf 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -15,7 +15,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #define OPREGION_SIGNATURE "IntelGraphicsMem" #define OPREGION_SIZE (8 * 1024) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 6069a11fb51ac..32d014421c1f6 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -20,7 +20,21 @@ #include #include -#include +#include "vfio_pci_priv.h" + +#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) +#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) +#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) +#define irq_is(vdev, type) (vdev->irq_type == type) + +struct vfio_pci_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *unmask; + struct virqfd *mask; + char *name; + bool masked; + struct irq_bypass_producer producer; +}; /* * INTx diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h new file mode 100644 index 0000000000000..ac701f05bef02 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef VFIO_PCI_PRIV_H +#define VFIO_PCI_PRIV_H + +#include + +/* Special capability IDs predefined access */ +#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ +#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ + +/* Cap maximum number of ioeventfds per device (arbitrary) */ +#define VFIO_PCI_IOEVENTFD_MAX 1000 + +struct vfio_pci_ioeventfd { + struct list_head next; + struct vfio_pci_core_device *vdev; + struct virqfd *virqfd; + void __iomem *addr; + uint64_t data; + loff_t pos; + int bar; + int count; + bool test_mem; +}; + +#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) + +void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); +void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); + +int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, + unsigned index, unsigned start, unsigned count, + void *data); + +ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +#ifdef CONFIG_VFIO_PCI_VGA +ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); +#else +static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + return -EINVAL; +} +#endif + +long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd); + +int vfio_pci_init_perm_bits(void); +void vfio_pci_uninit_perm_bits(void); + +int vfio_config_init(struct vfio_pci_core_device *vdev); +void vfio_config_free(struct vfio_pci_core_device *vdev); + +int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, + pci_power_t state); + +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); +void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); +u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); +void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, + u16 cmd); + +#ifdef CONFIG_VFIO_PCI_IGD +int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); +#else +static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) +{ + return -ENODEV; +} +#endif + +#ifdef CONFIG_VFIO_PCI_ZDEV_KVM +int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); +void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); +#else +static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + return -ENODEV; +} + +static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) +{ + return 0; +} + +static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) +{} +#endif + +static inline bool vfio_pci_is_vga(struct pci_dev *pdev) +{ + return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; +} + +#endif diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 82ac1569deb05..d5e9883c1eee1 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -17,7 +17,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #ifdef __LITTLE_ENDIAN #define vfio_ioread64 ioread64 diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index e163aa9f61444..0bff24f0d4d71 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -15,7 +15,7 @@ #include #include -#include +#include "vfio_pci_priv.h" /* * Add the Base PCI Function information to the device info region. diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 5579ece4347bd..9d18b832e61a0 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -20,39 +20,10 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 - #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) -/* Special capability IDs predefined access */ -#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ -#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ - -/* Cap maximum number of ioeventfds per device (arbitrary) */ -#define VFIO_PCI_IOEVENTFD_MAX 1000 - -struct vfio_pci_ioeventfd { - struct list_head next; - struct vfio_pci_core_device *vdev; - struct virqfd *virqfd; - void __iomem *addr; - uint64_t data; - loff_t pos; - int bar; - int count; - bool test_mem; -}; - -struct vfio_pci_irq_ctx { - struct eventfd_ctx *trigger; - struct virqfd *unmask; - struct virqfd *mask; - char *name; - bool masked; - struct irq_bypass_producer producer; -}; - struct vfio_pci_core_device; struct vfio_pci_region; @@ -78,23 +49,6 @@ struct vfio_pci_region { u32 flags; }; -struct vfio_pci_dummy_resource { - struct resource resource; - int index; - struct list_head res_next; -}; - -struct vfio_pci_vf_token { - struct mutex lock; - uuid_t uuid; - int users; -}; - -struct vfio_pci_mmap_vma { - struct vm_area_struct *vma; - struct list_head vma_next; -}; - struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; @@ -141,92 +95,11 @@ struct vfio_pci_core_device { struct rw_semaphore memory_lock; }; -#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) -#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) -#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) -#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) -#define irq_is(vdev, type) (vdev->irq_type == type) - -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); -void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); - -int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, - uint32_t flags, unsigned index, - unsigned start, unsigned count, void *data); - -ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); - -ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite); - -#ifdef CONFIG_VFIO_PCI_VGA -ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite); -#else -static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite) -{ - return -EINVAL; -} -#endif - -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd); - -int vfio_pci_init_perm_bits(void); -void vfio_pci_uninit_perm_bits(void); - -int vfio_config_init(struct vfio_pci_core_device *vdev); -void vfio_config_free(struct vfio_pci_core_device *vdev); - +/* Will be exported for vfio pci drivers usage */ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, unsigned int type, unsigned int subtype, const struct vfio_pci_regops *ops, size_t size, u32 flags, void *data); - -int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, - pci_power_t state); - -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); -void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); -u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); -void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, - u16 cmd); - -#ifdef CONFIG_VFIO_PCI_IGD -int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) -{ - return -ENODEV; -} -#endif - -#ifdef CONFIG_VFIO_PCI_ZDEV_KVM -int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, - struct vfio_info_cap *caps); -int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); -void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, - struct vfio_info_cap *caps) -{ - return -ENODEV; -} - -static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) -{ - return 0; -} - -static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) -{} -#endif - -/* Will be exported for vfio pci drivers usage */ void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); @@ -256,9 +129,4 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -static inline bool vfio_pci_is_vga(struct pci_dev *pdev) -{ - return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; -} - #endif /* VFIO_PCI_CORE_H */ -- GitLab From 1e979ef5df8b7b604a625343a179b812a7984068 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:02 -0300 Subject: [PATCH 0118/2223] vfio/pci: Rename vfio_pci_register_dev_region() As this is part of the vfio_pci_core component it should be called vfio_pci_core_register_dev_region() like everything else exported from this module. Suggested-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/2-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 10 +++++----- drivers/vfio/pci/vfio_pci_igd.c | 6 +++--- include/linux/vfio_pci_core.h | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 04180a0836cc9..84279b6941bc2 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -662,10 +662,10 @@ static int msix_mmappable_cap(struct vfio_pci_core_device *vdev, return vfio_info_add_capability(caps, &header, sizeof(header)); } -int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, - unsigned int type, unsigned int subtype, - const struct vfio_pci_regops *ops, - size_t size, u32 flags, void *data) +int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data) { struct vfio_pci_region *region; @@ -687,7 +687,7 @@ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, return 0; } -EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region); +EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 8177e9a1da3bf..5e6ca59269548 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -257,7 +257,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) } } - ret = vfio_pci_register_dev_region(vdev, + ret = vfio_pci_core_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, opregionvbt); @@ -402,7 +402,7 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) return -EINVAL; } - ret = vfio_pci_register_dev_region(vdev, + ret = vfio_pci_core_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &vfio_pci_igd_cfg_regops, host_bridge->cfg_size, @@ -422,7 +422,7 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) return -EINVAL; } - ret = vfio_pci_register_dev_region(vdev, + ret = vfio_pci_core_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &vfio_pci_igd_cfg_regops, lpc_bridge->cfg_size, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 9d18b832e61a0..e5cf0d3313a69 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -96,10 +96,10 @@ struct vfio_pci_core_device { }; /* Will be exported for vfio pci drivers usage */ -int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, - unsigned int type, unsigned int subtype, - const struct vfio_pci_regops *ops, - size_t size, u32 flags, void *data); +int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data); void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); -- GitLab From c462a8c5d98877b76cf229d3d605d2a865aa9c9e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:03 -0300 Subject: [PATCH 0119/2223] vfio/pci: Simplify the is_intx/msi/msix/etc defines Only three of these are actually used, simplify to three inline functions, and open code the if statement in vfio_pci_config.c. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/3-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 2 +- drivers/vfio/pci/vfio_pci_intrs.c | 22 +++++++++++++++++----- drivers/vfio/pci/vfio_pci_priv.h | 2 -- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 5f43b28075eec..4a350421c5f62 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1166,7 +1166,7 @@ static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos, flags = le16_to_cpu(*pflags); /* MSI is enabled via ioctl */ - if (!is_msi(vdev)) + if (vdev->irq_type != VFIO_PCI_MSI_IRQ_INDEX) flags &= ~PCI_MSI_FLAGS_ENABLE; /* Check queue size */ diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 32d014421c1f6..8cb987ef3c476 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -22,11 +22,6 @@ #include "vfio_pci_priv.h" -#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) -#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) -#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) -#define irq_is(vdev, type) (vdev->irq_type == type) - struct vfio_pci_irq_ctx { struct eventfd_ctx *trigger; struct virqfd *unmask; @@ -36,6 +31,23 @@ struct vfio_pci_irq_ctx { struct irq_bypass_producer producer; }; +static bool irq_is(struct vfio_pci_core_device *vdev, int type) +{ + return vdev->irq_type == type; +} + +static bool is_intx(struct vfio_pci_core_device *vdev) +{ + return vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX; +} + +static bool is_irq_none(struct vfio_pci_core_device *vdev) +{ + return !(vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX || + vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX || + vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX); +} + /* * INTx */ diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index ac701f05bef02..4830fb01a1caa 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,8 +23,6 @@ struct vfio_pci_ioeventfd { bool test_mem; }; -#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) - void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); -- GitLab From 16f4cbd9e156193312db19a68fe37c09854f77a8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:56 -0300 Subject: [PATCH 0120/2223] vfio-pci: Fix vfio_pci_ioeventfd() to return int This only returns 0 or -ERRNO, it should return int like all the other ioctl dispatch functions. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_priv.h | 4 ++-- drivers/vfio/pci/vfio_pci_rdwr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 4830fb01a1caa..58b8d34c162cd 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -48,8 +48,8 @@ static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, } #endif -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd); +int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd); int vfio_pci_init_perm_bits(void); void vfio_pci_uninit_perm_bits(void); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index d5e9883c1eee1..e352a033b4aef 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -412,8 +412,8 @@ static void vfio_pci_ioeventfd_thread(void *opaque, void *unused) vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem); } -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd) +int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd) { struct pci_dev *pdev = vdev->pdev; loff_t pos = offset & VFIO_PCI_OFFSET_MASK; -- GitLab From 2ecf3b58ed7bc52ad58e02bb1596130fa6e6da53 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:57 -0300 Subject: [PATCH 0121/2223] vfio-pci: Break up vfio_pci_core_ioctl() into one function per ioctl 500 lines is a bit long for a single function, move the bodies of each ioctl into separate functions and leave behind a switch statement to dispatch them. This patch just adds the function declarations and does not fix the indenting. The next patch will restore the indenting. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 97 ++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 84279b6941bc2..85b9720e77d28 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -689,21 +689,15 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, } EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); -long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, + void __user *arg) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz; - - if (cmd == VFIO_DEVICE_GET_INFO) { + unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); struct vfio_device_info info; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; unsigned long capsz; int ret; - minsz = offsetofend(struct vfio_device_info, num_irqs); - /* For backward compatibility, cannot require this */ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); @@ -752,15 +746,17 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { +static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; @@ -897,12 +893,14 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { +static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_info, count); struct vfio_irq_info info; - minsz = offsetofend(struct vfio_irq_info, count); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; @@ -933,15 +931,17 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} - } else if (cmd == VFIO_DEVICE_SET_IRQS) { +static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_set, count); struct vfio_irq_set hdr; u8 *data = NULL; int max, ret = 0; size_t data_size = 0; - minsz = offsetofend(struct vfio_irq_set, count); - if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; @@ -968,8 +968,11 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, kfree(data); return ret; +} - } else if (cmd == VFIO_DEVICE_RESET) { +static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, + void __user *arg) +{ int ret; if (!vdev->reset_works) @@ -993,16 +996,20 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, up_write(&vdev->memory_lock); return ret; +} - } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { +static int +vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = + offsetofend(struct vfio_pci_hot_reset_info, count); struct vfio_pci_hot_reset_info hdr; struct vfio_pci_fill_info fill = { 0 }; struct vfio_pci_dependent_device *devices = NULL; bool slot = false; int ret = 0; - minsz = offsetofend(struct vfio_pci_hot_reset_info, count); - if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; @@ -1066,8 +1073,12 @@ reset_info_exit: kfree(devices); return ret; +} - } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { +static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); struct vfio_pci_hot_reset hdr; int32_t *group_fds; struct file **files; @@ -1075,8 +1086,6 @@ reset_info_exit: bool slot = false; int file_idx, count = 0, ret = 0; - minsz = offsetofend(struct vfio_pci_hot_reset, count); - if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; @@ -1160,12 +1169,15 @@ hot_reset_release: kfree(files); return ret; - } else if (cmd == VFIO_DEVICE_IOEVENTFD) { +} + +static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); struct vfio_device_ioeventfd ioeventfd; int count; - minsz = offsetofend(struct vfio_device_ioeventfd, fd); - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) return -EFAULT; @@ -1182,8 +1194,35 @@ hot_reset_release: return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, ioeventfd.fd); +} + +long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return vfio_pci_ioctl_get_info(vdev, uarg); + case VFIO_DEVICE_GET_IRQ_INFO: + return vfio_pci_ioctl_get_irq_info(vdev, uarg); + case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: + return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); + case VFIO_DEVICE_GET_REGION_INFO: + return vfio_pci_ioctl_get_region_info(vdev, uarg); + case VFIO_DEVICE_IOEVENTFD: + return vfio_pci_ioctl_ioeventfd(vdev, uarg); + case VFIO_DEVICE_PCI_HOT_RESET: + return vfio_pci_ioctl_pci_hot_reset(vdev, uarg); + case VFIO_DEVICE_RESET: + return vfio_pci_ioctl_reset(vdev, uarg); + case VFIO_DEVICE_SET_IRQS: + return vfio_pci_ioctl_set_irqs(vdev, uarg); + default: + return -ENOTTY; } - return -ENOTTY; } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -- GitLab From ea3fc04d4fad2d31adb8a25115d4bd53b214bfc4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:58 -0300 Subject: [PATCH 0122/2223] vfio-pci: Re-indent what was vfio_pci_core_ioctl() Done mechanically with: $ git clang-format-14 -i --lines 675:1210 drivers/vfio/pci/vfio_pci_core.c And manually reflow the multi-line comments clang-format doesn't fix. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 711 +++++++++++++++---------------- 1 file changed, 349 insertions(+), 362 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 85b9720e77d28..8bff8ab5e807b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -693,309 +693,300 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); - struct vfio_device_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; - int ret; + struct vfio_device_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + unsigned long capsz; + int ret; - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); + /* For backward compatibility, cannot require this */ + capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; - } + if (info.argsz >= capsz) { + minsz = capsz; + info.cap_offset = 0; + } - info.flags = VFIO_DEVICE_FLAGS_PCI; + info.flags = VFIO_DEVICE_FLAGS_PCI; - if (vdev->reset_works) - info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->reset_works) + info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS; - ret = vfio_pci_info_zdev_add_caps(vdev, &caps); - if (ret && ret != -ENODEV) { - pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); - return ret; - } + ret = vfio_pci_info_zdev_add_caps(vdev, &caps); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup zPCI info capabilities\n"); + return ret; + } - if (caps.size) { - info.flags |= VFIO_DEVICE_FLAGS_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); + if (caps.size) { + info.flags |= VFIO_DEVICE_FLAGS_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; } - - kfree(caps.buf); + info.cap_offset = sizeof(info); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - int i, ret; + struct pci_dev *pdev = vdev->pdev; + struct vfio_region_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + int i, ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pdev->cfg_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { + info.flags = 0; break; - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; - break; - } + } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); - if (ret) - return ret; - } + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info.index]) { + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info.index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, &caps); + if (ret) + return ret; } + } - break; - case VFIO_PCI_ROM_REGION_INDEX: - { - void __iomem *io; - size_t size; - u16 cmd; + break; + case VFIO_PCI_ROM_REGION_INDEX: { + void __iomem *io; + size_t size; + u16 cmd; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.flags = 0; + + /* Report the BAR size, not the ROM size */ + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { + /* Shadow ROMs appear as PCI option ROMs */ + if (pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW) + info.size = 0x20000; + else + break; + } - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; + /* + * Is it really there? Enable memory decode for implicit access + * in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + pci_unmap_rom(pdev, io); + } else { + info.size = 0; + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } + break; + } + case VFIO_PCI_VGA_REGION_INDEX: + if (!vdev->has_vga) + return -EINVAL; - /* - * Is it really there? Enable memory decode for - * implicit access in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; - } - vfio_pci_memory_unlock_and_restore(vdev, cmd); + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0xc0000; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; - break; - } - case VFIO_PCI_VGA_REGION_INDEX: - if (!vdev->has_vga) - return -EINVAL; + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; + if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + return -EINVAL; + info.index = array_index_nospec( + info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - if (info.index >= - VFIO_PCI_NUM_REGIONS + vdev->num_regions) - return -EINVAL; - info.index = array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vdev->num_regions); + i = info.index - VFIO_PCI_NUM_REGIONS; - i = info.index - VFIO_PCI_NUM_REGIONS; + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vdev->region[i].size; + info.flags = vdev->region[i].flags; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + cap_type.type = vdev->region[i].type; + cap_type.subtype = vdev->region[i].subtype; - cap_type.type = vdev->region[i].type; - cap_type.subtype = vdev->region[i].subtype; + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; - ret = vfio_info_add_capability(&caps, &cap_type.header, - sizeof(cap_type)); + if (vdev->region[i].ops->add_capability) { + ret = vdev->region[i].ops->add_capability( + vdev, &vdev->region[i], &caps); if (ret) return ret; - - if (vdev->region[i].ops->add_capability) { - ret = vdev->region[i].ops->add_capability(vdev, - &vdev->region[i], &caps); - if (ret) - return ret; - } - } } + } + } - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; } - - kfree(caps.buf); + info.cap_offset = sizeof(info); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_info, count); - struct vfio_irq_info info; + struct vfio_irq_info info; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; - if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) - return -EINVAL; + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) + return -EINVAL; - switch (info.index) { - case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: - case VFIO_PCI_REQ_IRQ_INDEX: + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) break; - case VFIO_PCI_ERR_IRQ_INDEX: - if (pci_is_pcie(vdev->pdev)) - break; - fallthrough; - default: - return -EINVAL; - } + fallthrough; + default: + return -EINVAL; + } - info.flags = VFIO_IRQ_INFO_EVENTFD; + info.flags = VFIO_IRQ_INFO_EVENTFD; - info.count = vfio_pci_get_irq_count(vdev, info.index); + info.count = vfio_pci_get_irq_count(vdev, info.index); - if (info.index == VFIO_PCI_INTX_IRQ_INDEX) - info.flags |= (VFIO_IRQ_INFO_MASKABLE | - VFIO_IRQ_INFO_AUTOMASKED); - else - info.flags |= VFIO_IRQ_INFO_NORESIZE; + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= + (VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE; - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_set, count); - struct vfio_irq_set hdr; - u8 *data = NULL; - int max, ret = 0; - size_t data_size = 0; + struct vfio_irq_set hdr; + u8 *data = NULL; + int max, ret = 0; + size_t data_size = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; - max = vfio_pci_get_irq_count(vdev, hdr.index); + max = vfio_pci_get_irq_count(vdev, hdr.index); - ret = vfio_set_irqs_validate_and_prepare(&hdr, max, - VFIO_PCI_NUM_IRQS, &data_size); - if (ret) - return ret; + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS, + &data_size); + if (ret) + return ret; - if (data_size) { - data = memdup_user((void __user *)(arg + minsz), - data_size); - if (IS_ERR(data)) - return PTR_ERR(data); - } + if (data_size) { + data = memdup_user((void __user *)(arg + minsz), data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } - mutex_lock(&vdev->igate); + mutex_lock(&vdev->igate); - ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, - hdr.start, hdr.count, data); + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start, + hdr.count, data); - mutex_unlock(&vdev->igate); - kfree(data); + mutex_unlock(&vdev->igate); + kfree(data); - return ret; + return ret; } static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, void __user *arg) { - int ret; + int ret; - if (!vdev->reset_works) - return -EINVAL; + if (!vdev->reset_works) + return -EINVAL; - vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_zap_and_down_write_memory_lock(vdev); - /* - * This function can be invoked while the power state is non-D0. - * If pci_try_reset_function() has been called while the power - * state is non-D0, then pci_try_reset_function() will - * internally set the power state to D0 without vfio driver - * involvement. For the devices which have NoSoftRst-, the - * reset function can cause the PCI config space reset without - * restoring the original state (saved locally in - * 'vdev->pm_save'). - */ - vfio_pci_set_power_state(vdev, PCI_D0); + /* + * This function can be invoked while the power state is non-D0. If + * pci_try_reset_function() has been called while the power state is + * non-D0, then pci_try_reset_function() will internally set the power + * state to D0 without vfio driver involvement. For the devices which + * have NoSoftRst-, the reset function can cause the PCI config space + * reset without restoring the original state (saved locally in + * 'vdev->pm_save'). + */ + vfio_pci_set_power_state(vdev, PCI_D0); - ret = pci_try_reset_function(vdev->pdev); - up_write(&vdev->memory_lock); + ret = pci_try_reset_function(vdev->pdev); + up_write(&vdev->memory_lock); - return ret; + return ret; } static int @@ -1004,196 +995,192 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); - struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; - bool slot = false; - int ret = 0; + struct vfio_pci_hot_reset_info hdr; + struct vfio_pci_fill_info fill = { 0 }; + struct vfio_pci_dependent_device *devices = NULL; + bool slot = false; + int ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; - if (hdr.argsz < minsz) - return -EINVAL; + if (hdr.argsz < minsz) + return -EINVAL; - hdr.flags = 0; + hdr.flags = 0; - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; + /* How many devices are affected? */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &fill.max, slot); + if (ret) + return ret; - WARN_ON(!fill.max); /* Should always be at least one */ + WARN_ON(!fill.max); /* Should always be at least one */ - /* - * If there's enough space, fill it now, otherwise return - * -ENOSPC and the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } + /* + * If there's enough space, fill it now, otherwise return -ENOSPC and + * the number of devices affected. + */ + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { + ret = -ENOSPC; + hdr.count = fill.max; + goto reset_info_exit; + } - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; - fill.devices = devices; + fill.devices = devices; - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_fill_devs, - &fill, slot); + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, + &fill, slot); - /* - * If a device was removed between counting and filling, - * we may come up short of fill.max. If a device was - * added, we'll have a return of -EAGAIN above. - */ - if (!ret) - hdr.count = fill.cur; + /* + * If a device was removed between counting and filling, we may come up + * short of fill.max. If a device was added, we'll have a return of + * -EAGAIN above. + */ + if (!ret) + hdr.count = fill.cur; reset_info_exit: - if (copy_to_user((void __user *)arg, &hdr, minsz)) - ret = -EFAULT; + if (copy_to_user((void __user *)arg, &hdr, minsz)) + ret = -EFAULT; - if (!ret) { - if (copy_to_user((void __user *)(arg + minsz), devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } + if (!ret) { + if (copy_to_user((void __user *)(arg + minsz), devices, + hdr.count * sizeof(*devices))) + ret = -EFAULT; + } - kfree(devices); - return ret; + kfree(devices); + return ret; } static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); - struct vfio_pci_hot_reset hdr; - int32_t *group_fds; - struct file **files; - struct vfio_pci_group_info info; - bool slot = false; - int file_idx, count = 0, ret = 0; - - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + struct vfio_pci_hot_reset hdr; + int32_t *group_fds; + struct file **files; + struct vfio_pci_group_info info; + bool slot = false; + int file_idx, count = 0, ret = 0; - if (hdr.argsz < minsz || hdr.flags) - return -EINVAL; + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; - /* - * We can't let userspace give us an arbitrarily large - * buffer to copy, so verify how many we think there - * could be. Note groups can have multiple devices so - * one group per device is the max. - */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_count_devs, - &count, slot); - if (ret) - return ret; + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; - /* Somewhere between 1 and count is OK */ - if (!hdr.count || hdr.count > count) - return -EINVAL; + /* + * We can't let userspace give us an arbitrarily large buffer to copy, + * so verify how many we think there could be. Note groups can have + * multiple devices so one group per device is the max. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); - if (!group_fds || !files) { - kfree(group_fds); - kfree(files); - return -ENOMEM; - } + /* Somewhere between 1 and count is OK */ + if (!hdr.count || hdr.count > count) + return -EINVAL; - if (copy_from_user(group_fds, (void __user *)(arg + minsz), - hdr.count * sizeof(*group_fds))) { - kfree(group_fds); - kfree(files); - return -EFAULT; - } + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); + files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + if (!group_fds || !files) { + kfree(group_fds); + kfree(files); + return -ENOMEM; + } - /* - * For each group_fd, get the group through the vfio external - * user interface and store the group and iommu ID. This - * ensures the group is held across the reset. - */ - for (file_idx = 0; file_idx < hdr.count; file_idx++) { - struct file *file = fget(group_fds[file_idx]); + if (copy_from_user(group_fds, (void __user *)(arg + minsz), + hdr.count * sizeof(*group_fds))) { + kfree(group_fds); + kfree(files); + return -EFAULT; + } - if (!file) { - ret = -EBADF; - break; - } + /* + * For each group_fd, get the group through the vfio external user + * interface and store the group and iommu ID. This ensures the group + * is held across the reset. + */ + for (file_idx = 0; file_idx < hdr.count; file_idx++) { + struct file *file = fget(group_fds[file_idx]); - /* Ensure the FD is a vfio group FD.*/ - if (!vfio_file_iommu_group(file)) { - fput(file); - ret = -EINVAL; - break; - } + if (!file) { + ret = -EBADF; + break; + } - files[file_idx] = file; + /* Ensure the FD is a vfio group FD.*/ + if (!vfio_file_iommu_group(file)) { + fput(file); + ret = -EINVAL; + break; } - kfree(group_fds); + files[file_idx] = file; + } - /* release reference to groups on error */ - if (ret) - goto hot_reset_release; + kfree(group_fds); + + /* release reference to groups on error */ + if (ret) + goto hot_reset_release; - info.count = hdr.count; - info.files = files; + info.count = hdr.count; + info.files = files; - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); hot_reset_release: - for (file_idx--; file_idx >= 0; file_idx--) - fput(files[file_idx]); + for (file_idx--; file_idx >= 0; file_idx--) + fput(files[file_idx]); - kfree(files); - return ret; + kfree(files); + return ret; } static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); - struct vfio_device_ioeventfd ioeventfd; - int count; + struct vfio_device_ioeventfd ioeventfd; + int count; - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) + return -EFAULT; - if (ioeventfd.argsz < minsz) - return -EINVAL; + if (ioeventfd.argsz < minsz) + return -EINVAL; - if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) - return -EINVAL; + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) + return -EINVAL; - count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; - if (hweight8(count) != 1 || ioeventfd.fd < -1) - return -EINVAL; + if (hweight8(count) != 1 || ioeventfd.fd < -1) + return -EINVAL; - return vfio_pci_ioeventfd(vdev, ioeventfd.offset, - ioeventfd.data, count, ioeventfd.fd); + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, + ioeventfd.fd); } long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, -- GitLab From 663eab456e072bbcd02c2516d54b53f7ecd57dd3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:59 -0300 Subject: [PATCH 0123/2223] vfio-pci: Replace 'void __user *' with proper types in the ioctl functions This makes the code clearer and replaces a few places trying to access a flex array with an actual flex array. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 58 +++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 8bff8ab5e807b..9273f1ffd0ddd 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -690,7 +690,7 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_device_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); struct vfio_device_info info; @@ -701,7 +701,7 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, /* For backward compatibility, cannot require this */ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -733,22 +733,21 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, info.argsz = sizeof(info) + caps.size; } else { vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + sizeof(info), - caps.buf, caps.size)) { + if (copy_to_user(arg + 1, caps.buf, caps.size)) { kfree(caps.buf); return -EFAULT; } - info.cap_offset = sizeof(info); + info.cap_offset = sizeof(*arg); } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_region_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; @@ -756,7 +755,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -875,27 +874,26 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.cap_offset = 0; } else { vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + sizeof(info), - caps.buf, caps.size)) { + if (copy_to_user(arg + 1, caps.buf, caps.size)) { kfree(caps.buf); return -EFAULT; } - info.cap_offset = sizeof(info); + info.cap_offset = sizeof(*arg); } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_irq_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_info, count); struct vfio_irq_info info; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) @@ -923,11 +921,11 @@ static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, else info.flags |= VFIO_IRQ_INFO_NORESIZE; - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_irq_set __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_set, count); struct vfio_irq_set hdr; @@ -935,7 +933,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, int max, ret = 0; size_t data_size = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) + if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; max = vfio_pci_get_irq_count(vdev, hdr.index); @@ -946,7 +944,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, return ret; if (data_size) { - data = memdup_user((void __user *)(arg + minsz), data_size); + data = memdup_user(&arg->data, data_size); if (IS_ERR(data)) return PTR_ERR(data); } @@ -989,9 +987,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, return ret; } -static int -vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, - void __user *arg) +static int vfio_pci_ioctl_get_pci_hot_reset_info( + struct vfio_pci_core_device *vdev, + struct vfio_pci_hot_reset_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); @@ -1001,7 +999,7 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, bool slot = false; int ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) + if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; if (hdr.argsz < minsz) @@ -1051,11 +1049,11 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, hdr.count = fill.cur; reset_info_exit: - if (copy_to_user((void __user *)arg, &hdr, minsz)) + if (copy_to_user(arg, &hdr, minsz)) ret = -EFAULT; if (!ret) { - if (copy_to_user((void __user *)(arg + minsz), devices, + if (copy_to_user(&arg->devices, devices, hdr.count * sizeof(*devices))) ret = -EFAULT; } @@ -1065,7 +1063,7 @@ reset_info_exit: } static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_pci_hot_reset __user *arg) { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); struct vfio_pci_hot_reset hdr; @@ -1075,7 +1073,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, bool slot = false; int file_idx, count = 0, ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) + if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; if (hdr.argsz < minsz || hdr.flags) @@ -1109,7 +1107,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, return -ENOMEM; } - if (copy_from_user(group_fds, (void __user *)(arg + minsz), + if (copy_from_user(group_fds, arg->group_fds, hdr.count * sizeof(*group_fds))) { kfree(group_fds); kfree(files); @@ -1159,13 +1157,13 @@ hot_reset_release: } static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_device_ioeventfd __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); struct vfio_device_ioeventfd ioeventfd; int count; - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) + if (copy_from_user(&ioeventfd, arg, minsz)) return -EFAULT; if (ioeventfd.argsz < minsz) @@ -1214,7 +1212,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - void __user *arg, size_t argsz) + uuid_t __user *arg, size_t argsz) { struct vfio_pci_core_device *vdev = container_of(device, struct vfio_pci_core_device, vdev); -- GitLab From 150ee2f9cd9411a3fdbc55cef2fb01349216dbd7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:00 -0300 Subject: [PATCH 0124/2223] vfio: Fold VFIO_GROUP_GET_DEVICE_FD into vfio_group_get_device_fd() No reason to split it up like this, just have one function to process the ioctl. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 7cb56c382c97a..3afef45b8d1a2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1178,14 +1178,21 @@ err_unassign_container: return ERR_PTR(ret); } -static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, + char __user *arg) { struct vfio_device *device; struct file *filep; + char *buf; int fdno; int ret; + buf = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + device = vfio_device_get_from_name(group, buf); + kfree(buf); if (IS_ERR(device)) return PTR_ERR(device); @@ -1215,9 +1222,12 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_group *group = filep->private_data; + void __user *uarg = (void __user *)arg; long ret = -ENOTTY; switch (cmd) { + case VFIO_GROUP_GET_DEVICE_FD: + return vfio_group_ioctl_get_device_fd(group, uarg); case VFIO_GROUP_GET_STATUS: { struct vfio_group_status status; @@ -1267,18 +1277,6 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, ret = vfio_group_unset_container(group); up_write(&group->group_rwsem); break; - case VFIO_GROUP_GET_DEVICE_FD: - { - char *buf; - - buf = strndup_user((const char __user *)arg, PAGE_SIZE); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - ret = vfio_group_get_device_fd(group, buf); - kfree(buf); - break; - } } return ret; -- GitLab From 67671f153e6b5a379623b57881a6cf99b4a6f977 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:01 -0300 Subject: [PATCH 0125/2223] vfio: Fold VFIO_GROUP_SET_CONTAINER into vfio_group_set_container() No reason to split it up like this, just have one function to process the ioctl. Move the lock into the function as well to avoid having a lockdep annotation. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 51 +++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3afef45b8d1a2..17c44ee81f9fe 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -980,47 +980,54 @@ static int vfio_group_unset_container(struct vfio_group *group) return 0; } -static int vfio_group_set_container(struct vfio_group *group, int container_fd) +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) { struct fd f; struct vfio_container *container; struct vfio_iommu_driver *driver; + int container_fd; int ret = 0; - lockdep_assert_held_write(&group->group_rwsem); - - if (group->container || WARN_ON(group->container_users)) - return -EINVAL; - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; + if (get_user(container_fd, arg)) + return -EFAULT; + if (container_fd < 0) + return -EINVAL; f = fdget(container_fd); if (!f.file) return -EBADF; /* Sanity check, is this really our fd? */ if (f.file->f_op != &vfio_fops) { - fdput(f); - return -EINVAL; + ret = -EINVAL; + goto out_fdput; } - container = f.file->private_data; WARN_ON(!container); /* fget ensures we don't race vfio_release */ + down_write(&group->group_rwsem); + + if (group->container || WARN_ON(group->container_users)) { + ret = -EINVAL; + goto out_unlock_group; + } + down_write(&container->group_lock); /* Real groups and fake groups cannot mix */ if (!list_empty(&container->group_list) && container->noiommu != (group->type == VFIO_NO_IOMMU)) { ret = -EPERM; - goto unlock_out; + goto out_unlock_container; } if (group->type == VFIO_IOMMU) { ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); if (ret) - goto unlock_out; + goto out_unlock_container; } driver = container->iommu_driver; @@ -1032,7 +1039,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (group->type == VFIO_IOMMU) iommu_group_release_dma_owner( group->iommu_group); - goto unlock_out; + goto out_unlock_container; } } @@ -1044,8 +1051,11 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) /* Get a reference on the container and mark a user within the group */ vfio_container_get(container); -unlock_out: +out_unlock_container: up_write(&container->group_lock); +out_unlock_group: + up_write(&group->group_rwsem); +out_fdput: fdput(f); return ret; } @@ -1258,20 +1268,7 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, break; } case VFIO_GROUP_SET_CONTAINER: - { - int fd; - - if (get_user(fd, (int __user *)arg)) - return -EFAULT; - - if (fd < 0) - return -EINVAL; - - down_write(&group->group_rwsem); - ret = vfio_group_set_container(group, fd); - up_write(&group->group_rwsem); - break; - } + return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: down_write(&group->group_rwsem); ret = vfio_group_unset_container(group); -- GitLab From b3b43590fa276aef824a300b911fe5fb9083dbf5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:02 -0300 Subject: [PATCH 0126/2223] vfio: Follow the naming pattern for vfio_group_ioctl_unset_container() Make it clear that this is the body of the ioctl. Fold the locking into the function so it is self contained like the other ioctls. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/7-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 17c44ee81f9fe..0bb75416acfc4 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -968,16 +968,24 @@ static void __vfio_group_unset_container(struct vfio_group *group) * the group, we know that still exists, therefore the only valid * transition here is 1->0. */ -static int vfio_group_unset_container(struct vfio_group *group) +static int vfio_group_ioctl_unset_container(struct vfio_group *group) { - lockdep_assert_held_write(&group->group_rwsem); + int ret = 0; - if (!group->container) - return -EINVAL; - if (group->container_users != 1) - return -EBUSY; + down_write(&group->group_rwsem); + if (!group->container) { + ret = -EINVAL; + goto out_unlock; + } + if (group->container_users != 1) { + ret = -EBUSY; + goto out_unlock; + } __vfio_group_unset_container(group); - return 0; + +out_unlock: + up_write(&group->group_rwsem); + return ret; } static int vfio_group_ioctl_set_container(struct vfio_group *group, @@ -1270,10 +1278,7 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, case VFIO_GROUP_SET_CONTAINER: return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: - down_write(&group->group_rwsem); - ret = vfio_group_unset_container(group); - up_write(&group->group_rwsem); - break; + return vfio_group_ioctl_unset_container(group); } return ret; -- GitLab From 99a27c088b9c76d9e0f2a36152ffaf9891b224d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:03 -0300 Subject: [PATCH 0127/2223] vfio: Split VFIO_GROUP_GET_STATUS into a function This is the last sizable implementation in vfio_group_fops_unl_ioctl(), move it to a function so vfio_group_fops_unl_ioctl() is emptied out. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 61 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 0bb75416acfc4..eb714a484662f 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1236,52 +1236,51 @@ err_put_device: return ret; } +static int vfio_group_ioctl_get_status(struct vfio_group *group, + struct vfio_group_status __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_group_status, flags); + struct vfio_group_status status; + + if (copy_from_user(&status, arg, minsz)) + return -EFAULT; + + if (status.argsz < minsz) + return -EINVAL; + + status.flags = 0; + + down_read(&group->group_rwsem); + if (group->container) + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | + VFIO_GROUP_FLAGS_VIABLE; + else if (!iommu_group_dma_owner_claimed(group->iommu_group)) + status.flags |= VFIO_GROUP_FLAGS_VIABLE; + up_read(&group->group_rwsem); + + if (copy_to_user(arg, &status, minsz)) + return -EFAULT; + return 0; +} + static long vfio_group_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_group *group = filep->private_data; void __user *uarg = (void __user *)arg; - long ret = -ENOTTY; switch (cmd) { case VFIO_GROUP_GET_DEVICE_FD: return vfio_group_ioctl_get_device_fd(group, uarg); case VFIO_GROUP_GET_STATUS: - { - struct vfio_group_status status; - unsigned long minsz; - - minsz = offsetofend(struct vfio_group_status, flags); - - if (copy_from_user(&status, (void __user *)arg, minsz)) - return -EFAULT; - - if (status.argsz < minsz) - return -EINVAL; - - status.flags = 0; - - down_read(&group->group_rwsem); - if (group->container) - status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | - VFIO_GROUP_FLAGS_VIABLE; - else if (!iommu_group_dma_owner_claimed(group->iommu_group)) - status.flags |= VFIO_GROUP_FLAGS_VIABLE; - up_read(&group->group_rwsem); - - if (copy_to_user((void __user *)arg, &status, minsz)) - return -EFAULT; - - ret = 0; - break; - } + return vfio_group_ioctl_get_status(group, uarg); case VFIO_GROUP_SET_CONTAINER: return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: return vfio_group_ioctl_unset_container(group); + default: + return -ENOTTY; } - - return ret; } static int vfio_group_fops_open(struct inode *inode, struct file *filep) -- GitLab From 385ecfdfb5d5ad0ff37e20381c70e18af8cf1bdb Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:46 +0530 Subject: [PATCH 0128/2223] vfio: Add the device features for the low power entry and exit This patch adds the following new device features for the low power entry and exit in the header file. The implementation for the same will be added in the subsequent patches. - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP - VFIO_DEVICE_FEATURE_LOW_POWER_EXIT For vfio-pci based devices, with the standard PCI PM registers, all power states cannot be achieved. The platform-based power management needs to be involved to go into the lowest power state. For doing low power entry and exit with platform-based power management, these device features can be used. The entry device feature has two variants. These two variants are mainly to support the different behaviour for the low power entry. If there is any access for the VFIO device on the host side, then the device will be moved out of the low power state without the user's guest driver involvement. Some devices (for example NVIDIA VGA or 3D controller) require the user's guest driver involvement for each low-power entry. In the first variant, the host can return the device to low power automatically. The device will continue to attempt to reach low power until the low power exit feature is called. In the second variant, if the device exits low power due to an access, the host kernel will signal the user via the provided eventfd and will not return the device to low power without a subsequent call to one of the low power entry features. A call to the low power exit feature is optional if the user provided eventfd is signaled. These device features only support VFIO_DEVICE_FEATURE_SET and VFIO_DEVICE_FEATURE_PROBE operations. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-2-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 733a1cddde30a..76a173f973de6 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -986,6 +986,62 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_RUNNING_P2P = 5, }; +/* + * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power + * state with the platform-based power management. Device use of lower power + * states depends on factors managed by the runtime power management core, + * including system level support and coordinating support among dependent + * devices. Enabling device low power entry does not guarantee lower power + * usage by the device, nor is a mechanism provided through this feature to + * know the current power state of the device. If any device access happens + * (either from the host or through the vfio uAPI) when the device is in the + * low power state, then the host will move the device out of the low power + * state as necessary prior to the access. Once the access is completed, the + * device may re-enter the low power state. For single shot low power support + * with wake-up notification, see + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP below. Access to mmap'd + * device regions is disabled on LOW_POWER_ENTRY and may only be resumed after + * calling LOW_POWER_EXIT. + */ +#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY 3 + +/* + * This device feature has the same behavior as + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY with the exception that the user + * provides an eventfd for wake-up notification. When the device moves out of + * the low power state for the wake-up, the host will not allow the device to + * re-enter a low power state without a subsequent user call to one of the low + * power entry device feature IOCTLs. Access to mmap'd device regions is + * disabled on LOW_POWER_ENTRY_WITH_WAKEUP and may only be resumed after the + * low power exit. The low power exit can happen either through LOW_POWER_EXIT + * or through any other access (where the wake-up notification has been + * generated). The access to mmap'd device regions will not trigger low power + * exit. + * + * The notification through the provided eventfd will be generated only when + * the device has entered and is resumed from a low power state after + * calling this device feature IOCTL. A device that has not entered low power + * state, as managed through the runtime power management core, will not + * generate a notification through the provided eventfd on access. Calling the + * LOW_POWER_EXIT feature is optional in the case where notification has been + * signaled on the provided eventfd that a resume from low power has occurred. + */ +struct vfio_device_low_power_entry_with_wakeup { + __s32 wakeup_eventfd; + __u32 reserved; +}; + +#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP 4 + +/* + * Upon VFIO_DEVICE_FEATURE_SET, disallow use of device low power states as + * previously enabled via VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY or + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device features. + * This device feature IOCTL may itself generate a wakeup eventfd notification + * in the latter case if the device had previously entered a low power state. + */ +#define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- GitLab From 8e5c6995113d201addd651dc2db8e11c93ce639f Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:47 +0530 Subject: [PATCH 0129/2223] vfio: Increment the runtime PM usage count during IOCTL call The vfio-pci based drivers will have runtime power management support where the user can put the device into the low power state and then PCI devices can go into the D3cold state. If the device is in the low power state and the user issues any IOCTL, then the device should be moved out of the low power state first. Once the IOCTL is serviced, then it can go into the low power state again. The runtime PM framework manages this with help of usage count. One option was to add the runtime PM related API's inside vfio-pci driver but some IOCTL (like VFIO_DEVICE_FEATURE) can follow a different path and more IOCTL can be added in the future. Also, the runtime PM will be added for vfio-pci based drivers variant currently, but the other VFIO based drivers can use the same in the future. So, this patch adds the runtime calls runtime-related API in the top-level IOCTL function itself. For the VFIO drivers which do not have runtime power management support currently, the runtime PM API's won't be invoked. Only for vfio-pci based drivers currently, the runtime PM API's will be invoked to increment and decrement the usage count. In the vfio-pci drivers also, the variant drivers can opt-out by incrementing the usage count during device-open. The pm_runtime_resume_and_get() checks the device current status and will return early if the device is already in the ACTIVE state. Taking this usage count incremented while servicing IOCTL will make sure that the user won't put the device into the low power state when any other IOCTL is being serviced in parallel. Let's consider the following scenario: 1. Some other IOCTL is called. 2. The user has opened another device instance and called the IOCTL for low power entry. 3. The low power entry IOCTL moves the device into the low power state. 4. The other IOCTL finishes. If we don't keep the usage count incremented then the device access will happen between step 3 and 4 while the device has already gone into the low power state. The pm_runtime_resume_and_get() will be the first call so its error should not be propagated to user space directly. For example, if pm_runtime_resume_and_get() can return -EINVAL for the cases where the user has passed the correct argument. So the pm_runtime_resume_and_get() errors have been masked behind -EIO. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-3-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 52 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index eb714a484662f..5edc49748013d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -1353,6 +1354,39 @@ static const struct file_operations vfio_group_fops = { .release = vfio_group_fops_release, }; +/* + * Wrapper around pm_runtime_resume_and_get(). + * Return error code on failure or 0 on success. + */ +static inline int vfio_device_pm_runtime_get(struct vfio_device *device) +{ + struct device *dev = device->dev; + + if (dev->driver && dev->driver->pm) { + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) { + dev_info_ratelimited(dev, + "vfio: runtime resume failed %d\n", ret); + return -EIO; + } + } + + return 0; +} + +/* + * Wrapper around pm_runtime_put(). + */ +static inline void vfio_device_pm_runtime_put(struct vfio_device *device) +{ + struct device *dev = device->dev; + + if (dev->driver && dev->driver->pm) + pm_runtime_put(dev); +} + /* * VFIO Device fd */ @@ -1673,15 +1707,27 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_device *device = filep->private_data; + int ret; + + ret = vfio_device_pm_runtime_get(device); + if (ret) + return ret; switch (cmd) { case VFIO_DEVICE_FEATURE: - return vfio_ioctl_device_feature(device, (void __user *)arg); + ret = vfio_ioctl_device_feature(device, (void __user *)arg); + break; + default: if (unlikely(!device->ops->ioctl)) - return -EINVAL; - return device->ops->ioctl(device, cmd, arg); + ret = -EINVAL; + else + ret = device->ops->ioctl(device, cmd, arg); + break; } + + vfio_device_pm_runtime_put(device); + return ret; } static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, -- GitLab From 4813724c4b76b62f8e6b60dd5655633d0db1c9a8 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:48 +0530 Subject: [PATCH 0130/2223] vfio/pci: Mask INTx during runtime suspend This patch adds INTx handling during runtime suspend/resume. All the suspend/resume related code for the user to put the device into the low power state will be added in subsequent patches. The INTx lines may be shared among devices. Whenever any INTx interrupt comes for the VFIO devices, then vfio_intx_handler() will be called for each device sharing the interrupt. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx() and checks if the interrupt has been generated for the current device. Now, if the device is already in the D3cold state, then the config space can not be read. Attempt to read config space in D3cold state can cause system unresponsiveness in a few systems. To prevent this, mask INTx in runtime suspend callback, and unmask the same in runtime resume callback. If INTx has been already masked, then no handling is needed in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated to return true if the INTx vfio_pci_irq_ctx.masked value is changed inside this function. For the runtime suspend which is triggered for the no user of VFIO device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these callbacks won't do anything. The MSI/MSI-X are not shared so similar handling should not be needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal() without doing any device-specific config access. When the user performs any config access or IOCTL after receiving the eventfd notification, then the device will be moved to the D0 state first before servicing any request. Another option was to check this flag 'pm_intx_masked' inside vfio_intx_handler() instead of masking the interrupts. This flag is being set inside the runtime_suspend callback but the device can be in non-D3cold state (for example, if the user has disabled D3cold explicitly by sysfs, the D3cold is not supported in the platform, etc.). Also, in D3cold supported case, the device will be in D0 till the PCI core moves the device into D3cold. In this case, there is a possibility that the device can generate an interrupt. Adding check in the IRQ handler will not clear the IRQ status and the interrupt line will still be asserted. This can cause interrupt flooding. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 38 +++++++++++++++++++++++++++---- drivers/vfio/pci/vfio_pci_intrs.c | 6 ++++- drivers/vfio/pci/vfio_pci_priv.h | 2 +- include/linux/vfio_pci_core.h | 1 + 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9273f1ffd0ddd..207ede189c2af 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -277,16 +277,46 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +#ifdef CONFIG_PM +static int vfio_pci_core_runtime_suspend(struct device *dev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + + /* + * If INTx is enabled, then mask INTx before going into the runtime + * suspended state and unmask the same in the runtime resume. + * If INTx has already been masked by the user, then + * vfio_pci_intx_mask() will return false and in that case, INTx + * should not be unmasked in the runtime resume. + */ + vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) && + vfio_pci_intx_mask(vdev)); + + return 0; +} + +static int vfio_pci_core_runtime_resume(struct device *dev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + + if (vdev->pm_intx_masked) + vfio_pci_intx_unmask(vdev); + + return 0; +} +#endif /* CONFIG_PM */ + /* - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, - * so use structure without any callbacks. - * * The pci-driver core runtime PM routines always save the device state * before going into suspended state. If the device is going into low power * state with only with runtime PM ops, then no explicit handling is needed * for the devices which have NoSoftRst-. */ -static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; +static const struct dev_pm_ops vfio_pci_core_pm_ops = { + SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend, + vfio_pci_core_runtime_resume, + NULL) +}; int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) { diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 8cb987ef3c476..40c3d7cf163f6 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -59,10 +59,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused) eventfd_signal(vdev->ctx[0].trigger, 1); } -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) +/* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */ +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; unsigned long flags; + bool masked_changed = false; spin_lock_irqsave(&vdev->irqlock, flags); @@ -86,9 +88,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) disable_irq_nosync(pdev->irq); vdev->ctx[0].masked = true; + masked_changed = true; } spin_unlock_irqrestore(&vdev->irqlock, flags); + return masked_changed; } /* diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 58b8d34c162cd..5e4fa69aee16c 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,7 +23,7 @@ struct vfio_pci_ioeventfd { bool test_mem; }; -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index e5cf0d3313a69..a0f1f36e42a20 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -78,6 +78,7 @@ struct vfio_pci_core_device { bool needs_reset; bool nointx; bool needs_pm_restore; + bool pm_intx_masked; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; -- GitLab From cc2742fe3660cc6500021d3da8f937d326392dbd Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:49 +0530 Subject: [PATCH 0131/2223] vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT Currently, if the runtime power management is enabled for vfio-pci based devices in the guest OS, then the guest OS will do the register write for PCI_PM_CTRL register. This write request will be handled in vfio_pm_config_write() where it will do the actual register write of PCI_PM_CTRL register. With this, the maximum D3hot state can be achieved for low power. If we can use the runtime PM framework, then we can achieve the D3cold state (on the supported systems) which will help in saving maximum power. 1. D3cold state can't be achieved by writing PCI standard PM config registers. This patch implements the following newly added low power related device features: - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY - VFIO_DEVICE_FEATURE_LOW_POWER_EXIT The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the device to make use of low power platform states on the host while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent further use of those power states. 2. The vfio-pci driver uses runtime PM framework for low power entry and exit. On the platforms where D3cold state is supported, the runtime PM framework will put the device into D3cold otherwise, D3hot or some other power state will be used. There are various cases where the device will not go into the runtime suspended state. For example, - The runtime power management is disabled on the host side for the device. - The user keeps the device busy after calling LOW_POWER_ENTRY. - There are dependent devices that are still in runtime active state. For these cases, the device will be in the same power state that has been configured by the user through PCI_PM_CTRL register. 3. The hypervisors can implement virtual ACPI methods. For example, in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power resources with _ON/_OFF method, then guest linux OS invokes the _OFF method during D3cold transition and then _ON during D0 transition. The hypervisor can tap these virtual ACPI calls and then call the low power device feature IOCTL. 4. The 'pm_runtime_engaged' flag tracks the entry and exit to runtime PM. This flag is protected with 'memory_lock' semaphore. 5. All the config and other region access are wrapped under pm_runtime_resume_and_get() and pm_runtime_put(). So, if any device access happens while the device is in the runtime suspended state, then the device will be resumed first before access. Once the access has been finished, then the device will again go into the runtime suspended state. 6. The memory region access through mmap will not be allowed in the low power state. Since __vfio_pci_memory_enabled() is a common function, so check for 'pm_runtime_engaged' has been added explicitly in vfio_pci_mmap_fault() to block only mmap'ed access. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++-- include/linux/vfio_pci_core.h | 1 + 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 207ede189c2af..9c612162653fb 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -277,11 +277,100 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev) +{ + /* + * The vdev power related flags are protected with 'memory_lock' + * semaphore. + */ + vfio_pci_zap_and_down_write_memory_lock(vdev); + if (vdev->pm_runtime_engaged) { + up_write(&vdev->memory_lock); + return -EINVAL; + } + + vdev->pm_runtime_engaged = true; + pm_runtime_put_noidle(&vdev->pdev->dev); + up_write(&vdev->memory_lock); + + return 0; +} + +static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + /* + * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count + * will be decremented. The pm_runtime_put() will be invoked again + * while returning from the ioctl and then the device can go into + * runtime suspended state. + */ + return vfio_pci_runtime_pm_entry(vdev); +} + +static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) +{ + if (vdev->pm_runtime_engaged) { + vdev->pm_runtime_engaged = false; + pm_runtime_get_noresume(&vdev->pdev->dev); + } +} + +static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) +{ + /* + * The vdev power related flags are protected with 'memory_lock' + * semaphore. + */ + down_write(&vdev->memory_lock); + __vfio_pci_runtime_pm_exit(vdev); + up_write(&vdev->memory_lock); +} + +static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + /* + * The device is always in the active state here due to pm wrappers + * around ioctls. + */ + vfio_pci_runtime_pm_exit(vdev); + return 0; +} + #ifdef CONFIG_PM static int vfio_pci_core_runtime_suspend(struct device *dev) { struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + down_write(&vdev->memory_lock); + /* + * The user can move the device into D3hot state before invoking + * power management IOCTL. Move the device into D0 state here and then + * the pci-driver core runtime PM suspend function will move the device + * into the low power state. Also, for the devices which have + * NoSoftRst-, it will help in restoring the original state + * (saved locally in 'vdev->pm_save'). + */ + vfio_pci_set_power_state(vdev, PCI_D0); + up_write(&vdev->memory_lock); + /* * If INTx is enabled, then mask INTx before going into the runtime * suspended state and unmask the same in the runtime resume. @@ -418,6 +507,18 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) /* * This function can be invoked while the power state is non-D0. + * This non-D0 power state can be with or without runtime PM. + * vfio_pci_runtime_pm_exit() will internally increment the usage + * count corresponding to pm_runtime_put() called during low power + * feature entry and then pm_runtime_resume() will wake up the device, + * if the device has already gone into the suspended state. Otherwise, + * the vfio_pci_set_power_state() will change the device power state + * to D0. + */ + vfio_pci_runtime_pm_exit(vdev); + pm_runtime_resume(&pdev->dev); + + /* * This function calls __pci_reset_function_locked() which internally * can use pci_pm_reset() for the function reset. pci_pm_reset() will * fail if the power state is non-D0. Also, for the devices which @@ -1273,6 +1374,10 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { switch (flags & VFIO_DEVICE_FEATURE_MASK) { + case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: + return vfio_pci_core_pm_entry(device, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: + return vfio_pci_core_pm_exit(device, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(device, flags, arg, argsz); default: @@ -1285,31 +1390,47 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; + ret = pm_runtime_resume_and_get(&vdev->pdev->dev); + if (ret) { + pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n", + ret); + return -EIO; + } + switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: - return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + break; case VFIO_PCI_ROM_REGION_INDEX: if (iswrite) - return -EINVAL; - return vfio_pci_bar_rw(vdev, buf, count, ppos, false); + ret = -EINVAL; + else + ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false); + break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + break; case VFIO_PCI_VGA_REGION_INDEX: - return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + break; + default: index -= VFIO_PCI_NUM_REGIONS; - return vdev->region[index].ops->rw(vdev, buf, + ret = vdev->region[index].ops->rw(vdev, buf, count, ppos, iswrite); + break; } - return -EINVAL; + pm_runtime_put(&vdev->pdev->dev); + return ret; } ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, @@ -1504,7 +1625,11 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) mutex_lock(&vdev->vma_lock); down_read(&vdev->memory_lock); - if (!__vfio_pci_memory_enabled(vdev)) { + /* + * Memory region cannot be accessed if the low power feature is engaged + * or memory access is disabled. + */ + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { ret = VM_FAULT_SIGBUS; goto up_out; } @@ -2219,6 +2344,15 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_unlock; } + /* + * Some of the devices in the dev_set can be in the runtime suspended + * state. Increment the usage count for all the devices in the dev_set + * before reset and decrement the same after reset. + */ + ret = vfio_pci_dev_set_pm_runtime_get(dev_set); + if (ret) + goto err_unlock; + list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { /* * Test whether all the affected devices are contained by the @@ -2274,6 +2408,9 @@ err_undo: else mutex_unlock(&cur->vma_lock); } + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a0f1f36e42a20..1025d53fde0bf 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -79,6 +79,7 @@ struct vfio_pci_core_device { bool nointx; bool needs_pm_restore; bool pm_intx_masked; + bool pm_runtime_engaged; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; -- GitLab From 453e6c98fd2bdae0df9adbc86af8d8bf1164edd5 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:50 +0530 Subject: [PATCH 0132/2223] vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is any access for the VFIO device on the host side, then the device will be moved out of the low power state without the user's guest driver involvement. Once the device access has been finished, then the host can move the device again into low power state. With the low power entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP, the device will not be moved back into the low power state and a notification will be sent to the user by triggering wakeup eventfd. vfio_pci_core_pm_entry() will be called for both the variants of low power feature entry so add an extra argument for wakeup eventfd context and store locally in 'struct vfio_pci_core_device'. For the entry happened without wakeup eventfd, all the exit related handling will be done by the LOW_POWER_EXIT device feature only. When the LOW_POWER_EXIT will be called, then the vfio core layer vfio_device_pm_runtime_get() will increment the usage count and will resume the device. In the driver runtime_resume callback, the 'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit() will call vfio_pci_runtime_pm_exit() and all the exit related handling will be done. For the entry happened with wakeup eventfd, in the driver resume callback, eventfd will be triggered and all the exit related handling will be done. When vfio_pci_runtime_pm_exit() will be called by vfio_pci_core_pm_exit(), then it will return early. But if the runtime suspend has not happened on the host side, then all the exit related handling will be done in vfio_pci_core_pm_exit() only. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 63 ++++++++++++++++++++++++++++++-- include/linux/vfio_pci_core.h | 1 + 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9c612162653fb..0d4b49f06b149 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -277,7 +277,8 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } -static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev) +static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, + struct eventfd_ctx *efdctx) { /* * The vdev power related flags are protected with 'memory_lock' @@ -290,6 +291,7 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev) } vdev->pm_runtime_engaged = true; + vdev->pm_wake_eventfd_ctx = efdctx; pm_runtime_put_noidle(&vdev->pdev->dev); up_write(&vdev->memory_lock); @@ -313,7 +315,40 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, * while returning from the ioctl and then the device can go into * runtime suspended state. */ - return vfio_pci_runtime_pm_entry(vdev); + return vfio_pci_runtime_pm_entry(vdev, NULL); +} + +static int vfio_pci_core_pm_entry_with_wakeup( + struct vfio_device *device, u32 flags, + struct vfio_device_low_power_entry_with_wakeup __user *arg, + size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + struct vfio_device_low_power_entry_with_wakeup entry; + struct eventfd_ctx *efdctx; + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, + sizeof(entry)); + if (ret != 1) + return ret; + + if (copy_from_user(&entry, arg, sizeof(entry))) + return -EFAULT; + + if (entry.wakeup_eventfd < 0) + return -EINVAL; + + efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + + ret = vfio_pci_runtime_pm_entry(vdev, efdctx); + if (ret) + eventfd_ctx_put(efdctx); + + return ret; } static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) @@ -321,6 +356,11 @@ static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) if (vdev->pm_runtime_engaged) { vdev->pm_runtime_engaged = false; pm_runtime_get_noresume(&vdev->pdev->dev); + + if (vdev->pm_wake_eventfd_ctx) { + eventfd_ctx_put(vdev->pm_wake_eventfd_ctx); + vdev->pm_wake_eventfd_ctx = NULL; + } } } @@ -348,7 +388,10 @@ static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, /* * The device is always in the active state here due to pm wrappers - * around ioctls. + * around ioctls. If the device had entered a low power state and + * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has + * already signaled the eventfd and exited low power mode itself. + * pm_runtime_engaged protects the redundant call here. */ vfio_pci_runtime_pm_exit(vdev); return 0; @@ -388,6 +431,17 @@ static int vfio_pci_core_runtime_resume(struct device *dev) { struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + /* + * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit + * low power mode. + */ + down_write(&vdev->memory_lock); + if (vdev->pm_wake_eventfd_ctx) { + eventfd_signal(vdev->pm_wake_eventfd_ctx, 1); + __vfio_pci_runtime_pm_exit(vdev); + } + up_write(&vdev->memory_lock); + if (vdev->pm_intx_masked) vfio_pci_intx_unmask(vdev); @@ -1376,6 +1430,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: return vfio_pci_core_pm_entry(device, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: + return vfio_pci_core_pm_entry_with_wakeup(device, flags, + arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: return vfio_pci_core_pm_exit(device, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 1025d53fde0bf..089b603bcfdca 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -85,6 +85,7 @@ struct vfio_pci_core_device { int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *pm_wake_eventfd_ctx; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; struct list_head ioeventfds_list; -- GitLab From 21c13829bc3b786bc5336470df023ae54e41d230 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Aug 2022 16:13:04 -0300 Subject: [PATCH 0133/2223] vfio: Remove vfio_group dev_counter This counts the number of devices attached to a vfio_group, ie the number of items in the group->device_list. It is only read in vfio_pin_pages(), as some kind of protection against limitations in type1. However, with all the code cleanups in this area, now that vfio_pin_pages() accepts a vfio_device directly it is redundant. All drivers are already calling vfio_register_emulated_iommu_dev() which directly creates a group specifically for the device and thus it is guaranteed that there is a singleton group. Leave a note in the comment about this requirement and remove the logic. Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v2-d4374a7bf0c9+c4-vfio_dev_counter_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5edc49748013d..77264d836d520 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -75,7 +75,6 @@ struct vfio_group { struct list_head vfio_next; struct list_head container_next; enum vfio_group_type type; - unsigned int dev_counter; struct rw_semaphore group_rwsem; struct kvm *kvm; struct file *opened_file; @@ -609,7 +608,6 @@ static int __vfio_register_dev(struct vfio_device *device, mutex_lock(&group->device_lock); list_add(&device->group_next, &group->device_list); - group->dev_counter++; mutex_unlock(&group->device_lock); return 0; @@ -697,7 +695,6 @@ void vfio_unregister_group_dev(struct vfio_device *device) mutex_lock(&group->device_lock); list_del(&device->group_next); - group->dev_counter--; mutex_unlock(&group->device_lock); if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) @@ -1991,6 +1988,9 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); * @prot [in] : protection flags * @pages[out] : array of host pages * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev(). */ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, int npage, int prot, struct page **pages) @@ -2006,9 +2006,6 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - if (group->dev_counter > 1) - return -EINVAL; - /* group->container cannot change while a vfio device is open */ container = group->container; driver = container->iommu_driver; -- GitLab From 245898eb9275ce31942cff95d0bdc7412ad3d589 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 31 Aug 2022 09:59:43 +0100 Subject: [PATCH 0134/2223] hisi_acc_vfio_pci: Correct the function prefix for hssi_acc_drvdata() Commit 91be0bd6c6cf("vfio/pci: Have all VFIO PCI drivers store the vfio_pci_core_device in drvdata") introduced a helper function to retrieve the drvdata but used "hssi" instead of "hisi" for the function prefix. Correct that and also while at it, moved the function a bit down so that it's close to other hisi_ prefixed functions. No functional changes. Signed-off-by: Shameer Kolothum Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220831085943.993-1-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index ea762e28c1cc6..258cae0863eae 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -337,14 +337,6 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) return 0; } -static struct hisi_acc_vf_core_device *hssi_acc_drvdata(struct pci_dev *pdev) -{ - struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); - - return container_of(core_device, struct hisi_acc_vf_core_device, - core_device); -} - static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_qm *qm) { @@ -552,6 +544,14 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } +static struct hisi_acc_vf_core_device *hisi_acc_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct hisi_acc_vf_core_device, + core_device); +} + /* Check the PF's RAS state and Function INT state */ static int hisi_acc_check_int_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) @@ -970,7 +970,7 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); if (hisi_acc_vdev->core_device.vdev.migration_flags != VFIO_MIGRATION_STOP_COPY) @@ -1301,7 +1301,7 @@ out_free: static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); -- GitLab From f23b373f30fc9a8e77dec82d2a3d583c1eef155e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 20:58:50 +0300 Subject: [PATCH 0135/2223] pinctrl: mcp23s08: Drop assignment of default number of OF cells The GPIO library code will assign default value for number of OF cells, no need to repeat this in the driver. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220830175850.44770-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c index 695236636d057..5f356edfd0fd5 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08.c +++ b/drivers/pinctrl/pinctrl-mcp23s08.c @@ -549,9 +549,6 @@ int mcp23s08_probe_one(struct mcp23s08 *mcp, struct device *dev, mcp->chip.get = mcp23s08_get; mcp->chip.direction_output = mcp23s08_direction_output; mcp->chip.set = mcp23s08_set; -#ifdef CONFIG_OF_GPIO - mcp->chip.of_gpio_n_cells = 2; -#endif mcp->chip.base = base; mcp->chip.can_sleep = true; -- GitLab From 7fec8c9ceeedbe29be64c1b0a0610d40de39fcf8 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 31 Aug 2022 16:56:34 +0300 Subject: [PATCH 0136/2223] pinctrl: at91: use kernel-doc style for documentation of at91_gpio_chip Use kernel-doc style for documentation of struct at91_gpio_chip. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220831135636.3176406-2-claudiu.beznea@microchip.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-at91.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c index 5634fa063ebfe..0b78fd48fd026 100644 --- a/drivers/pinctrl/pinctrl-at91.c +++ b/drivers/pinctrl/pinctrl-at91.c @@ -33,16 +33,28 @@ struct at91_pinctrl_mux_ops; +/** + * struct at91_gpio_chip: at91 gpio chip + * @chip: gpio chip + * @range: gpio range + * @next: bank sharing same clock + * @pioc_hwirq: PIO bank interrupt identifier on AIC + * @pioc_virq: PIO bank Linux virtual interrupt + * @pioc_idx: PIO bank index + * @regbase: PIO bank virtual address + * @clock: associated clock + * @ops: at91 pinctrl mux ops + */ struct at91_gpio_chip { struct gpio_chip chip; struct pinctrl_gpio_range range; - struct at91_gpio_chip *next; /* Bank sharing same clock */ - int pioc_hwirq; /* PIO bank interrupt identifier on AIC */ - int pioc_virq; /* PIO bank Linux virtual interrupt */ - int pioc_idx; /* PIO bank index */ - void __iomem *regbase; /* PIO bank virtual address */ - struct clk *clock; /* associated clock */ - const struct at91_pinctrl_mux_ops *ops; /* ops */ + struct at91_gpio_chip *next; + int pioc_hwirq; + int pioc_virq; + int pioc_idx; + void __iomem *regbase; + struct clk *clock; + const struct at91_pinctrl_mux_ops *ops; }; static struct at91_gpio_chip *gpio_chips[MAX_GPIO_BANKS]; -- GitLab From a575207583676298f3999d41d86d81f7172fe950 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 31 Aug 2022 16:56:35 +0300 Subject: [PATCH 0137/2223] pinctrl: at91: move gpio suspend/resume calls to driver's context Move gpio suspend/resume execution local to driver and let it execute as close as possible to the moment the machine specific PM code is executed (by setting it to .noirq member of dev_pm_ops). With this the at91_pinctrl_gpio_suspend()/at91_pinctrl_gpio_resume() calls were removed from arch/arm/mach-at91/pm.c and also a header has been removed. The patch has been checked on sama5d3_xplained, sam9x60ek, sama5d2_xplained, sama7g5ek boards. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220831135636.3176406-3-claudiu.beznea@microchip.com Signed-off-by: Linus Walleij --- arch/arm/mach-at91/pm.c | 15 ------- drivers/pinctrl/pinctrl-at91.c | 79 ++++++++++++++++------------------ include/soc/at91/pm.h | 16 ------- 3 files changed, 36 insertions(+), 74 deletions(-) delete mode 100644 include/soc/at91/pm.h diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index df6d673e83d56..a695710142dba 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -19,8 +19,6 @@ #include #include -#include - #include #include #include @@ -624,16 +622,6 @@ static int at91_pm_enter(suspend_state_t state) if (ret) return ret; -#ifdef CONFIG_PINCTRL_AT91 - /* - * FIXME: this is needed to communicate between the pinctrl driver and - * the PM implementation in the machine. Possibly part of the PM - * implementation should be moved down into the pinctrl driver and get - * called as part of the generic suspend/resume path. - */ - at91_pinctrl_gpio_suspend(); -#endif - switch (state) { case PM_SUSPEND_MEM: case PM_SUSPEND_STANDBY: @@ -658,9 +646,6 @@ static int at91_pm_enter(suspend_state_t state) } error: -#ifdef CONFIG_PINCTRL_AT91 - at91_pinctrl_gpio_resume(); -#endif at91_pm_config_quirks(false); return 0; } diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c index 0b78fd48fd026..631a6289c2b61 100644 --- a/drivers/pinctrl/pinctrl-at91.c +++ b/drivers/pinctrl/pinctrl-at91.c @@ -22,8 +22,7 @@ #include /* Since we request GPIOs from ourself */ #include - -#include +#include #include "pinctrl-at91.h" #include "core.h" @@ -44,6 +43,9 @@ struct at91_pinctrl_mux_ops; * @regbase: PIO bank virtual address * @clock: associated clock * @ops: at91 pinctrl mux ops + * @wakeups: wakeup interrupts + * @backups: interrupts disabled in suspend + * @id: gpio chip identifier */ struct at91_gpio_chip { struct gpio_chip chip; @@ -55,6 +57,9 @@ struct at91_gpio_chip { void __iomem *regbase; struct clk *clock; const struct at91_pinctrl_mux_ops *ops; + u32 wakeups; + u32 backups; + u32 id; }; static struct at91_gpio_chip *gpio_chips[MAX_GPIO_BANKS]; @@ -1627,70 +1632,51 @@ static void gpio_irq_ack(struct irq_data *d) /* the interrupt is already cleared before by reading ISR */ } -static u32 wakeups[MAX_GPIO_BANKS]; -static u32 backups[MAX_GPIO_BANKS]; - static int gpio_irq_set_wake(struct irq_data *d, unsigned state) { struct at91_gpio_chip *at91_gpio = irq_data_get_irq_chip_data(d); - unsigned bank = at91_gpio->pioc_idx; unsigned mask = 1 << d->hwirq; - if (unlikely(bank >= MAX_GPIO_BANKS)) - return -EINVAL; - if (state) - wakeups[bank] |= mask; + at91_gpio->wakeups |= mask; else - wakeups[bank] &= ~mask; + at91_gpio->wakeups &= ~mask; irq_set_irq_wake(at91_gpio->pioc_virq, state); return 0; } -void at91_pinctrl_gpio_suspend(void) +static int at91_gpio_suspend(struct device *dev) { - int i; + struct at91_gpio_chip *at91_chip = dev_get_drvdata(dev); + void __iomem *pio = at91_chip->regbase; - for (i = 0; i < gpio_banks; i++) { - void __iomem *pio; + at91_chip->backups = readl_relaxed(pio + PIO_IMR); + writel_relaxed(at91_chip->backups, pio + PIO_IDR); + writel_relaxed(at91_chip->wakeups, pio + PIO_IER); - if (!gpio_chips[i]) - continue; - - pio = gpio_chips[i]->regbase; - - backups[i] = readl_relaxed(pio + PIO_IMR); - writel_relaxed(backups[i], pio + PIO_IDR); - writel_relaxed(wakeups[i], pio + PIO_IER); + if (!at91_chip->wakeups) + clk_disable_unprepare(at91_chip->clock); + else + printk(KERN_DEBUG "GPIO-%c may wake for %08x\n", + 'A' + at91_chip->id, at91_chip->wakeups); - if (!wakeups[i]) - clk_disable_unprepare(gpio_chips[i]->clock); - else - printk(KERN_DEBUG "GPIO-%c may wake for %08x\n", - 'A'+i, wakeups[i]); - } + return 0; } -void at91_pinctrl_gpio_resume(void) +static int at91_gpio_resume(struct device *dev) { - int i; + struct at91_gpio_chip *at91_chip = dev_get_drvdata(dev); + void __iomem *pio = at91_chip->regbase; - for (i = 0; i < gpio_banks; i++) { - void __iomem *pio; - - if (!gpio_chips[i]) - continue; + if (!at91_chip->wakeups) + clk_prepare_enable(at91_chip->clock); - pio = gpio_chips[i]->regbase; + writel_relaxed(at91_chip->wakeups, pio + PIO_IDR); + writel_relaxed(at91_chip->backups, pio + PIO_IER); - if (!wakeups[i]) - clk_prepare_enable(gpio_chips[i]->clock); - - writel_relaxed(wakeups[i], pio + PIO_IDR); - writel_relaxed(backups[i], pio + PIO_IER); - } + return 0; } static void gpio_irq_handler(struct irq_desc *desc) @@ -1872,6 +1858,7 @@ static int at91_gpio_probe(struct platform_device *pdev) } at91_chip->chip = at91_gpio_template; + at91_chip->id = alias_idx; chip = &at91_chip->chip; chip->label = dev_name(&pdev->dev); @@ -1917,6 +1904,7 @@ static int at91_gpio_probe(struct platform_device *pdev) goto gpiochip_add_err; gpio_chips[alias_idx] = at91_chip; + platform_set_drvdata(pdev, at91_chip); gpio_banks = max(gpio_banks, alias_idx + 1); dev_info(&pdev->dev, "at address %p\n", at91_chip->regbase); @@ -1932,10 +1920,15 @@ err: return ret; } +static const struct dev_pm_ops at91_gpio_pm_ops = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(at91_gpio_suspend, at91_gpio_resume) +}; + static struct platform_driver at91_gpio_driver = { .driver = { .name = "gpio-at91", .of_match_table = at91_gpio_of_match, + .pm = pm_ptr(&at91_gpio_pm_ops), }, .probe = at91_gpio_probe, }; diff --git a/include/soc/at91/pm.h b/include/soc/at91/pm.h deleted file mode 100644 index 7a41e53a3ffa3..0000000000000 --- a/include/soc/at91/pm.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Atmel Power Management - * - * Copyright (C) 2020 Atmel - * - * Author: Lee Jones - */ - -#ifndef __SOC_ATMEL_PM_H -#define __SOC_ATMEL_PM_H - -void at91_pinctrl_gpio_suspend(void); -void at91_pinctrl_gpio_resume(void); - -#endif /* __SOC_ATMEL_PM_H */ -- GitLab From 42eae17d56079591f945e409a4159e750ccc57df Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 31 Aug 2022 16:56:36 +0300 Subject: [PATCH 0138/2223] pinctrl: at91: use dev_dbg() instead of printk() Use dev_dbg() instead of printk(KERN_DEBUG) to avoid the following checkpatch.pl warning: "Prefer [subsystem eg: netdev]_dbg([subsystem]dev, ... then dev_dbg(dev, ... then pr_debug(... to printk(KERN_DEBUG ...". Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220831135636.3176406-4-claudiu.beznea@microchip.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-at91.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c index 631a6289c2b61..81dbffab621fb 100644 --- a/drivers/pinctrl/pinctrl-at91.c +++ b/drivers/pinctrl/pinctrl-at91.c @@ -1659,8 +1659,8 @@ static int at91_gpio_suspend(struct device *dev) if (!at91_chip->wakeups) clk_disable_unprepare(at91_chip->clock); else - printk(KERN_DEBUG "GPIO-%c may wake for %08x\n", - 'A' + at91_chip->id, at91_chip->wakeups); + dev_dbg(dev, "GPIO-%c may wake for %08x\n", + 'A' + at91_chip->id, at91_chip->wakeups); return 0; } -- GitLab From 1074e1d23a5c201b6558878a09f1d2b7c9506835 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 31 Aug 2022 16:55:12 +0300 Subject: [PATCH 0139/2223] pinctrl: pistachio: Switch to use fwnode instead of GPIO library now accepts fwnode as a firmware node, so switch the driver to use it. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220831135512.78407-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-pistachio.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/pinctrl/pinctrl-pistachio.c b/drivers/pinctrl/pinctrl-pistachio.c index 5de691c630b4f..940ed3fff63a8 100644 --- a/drivers/pinctrl/pinctrl-pistachio.c +++ b/drivers/pinctrl/pinctrl-pistachio.c @@ -10,13 +10,13 @@ #include #include #include -#include -#include +#include #include #include #include #include #include +#include #include #include @@ -1347,46 +1347,45 @@ static struct pistachio_gpio_bank pistachio_gpio_banks[] = { static int pistachio_gpio_register(struct pistachio_pinctrl *pctl) { - struct device_node *node = pctl->dev->of_node; struct pistachio_gpio_bank *bank; unsigned int i; int irq, ret = 0; for (i = 0; i < pctl->nbanks; i++) { char child_name[sizeof("gpioXX")]; - struct device_node *child; + struct fwnode_handle *child; struct gpio_irq_chip *girq; snprintf(child_name, sizeof(child_name), "gpio%d", i); - child = of_get_child_by_name(node, child_name); + child = device_get_named_child_node(pctl->dev, child_name); if (!child) { dev_err(pctl->dev, "No node for bank %u\n", i); ret = -ENODEV; goto err; } - if (!of_find_property(child, "gpio-controller", NULL)) { + if (!fwnode_property_present(child, "gpio-controller")) { + fwnode_handle_put(child); dev_err(pctl->dev, "No gpio-controller property for bank %u\n", i); - of_node_put(child); ret = -ENODEV; goto err; } - irq = irq_of_parse_and_map(child, 0); - if (!irq) { + ret = fwnode_irq_get(child, 0); + if (ret < 0) { + fwnode_handle_put(child); dev_err(pctl->dev, "No IRQ for bank %u\n", i); - of_node_put(child); - ret = -EINVAL; goto err; } + irq = ret; bank = &pctl->gpio_banks[i]; bank->pctl = pctl; bank->base = pctl->base + GPIO_BANK_BASE(i); bank->gpio_chip.parent = pctl->dev; - bank->gpio_chip.of_node = child; + bank->gpio_chip.fwnode = child; girq = &bank->gpio_chip.irq; girq->chip = &bank->irq_chip; -- GitLab From c99e3ac632f9dfa4e363cf370dea7467ebb0f367 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 22:11:17 -0700 Subject: [PATCH 0140/2223] Input: atkbd - switch to using dev_groups for driver-specific attributes The driver core now has the ability to handle the creation and removal of device-specific sysfs files, let's use it instead of registering and unregistering attributes by hand. Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220903051119.1332808-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/atkbd.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c index d4131236d18c5..246958795f606 100644 --- a/drivers/input/keyboard/atkbd.c +++ b/drivers/input/keyboard/atkbd.c @@ -323,11 +323,13 @@ static umode_t atkbd_attr_is_visible(struct kobject *kobj, return attr->mode; } -static struct attribute_group atkbd_attribute_group = { +static const struct attribute_group atkbd_attribute_group = { .attrs = atkbd_attributes, .is_visible = atkbd_attr_is_visible, }; +__ATTRIBUTE_GROUPS(atkbd_attribute); + static const unsigned int xl_table[] = { ATKBD_RET_BAT, ATKBD_RET_ERR, ATKBD_RET_ACK, ATKBD_RET_NAK, ATKBD_RET_HANJA, ATKBD_RET_HANGEUL, @@ -922,8 +924,6 @@ static void atkbd_disconnect(struct serio *serio) { struct atkbd *atkbd = serio_get_drvdata(serio); - sysfs_remove_group(&serio->dev.kobj, &atkbd_attribute_group); - atkbd_disable(atkbd); input_unregister_device(atkbd->dev); @@ -1271,21 +1271,16 @@ static int atkbd_connect(struct serio *serio, struct serio_driver *drv) atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); - err = sysfs_create_group(&serio->dev.kobj, &atkbd_attribute_group); - if (err) - goto fail3; - atkbd_enable(atkbd); if (serio->write) atkbd_activate(atkbd); err = input_register_device(atkbd->dev); if (err) - goto fail4; + goto fail3; return 0; - fail4: sysfs_remove_group(&serio->dev.kobj, &atkbd_attribute_group); fail3: serio_close(serio); fail2: serio_set_drvdata(serio, NULL); fail1: input_free_device(dev); @@ -1378,7 +1373,8 @@ MODULE_DEVICE_TABLE(serio, atkbd_serio_ids); static struct serio_driver atkbd_drv = { .driver = { - .name = "atkbd", + .name = "atkbd", + .dev_groups = atkbd_attribute_groups, }, .description = DRIVER_DESC, .id_table = atkbd_serio_ids, -- GitLab From fd30a4ba81f94e7297a3fb3c0d83879a4e4b2591 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 22:11:18 -0700 Subject: [PATCH 0141/2223] Input: psmouse - switch to using dev_groups for driver-specific attributes The driver core now has the ability to handle the creation and removal of device-specific sysfs files, let's use it instead of registering and unregistering attributes by hand. Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220903051119.1332808-2-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/psmouse-base.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c index 0b4a3039f312f..c9a7e87b273ed 100644 --- a/drivers/input/mouse/psmouse-base.c +++ b/drivers/input/mouse/psmouse-base.c @@ -94,7 +94,7 @@ PSMOUSE_DEFINE_ATTR(resync_time, S_IWUSR | S_IRUGO, (void *) offsetof(struct psmouse, resync_time), psmouse_show_int_attr, psmouse_set_int_attr); -static struct attribute *psmouse_attributes[] = { +static struct attribute *psmouse_dev_attrs[] = { &psmouse_attr_protocol.dattr.attr, &psmouse_attr_rate.dattr.attr, &psmouse_attr_resolution.dattr.attr, @@ -103,9 +103,7 @@ static struct attribute *psmouse_attributes[] = { NULL }; -static const struct attribute_group psmouse_attribute_group = { - .attrs = psmouse_attributes, -}; +ATTRIBUTE_GROUPS(psmouse_dev); /* * psmouse_mutex protects all operations changing state of mouse @@ -1481,8 +1479,6 @@ static void psmouse_disconnect(struct serio *serio) struct psmouse *psmouse = serio_get_drvdata(serio); struct psmouse *parent = NULL; - sysfs_remove_group(&serio->dev.kobj, &psmouse_attribute_group); - mutex_lock(&psmouse_mutex); psmouse_set_state(psmouse, PSMOUSE_CMD_MODE); @@ -1647,10 +1643,6 @@ static int psmouse_connect(struct serio *serio, struct serio_driver *drv) if (parent && parent->pt_activate) parent->pt_activate(parent); - error = sysfs_create_group(&serio->dev.kobj, &psmouse_attribute_group); - if (error) - goto err_pt_deactivate; - /* * PS/2 devices having SMBus companions should stay disabled * on PS/2 side, in order to have SMBus part operable. @@ -1666,13 +1658,6 @@ static int psmouse_connect(struct serio *serio, struct serio_driver *drv) mutex_unlock(&psmouse_mutex); return retval; - err_pt_deactivate: - if (parent && parent->pt_deactivate) - parent->pt_deactivate(parent); - if (input_dev) { - input_unregister_device(input_dev); - input_dev = NULL; /* so we don't try to free it below */ - } err_protocol_disconnect: if (psmouse->disconnect) psmouse->disconnect(psmouse); @@ -1791,7 +1776,8 @@ MODULE_DEVICE_TABLE(serio, psmouse_serio_ids); static struct serio_driver psmouse_drv = { .driver = { - .name = "psmouse", + .name = "psmouse", + .dev_groups = psmouse_dev_groups, }, .description = DRIVER_DESC, .id_table = psmouse_serio_ids, -- GitLab From f4e7a254299bcdfe7bced700a7d96690b1b9a6f2 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 22:11:19 -0700 Subject: [PATCH 0142/2223] Input: aiptek - switch to using dev_groups for driver-specific attributes The driver core now has the ability to handle the creation and removal of device-specific sysfs files, let's use it instead of registering and unregistering attributes by hand. Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220903051119.1332808-3-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/tablet/aiptek.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/input/tablet/aiptek.c b/drivers/input/tablet/aiptek.c index 24ec4844a5c3e..baabc51547b83 100644 --- a/drivers/input/tablet/aiptek.c +++ b/drivers/input/tablet/aiptek.c @@ -1617,7 +1617,7 @@ static ssize_t show_firmwareCode(struct device *dev, struct device_attribute *at static DEVICE_ATTR(firmware_code, S_IRUGO, show_firmwareCode, NULL); -static struct attribute *aiptek_attributes[] = { +static struct attribute *aiptek_dev_attrs[] = { &dev_attr_size.attr, &dev_attr_pointer_mode.attr, &dev_attr_coordinate_mode.attr, @@ -1641,9 +1641,7 @@ static struct attribute *aiptek_attributes[] = { NULL }; -static const struct attribute_group aiptek_attribute_group = { - .attrs = aiptek_attributes, -}; +ATTRIBUTE_GROUPS(aiptek_dev); /*********************************************************************** * This routine is called when a tablet has been identified. It basically @@ -1842,26 +1840,16 @@ aiptek_probe(struct usb_interface *intf, const struct usb_device_id *id) */ usb_set_intfdata(intf, aiptek); - /* Set up the sysfs files - */ - err = sysfs_create_group(&intf->dev.kobj, &aiptek_attribute_group); - if (err) { - dev_warn(&intf->dev, "cannot create sysfs group err: %d\n", - err); - goto fail3; - } - /* Register the tablet as an Input Device */ err = input_register_device(aiptek->inputdev); if (err) { dev_warn(&intf->dev, "input_register_device returned err: %d\n", err); - goto fail4; + goto fail3; } return 0; - fail4: sysfs_remove_group(&intf->dev.kobj, &aiptek_attribute_group); fail3: usb_free_urb(aiptek->urb); fail2: usb_free_coherent(usbdev, AIPTEK_PACKET_LENGTH, aiptek->data, aiptek->data_dma); @@ -1886,7 +1874,6 @@ static void aiptek_disconnect(struct usb_interface *intf) */ usb_kill_urb(aiptek->urb); input_unregister_device(aiptek->inputdev); - sysfs_remove_group(&intf->dev.kobj, &aiptek_attribute_group); usb_free_urb(aiptek->urb); usb_free_coherent(interface_to_usbdev(intf), AIPTEK_PACKET_LENGTH, @@ -1900,6 +1887,7 @@ static struct usb_driver aiptek_driver = { .probe = aiptek_probe, .disconnect = aiptek_disconnect, .id_table = aiptek_ids, + .dev_groups = aiptek_dev_groups, }; module_usb_driver(aiptek_driver); -- GitLab From 35e49953c31d85d5d942af611d5b9090b0dc8cfa Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 3 Sep 2022 23:23:37 -0500 Subject: [PATCH 0143/2223] memblock tests: remove 'cleared' from comment blocks The tests in alloc_nid_api can now run either memblock_alloc_try_nid() or memblock_alloc_try_nid_raw(). The comment blocks for these tests should not refer to a 'cleared' region since that only applies to memblock_alloc_try_nid(). Remove 'cleared' from the comment blocks so that the comments are accurate for either memblock function. Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/e8be24137e54e9f81a06af969ded82b319114d7a.1662264347.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_nid_api.c | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 32b3c1594fddd..d89741dd3d464 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -33,7 +33,7 @@ static inline void *run_memblock_alloc_try_nid(phys_addr_t size, * | | * min_addr max_addr * - * Expect to allocate a cleared region that ends at max_addr. + * Expect to allocate a region that ends at max_addr. */ static int alloc_try_nid_top_down_simple_check(void) { @@ -87,7 +87,7 @@ static int alloc_try_nid_top_down_simple_check(void) * Aligned address * boundary * - * Expect to allocate a cleared, aligned region that ends before max_addr. + * Expect to allocate an aligned region that ends before max_addr. */ static int alloc_try_nid_top_down_end_misaligned_check(void) { @@ -139,7 +139,7 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) * | | * min_addr max_addr * - * Expect to allocate a cleared region that starts at min_addr and ends at + * Expect to allocate a region that starts at min_addr and ends at * max_addr, given that min_addr is aligned. */ static int alloc_try_nid_exact_address_generic_check(void) @@ -193,7 +193,7 @@ static int alloc_try_nid_exact_address_generic_check(void) * address | * boundary min_add * - * Expect to drop the lower limit and allocate a cleared memory region which + * Expect to drop the lower limit and allocate a memory region which * ends at max_addr (if the address is aligned). */ static int alloc_try_nid_top_down_narrow_range_check(void) @@ -641,7 +641,7 @@ static int alloc_try_nid_reserved_all_generic_check(void) /* * A test that tries to allocate a memory region, where max_addr is * bigger than the end address of the available memory. Expect to allocate - * a cleared region that ends before the end of the memory. + * a region that ends before the end of the memory. */ static int alloc_try_nid_top_down_cap_max_check(void) { @@ -680,7 +680,7 @@ static int alloc_try_nid_top_down_cap_max_check(void) /* * A test that tries to allocate a memory region, where min_addr is * smaller than the start address of the available memory. Expect to allocate - * a cleared region that ends before the end of the memory. + * a region that ends before the end of the memory. */ static int alloc_try_nid_top_down_cap_min_check(void) { @@ -728,7 +728,7 @@ static int alloc_try_nid_top_down_cap_min_check(void) * | | * min_addr max_addr * - * Expect to allocate a cleared region that ends before max_addr. + * Expect to allocate a region that ends before max_addr. */ static int alloc_try_nid_bottom_up_simple_check(void) { @@ -782,7 +782,7 @@ static int alloc_try_nid_bottom_up_simple_check(void) * Aligned address * boundary * - * Expect to allocate a cleared, aligned region that ends before max_addr. + * Expect to allocate an aligned region that ends before max_addr. */ static int alloc_try_nid_bottom_up_start_misaligned_check(void) { @@ -836,7 +836,7 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) * | * min_add * - * Expect to drop the lower limit and allocate a cleared memory region which + * Expect to drop the lower limit and allocate a memory region which * starts at the beginning of the available memory. */ static int alloc_try_nid_bottom_up_narrow_range_check(void) @@ -1019,7 +1019,7 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) /* * A test that tries to allocate a memory region, where max_addr is * bigger than the end address of the available memory. Expect to allocate - * a cleared region that starts at the min_addr + * a region that starts at the min_addr. */ static int alloc_try_nid_bottom_up_cap_max_check(void) { @@ -1058,7 +1058,7 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) /* * A test that tries to allocate a memory region, where min_addr is * smaller than the start address of the available memory. Expect to allocate - * a cleared region at the beginning of the available memory. + * a region at the beginning of the available memory. */ static int alloc_try_nid_bottom_up_cap_min_check(void) { -- GitLab From 42c3ba86581896be8dd7fb88ed075b600fd57fa1 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 3 Sep 2022 23:24:06 -0500 Subject: [PATCH 0144/2223] memblock_tests: move variable declarations to single block Move variable declarations to a single block at the beginning of each testing function. Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/e61431e73977f305fdd027bca99d1dc119e96d84.1662264355.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 57 ++++--------- .../memblock/tests/alloc_helpers_api.c | 32 ++------ tools/testing/memblock/tests/alloc_nid_api.c | 80 +++++-------------- 3 files changed, 42 insertions(+), 127 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index 36dd7e254cce9..68f1a75cd72c4 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -25,12 +25,10 @@ static int alloc_top_down_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_2; phys_addr_t expected_start; + PREFIX_PUSH(); setup_memblock(); expected_start = memblock_end_of_DRAM() - SMP_CACHE_BYTES; @@ -76,15 +74,13 @@ static int alloc_top_down_disjoint_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[0]; struct region r1; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r2_size = SZ_16; /* Use custom alignment */ phys_addr_t alignment = SMP_CACHE_BYTES * 2; phys_addr_t total_size; phys_addr_t expected_start; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SZ_2; @@ -128,9 +124,6 @@ static int alloc_top_down_before_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - /* * The first region ends at the aligned address to test region merging */ @@ -138,6 +131,7 @@ static int alloc_top_down_before_check(void) phys_addr_t r2_size = SZ_512; phys_addr_t total_size = r1_size + r2_size; + PREFIX_PUSH(); setup_memblock(); memblock_reserve(memblock_end_of_DRAM() - total_size, r1_size); @@ -174,12 +168,10 @@ static int alloc_top_down_after_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; struct region r1; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r2_size = SZ_512; phys_addr_t total_size; + PREFIX_PUSH(); setup_memblock(); /* @@ -225,12 +217,10 @@ static int alloc_top_down_second_fit_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; struct region r1, r2; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_1K; phys_addr_t total_size; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SZ_512; @@ -276,9 +266,6 @@ static int alloc_in_between_generic_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; struct region r1, r2; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t r3_size = SZ_64; /* @@ -287,6 +274,7 @@ static int alloc_in_between_generic_check(void) phys_addr_t rgn_size = (MEM_SIZE - (2 * gap_size + r3_size)) / 2; phys_addr_t total_size; + PREFIX_PUSH(); setup_memblock(); r1.size = rgn_size; @@ -332,13 +320,11 @@ static int alloc_in_between_generic_check(void) static int alloc_small_gaps_generic_check(void) { void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t region_size = SZ_1K; phys_addr_t gap_size = SZ_256; phys_addr_t region_end; + PREFIX_PUSH(); setup_memblock(); region_end = memblock_start_of_DRAM(); @@ -366,7 +352,6 @@ static int alloc_all_reserved_generic_check(void) void *allocated_ptr = NULL; PREFIX_PUSH(); - setup_memblock(); /* Simulate full memory */ @@ -397,14 +382,12 @@ static int alloc_all_reserved_generic_check(void) static int alloc_no_space_generic_check(void) { void *allocated_ptr = NULL; + phys_addr_t available_size = SZ_256; + phys_addr_t reserved_size = MEM_SIZE - available_size; PREFIX_PUSH(); - setup_memblock(); - phys_addr_t available_size = SZ_256; - phys_addr_t reserved_size = MEM_SIZE - available_size; - /* Simulate almost-full memory */ memblock_reserve(memblock_start_of_DRAM(), reserved_size); @@ -432,12 +415,10 @@ static int alloc_limited_space_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t available_size = SZ_256; phys_addr_t reserved_size = MEM_SIZE - available_size; + PREFIX_PUSH(); setup_memblock(); /* Simulate almost-full memory */ @@ -504,7 +485,6 @@ static int alloc_too_large_generic_check(void) void *allocated_ptr = NULL; PREFIX_PUSH(); - setup_memblock(); allocated_ptr = run_memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); @@ -530,7 +510,6 @@ static int alloc_bottom_up_simple_check(void) void *allocated_ptr = NULL; PREFIX_PUSH(); - setup_memblock(); allocated_ptr = run_memblock_alloc(SZ_2, SMP_CACHE_BYTES); @@ -572,15 +551,13 @@ static int alloc_bottom_up_disjoint_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[1]; struct region r1; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r2_size = SZ_16; /* Use custom alignment */ phys_addr_t alignment = SMP_CACHE_BYTES * 2; phys_addr_t total_size; phys_addr_t expected_start; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_start_of_DRAM() + SZ_2; @@ -624,13 +601,11 @@ static int alloc_bottom_up_before_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_512; phys_addr_t r2_size = SZ_128; phys_addr_t total_size = r1_size + r2_size; + PREFIX_PUSH(); setup_memblock(); memblock_reserve(memblock_start_of_DRAM() + r1_size, r2_size); @@ -666,12 +641,10 @@ static int alloc_bottom_up_after_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; struct region r1; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r2_size = SZ_512; phys_addr_t total_size; + PREFIX_PUSH(); setup_memblock(); /* @@ -718,12 +691,10 @@ static int alloc_bottom_up_second_fit_check(void) struct memblock_region *rgn = &memblock.reserved.regions[1]; struct region r1, r2; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_1K; phys_addr_t total_size; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_start_of_DRAM(); diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index 06577bd0e349b..3ef9486da8a09 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -19,12 +19,10 @@ static int alloc_from_simple_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_16; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES; @@ -64,12 +62,10 @@ static int alloc_from_misaligned_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_32; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); /* A misaligned address */ @@ -113,12 +109,10 @@ static int alloc_from_top_down_high_addr_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_32; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); /* The address is too close to the end of the memory */ @@ -158,14 +152,12 @@ static int alloc_from_top_down_no_space_above_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_64; phys_addr_t r2_size = SZ_2; phys_addr_t total_size = r1_size + r2_size; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; @@ -197,13 +189,11 @@ static int alloc_from_top_down_min_addr_cap_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_64; phys_addr_t min_addr; phys_addr_t start_addr; + PREFIX_PUSH(); setup_memblock(); start_addr = (phys_addr_t)memblock_start_of_DRAM(); @@ -245,12 +235,10 @@ static int alloc_from_bottom_up_high_addr_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_32; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); /* The address is too close to the end of the memory */ @@ -289,13 +277,11 @@ static int alloc_from_bottom_up_no_space_above_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_64; phys_addr_t min_addr; phys_addr_t r2_size; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SZ_128; @@ -327,13 +313,11 @@ static int alloc_from_bottom_up_min_addr_cap_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_64; phys_addr_t min_addr; phys_addr_t start_addr; + PREFIX_PUSH(); setup_memblock(); start_addr = (phys_addr_t)memblock_start_of_DRAM(); diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index d89741dd3d464..db5daa50fa72e 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -39,14 +39,12 @@ static int alloc_try_nid_top_down_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_128; phys_addr_t min_addr; phys_addr_t max_addr; phys_addr_t rgn_end; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; @@ -93,15 +91,13 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_128; phys_addr_t misalign = SZ_2; phys_addr_t min_addr; phys_addr_t max_addr; phys_addr_t rgn_end; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; @@ -146,14 +142,12 @@ static int alloc_try_nid_exact_address_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; phys_addr_t rgn_end; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES; @@ -200,13 +194,11 @@ static int alloc_try_nid_top_down_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SZ_512; @@ -253,13 +245,11 @@ static int alloc_try_nid_top_down_narrow_range_check(void) static int alloc_try_nid_low_max_generic_check(void) { void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM(); @@ -294,9 +284,6 @@ static int alloc_try_nid_min_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_128; phys_addr_t r2_size = SZ_64; phys_addr_t total_size = r1_size + r2_size; @@ -304,6 +291,7 @@ static int alloc_try_nid_min_reserved_generic_check(void) phys_addr_t max_addr; phys_addr_t reserved_base; + PREFIX_PUSH(); setup_memblock(); max_addr = memblock_end_of_DRAM(); @@ -348,15 +336,13 @@ static int alloc_try_nid_max_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t r1_size = SZ_64; phys_addr_t r2_size = SZ_128; phys_addr_t total_size = r1_size + r2_size; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); max_addr = memblock_end_of_DRAM() - r1_size; @@ -405,15 +391,13 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; struct region r1, r2; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_64; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; phys_addr_t max_addr; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; @@ -471,14 +455,12 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; struct region r1, r2; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_64; phys_addr_t total_size; phys_addr_t max_addr; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; @@ -536,15 +518,13 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; struct region r1, r2; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_256; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; phys_addr_t max_addr; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; @@ -605,14 +585,12 @@ static int alloc_try_nid_reserved_all_generic_check(void) { void *allocated_ptr = NULL; struct region r1, r2; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_256; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t max_addr; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SMP_CACHE_BYTES; @@ -647,13 +625,11 @@ static int alloc_try_nid_top_down_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_end_of_DRAM() - SZ_1K; @@ -686,13 +662,11 @@ static int alloc_try_nid_top_down_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() - SZ_256; @@ -734,14 +708,12 @@ static int alloc_try_nid_bottom_up_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_128; phys_addr_t min_addr; phys_addr_t max_addr; phys_addr_t rgn_end; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; @@ -788,15 +760,13 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_128; phys_addr_t misalign = SZ_2; phys_addr_t min_addr; phys_addr_t max_addr; phys_addr_t rgn_end; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + misalign; @@ -843,13 +813,11 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SZ_512; @@ -896,15 +864,13 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; struct region r1, r2; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_64; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; phys_addr_t max_addr; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; @@ -968,15 +934,13 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) struct memblock_region *rgn3 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; struct region r1, r2; - - PREFIX_PUSH(); - phys_addr_t r3_size = SZ_256; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; phys_addr_t max_addr; phys_addr_t min_addr; + PREFIX_PUSH(); setup_memblock(); r1.base = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2; @@ -1025,13 +989,11 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM() + SZ_1K; @@ -1064,13 +1026,11 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - - PREFIX_PUSH(); - phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; + PREFIX_PUSH(); setup_memblock(); min_addr = memblock_start_of_DRAM(); -- GitLab From ba5829c6543fbcf0b31854cc6970b9012ff71279 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Sun, 4 Sep 2022 11:43:14 -0500 Subject: [PATCH 0145/2223] ipmi:ipmb: Fix a vague comment and a typo Sending an IPMI response message gets a reponse to the response, but the comment saying that just said "response response", which is hard to understand. Also fix an obvious typo. Reported-by: Shaomin Deng Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ipmb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index ab19b4b3317ec..1019946abe4e8 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -218,8 +218,8 @@ static void ipmi_ipmb_send_response(struct ipmi_ipmb_dev *iidev, { if ((msg->data[0] >> 2) & 1) { /* - * It's a response being sent, we needto return a - * response response. Fake a send msg command + * It's a response being sent, we need to return a + * response to the response. Fake a send msg command * response with channel 0. This will always be ipmb * direct. */ -- GitLab From 4a13796aeb84c94e1883d4f93904a94284f8e5ea Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 15:13:00 +0800 Subject: [PATCH 0146/2223] pinctrl: berlin: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Link: https://lore.kernel.org/r/20220905071300.1832105-1-13667453960@163.com Signed-off-by: Linus Walleij --- drivers/pinctrl/berlin/berlin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/berlin/berlin.c b/drivers/pinctrl/berlin/berlin.c index a073eedd71aa5..1e427ea4d31bc 100644 --- a/drivers/pinctrl/berlin/berlin.c +++ b/drivers/pinctrl/berlin/berlin.c @@ -209,7 +209,7 @@ static int berlin_pinctrl_build_state(struct platform_device *pdev) for (i = 0; i < pctrl->desc->ngroups; i++) { desc_group = pctrl->desc->groups + i; - /* compute the maxiumum number of functions a group can have */ + /* compute the maximum number of functions a group can have */ max_functions += 1 << (desc_group->bit_width + 1); } -- GitLab From 94b22e125175e0c57d044c18d122ad5991348ca3 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Mon, 5 Sep 2022 20:15:17 -0700 Subject: [PATCH 0147/2223] dt-bindings: input: touchscreen: stmpe: Remove node name requirement STMPE driver does not require a specific node name anymore, only the compatible is checked, update binding according to this. Signed-off-by: Francesco Dolcini Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220712163345.445811-6-francesco.dolcini@toradex.com Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/input/touchscreen/stmpe.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt b/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt index c549924603d26..238b51555c047 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/stmpe.txt @@ -54,8 +54,7 @@ Optional properties common with MFD (deprecated): 1 -> 3.25 MHz 2 || 3 -> 6.5 MHz -Node name must be stmpe_touchscreen and should be child node of stmpe node to -which it belongs. +Node should be child node of stmpe node to which it belongs. Note that common ADC settings of stmpe_touchscreen (child) will take precedence over the settings done in MFD. -- GitLab From b382c5e37344883dc97525d05f1f6b788f549985 Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Thu, 18 Aug 2022 17:44:08 +0200 Subject: [PATCH 0148/2223] Input: xpad - add supported devices as contributed on github This is based on multiple commits at https://github.com/paroj/xpad Cc: stable@vger.kernel.org Signed-off-by: Jasper Poppe Signed-off-by: Jeremy Palmer Signed-off-by: Ruineka Signed-off-by: Cleber de Mattos Casali Signed-off-by: Kyle Gospodnetich Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20220818154411.510308-2-rojtberg@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 18190b529bca3..5af07ded98fc2 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -113,6 +113,8 @@ static const struct xpad_device { u8 xtype; } xpad_device[] = { { 0x0079, 0x18d4, "GPD Win 2 X-Box Controller", 0, XTYPE_XBOX360 }, + { 0x03eb, 0xff01, "Wooting One (Legacy)", 0, XTYPE_XBOX360 }, + { 0x03eb, 0xff02, "Wooting Two (Legacy)", 0, XTYPE_XBOX360 }, { 0x044f, 0x0f00, "Thrustmaster Wheel", 0, XTYPE_XBOX }, { 0x044f, 0x0f03, "Thrustmaster Wheel", 0, XTYPE_XBOX }, { 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX }, @@ -244,6 +246,7 @@ static const struct xpad_device { { 0x0f0d, 0x0063, "Hori Real Arcade Pro Hayabusa (USA) Xbox One", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOXONE }, { 0x0f0d, 0x0067, "HORIPAD ONE", 0, XTYPE_XBOXONE }, { 0x0f0d, 0x0078, "Hori Real Arcade Pro V Kai Xbox One", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOXONE }, + { 0x0f0d, 0x00c5, "Hori Fighting Commander ONE", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOXONE }, { 0x0f30, 0x010b, "Philips Recoil", 0, XTYPE_XBOX }, { 0x0f30, 0x0202, "Joytech Advanced Controller", 0, XTYPE_XBOX }, { 0x0f30, 0x8888, "BigBen XBMiniPad Controller", 0, XTYPE_XBOX }, @@ -260,6 +263,7 @@ static const struct xpad_device { { 0x1430, 0x8888, "TX6500+ Dance Pad (first generation)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX }, { 0x1430, 0xf801, "RedOctane Controller", 0, XTYPE_XBOX360 }, { 0x146b, 0x0601, "BigBen Interactive XBOX 360 Controller", 0, XTYPE_XBOX360 }, + { 0x146b, 0x0604, "Bigben Interactive DAIJA Arcade Stick", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, { 0x1532, 0x0037, "Razer Sabertooth", 0, XTYPE_XBOX360 }, { 0x1532, 0x0a00, "Razer Atrox Arcade Stick", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOXONE }, { 0x1532, 0x0a03, "Razer Wildcat", 0, XTYPE_XBOXONE }, @@ -325,6 +329,7 @@ static const struct xpad_device { { 0x24c6, 0x5502, "Hori Fighting Stick VX Alt", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, { 0x24c6, 0x5503, "Hori Fighting Edge", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, { 0x24c6, 0x5506, "Hori SOULCALIBUR V Stick", 0, XTYPE_XBOX360 }, + { 0x24c6, 0x5510, "Hori Fighting Commander ONE (Xbox 360/PC Mode)", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, { 0x24c6, 0x550d, "Hori GEM Xbox controller", 0, XTYPE_XBOX360 }, { 0x24c6, 0x550e, "Hori Real Arcade Pro V Kai 360", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, { 0x24c6, 0x551a, "PowerA FUSION Pro Controller", 0, XTYPE_XBOXONE }, @@ -334,6 +339,14 @@ static const struct xpad_device { { 0x24c6, 0x5b03, "Thrustmaster Ferrari 458 Racing Wheel", 0, XTYPE_XBOX360 }, { 0x24c6, 0x5d04, "Razer Sabertooth", 0, XTYPE_XBOX360 }, { 0x24c6, 0xfafe, "Rock Candy Gamepad for Xbox 360", 0, XTYPE_XBOX360 }, + { 0x2563, 0x058d, "OneXPlayer Gamepad", 0, XTYPE_XBOX360 }, + { 0x2dc8, 0x2000, "8BitDo Pro 2 Wired Controller fox Xbox", 0, XTYPE_XBOXONE }, + { 0x31e3, 0x1100, "Wooting One", 0, XTYPE_XBOX360 }, + { 0x31e3, 0x1200, "Wooting Two", 0, XTYPE_XBOX360 }, + { 0x31e3, 0x1210, "Wooting Lekker", 0, XTYPE_XBOX360 }, + { 0x31e3, 0x1220, "Wooting Two HE", 0, XTYPE_XBOX360 }, + { 0x31e3, 0x1300, "Wooting 60HE (AVR)", 0, XTYPE_XBOX360 }, + { 0x31e3, 0x1310, "Wooting 60HE (ARM)", 0, XTYPE_XBOX360 }, { 0x3285, 0x0607, "Nacon GC-100", 0, XTYPE_XBOX360 }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, @@ -419,6 +432,7 @@ static const signed short xpad_abs_triggers[] = { static const struct usb_device_id xpad_table[] = { { USB_INTERFACE_INFO('X', 'B', 0) }, /* X-Box USB-IF not approved class */ XPAD_XBOX360_VENDOR(0x0079), /* GPD Win 2 Controller */ + XPAD_XBOX360_VENDOR(0x03eb), /* Wooting Keyboards (Legacy) */ XPAD_XBOX360_VENDOR(0x044f), /* Thrustmaster X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x045e), /* Microsoft X-Box 360 controllers */ XPAD_XBOXONE_VENDOR(0x045e), /* Microsoft X-Box One controllers */ @@ -429,6 +443,7 @@ static const struct usb_device_id xpad_table[] = { { USB_DEVICE(0x0738, 0x4540) }, /* Mad Catz Beat Pad */ XPAD_XBOXONE_VENDOR(0x0738), /* Mad Catz FightStick TE 2 */ XPAD_XBOX360_VENDOR(0x07ff), /* Mad Catz GamePad */ + XPAD_XBOX360_VENDOR(0x0c12), /* Zeroplus X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x0e6f), /* 0x0e6f X-Box 360 controllers */ XPAD_XBOXONE_VENDOR(0x0e6f), /* 0x0e6f X-Box One controllers */ XPAD_XBOX360_VENDOR(0x0f0d), /* Hori Controllers */ @@ -450,8 +465,12 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOXONE_VENDOR(0x20d6), /* PowerA Controllers */ XPAD_XBOX360_VENDOR(0x24c6), /* PowerA Controllers */ XPAD_XBOXONE_VENDOR(0x24c6), /* PowerA Controllers */ + XPAD_XBOX360_VENDOR(0x2563), /* OneXPlayer Gamepad */ + XPAD_XBOX360_VENDOR(0x260d), /* Dareu H101 */ + XPAD_XBOXONE_VENDOR(0x2dc8), /* 8BitDo Pro 2 Wired Controller for Xbox */ XPAD_XBOXONE_VENDOR(0x2e24), /* Hyperkin Duke X-Box One pad */ XPAD_XBOX360_VENDOR(0x2f24), /* GameSir Controllers */ + XPAD_XBOX360_VENDOR(0x31e3), /* Wooting Keyboards */ XPAD_XBOX360_VENDOR(0x3285), /* Nacon GC-100 */ { } }; -- GitLab From a17b9841152e7f4621619902b347e2cc39c32996 Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Thu, 18 Aug 2022 17:44:09 +0200 Subject: [PATCH 0149/2223] Input: xpad - fix wireless 360 controller breaking after suspend Suspending and resuming the system can sometimes cause the out URB to get hung after a reset_resume. This causes LED setting and force feedback to break on resume. To avoid this, just drop the reset_resume callback so the USB core rebinds xpad to the wireless pads on resume if a reset happened. A nice side effect of this change is the LED ring on wireless controllers is now set correctly on system resume. Cc: stable@vger.kernel.org Fixes: 4220f7db1e42 ("Input: xpad - workaround dead irq_out after suspend/ resume") Signed-off-by: Cameron Gutman Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20220818154411.510308-3-rojtberg@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 5af07ded98fc2..3da5fd5b5aaf4 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1991,7 +1991,6 @@ static struct usb_driver xpad_driver = { .disconnect = xpad_disconnect, .suspend = xpad_suspend, .resume = xpad_resume, - .reset_resume = xpad_resume, .id_table = xpad_table, }; -- GitLab From da7e2128b869a1315b8919ded42b799076279cda Mon Sep 17 00:00:00 2001 From: Santosh De Massari Date: Thu, 18 Aug 2022 17:44:10 +0200 Subject: [PATCH 0150/2223] Input: xpad - Poweroff XBOX360W on mode button long press Newer gamepads turn themselves off when the mode button is held down. For XBOX360W gamepads we must do this in the driver. Do not use BIT() macro for consistency within the file. Signed-off-by: Santosh De Massari Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20220818154411.510308-4-rojtberg@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 3da5fd5b5aaf4..d770221709c08 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -89,6 +89,11 @@ #define XTYPE_XBOXONE 3 #define XTYPE_UNKNOWN 4 +/* Send power-off packet to xpad360w after holding the mode button for this many + * seconds + */ +#define XPAD360W_POWEROFF_TIMEOUT 5 + static bool dpad_to_buttons; module_param(dpad_to_buttons, bool, S_IRUGO); MODULE_PARM_DESC(dpad_to_buttons, "Map D-PAD to buttons rather than axes for unknown pads"); @@ -630,11 +635,13 @@ struct usb_xpad { int pad_nr; /* the order x360 pads were attached */ const char *name; /* name of the device */ struct work_struct work; /* init/remove device from callback */ + time64_t mode_btn_down_ts; }; static int xpad_init_input(struct usb_xpad *xpad); static void xpad_deinit_input(struct usb_xpad *xpad); static void xpadone_ack_mode_report(struct usb_xpad *xpad, u8 seq_num); +static void xpad360w_poweroff_controller(struct usb_xpad *xpad); /* * xpad_process_packet @@ -786,6 +793,23 @@ static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev, } input_sync(dev); + + /* XBOX360W controllers can't be turned off without driver assistance */ + if (xpad->xtype == XTYPE_XBOX360W) { + if (xpad->mode_btn_down_ts > 0 && xpad->pad_present && + ((ktime_get_seconds() - xpad->mode_btn_down_ts) >= + XPAD360W_POWEROFF_TIMEOUT)) { + xpad360w_poweroff_controller(xpad); + xpad->mode_btn_down_ts = 0; + return; + } + + /* mode button down/up */ + if (data[3] & 0x04) + xpad->mode_btn_down_ts = ktime_get_seconds(); + else + xpad->mode_btn_down_ts = 0; + } } static void xpad_presence_work(struct work_struct *work) -- GitLab From e23c69e3324892f7420686b3aaa0403df6cf152c Mon Sep 17 00:00:00 2001 From: Christopher Crockett Date: Thu, 18 Aug 2022 17:44:11 +0200 Subject: [PATCH 0151/2223] Input: xpad - add support for XBOX One Elite paddles An effort has been made to support every official model and firmware version I could track down info on. The following controllers _should_ have working paddles with this PR: - Xbox Elite (**untested**) - Xbox Elite Series 2 on early firmwares (**untested**) - Xbox Elite Series 2 on v4 firmwares (Tested v4.8.1908.0) - Xbox Elite Series 2 on v5 pre-BLE firmwares (**untested**) - Xbox Elite Series 2 on v5 post-BLE firmwares (Tested v5.13.3143.0) This patch also introduces correct handling for the Elite 1 controller and properly suppresses paddle inputs when using a custom profile slot. Starting with firmware v5.11, certain inputs for the Elite 2 were moved to an extra packet that is not enabled by default. We must first manually enable this extra packet in order to correctly process paddle input data with these later firmwares. Signed-off-by: Christopher Crockett Signed-off-by: Pavel Rojtberg Tested-by: Bastien Nocera Link: https://lore.kernel.org/r/20220818154411.510308-5-rojtberg@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 254 ++++++++++++++++++++++++++-------- 1 file changed, 193 insertions(+), 61 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index d770221709c08..fceb0d342945b 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -80,6 +80,7 @@ #define MAP_TRIGGERS_TO_BUTTONS (1 << 1) #define MAP_STICKS_TO_NULL (1 << 2) #define MAP_SELECT_BUTTON (1 << 3) +#define MAP_PADDLES (1 << 4) #define DANCEPAD_MAP_CONFIG (MAP_DPAD_TO_BUTTONS | \ MAP_TRIGGERS_TO_BUTTONS | MAP_STICKS_TO_NULL) @@ -94,6 +95,12 @@ */ #define XPAD360W_POWEROFF_TIMEOUT 5 +#define PKT_XB 0 +#define PKT_XBE1 1 +#define PKT_XBE2_FW_OLD 2 +#define PKT_XBE2_FW_5_EARLY 3 +#define PKT_XBE2_FW_5_11 4 + static bool dpad_to_buttons; module_param(dpad_to_buttons, bool, S_IRUGO); MODULE_PARM_DESC(dpad_to_buttons, "Map D-PAD to buttons rather than axes for unknown pads"); @@ -116,6 +123,7 @@ static const struct xpad_device { char *name; u8 mapping; u8 xtype; + u8 packet_type; } xpad_device[] = { { 0x0079, 0x18d4, "GPD Win 2 X-Box Controller", 0, XTYPE_XBOX360 }, { 0x03eb, 0xff01, "Wooting One (Legacy)", 0, XTYPE_XBOX360 }, @@ -135,7 +143,8 @@ static const struct xpad_device { { 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, { 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE }, { 0x045e, 0x02dd, "Microsoft X-Box One pad (Firmware 2015)", 0, XTYPE_XBOXONE }, - { 0x045e, 0x02e3, "Microsoft X-Box One Elite pad", 0, XTYPE_XBOXONE }, + { 0x045e, 0x02e3, "Microsoft X-Box One Elite pad", MAP_PADDLES, XTYPE_XBOXONE }, + { 0x045e, 0x0b00, "Microsoft X-Box One Elite 2 pad", MAP_PADDLES, XTYPE_XBOXONE }, { 0x045e, 0x02ea, "Microsoft X-Box One S pad", 0, XTYPE_XBOXONE }, { 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, { 0x045e, 0x0b12, "Microsoft Xbox Series S|X Controller", MAP_SELECT_BUTTON, XTYPE_XBOXONE }, @@ -408,6 +417,13 @@ static const signed short xpad_abs_triggers[] = { -1 }; +/* used when the controller has extra paddle buttons */ +static const signed short xpad_btn_paddles[] = { + BTN_TRIGGER_HAPPY5, BTN_TRIGGER_HAPPY6, /* paddle upper right, lower right */ + BTN_TRIGGER_HAPPY7, BTN_TRIGGER_HAPPY8, /* paddle upper left, lower left */ + -1 /* terminating entry */ +}; + /* * Xbox 360 has a vendor-specific class, so we cannot match it with only * USB_INTERFACE_INFO (also specifically refused by USB subsystem), so we @@ -516,6 +532,15 @@ static const u8 xboxone_s_init[] = { 0x05, 0x20, 0x00, 0x0f, 0x06 }; +/* + * This packet is required to get additional input data + * from Xbox One Elite Series 2 (0x045e:0x0b00) pads. + * We mostly do this right now to get paddle data + */ +static const u8 extra_input_packet_init[] = { + 0x4d, 0x10, 0x01, 0x02, 0x07, 0x00 +}; + /* * This packet is required for the Titanfall 2 Xbox One pads * (0x0e6f:0x0165) to finish initialization and for Hori pads @@ -576,6 +601,7 @@ static const struct xboxone_init_packet xboxone_init_packets[] = { XBOXONE_INIT_PKT(0x0000, 0x0000, xboxone_fw2015_init), XBOXONE_INIT_PKT(0x045e, 0x02ea, xboxone_s_init), XBOXONE_INIT_PKT(0x045e, 0x0b00, xboxone_s_init), + XBOXONE_INIT_PKT(0x045e, 0x0b00, extra_input_packet_init), XBOXONE_INIT_PKT(0x0e6f, 0x0000, xboxone_pdp_init1), XBOXONE_INIT_PKT(0x0e6f, 0x0000, xboxone_pdp_init2), XBOXONE_INIT_PKT(0x24c6, 0x541a, xboxone_rumblebegin_init), @@ -632,6 +658,7 @@ struct usb_xpad { int mapping; /* map d-pad to buttons or to axes */ int xtype; /* type of xbox device */ + int packet_type; /* type of the extended packet */ int pad_nr; /* the order x360 pads were attached */ const char *name; /* name of the device */ struct work_struct work; /* init/remove device from callback */ @@ -889,6 +916,7 @@ static void xpad360w_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned cha static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *data) { struct input_dev *dev = xpad->dev; + bool do_sync = false; /* the xbox button has its own special report */ if (data[0] == 0X07) { @@ -901,75 +929,140 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char xpadone_ack_mode_report(xpad, data[2]); input_report_key(dev, BTN_MODE, data[4] & 0x01); - input_sync(dev); - return; - } - /* check invalid packet */ - else if (data[0] != 0X20) - return; - - /* menu/view buttons */ - input_report_key(dev, BTN_START, data[4] & 0x04); - input_report_key(dev, BTN_SELECT, data[4] & 0x08); - if (xpad->mapping & MAP_SELECT_BUTTON) - input_report_key(dev, KEY_RECORD, data[22] & 0x01); - /* buttons A,B,X,Y */ - input_report_key(dev, BTN_A, data[4] & 0x10); - input_report_key(dev, BTN_B, data[4] & 0x20); - input_report_key(dev, BTN_X, data[4] & 0x40); - input_report_key(dev, BTN_Y, data[4] & 0x80); + do_sync = true; + } else if (data[0] == 0X0C) { + /* Some packet formats force us to use this separate to poll paddle inputs */ + if (xpad->packet_type == PKT_XBE2_FW_5_11) { + /* Mute paddles if controller is in a custom profile slot + * Checked by looking at the active profile slot to + * verify it's the default slot + */ + if (data[19] != 0) + data[18] = 0; - /* digital pad */ - if (xpad->mapping & MAP_DPAD_TO_BUTTONS) { - /* dpad as buttons (left, right, up, down) */ - input_report_key(dev, BTN_TRIGGER_HAPPY1, data[5] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY2, data[5] & 0x08); - input_report_key(dev, BTN_TRIGGER_HAPPY3, data[5] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY4, data[5] & 0x02); - } else { - input_report_abs(dev, ABS_HAT0X, - !!(data[5] & 0x08) - !!(data[5] & 0x04)); - input_report_abs(dev, ABS_HAT0Y, - !!(data[5] & 0x02) - !!(data[5] & 0x01)); - } + /* Elite Series 2 split packet paddle bits */ + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[18] & 0x01); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[18] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[18] & 0x04); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[18] & 0x08); - /* TL/TR */ - input_report_key(dev, BTN_TL, data[5] & 0x10); - input_report_key(dev, BTN_TR, data[5] & 0x20); + do_sync = true; + } + } else if (data[0] == 0X20) { /* The main valid packet type for inputs */ + /* menu/view buttons */ + input_report_key(dev, BTN_START, data[4] & 0x04); + input_report_key(dev, BTN_SELECT, data[4] & 0x08); + if (xpad->mapping & MAP_SELECT_BUTTON) + input_report_key(dev, KEY_RECORD, data[22] & 0x01); + + /* buttons A,B,X,Y */ + input_report_key(dev, BTN_A, data[4] & 0x10); + input_report_key(dev, BTN_B, data[4] & 0x20); + input_report_key(dev, BTN_X, data[4] & 0x40); + input_report_key(dev, BTN_Y, data[4] & 0x80); + + /* digital pad */ + if (xpad->mapping & MAP_DPAD_TO_BUTTONS) { + /* dpad as buttons (left, right, up, down) */ + input_report_key(dev, BTN_TRIGGER_HAPPY1, data[5] & 0x04); + input_report_key(dev, BTN_TRIGGER_HAPPY2, data[5] & 0x08); + input_report_key(dev, BTN_TRIGGER_HAPPY3, data[5] & 0x01); + input_report_key(dev, BTN_TRIGGER_HAPPY4, data[5] & 0x02); + } else { + input_report_abs(dev, ABS_HAT0X, + !!(data[5] & 0x08) - !!(data[5] & 0x04)); + input_report_abs(dev, ABS_HAT0Y, + !!(data[5] & 0x02) - !!(data[5] & 0x01)); + } - /* stick press left/right */ - input_report_key(dev, BTN_THUMBL, data[5] & 0x40); - input_report_key(dev, BTN_THUMBR, data[5] & 0x80); + /* TL/TR */ + input_report_key(dev, BTN_TL, data[5] & 0x10); + input_report_key(dev, BTN_TR, data[5] & 0x20); + + /* stick press left/right */ + input_report_key(dev, BTN_THUMBL, data[5] & 0x40); + input_report_key(dev, BTN_THUMBR, data[5] & 0x80); + + if (!(xpad->mapping & MAP_STICKS_TO_NULL)) { + /* left stick */ + input_report_abs(dev, ABS_X, + (__s16) le16_to_cpup((__le16 *)(data + 10))); + input_report_abs(dev, ABS_Y, + ~(__s16) le16_to_cpup((__le16 *)(data + 12))); + + /* right stick */ + input_report_abs(dev, ABS_RX, + (__s16) le16_to_cpup((__le16 *)(data + 14))); + input_report_abs(dev, ABS_RY, + ~(__s16) le16_to_cpup((__le16 *)(data + 16))); + } - if (!(xpad->mapping & MAP_STICKS_TO_NULL)) { - /* left stick */ - input_report_abs(dev, ABS_X, - (__s16) le16_to_cpup((__le16 *)(data + 10))); - input_report_abs(dev, ABS_Y, - ~(__s16) le16_to_cpup((__le16 *)(data + 12))); + /* triggers left/right */ + if (xpad->mapping & MAP_TRIGGERS_TO_BUTTONS) { + input_report_key(dev, BTN_TL2, + (__u16) le16_to_cpup((__le16 *)(data + 6))); + input_report_key(dev, BTN_TR2, + (__u16) le16_to_cpup((__le16 *)(data + 8))); + } else { + input_report_abs(dev, ABS_Z, + (__u16) le16_to_cpup((__le16 *)(data + 6))); + input_report_abs(dev, ABS_RZ, + (__u16) le16_to_cpup((__le16 *)(data + 8))); + } - /* right stick */ - input_report_abs(dev, ABS_RX, - (__s16) le16_to_cpup((__le16 *)(data + 14))); - input_report_abs(dev, ABS_RY, - ~(__s16) le16_to_cpup((__le16 *)(data + 16))); - } + /* paddle handling */ + /* based on SDL's SDL_hidapi_xboxone.c */ + if (xpad->mapping & MAP_PADDLES) { + if (xpad->packet_type == PKT_XBE1) { + /* Mute paddles if controller has a custom mapping applied. + * Checked by comparing the current mapping + * config against the factory mapping config + */ + if (memcmp(&data[4], &data[18], 2) != 0) + data[32] = 0; + + /* OG Elite Series Controller paddle bits */ + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[32] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[32] & 0x08); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[32] & 0x01); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[32] & 0x04); + } else if (xpad->packet_type == PKT_XBE2_FW_OLD) { + /* Mute paddles if controller has a custom mapping applied. + * Checked by comparing the current mapping + * config against the factory mapping config + */ + if (data[19] != 0) + data[18] = 0; + + /* Elite Series 2 4.x firmware paddle bits */ + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[18] & 0x01); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[18] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[18] & 0x04); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[18] & 0x08); + } else if (xpad->packet_type == PKT_XBE2_FW_5_EARLY) { + /* Mute paddles if controller has a custom mapping applied. + * Checked by comparing the current mapping + * config against the factory mapping config + */ + if (data[23] != 0) + data[22] = 0; + + /* Elite Series 2 5.x firmware paddle bits + * (before the packet was split) + */ + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[22] & 0x01); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[22] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[22] & 0x04); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[22] & 0x08); + } + } - /* triggers left/right */ - if (xpad->mapping & MAP_TRIGGERS_TO_BUTTONS) { - input_report_key(dev, BTN_TL2, - (__u16) le16_to_cpup((__le16 *)(data + 6))); - input_report_key(dev, BTN_TR2, - (__u16) le16_to_cpup((__le16 *)(data + 8))); - } else { - input_report_abs(dev, ABS_Z, - (__u16) le16_to_cpup((__le16 *)(data + 6))); - input_report_abs(dev, ABS_RZ, - (__u16) le16_to_cpup((__le16 *)(data + 8))); + do_sync = true; } - input_sync(dev); + if (do_sync) + input_sync(dev); } static void xpad_irq_in(struct urb *urb) @@ -1736,6 +1829,12 @@ static int xpad_init_input(struct usb_xpad *xpad) xpad_btn_pad[i]); } + /* set up paddles if the controller has them */ + if (xpad->mapping & MAP_PADDLES) { + for (i = 0; xpad_btn_paddles[i] >= 0; i++) + input_set_capability(input_dev, EV_KEY, xpad_btn_paddles[i]); + } + /* * This should be a simple else block. However historically * xbox360w has mapped DPAD to buttons while xbox360 did not. This @@ -1822,6 +1921,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id xpad->mapping = xpad_device[i].mapping; xpad->xtype = xpad_device[i].xtype; xpad->name = xpad_device[i].name; + xpad->packet_type = PKT_XB; INIT_WORK(&xpad->work, xpad_presence_work); if (xpad->xtype == XTYPE_UNKNOWN) { @@ -1887,6 +1987,38 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id usb_set_intfdata(intf, xpad); + /* Packet type detection */ + if (le16_to_cpu(udev->descriptor.idVendor) == 0x045e) { /* Microsoft controllers */ + if (le16_to_cpu(udev->descriptor.idProduct) == 0x02e3) { + /* The original elite controller always uses the oldest + * type of extended packet + */ + xpad->packet_type = PKT_XBE1; + } else if (le16_to_cpu(udev->descriptor.idProduct) == 0x0b00) { + /* The elite 2 controller has seen multiple packet + * revisions. These are tied to specific firmware + * versions + */ + if (le16_to_cpu(udev->descriptor.bcdDevice) < 0x0500) { + /* This is the format that the Elite 2 used + * prior to the BLE update + */ + xpad->packet_type = PKT_XBE2_FW_OLD; + } else if (le16_to_cpu(udev->descriptor.bcdDevice) < + 0x050b) { + /* This is the format that the Elite 2 used + * prior to the update that split the packet + */ + xpad->packet_type = PKT_XBE2_FW_5_EARLY; + } else { + /* The split packet format that was introduced + * in firmware v5.11 + */ + xpad->packet_type = PKT_XBE2_FW_5_11; + } + } + } + if (xpad->xtype == XTYPE_XBOX360W) { /* * Submit the int URB immediately rather than waiting for open -- GitLab From f5d620254c978746020a5be09f7b2a84dd9daa48 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:34 +0300 Subject: [PATCH 0152/2223] pinctrl: cy8c95x0: make irq_chip immutable Since recently, the kernel is nagging about mutable irq_chips: "not an immutable chip, please consider fixing it!" Drop the unneeded copy, flag it as IRQCHIP_IMMUTABLE, add the new helper functions and call the appropriate gpiolib functions. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 05791212822e5..efc6ba1089fb9 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -90,7 +90,6 @@ MODULE_DEVICE_TABLE(of, cy8c95x0_dt_ids); * @irq_trig_high: I/O bits affected by a high voltage level * @push_pull: I/O bits configured as push pull driver * @shiftmask: Mask used to compensate for Gport2 width - * @irq_chip: IRQ chip configuration * @nport: Number of Gports in this chip * @gpio_chip: gpiolib chip * @driver_data: private driver data @@ -112,7 +111,6 @@ struct cy8c95x0_pinctrl { DECLARE_BITMAP(irq_trig_high, MAX_LINE); DECLARE_BITMAP(push_pull, MAX_LINE); DECLARE_BITMAP(shiftmask, MAX_LINE); - struct irq_chip irq_chip; int nport; struct gpio_chip gpio_chip; unsigned long driver_data; @@ -844,16 +842,20 @@ static void cy8c95x0_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - set_bit(irqd_to_hwirq(d), chip->irq_mask); + set_bit(hwirq, chip->irq_mask); + gpiochip_disable_irq(gc, hwirq); } static void cy8c95x0_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - clear_bit(irqd_to_hwirq(d), chip->irq_mask); + gpiochip_enable_irq(gc, hwirq); + clear_bit(hwirq, chip->irq_mask); } static void cy8c95x0_irq_bus_lock(struct irq_data *d) @@ -931,6 +933,18 @@ static void cy8c95x0_irq_shutdown(struct irq_data *d) clear_bit(hwirq, chip->irq_trig_high); } +static const struct irq_chip cy8c95x0_irqchip = { + .name = "cy8c95x0-irq", + .irq_mask = cy8c95x0_irq_mask, + .irq_unmask = cy8c95x0_irq_unmask, + .irq_bus_lock = cy8c95x0_irq_bus_lock, + .irq_bus_sync_unlock = cy8c95x0_irq_bus_sync_unlock, + .irq_set_type = cy8c95x0_irq_set_type, + .irq_shutdown = cy8c95x0_irq_shutdown, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static bool cy8c95x0_irq_pending(struct cy8c95x0_pinctrl *chip, unsigned long *pending) { DECLARE_BITMAP(ones, MAX_LINE); @@ -1136,7 +1150,6 @@ static const struct pinconf_ops cy8c95x0_pinconf_ops = { static int cy8c95x0_irq_setup(struct cy8c95x0_pinctrl *chip, int irq) { - struct irq_chip *irq_chip = &chip->irq_chip; struct gpio_irq_chip *girq = &chip->gpio_chip.irq; DECLARE_BITMAP(pending_irqs, MAX_LINE); int ret; @@ -1155,15 +1168,8 @@ static int cy8c95x0_irq_setup(struct cy8c95x0_pinctrl *chip, int irq) /* Mask all interrupts */ bitmap_fill(chip->irq_mask, MAX_LINE); - irq_chip->name = devm_kasprintf(chip->dev, GFP_KERNEL, "%s-irq", chip->name); - irq_chip->irq_mask = cy8c95x0_irq_mask; - irq_chip->irq_unmask = cy8c95x0_irq_unmask; - irq_chip->irq_bus_lock = cy8c95x0_irq_bus_lock; - irq_chip->irq_bus_sync_unlock = cy8c95x0_irq_bus_sync_unlock; - irq_chip->irq_set_type = cy8c95x0_irq_set_type; - irq_chip->irq_shutdown = cy8c95x0_irq_shutdown; + gpio_irq_chip_set_chip(girq, &cy8c95x0_irqchip); - girq->chip = irq_chip; /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; girq->num_parents = 0; -- GitLab From ad3d55aab4c08cdfc127753c5c5db78218cff4b2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:35 +0300 Subject: [PATCH 0153/2223] pinctrl: cy8c95x0: Allow IRQ chip core to handle numbering No need to assign first line number for IRQ chip. Let IRQ core to decide. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-2-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index efc6ba1089fb9..529664894e208 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1177,7 +1177,6 @@ static int cy8c95x0_irq_setup(struct cy8c95x0_pinctrl *chip, int irq) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_simple_irq; girq->threaded = true; - girq->first = 0; ret = devm_request_threaded_irq(chip->dev, irq, NULL, cy8c95x0_irq_handler, -- GitLab From 43dcf873d48d363d73fda0834c63d02ad177827e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:37 +0300 Subject: [PATCH 0154/2223] pinctrl: cy8c95x0: Fix return value in cy8c95x0_detect() It's an obvious typo in never tested piece of code that successful detection shouldn't fail. Fix that. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-4-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 529664894e208..2e46446c05ff8 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1266,7 +1266,7 @@ static int cy8c95x0_detect(struct i2c_client *client, dev_info(&client->dev, "Found a %s chip at 0x%02x.\n", name, client->addr); strscpy(info->type, name, I2C_NAME_SIZE); - return -ENODEV; + return 0; } static int cy8c95x0_probe(struct i2c_client *client) -- GitLab From 641d6cc65dd4b783dd73ecfbface5e7a961c2f11 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:38 +0300 Subject: [PATCH 0155/2223] pinctrl: cy8c95x0: Fix pin control name to enable more than one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Cypress GPIO expander is an I²C discrete component. Hence the platform may contain more than one of a such. Currently this has limitations in the driver due to same name used for all chips of a type. Replace this with device instance specific name. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-5-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 2e46446c05ff8..fa3764e768a4d 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1199,8 +1199,7 @@ static int cy8c95x0_setup_pinctrl(struct cy8c95x0_pinctrl *chip) pd->confops = &cy8c95x0_pinconf_ops; pd->pmxops = &cy8c95x0_pmxops; pd->npins = chip->gpio_chip.ngpio; - pd->name = devm_kasprintf(chip->dev, GFP_KERNEL, "pinctrl-%s", - chip->name); + pd->name = dev_name(chip->dev); pd->pins = cy8c9560_pins; pd->npins = chip->tpin; pd->owner = THIS_MODULE; -- GitLab From 28ce127238f4bd3aaf5b6666f48f8e2a34b81579 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:39 +0300 Subject: [PATCH 0156/2223] pinctrl: cy8c95x0: Drop unneeded npins assignment The npins field is assigned twice. Remove the first occurrence. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-6-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index fa3764e768a4d..a511044ea60a6 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1198,7 +1198,6 @@ static int cy8c95x0_setup_pinctrl(struct cy8c95x0_pinctrl *chip) pd->pctlops = &cy8c95x0_pinctrl_ops; pd->confops = &cy8c95x0_pinconf_ops; pd->pmxops = &cy8c95x0_pmxops; - pd->npins = chip->gpio_chip.ngpio; pd->name = dev_name(chip->dev); pd->pins = cy8c9560_pins; pd->npins = chip->tpin; -- GitLab From d86e0344852eb65688c31227ee5a1e081f49e1bd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:40 +0300 Subject: [PATCH 0157/2223] pinctrl: cy8c95x0: Enable GPIO range Since it's a pin control, GPIO counterpart needs to know the mapping between pin numbering and GPIO numbering. Enable this by calling gpiochip_add_pin_range() at the chip addition time. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-7-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index a511044ea60a6..2e05585c88db2 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -812,7 +812,20 @@ static void cy8c95x0_gpio_set_multiple(struct gpio_chip *gc, cy8c95x0_write_regs_mask(chip, CY8C95X0_OUTPUT, bits, mask); } -static int cy8c95x0_setup_gpiochip(struct cy8c95x0_pinctrl *chip, int ngpio) +static int cy8c95x0_add_pin_ranges(struct gpio_chip *gc) +{ + struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); + struct device *dev = chip->dev; + int ret; + + ret = gpiochip_add_pin_range(gc, dev_name(dev), 0, 0, chip->tpin); + if (ret) + dev_err(dev, "failed to add GPIO pin range\n"); + + return ret; +} + +static int cy8c95x0_setup_gpiochip(struct cy8c95x0_pinctrl *chip) { struct gpio_chip *gc = &chip->gpio_chip; @@ -825,9 +838,10 @@ static int cy8c95x0_setup_gpiochip(struct cy8c95x0_pinctrl *chip, int ngpio) gc->set_multiple = cy8c95x0_gpio_set_multiple; gc->set_config = cy8c95x0_gpio_set_config; gc->can_sleep = true; + gc->add_pin_ranges = cy8c95x0_add_pin_ranges; gc->base = -1; - gc->ngpio = ngpio; + gc->ngpio = chip->tpin; gc->parent = chip->dev; gc->owner = THIS_MODULE; @@ -1339,11 +1353,11 @@ static int cy8c95x0_probe(struct i2c_client *client) goto err_exit; } - ret = cy8c95x0_setup_gpiochip(chip, chip->tpin); + ret = cy8c95x0_setup_pinctrl(chip); if (ret) goto err_exit; - ret = cy8c95x0_setup_pinctrl(chip); + ret = cy8c95x0_setup_gpiochip(chip); if (ret) goto err_exit; -- GitLab From 44c2533366d2259ce861d5e3adfb0237a844ffa4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:41 +0300 Subject: [PATCH 0158/2223] pinctrl: cy8c95x0: Remove device initialization The Cypress CY8C95x0 chips have an internal EEPROM that defines initial configuration. It might be that bootloader or other entity wrote the platform related setup into it. Don't override it in the driver. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-8-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 2e05585c88db2..804dce0840f70 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1224,30 +1224,6 @@ static int cy8c95x0_setup_pinctrl(struct cy8c95x0_pinctrl *chip) return 0; } -static int device_cy8c95x0_init(struct cy8c95x0_pinctrl *chip) -{ - DECLARE_BITMAP(ones, MAX_LINE); - DECLARE_BITMAP(zeros, MAX_LINE); - int ret; - - /* Set all pins to input. This is the POR default. */ - bitmap_fill(ones, MAX_LINE); - ret = cy8c95x0_write_regs_mask(chip, CY8C95X0_DIRECTION, ones, ones); - if (ret) { - dev_err(chip->dev, "Failed to set pins to input\n"); - return ret; - } - - bitmap_zero(zeros, MAX_LINE); - ret = cy8c95x0_write_regs_mask(chip, CY8C95X0_INVERT, zeros, ones); - if (ret) { - dev_err(chip->dev, "Failed to set polarity inversion\n"); - return ret; - } - - return 0; -} - static int cy8c95x0_detect(struct i2c_client *client, struct i2c_board_info *info) { @@ -1343,10 +1319,6 @@ static int cy8c95x0_probe(struct i2c_client *client) bitmap_set(chip->shiftmask, 0, 20); mutex_init(&chip->i2c_lock); - ret = device_cy8c95x0_init(chip); - if (ret) - goto err_exit; - if (client->irq) { ret = cy8c95x0_irq_setup(chip, client->irq); if (ret) -- GitLab From a416bfb7d595b423cf50af58f1ea9263f233fbd5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:42 +0300 Subject: [PATCH 0159/2223] pinctrl: cy8c95x0: Remove useless conditionals The pin control framework checks pin boundaries before calling the respective driver's callbacks. Hence no need to check for pin boundaries, the respective conditionals won't be ever true. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-9-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 804dce0840f70..fef735bea648b 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1040,14 +1040,6 @@ static int cy8c95x0_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, const unsigned int **pins, unsigned int *num_pins) { - struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); - - if (group >= chip->tpin) { - *pins = NULL; - *num_pins = 0; - return 0; - } - *pins = &cy8c9560_pins[group].number; *num_pins = 1; return 0; @@ -1115,9 +1107,6 @@ static int cy8c95x0_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, { struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); - if (group >= chip->tpin) - return -EINVAL; - return cy8c95x0_pinmux_cfg(chip, selector, group); } @@ -1144,9 +1133,6 @@ static int cy8c95x0_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, int ret = 0; int i; - if (WARN_ON(pin >= chip->tpin)) - return -EINVAL; - for (i = 0; i < num_configs; i++) { ret = cy8c95x0_gpio_set_pincfg(chip, pin, configs[i]); if (ret) -- GitLab From 1fa3df901f2c80d6a597abe462725e5a374b2795 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:43 +0300 Subject: [PATCH 0160/2223] pinctrl: cy8c95x0: Remove custom ->set_config() Since we have pin configuration getter and setter provided, there is no need to duplicate that in the custom ->set_config(). Instead, switch to gpiochip_generic_config(). Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-10-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index fef735bea648b..204a53d6c4c98 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -772,30 +772,6 @@ out: return ret; } -static int cy8c95x0_gpio_set_config(struct gpio_chip *gc, unsigned int offset, - unsigned long config) -{ - struct cy8c95x0_pinctrl *chip = gpiochip_get_data(gc); - unsigned long arg = pinconf_to_config_argument(config); - - switch (pinconf_to_config_param(config)) { - case PIN_CONFIG_INPUT_ENABLE: - return cy8c95x0_gpio_direction_input(gc, offset); - case PIN_CONFIG_OUTPUT: - return cy8c95x0_gpio_direction_output(gc, offset, arg); - case PIN_CONFIG_MODE_PWM: - case PIN_CONFIG_BIAS_PULL_UP: - case PIN_CONFIG_BIAS_PULL_DOWN: - case PIN_CONFIG_BIAS_DISABLE: - case PIN_CONFIG_DRIVE_OPEN_DRAIN: - case PIN_CONFIG_DRIVE_OPEN_SOURCE: - case PIN_CONFIG_DRIVE_PUSH_PULL: - return cy8c95x0_gpio_set_pincfg(chip, offset, config); - default: - return -ENOTSUPP; - } -} - static int cy8c95x0_gpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { @@ -836,7 +812,7 @@ static int cy8c95x0_setup_gpiochip(struct cy8c95x0_pinctrl *chip) gc->get_direction = cy8c95x0_gpio_get_direction; gc->get_multiple = cy8c95x0_gpio_get_multiple; gc->set_multiple = cy8c95x0_gpio_set_multiple; - gc->set_config = cy8c95x0_gpio_set_config; + gc->set_config = gpiochip_generic_config, gc->can_sleep = true; gc->add_pin_ranges = cy8c95x0_add_pin_ranges; -- GitLab From c3e4095287afbc8954928ae79849766194d364ac Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:44 +0300 Subject: [PATCH 0161/2223] pinctrl: cy8c95x0: Use 'default' in all switch-cases Move the default values to the 'default' case in the switches. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-11-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 204a53d6c4c98..c714c438f6413 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -279,9 +279,9 @@ static bool cy8c95x0_readable_register(struct device *dev, unsigned int reg) switch (reg) { case 0x24 ... 0x27: return false; + default: + return true; } - - return true; } static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) @@ -293,9 +293,9 @@ static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) return false; case 0x24 ... 0x27: return false; + default: + return true; } - - return true; } static bool cy8c95x0_volatile_register(struct device *dev, unsigned int reg) @@ -325,9 +325,9 @@ static bool cy8c95x0_precious_register(struct device *dev, unsigned int reg) switch (reg) { case CY8C95X0_INTSTATUS_(0) ... CY8C95X0_INTSTATUS_(7): return true; + default: + return false; } - - return false; } static const struct reg_default cy8c95x0_reg_defaults[] = { @@ -1255,6 +1255,8 @@ static int cy8c95x0_probe(struct i2c_client *client) case 60: strscpy(chip->name, cy8c95x0_id[2].name, I2C_NAME_SIZE); break; + default: + return -ENODEV; } reg = devm_regulator_get(&client->dev, "vdd"); -- GitLab From f12352f334c28a85e738f9caa31a0c5b07febd20 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:45 +0300 Subject: [PATCH 0162/2223] pinctrl: cy8c95x0: Implement ->pin_dbg_show() The introduced callback ->pin_dbg_show() is useful for debugging. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-12-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 40 ++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index c714c438f6413..e1900db54c16a 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1021,25 +1021,49 @@ static int cy8c95x0_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, return 0; } +static const char *cy8c95x0_get_fname(unsigned int selector) +{ + if (selector == 0) + return "gpio"; + else + return "pwm"; +} + +static void cy8c95x0_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, + unsigned int pin) +{ + struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + DECLARE_BITMAP(mask, MAX_LINE); + DECLARE_BITMAP(pwm, MAX_LINE); + + bitmap_zero(mask, MAX_LINE); + __set_bit(pin, mask); + + if (cy8c95x0_read_regs_mask(chip, CY8C95X0_PWMSEL, pwm, mask)) { + seq_puts(s, "not available"); + return; + } + + seq_printf(s, "MODE:%s", cy8c95x0_get_fname(test_bit(pin, pwm))); +} + static const struct pinctrl_ops cy8c95x0_pinctrl_ops = { .get_groups_count = cy8c95x0_pinctrl_get_groups_count, .get_group_name = cy8c95x0_pinctrl_get_group_name, .get_group_pins = cy8c95x0_pinctrl_get_group_pins, .dt_node_to_map = pinconf_generic_dt_node_to_map_pin, .dt_free_map = pinconf_generic_dt_free_map, + .pin_dbg_show = cy8c95x0_pin_dbg_show, }; -static int cy8c95x0_get_functions_count(struct pinctrl_dev *pctldev) +static const char *cy8c95x0_get_functions_name(struct pinctrl_dev *pctldev, unsigned int selector) { - return 2; + return cy8c95x0_get_fname(selector); } -static const char *cy8c95x0_get_fname(struct pinctrl_dev *pctldev, unsigned int selector) +static int cy8c95x0_get_functions_count(struct pinctrl_dev *pctldev) { - if (selector == 0) - return "gpio"; - else - return "pwm"; + return 2; } static int cy8c95x0_get_groups(struct pinctrl_dev *pctldev, unsigned int selector, @@ -1088,7 +1112,7 @@ static int cy8c95x0_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, static const struct pinmux_ops cy8c95x0_pmxops = { .get_functions_count = cy8c95x0_get_functions_count, - .get_function_name = cy8c95x0_get_fname, + .get_function_name = cy8c95x0_get_functions_name, .get_function_groups = cy8c95x0_get_groups, .set_mux = cy8c95x0_set_mux, .strict = true, -- GitLab From 8586466e4f11a5879a7c0df5d25da6c6a7d7c672 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:46 +0300 Subject: [PATCH 0163/2223] pinctrl: cy8c95x0: Make use of device properties Convert the module to be property provider agnostic and allow it to be used on non-OF platforms. Add mod_devicetable.h include. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-13-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/Kconfig | 2 +- drivers/pinctrl/pinctrl-cy8c95x0.c | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index fc0e529e633ff..c09562fbb1b75 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -137,7 +137,7 @@ config PINCTRL_BM1880 config PINCTRL_CY8C95X0 tristate "Cypress CY8C95X0 I2C pinctrl and GPIO driver" - depends on I2C && OF + depends on I2C select GPIOLIB select GPIOLIB_IRQCHIP select PINMUX diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index e1900db54c16a..e0f99c82f6214 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -13,15 +13,16 @@ #include #include #include +#include #include -#include -#include +#include +#include +#include + #include #include #include #include -#include -#include /* Fast access registers */ #define CY8C95X0_INPUT 0x00 @@ -1051,8 +1052,10 @@ static const struct pinctrl_ops cy8c95x0_pinctrl_ops = { .get_groups_count = cy8c95x0_pinctrl_get_groups_count, .get_group_name = cy8c95x0_pinctrl_get_group_name, .get_group_pins = cy8c95x0_pinctrl_get_group_pins, +#ifdef CONFIG_OF .dt_node_to_map = pinconf_generic_dt_node_to_map_pin, .dt_free_map = pinconf_generic_dt_free_map, +#endif .pin_dbg_show = cy8c95x0_pin_dbg_show, }; @@ -1256,9 +1259,8 @@ static int cy8c95x0_probe(struct i2c_client *client) chip->dev = &client->dev; /* Set the device type */ - if (client->dev.of_node) - chip->driver_data = (unsigned long)of_device_get_match_data(&client->dev); - else + chip->driver_data = (unsigned long)device_get_match_data(&client->dev); + if (!chip->driver_data) chip->driver_data = i2c_match_id(cy8c95x0_id, client)->driver_data; if (!chip->driver_data) -- GitLab From 618a43ff1f37603164ac82cfa0734a39c079a3e9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:47 +0300 Subject: [PATCH 0164/2223] pinctrl: cy8c95x0: support ACPI device found on Galileo Gen1 Add support of the expander found on Intel Galileo Gen1 board. The platform information comes from ACPI. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-14-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index e0f99c82f6214..18a9f5a8ab2de 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1339,10 +1339,17 @@ static void cy8c95x0_remove(struct i2c_client *client) regulator_disable(chip->regulator); } +static const struct acpi_device_id cy8c95x0_acpi_ids[] = { + { "INT3490", 40, }, + { } +}; +MODULE_DEVICE_TABLE(acpi, cy8c95x0_acpi_ids); + static struct i2c_driver cy8c95x0_driver = { .driver = { .name = "cy8c95x0-pinctrl", .of_match_table = cy8c95x0_dt_ids, + .acpi_match_table = cy8c95x0_acpi_ids, }, .probe_new = cy8c95x0_probe, .remove = cy8c95x0_remove, -- GitLab From 785b1bd8546eb0d9e70bd4d859583c33aaceeb9b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:48 +0300 Subject: [PATCH 0165/2223] pinctrl: cy8c95x0: Override IRQ for one of the expanders on Galileo Gen 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACPI table on Intel Galileo Gen 1 has wrong pin number for IRQ resource of the I²C GPIO expander. Since we know what that number is and luckily have GPIO bases fixed for SoC's controllers, we may use a simple DMI quirk to match the platform and retrieve GpioInt() pin on it for the expander in question. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-15-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 18a9f5a8ab2de..a427909501820 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -7,7 +7,9 @@ * Author: Naresh Solanki */ +#include #include +#include #include #include #include @@ -73,6 +75,46 @@ static const struct of_device_id cy8c95x0_dt_ids[] = { MODULE_DEVICE_TABLE(of, cy8c95x0_dt_ids); +static const struct acpi_gpio_params cy8c95x0_irq_gpios = { 0, 0, true }; + +static const struct acpi_gpio_mapping cy8c95x0_acpi_irq_gpios[] = { + { "irq-gpios", &cy8c95x0_irq_gpios, 1, ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER }, + { } +}; + +static int cy8c95x0_acpi_get_irq(struct device *dev) +{ + int ret; + + ret = devm_acpi_dev_add_driver_gpios(dev, cy8c95x0_acpi_irq_gpios); + if (ret) + dev_warn(dev, "can't add GPIO ACPI mapping\n"); + + ret = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(dev), "irq-gpios", 0); + if (ret < 0) + return ret; + + dev_info(dev, "ACPI interrupt quirk (IRQ %d)\n", ret); + return ret; +} + +static const struct dmi_system_id cy8c95x0_dmi_acpi_irq_info[] = { + { + /* + * On Intel Galileo Gen 1 board the IRQ pin is provided + * as an absolute number instead of being relative. + * Since first controller (gpio-sch.c) and second + * (gpio-dwapb.c) are at the fixed bases, we may safely + * refer to the number in the global space to get an IRQ + * out of it. + */ + .matches = { + DMI_EXACT_MATCH(DMI_BOARD_NAME, "Galileo"), + }, + }, + {} +}; + #define MAX_BANK 8 #define BANK_SZ 8 #define MAX_LINE (MAX_BANK * BANK_SZ) @@ -1309,6 +1351,12 @@ static int cy8c95x0_probe(struct i2c_client *client) bitmap_set(chip->shiftmask, 0, 20); mutex_init(&chip->i2c_lock); + if (dmi_first_match(cy8c95x0_dmi_acpi_irq_info)) { + ret = cy8c95x0_acpi_get_irq(&client->dev); + if (ret > 0) + client->irq = ret; + } + if (client->irq) { ret = cy8c95x0_irq_setup(chip, client->irq); if (ret) -- GitLab From 9540a8360673350526615e1dfe4993ba06de0f15 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:49 +0300 Subject: [PATCH 0166/2223] pinctrl: cy8c95x0: use bits.h macros for all masks Make use of the GENMASK() (far less error-prone, far more concise). Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-16-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index a427909501820..c8f86c3f526f8 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -374,14 +374,14 @@ static bool cy8c95x0_precious_register(struct device *dev, unsigned int reg) } static const struct reg_default cy8c95x0_reg_defaults[] = { - { CY8C95X0_OUTPUT_(0), 0xff }, - { CY8C95X0_OUTPUT_(1), 0xff }, - { CY8C95X0_OUTPUT_(2), 0xff }, - { CY8C95X0_OUTPUT_(3), 0xff }, - { CY8C95X0_OUTPUT_(4), 0xff }, - { CY8C95X0_OUTPUT_(5), 0xff }, - { CY8C95X0_OUTPUT_(6), 0xff }, - { CY8C95X0_OUTPUT_(7), 0xff }, + { CY8C95X0_OUTPUT_(0), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(1), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(2), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(3), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(4), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(5), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(6), GENMASK(7, 0) }, + { CY8C95X0_OUTPUT_(7), GENMASK(7, 0) }, { CY8C95X0_PORTSEL, 0 }, { CY8C95X0_PWMSEL, 0 }, }; @@ -1268,7 +1268,7 @@ static int cy8c95x0_detect(struct i2c_client *client, ret = i2c_smbus_read_byte_data(client, CY8C95X0_DEVID); if (ret < 0) return ret; - switch (ret & 0xf0) { + switch (ret & GENMASK(7, 4)) { case 0x20: name = cy8c95x0_id[0].name; break; -- GitLab From 63e23304488f25f7193d5868b6cef02cf3a05e66 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Sep 2022 21:26:50 +0300 Subject: [PATCH 0167/2223] pinctrl: cy8c95x0: Correct comment style In a few comments the style is not aligned with the rest. Correct them. While at it, drop unneeded blank lines and deduplicate 'Author'. Signed-off-by: Andy Shevchenko Tested-by: Patrick Rudolph Link: https://lore.kernel.org/r/20220902182650.83098-17-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 40 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index c8f86c3f526f8..1335d07822f9a 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -3,8 +3,8 @@ * CY8C95X0 20/40/60 pin I2C GPIO port expander with interrupt support * * Copyright (C) 2022 9elements GmbH - * Author: Patrick Rudolph - * Author: Naresh Solanki + * Authors: Patrick Rudolph + * Naresh Solanki */ #include @@ -37,7 +37,7 @@ /* Port Select configures the port */ #define CY8C95X0_PORTSEL 0x18 -/* port settings, write PORTSEL first */ +/* Port settings, write PORTSEL first */ #define CY8C95X0_INTMASK 0x19 #define CY8C95X0_PWMSEL 0x1A #define CY8C95X0_INVERT 0x1B @@ -72,7 +72,6 @@ static const struct of_device_id cy8c95x0_dt_ids[] = { { .compatible = "cypress,cy8c9560", .data = OF_CY8C95X(60), }, { } }; - MODULE_DEVICE_TABLE(of, cy8c95x0_dt_ids); static const struct acpi_gpio_params cy8c95x0_irq_gpios = { 0, 0, true }; @@ -429,7 +428,7 @@ static int cy8c95x0_write_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, continue; switch (reg) { - /* muxed registers */ + /* Muxed registers */ case CY8C95X0_INTMASK: case CY8C95X0_PWMSEL: case CY8C95X0_INVERT: @@ -446,7 +445,7 @@ static int cy8c95x0_write_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, goto out; off = reg; break; - /* direct access registers */ + /* Direct access registers */ case CY8C95X0_INPUT: case CY8C95X0_OUTPUT: case CY8C95X0_INTSTATUS: @@ -500,7 +499,7 @@ static int cy8c95x0_read_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, continue; switch (reg) { - /* muxed registers */ + /* Muxed registers */ case CY8C95X0_INTMASK: case CY8C95X0_PWMSEL: case CY8C95X0_INVERT: @@ -517,7 +516,7 @@ static int cy8c95x0_read_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, goto out; off = reg; break; - /* direct access registers */ + /* Direct access registers */ case CY8C95X0_INPUT: case CY8C95X0_OUTPUT: case CY8C95X0_INTSTATUS: @@ -592,18 +591,18 @@ static int cy8c95x0_gpio_direction_output(struct gpio_chip *gc, u8 bit = cypress_get_pin_mask(chip, off); int ret; - /* set output level */ + /* Set output level */ ret = regmap_write_bits(chip->regmap, outreg, bit, val ? bit : 0); if (ret) return ret; mutex_lock(&chip->i2c_lock); - /* select port */ + /* Select port... */ ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); if (ret) goto out; - /* then direction */ + /* ...then direction */ ret = regmap_write_bits(chip->regmap, CY8C95X0_DIRECTION, bit, 0); out: @@ -624,7 +623,7 @@ static int cy8c95x0_gpio_get_value(struct gpio_chip *gc, unsigned int off) if (ret < 0) { /* * NOTE: - * diagnostic already emitted; that's all we should + * Diagnostic already emitted; that's all we should * do unless gpio_*_value_cansleep() calls become different * from their nonsleeping siblings (and report faults). */ @@ -687,7 +686,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, mutex_lock(&chip->i2c_lock); - /* select port */ + /* Select port */ ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); if (ret < 0) goto out; @@ -742,7 +741,8 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, ret = -ENOTSUPP; goto out; } - /* Writing 1 to one of the drive mode registers will automatically + /* + * Writing 1 to one of the drive mode registers will automatically * clear conflicting set bits in the other drive mode registers. */ ret = regmap_read(chip->regmap, reg, ®_val); @@ -768,7 +768,7 @@ static int cy8c95x0_gpio_set_pincfg(struct cy8c95x0_pinctrl *chip, mutex_lock(&chip->i2c_lock); - /* select port */ + /* Select port */ ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); if (ret < 0) goto out; @@ -805,7 +805,8 @@ static int cy8c95x0_gpio_set_pincfg(struct cy8c95x0_pinctrl *chip, ret = -ENOTSUPP; goto out; } - /* Writing 1 to one of the drive mode registers will automatically + /* + * Writing 1 to one of the drive mode registers will automatically * clear conflicting set bits in the other drive mode registers. */ ret = regmap_write_bits(chip->regmap, reg, bit, bit); @@ -1130,7 +1131,7 @@ static int cy8c95x0_pinmux_cfg(struct cy8c95x0_pinctrl *chip, u8 bit = cypress_get_pin_mask(chip, off); int ret; - /* select port */ + /* Select port */ ret = regmap_write(chip->regmap, CY8C95X0_PORTSEL, port); if (ret < 0) return ret; @@ -1247,11 +1248,12 @@ static int cy8c95x0_setup_pinctrl(struct cy8c95x0_pinctrl *chip) pd->pins = cy8c9560_pins; pd->npins = chip->tpin; pd->owner = THIS_MODULE; - chip->pctldev = devm_pinctrl_register(chip->dev, pd, chip); + chip->pctldev = devm_pinctrl_register(chip->dev, pd, chip); if (IS_ERR(chip->pctldev)) return dev_err_probe(chip->dev, PTR_ERR(chip->pctldev), "can't register controller\n"); + return 0; } @@ -1304,7 +1306,6 @@ static int cy8c95x0_probe(struct i2c_client *client) chip->driver_data = (unsigned long)device_get_match_data(&client->dev); if (!chip->driver_data) chip->driver_data = i2c_match_id(cy8c95x0_id, client)->driver_data; - if (!chip->driver_data) return -ENODEV; @@ -1404,7 +1405,6 @@ static struct i2c_driver cy8c95x0_driver = { .id_table = cy8c95x0_id, .detect = cy8c95x0_detect, }; - module_i2c_driver(cy8c95x0_driver); MODULE_AUTHOR("Patrick Rudolph "); -- GitLab From 71e268e3426d2a1a4fcf3d88079d1d977fd034e0 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Tue, 6 Sep 2022 00:44:08 +0200 Subject: [PATCH 0168/2223] pinctrl: imx8m: kconfig: Fix build error on test compile PINCTRL_IMX depends on OF, however the dependency is missed when selected by PINCTRL_IMX8M* (it does not follow the indirect 'select' statements), select it explicitly. Cc: Arnd Bergmann Cc: Linus Walleij Reported-by: kernel test robot Link: https://lore.kernel.org/all/202209050605.fezJUgFH-lkp@intel.com/ Fixes: 87c2a29a6bf1 ("pinctrl: imx8m: kconfig: Depends on SOC_IMX8M") Signed-off-by: Francesco Dolcini Reviewed-by: Jacky Bai Link: https://lore.kernel.org/r/20220905224408.346425-1-francesco.dolcini@toradex.com Signed-off-by: Linus Walleij --- drivers/pinctrl/freescale/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pinctrl/freescale/Kconfig b/drivers/pinctrl/freescale/Kconfig index 365fcff8e470d..7a32f77792d9a 100644 --- a/drivers/pinctrl/freescale/Kconfig +++ b/drivers/pinctrl/freescale/Kconfig @@ -119,6 +119,7 @@ config PINCTRL_IMX7ULP config PINCTRL_IMX8MM tristate "IMX8MM pinctrl driver" + depends on OF depends on SOC_IMX8M select PINCTRL_IMX help @@ -126,6 +127,7 @@ config PINCTRL_IMX8MM config PINCTRL_IMX8MN tristate "IMX8MN pinctrl driver" + depends on OF depends on SOC_IMX8M select PINCTRL_IMX help @@ -133,6 +135,7 @@ config PINCTRL_IMX8MN config PINCTRL_IMX8MP tristate "IMX8MP pinctrl driver" + depends on OF depends on SOC_IMX8M select PINCTRL_IMX help @@ -140,6 +143,7 @@ config PINCTRL_IMX8MP config PINCTRL_IMX8MQ tristate "IMX8MQ pinctrl driver" + depends on OF depends on SOC_IMX8M select PINCTRL_IMX help -- GitLab From f1509dad5dbf480e3f19fbd99e586d919adf55fe Mon Sep 17 00:00:00 2001 From: Iskren Chernev Date: Sat, 3 Sep 2022 20:41:45 +0300 Subject: [PATCH 0169/2223] dt-bindings: pinctrl: qcom: sm6115: Add reserved ranges Ideally this and similar common properties will be inherited so you won't need to paste them in every pinctrl binding. Signed-off-by: Iskren Chernev Reviewed-by: Caleb Connolly Link: https://lore.kernel.org/r/20220903174150.3566935-5-iskren.chernev@gmail.com Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml index a7a2bb8bff467..d8443811767db 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml @@ -49,6 +49,8 @@ properties: gpio-ranges: maxItems: 1 + gpio-reserved-ranges: true + wakeup-parent: true #PIN CONFIGURATION NODES -- GitLab From 8c943137c00a773ece8d324862910d53832a97a1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 5 Sep 2022 21:51:02 +0300 Subject: [PATCH 0170/2223] pinctrl: ingenic: Switch to use fwnode instead of of_node GPIO library now accepts fwnode as a firmware node, so switch the driver to use it. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220905185102.74056-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-ingenic.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index 3a9ee9c8af116..7e732076dedf0 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -12,14 +12,14 @@ #include #include #include -#include -#include -#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -4152,7 +4152,7 @@ static const struct of_device_id ingenic_gpio_of_matches[] __initconst = { }; static int __init ingenic_gpio_probe(struct ingenic_pinctrl *jzpc, - struct device_node *node) + struct fwnode_handle *fwnode) { struct ingenic_gpio_chip *jzgc; struct device *dev = jzpc->dev; @@ -4160,7 +4160,7 @@ static int __init ingenic_gpio_probe(struct ingenic_pinctrl *jzpc, unsigned int bank; int err; - err = of_property_read_u32(node, "reg", &bank); + err = fwnode_property_read_u32(fwnode, "reg", &bank); if (err) { dev_err(dev, "Cannot read \"reg\" property: %i\n", err); return err; @@ -4185,7 +4185,7 @@ static int __init ingenic_gpio_probe(struct ingenic_pinctrl *jzpc, jzgc->gc.ngpio = 32; jzgc->gc.parent = dev; - jzgc->gc.of_node = node; + jzgc->gc.fwnode = fwnode; jzgc->gc.owner = THIS_MODULE; jzgc->gc.set = ingenic_gpio_set; @@ -4196,9 +4196,12 @@ static int __init ingenic_gpio_probe(struct ingenic_pinctrl *jzpc, jzgc->gc.request = gpiochip_generic_request; jzgc->gc.free = gpiochip_generic_free; - jzgc->irq = irq_of_parse_and_map(node, 0); - if (!jzgc->irq) + err = fwnode_irq_get(fwnode, 0); + if (err < 0) + return err; + if (!err) return -EINVAL; + jzgc->irq = err; girq = &jzgc->gc.irq; gpio_irq_chip_set_chip(girq, &ingenic_gpio_irqchip); @@ -4227,12 +4230,12 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev) struct pinctrl_desc *pctl_desc; void __iomem *base; const struct ingenic_chip_info *chip_info; - struct device_node *node; struct regmap_config regmap_config; + struct fwnode_handle *fwnode; unsigned int i; int err; - chip_info = of_device_get_match_data(dev); + chip_info = device_get_match_data(dev); if (!chip_info) { dev_err(dev, "Unsupported SoC\n"); return -EINVAL; @@ -4319,11 +4322,11 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev) dev_set_drvdata(dev, jzpc->map); - for_each_child_of_node(dev->of_node, node) { - if (of_match_node(ingenic_gpio_of_matches, node)) { - err = ingenic_gpio_probe(jzpc, node); + device_for_each_child_node(dev, fwnode) { + if (of_match_node(ingenic_gpio_of_matches, to_of_node(fwnode))) { + err = ingenic_gpio_probe(jzpc, fwnode); if (err) { - of_node_put(node); + fwnode_handle_put(fwnode); return err; } } -- GitLab From 6323f916686d4e9d299a53114b28ecaf833422cd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 6 Sep 2022 14:50:21 +0300 Subject: [PATCH 0171/2223] pinctrl: microchip-sgpio: Correct the fwnode_irq_get() return value check fwnode_irq_get() may return all possible signed values, such as Linux error code. Fix the code to handle this properly. Fixes: be2dc859abd4 ("pinctrl: pinctrl-microchip-sgpio: Add irq support (for sparx5)") Signed-off-by: Andy Shevchenko Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20220906115021.8661-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-microchip-sgpio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 6f55bf7d5e054..0771b743a940d 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -864,9 +864,10 @@ static int microchip_sgpio_register_bank(struct device *dev, gc->can_sleep = !bank->is_input; if (bank->is_input && priv->properties->flags & SGPIO_FLAGS_HAS_IRQ) { - int irq = fwnode_irq_get(fwnode, 0); + int irq; - if (irq) { + irq = fwnode_irq_get(fwnode, 0); + if (irq > 0) { struct gpio_irq_chip *girq = &gc->irq; gpio_irq_chip_set_chip(girq, µchip_sgpio_irqchip); -- GitLab From 827eb27ec2e508e1ef5dc36d29db73cbae1ccb40 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 5 Sep 2022 21:00:34 +0300 Subject: [PATCH 0172/2223] pinctrl: meson: Switch to use fwnode instead of of_node GPIO library now accepts fwnode as a firmware node, so switch the driver to use it. Signed-off-by: Andy Shevchenko Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20220905180034.73132-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/meson/pinctrl-meson.c | 7 +++---- drivers/pinctrl/meson/pinctrl-meson.h | 4 +++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index cc2cd73ff8f98..530f3f934e196 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -608,6 +608,7 @@ static int meson_gpiolib_register(struct meson_pinctrl *pc) pc->chip.label = pc->data->name; pc->chip.parent = pc->dev; + pc->chip.fwnode = pc->fwnode; pc->chip.request = gpiochip_generic_request; pc->chip.free = gpiochip_generic_free; pc->chip.set_config = gpiochip_generic_config; @@ -619,8 +620,6 @@ static int meson_gpiolib_register(struct meson_pinctrl *pc) pc->chip.base = -1; pc->chip.ngpio = pc->data->num_pins; pc->chip.can_sleep = false; - pc->chip.of_node = pc->of_node; - pc->chip.of_gpio_n_cells = 2; ret = gpiochip_add_data(&pc->chip, pc); if (ret) { @@ -678,8 +677,8 @@ static int meson_pinctrl_parse_dt(struct meson_pinctrl *pc) return -EINVAL; } - gpio_np = to_of_node(gpiochip_node_get_first(pc->dev)); - pc->of_node = gpio_np; + pc->fwnode = gpiochip_node_get_first(pc->dev); + gpio_np = to_of_node(pc->fwnode); pc->reg_mux = meson_map_resource(pc, gpio_np, "mux"); if (IS_ERR_OR_NULL(pc->reg_mux)) { diff --git a/drivers/pinctrl/meson/pinctrl-meson.h b/drivers/pinctrl/meson/pinctrl-meson.h index b197827027bd0..34fc4e8612e47 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.h +++ b/drivers/pinctrl/meson/pinctrl-meson.h @@ -12,6 +12,8 @@ #include #include +struct fwnode_handle; + struct meson_pinctrl; /** @@ -131,7 +133,7 @@ struct meson_pinctrl { struct regmap *reg_gpio; struct regmap *reg_ds; struct gpio_chip chip; - struct device_node *of_node; + struct fwnode_handle *fwnode; }; #define FUNCTION(fn) \ -- GitLab From 1a41d1e5c8e5c5850a15abf3d18f610e9310b8ef Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 30 Aug 2022 14:52:32 +0530 Subject: [PATCH 0173/2223] pinctrl: qcom: spmi-gpio: Make irqchip immutable The irqchip implementation used inside the gpiochips are not supposed to be changed during runtime. So let's make the one inside the spmi-gpio gpiochip immutable. This fixes the below warning during boot: gpio gpiochip0: (c440000.spmi:pmic@0:gpio@c000): not an immutable chip, please consider fixing it! Acked-by: Marc Zyngier Signed-off-by: Manivannan Sadhasivam Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20220830092232.168561-1-manivannan.sadhasivam@linaro.org [switched two lines as indicated by Johan] Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 38 +++++++++++++++++------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index ccaf40a9c0e6b..8ba3d5021f0b8 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -171,7 +171,6 @@ struct pmic_gpio_state { struct regmap *map; struct pinctrl_dev *ctrl; struct gpio_chip chip; - struct irq_chip irq; u8 usid; u8 pid_base; }; @@ -985,6 +984,33 @@ static int pmic_gpio_populate_parent_fwspec(struct gpio_chip *chip, return 0; } +static void pmic_gpio_irq_mask(struct irq_data *data) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + + irq_chip_mask_parent(data); + gpiochip_disable_irq(gc, data->hwirq); +} + +static void pmic_gpio_irq_unmask(struct irq_data *data) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + + gpiochip_enable_irq(gc, data->hwirq); + irq_chip_unmask_parent(data); +} + +static const struct irq_chip spmi_gpio_irq_chip = { + .name = "spmi-gpio", + .irq_ack = irq_chip_ack_parent, + .irq_mask = pmic_gpio_irq_mask, + .irq_unmask = pmic_gpio_irq_unmask, + .irq_set_type = irq_chip_set_type_parent, + .irq_set_wake = irq_chip_set_wake_parent, + .flags = IRQCHIP_IMMUTABLE | IRQCHIP_MASK_ON_SUSPEND, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static int pmic_gpio_probe(struct platform_device *pdev) { struct irq_domain *parent_domain; @@ -1078,16 +1104,8 @@ static int pmic_gpio_probe(struct platform_device *pdev) if (!parent_domain) return -ENXIO; - state->irq.name = "spmi-gpio", - state->irq.irq_ack = irq_chip_ack_parent, - state->irq.irq_mask = irq_chip_mask_parent, - state->irq.irq_unmask = irq_chip_unmask_parent, - state->irq.irq_set_type = irq_chip_set_type_parent, - state->irq.irq_set_wake = irq_chip_set_wake_parent, - state->irq.flags = IRQCHIP_MASK_ON_SUSPEND, - girq = &state->chip.irq; - girq->chip = &state->irq; + gpio_irq_chip_set_chip(girq, &spmi_gpio_irq_chip); girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; girq->fwnode = of_node_to_fwnode(state->dev->of_node); -- GitLab From 88d60d7d94bfab00abef3596fbf54b460c2a45ec Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Sep 2022 12:43:23 +0300 Subject: [PATCH 0174/2223] pinctrl: pistachio: Correct the fwnode_irq_get() return value check fwnode_irq_get() may return all possible signed values, such as Linux error code or 0. Fix the code to handle this properly. Fixes: 1074e1d23a5c ("pinctrl: pistachio: Switch to use fwnode instead of") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220908094323.31965-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-pistachio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pinctrl/pinctrl-pistachio.c b/drivers/pinctrl/pinctrl-pistachio.c index 940ed3fff63a8..7ca4ecb6eb8d7 100644 --- a/drivers/pinctrl/pinctrl-pistachio.c +++ b/drivers/pinctrl/pinctrl-pistachio.c @@ -1374,8 +1374,14 @@ static int pistachio_gpio_register(struct pistachio_pinctrl *pctl) ret = fwnode_irq_get(child, 0); if (ret < 0) { + fwnode_handle_put(child); + dev_err(pctl->dev, "Failed to retrieve IRQ for bank %u\n", i); + goto err; + } + if (!ret) { fwnode_handle_put(child); dev_err(pctl->dev, "No IRQ for bank %u\n", i); + ret = -EINVAL; goto err; } irq = ret; -- GitLab From f7ec3f62d7736ed14050716943a9562879155fcc Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 8 Sep 2022 12:43:34 +0200 Subject: [PATCH 0175/2223] media: remove reference to CONFIG_EMBEDDED in MEDIA_SUPPORT_FILTER The config EMBEDDED selects EXPERT, i.e., when EMBEDDED is enabled, EXPERT is usually also enabled. Hence, it sufficient to have the option MEDIA_SUPPORT_FILTER set to y if !EXPERT. This way, MEDIA_SUPPORT_FILTER does not refer to CONFIG_EMBEDDED anymore and allows us to remove CONFIG_EMBEDDED in the close future. Remove the reference to CONFIG_EMBEDDED in MEDIA_SUPPORT_FILTER. Link: https://lore.kernel.org/linux-media/20220908104337.11940-4-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Mauro Carvalho Chehab --- drivers/media/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/Kconfig b/drivers/media/Kconfig index ba6592b3dab20..283b78b5766ea 100644 --- a/drivers/media/Kconfig +++ b/drivers/media/Kconfig @@ -24,7 +24,7 @@ if MEDIA_SUPPORT config MEDIA_SUPPORT_FILTER bool "Filter media drivers" - default y if !EMBEDDED && !EXPERT + default y if !EXPERT help Configuring the media subsystem can be complex, as there are hundreds of drivers and other config options. -- GitLab From eab60bbc05a9375145e7b793ca37a1b6ec262887 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 2 Sep 2022 18:07:54 +0200 Subject: [PATCH 0176/2223] vfio/fsl-mc: Fix a typo in a message L and S are swapped in the message. s/VFIO_FLS_MC/VFIO_FSL_MC/ Also use 'ret' instead of 'WARN_ON(ret)' to avoid a duplicated message. Signed-off-by: Christophe JAILLET Reviewed-by: Diana Craciun Acked-by: Cornelia Huck Link: https://lore.kernel.org/r/a7c1394346725b7435792628c8d4c06a0a745e0b.1662134821.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 3feff729f3ce8..42b344bd7cd5b 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -108,9 +108,9 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) /* reset the device before cleaning up the interrupts */ ret = vfio_fsl_mc_reset_device(vdev); - if (WARN_ON(ret)) + if (ret) dev_warn(&mc_cont->dev, - "VFIO_FLS_MC: reset device has failed (%d)\n", ret); + "VFIO_FSL_MC: reset device has failed (%d)\n", ret); vfio_fsl_mc_irqs_cleanup(vdev); -- GitLab From 42ee53f9bfd3e4cf58ae7656e0d11075f5fe8489 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:41 +0300 Subject: [PATCH 0177/2223] vfio: Introduce DMA logging uAPIs DMA logging allows a device to internally record what DMAs the device is initiating and report them back to userspace. It is part of the VFIO migration infrastructure that allows implementing dirty page tracking during the pre copy phase of live migration. Only DMA WRITEs are logged, and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. This patch introduces the DMA logging involved uAPIs. It uses the FEATURE ioctl with its GET/SET/PROBE options as of below. It exposes a PROBE option to detect if the device supports DMA logging. It exposes a SET option to start device DMA logging in given IOVAs ranges. It exposes a SET option to stop device DMA logging that was previously started. It exposes a GET option to read back and clear the device DMA log. Extra details exist as part of vfio.h per a specific option. Signed-off-by: Yishai Hadas Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220908183448.195262-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 86 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 76a173f973de6..d7d8e0922376c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1042,6 +1042,92 @@ struct vfio_device_low_power_entry_with_wakeup { */ #define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5 +/* + * Upon VFIO_DEVICE_FEATURE_SET start/stop device DMA logging. + * VFIO_DEVICE_FEATURE_PROBE can be used to detect if the device supports + * DMA logging. + * + * DMA logging allows a device to internally record what DMAs the device is + * initiating and report them back to userspace. It is part of the VFIO + * migration infrastructure that allows implementing dirty page tracking + * during the pre copy phase of live migration. Only DMA WRITEs are logged, + * and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. + * + * When DMA logging is started a range of IOVAs to monitor is provided and the + * device can optimize its logging to cover only the IOVA range given. Each + * DMA that the device initiates inside the range will be logged by the device + * for later retrieval. + * + * page_size is an input that hints what tracking granularity the device + * should try to achieve. If the device cannot do the hinted page size then + * it's the driver choice which page size to pick based on its support. + * On output the device will return the page size it selected. + * + * ranges is a pointer to an array of + * struct vfio_device_feature_dma_logging_range. + * + * The core kernel code guarantees to support by minimum num_ranges that fit + * into a single kernel page. User space can try higher values but should give + * up if the above can't be achieved as of some driver limitations. + * + * A single call to start device DMA logging can be issued and a matching stop + * should follow at the end. Another start is not allowed in the meantime. + */ +struct vfio_device_feature_dma_logging_control { + __aligned_u64 page_size; + __u32 num_ranges; + __u32 __reserved; + __aligned_u64 ranges; +}; + +struct vfio_device_feature_dma_logging_range { + __aligned_u64 iova; + __aligned_u64 length; +}; + +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6 + +/* + * Upon VFIO_DEVICE_FEATURE_SET stop device DMA logging that was started + * by VFIO_DEVICE_FEATURE_DMA_LOGGING_START + */ +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7 + +/* + * Upon VFIO_DEVICE_FEATURE_GET read back and clear the device DMA log + * + * Query the device's DMA log for written pages within the given IOVA range. + * During querying the log is cleared for the IOVA range. + * + * bitmap is a pointer to an array of u64s that will hold the output bitmap + * with 1 bit reporting a page_size unit of IOVA. The mapping of IOVA to bits + * is given by: + * bitmap[(addr - iova)/page_size] & (1ULL << (addr % 64)) + * + * The input page_size can be any power of two value and does not have to + * match the value given to VFIO_DEVICE_FEATURE_DMA_LOGGING_START. The driver + * will format its internal logging to match the reporting page size, possibly + * by replicating bits if the internal page size is lower than requested. + * + * The LOGGING_REPORT will only set bits in the bitmap and never clear or + * perform any initialization of the user provided bitmap. + * + * If any error is returned userspace should assume that the dirty log is + * corrupted. Error recovery is to consider all memory dirty and try to + * restart the dirty tracking, or to abort/restart the whole migration. + * + * If DMA logging is not enabled, an error will be returned. + * + */ +struct vfio_device_feature_dma_logging_report { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 bitmap; +}; + +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- GitLab From 58ccf0190d19d9a8a41f8a02b9e06742b58df4a1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 8 Sep 2022 21:34:42 +0300 Subject: [PATCH 0178/2223] vfio: Add an IOVA bitmap support The new facility adds a bunch of wrappers that abstract how an IOVA range is represented in a bitmap that is granulated by a given page_size. So it translates all the lifting of dealing with user pointers into its corresponding kernel addresses backing said user memory into doing finally the (non-atomic) bitmap ops to change various bits. The formula for the bitmap is: data[(iova / page_size) / 64] & (1ULL << (iova % 64)) Where 64 is the number of bits in a unsigned long (depending on arch) It introduces an IOVA iterator that uses a windowing scheme to minimize the pinning overhead, as opposed to pinning it on demand 4K at a time. Assuming a 4K kernel page and 4K requested page size, we can use a single kernel page to hold 512 page pointers, mapping 2M of bitmap, representing 64G of IOVA space. An example usage of these helpers for a given @base_iova, @page_size, @length and __user @data: bitmap = iova_bitmap_alloc(base_iova, page_size, length, data); if (IS_ERR(bitmap)) return -ENOMEM; ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); iova_bitmap_free(bitmap); Each iteration of the @dirty_reporter_fn is called with a unique @iova and @length argument, indicating the current range available through the iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty areas (@iova_length) within that provided range, as following: iova_bitmap_set(bitmap, iova, iova_length); The facility is intended to be used for user bitmaps representing dirtied IOVAs by IOMMU (via IOMMUFD) and PCI Devices (via vfio-pci). Signed-off-by: Joao Martins Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 6 +- drivers/vfio/iova_bitmap.c | 422 ++++++++++++++++++++++++++++++++++++ include/linux/iova_bitmap.h | 26 +++ 3 files changed, 452 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/iova_bitmap.c create mode 100644 include/linux/iova_bitmap.h diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 1a32357592e3e..d67c604d0407e 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,9 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 vfio_virqfd-y := virqfd.o -vfio-y += vfio_main.o - obj-$(CONFIG_VFIO) += vfio.o + +vfio-y += vfio_main.o \ + iova_bitmap.o \ + obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c new file mode 100644 index 0000000000000..6631e8befe1b2 --- /dev/null +++ b/drivers/vfio/iova_bitmap.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022, Oracle and/or its affiliates. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ +#include +#include +#include + +#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) + +/* + * struct iova_bitmap_map - A bitmap representing an IOVA range + * + * Main data structure for tracking mapped user pages of bitmap data. + * + * For example, for something recording dirty IOVAs, it will be provided a + * struct iova_bitmap structure, as a general structure for iterating the + * total IOVA range. The struct iova_bitmap_map, though, represents the + * subset of said IOVA space that is pinned by its parent structure (struct + * iova_bitmap). + * + * The user does not need to exact location of the bits in the bitmap. + * From user perspective the only API available is iova_bitmap_set() which + * records the IOVA *range* in the bitmap by setting the corresponding + * bits. + * + * The bitmap is an array of u64 whereas each bit represents an IOVA of + * range of (1 << pgshift). Thus formula for the bitmap data to be set is: + * + * data[(iova / page_size) / 64] & (1ULL << (iova % 64)) + */ +struct iova_bitmap_map { + /* base IOVA representing bit 0 of the first page */ + unsigned long iova; + + /* page size order that each bit granules to */ + unsigned long pgshift; + + /* page offset of the first user page pinned */ + unsigned long pgoff; + + /* number of pages pinned */ + unsigned long npages; + + /* pinned pages representing the bitmap data */ + struct page **pages; +}; + +/* + * struct iova_bitmap - The IOVA bitmap object + * + * Main data structure for iterating over the bitmap data. + * + * Abstracts the pinning work and iterates in IOVA ranges. + * It uses a windowing scheme and pins the bitmap in relatively + * big ranges e.g. + * + * The bitmap object uses one base page to store all the pinned pages + * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores + * 512 struct page pointers which, if the base page size is 4K, it means + * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is + * also 4K then the range window to iterate is 64G. + * + * For example iterating on a total IOVA range of 4G..128G, it will walk + * through this set of ranges: + * + * 4G - 68G-1 (64G) + * 68G - 128G-1 (64G) + * + * An example of the APIs on how to use/iterate over the IOVA bitmap: + * + * bitmap = iova_bitmap_alloc(iova, length, page_size, data); + * if (IS_ERR(bitmap)) + * return PTR_ERR(bitmap); + * + * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); + * + * iova_bitmap_free(bitmap); + * + * Each iteration of the @dirty_reporter_fn is called with a unique @iova + * and @length argument, indicating the current range available through the + * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty + * areas (@iova_length) within that provided range, as following: + * + * iova_bitmap_set(bitmap, iova, iova_length); + * + * The internals of the object uses an index @mapped_base_index that indexes + * which u64 word of the bitmap is mapped, up to @mapped_total_index. + * Those keep being incremented until @mapped_total_index is reached while + * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages. + * + * The IOVA bitmap is usually located on what tracks DMA mapped ranges or + * some form of IOVA range tracking that co-relates to the user passed + * bitmap. + */ +struct iova_bitmap { + /* IOVA range representing the currently mapped bitmap data */ + struct iova_bitmap_map mapped; + + /* userspace address of the bitmap */ + u64 __user *bitmap; + + /* u64 index that @mapped points to */ + unsigned long mapped_base_index; + + /* how many u64 can we walk in total */ + unsigned long mapped_total_index; + + /* base IOVA of the whole bitmap */ + unsigned long iova; + + /* length of the IOVA range for the whole bitmap */ + size_t length; +}; + +/* + * Converts a relative IOVA to a bitmap index. + * This function provides the index into the u64 array (bitmap::bitmap) + * for a given IOVA offset. + * Relative IOVA means relative to the bitmap::mapped base IOVA + * (stored in mapped::iova). All computations in this file are done using + * relative IOVAs and thus avoid an extra subtraction against mapped::iova. + * The user API iova_bitmap_set() always uses a regular absolute IOVAs. + */ +static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, + unsigned long iova) +{ + unsigned long pgsize = 1 << bitmap->mapped.pgshift; + + return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); +} + +/* + * Converts a bitmap index to a *relative* IOVA. + */ +static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap, + unsigned long index) +{ + unsigned long pgshift = bitmap->mapped.pgshift; + + return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift; +} + +/* + * Returns the base IOVA of the mapped range. + */ +static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap) +{ + unsigned long skip = bitmap->mapped_base_index; + + return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip); +} + +/* + * Pins the bitmap user pages for the current range window. + * This is internal to IOVA bitmap and called when advancing the + * index (@mapped_base_index) or allocating the bitmap. + */ +static int iova_bitmap_get(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + unsigned long npages; + u64 __user *addr; + long ret; + + /* + * @mapped_base_index is the index of the currently mapped u64 words + * that we have access. Anything before @mapped_base_index is not + * mapped. The range @mapped_base_index .. @mapped_total_index-1 is + * mapped but capped at a maximum number of pages. + */ + npages = DIV_ROUND_UP((bitmap->mapped_total_index - + bitmap->mapped_base_index) * + sizeof(*bitmap->bitmap), PAGE_SIZE); + + /* + * We always cap at max number of 'struct page' a base page can fit. + * This is, for example, on x86 means 2M of bitmap data max. + */ + npages = min(npages, PAGE_SIZE / sizeof(struct page *)); + + /* + * Bitmap address to be pinned is calculated via pointer arithmetic + * with bitmap u64 word index. + */ + addr = bitmap->bitmap + bitmap->mapped_base_index; + + ret = pin_user_pages_fast((unsigned long)addr, npages, + FOLL_WRITE, mapped->pages); + if (ret <= 0) + return -EFAULT; + + mapped->npages = (unsigned long)ret; + /* Base IOVA where @pages point to i.e. bit 0 of the first page */ + mapped->iova = iova_bitmap_mapped_iova(bitmap); + + /* + * offset of the page where pinned pages bit 0 is located. + * This handles the case where the bitmap is not PAGE_SIZE + * aligned. + */ + mapped->pgoff = offset_in_page(addr); + return 0; +} + +/* + * Unpins the bitmap user pages and clears @npages + * (un)pinning is abstracted from API user and it's done when advancing + * the index or freeing the bitmap. + */ +static void iova_bitmap_put(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + + if (mapped->npages) { + unpin_user_pages(mapped->pages, mapped->npages); + mapped->npages = 0; + } +} + +/** + * iova_bitmap_alloc() - Allocates an IOVA bitmap object + * @iova: Start address of the IOVA range + * @length: Length of the IOVA range + * @page_size: Page size of the IOVA bitmap. It defines what each bit + * granularity represents + * @data: Userspace address of the bitmap + * + * Allocates an IOVA object and initializes all its fields including the + * first user pages of @data. + * + * Return: A pointer to a newly allocated struct iova_bitmap + * or ERR_PTR() on error. + */ +struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, + unsigned long page_size, u64 __user *data) +{ + struct iova_bitmap_map *mapped; + struct iova_bitmap *bitmap; + int rc; + + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return ERR_PTR(-ENOMEM); + + mapped = &bitmap->mapped; + mapped->pgshift = __ffs(page_size); + bitmap->bitmap = data; + bitmap->mapped_total_index = + iova_bitmap_offset_to_index(bitmap, length - 1) + 1; + bitmap->iova = iova; + bitmap->length = length; + mapped->iova = iova; + mapped->pages = (struct page **)__get_free_page(GFP_KERNEL); + if (!mapped->pages) { + rc = -ENOMEM; + goto err; + } + + rc = iova_bitmap_get(bitmap); + if (rc) + goto err; + return bitmap; + +err: + iova_bitmap_free(bitmap); + return ERR_PTR(rc); +} + +/** + * iova_bitmap_free() - Frees an IOVA bitmap object + * @bitmap: IOVA bitmap to free + * + * It unpins and releases pages array memory and clears any leftover + * state. + */ +void iova_bitmap_free(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + + iova_bitmap_put(bitmap); + + if (mapped->pages) { + free_page((unsigned long)mapped->pages); + mapped->pages = NULL; + } + + kfree(bitmap); +} + +/* + * Returns the remaining bitmap indexes from mapped_total_index to process for + * the currently pinned bitmap pages. + */ +static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) +{ + unsigned long remaining; + + remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; + remaining = min_t(unsigned long, remaining, + (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); + + return remaining; +} + +/* + * Returns the length of the mapped IOVA range. + */ +static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap) +{ + unsigned long max_iova = bitmap->iova + bitmap->length - 1; + unsigned long iova = iova_bitmap_mapped_iova(bitmap); + unsigned long remaining; + + /* + * iova_bitmap_mapped_remaining() returns a number of indexes which + * when converted to IOVA gives us a max length that the bitmap + * pinned data can cover. Afterwards, that is capped to + * only cover the IOVA range in @bitmap::iova .. @bitmap::length. + */ + remaining = iova_bitmap_index_to_offset(bitmap, + iova_bitmap_mapped_remaining(bitmap)); + + if (iova + remaining - 1 > max_iova) + remaining -= ((iova + remaining - 1) - max_iova); + + return remaining; +} + +/* + * Returns true if there's not more data to iterate. + */ +static bool iova_bitmap_done(struct iova_bitmap *bitmap) +{ + return bitmap->mapped_base_index >= bitmap->mapped_total_index; +} + +/* + * Advances to the next range, releases the current pinned + * pages and pins the next set of bitmap pages. + * Returns 0 on success or otherwise errno. + */ +static int iova_bitmap_advance(struct iova_bitmap *bitmap) +{ + unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1; + unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1; + + bitmap->mapped_base_index += count; + + iova_bitmap_put(bitmap); + if (iova_bitmap_done(bitmap)) + return 0; + + /* When advancing the index we pin the next set of bitmap pages */ + return iova_bitmap_get(bitmap); +} + +/** + * iova_bitmap_for_each() - Iterates over the bitmap + * @bitmap: IOVA bitmap to iterate + * @opaque: Additional argument to pass to the callback + * @fn: Function that gets called for each IOVA range + * + * Helper function to iterate over bitmap data representing a portion of IOVA + * space. It hides the complexity of iterating bitmaps and translating the + * mapped bitmap user pages into IOVA ranges to process. + * + * Return: 0 on success, and an error on failure either upon + * iteration or when the callback returns an error. + */ +int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn) +{ + int ret = 0; + + for (; !iova_bitmap_done(bitmap) && !ret; + ret = iova_bitmap_advance(bitmap)) { + ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap), + iova_bitmap_mapped_length(bitmap), opaque); + if (ret) + break; + } + + return ret; +} + +/** + * iova_bitmap_set() - Records an IOVA range in bitmap + * @bitmap: IOVA bitmap + * @iova: IOVA to start + * @length: IOVA range length + * + * Set the bits corresponding to the range [iova .. iova+length-1] in + * the user bitmap. + * + * Return: The number of bits set. + */ +void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; + unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); + unsigned long page_idx = offset / BITS_PER_PAGE; + unsigned long page_offset = mapped->pgoff; + void *kaddr; + + offset = offset % BITS_PER_PAGE; + + do { + unsigned long size = min(BITS_PER_PAGE - offset, nbits); + + kaddr = kmap_local_page(mapped->pages[page_idx]); + bitmap_set(kaddr + page_offset, offset, size); + kunmap_local(kaddr); + page_offset = offset = 0; + nbits -= size; + page_idx++; + } while (nbits > 0); +} +EXPORT_SYMBOL_GPL(iova_bitmap_set); diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h new file mode 100644 index 0000000000000..c006cf0a25f3d --- /dev/null +++ b/include/linux/iova_bitmap.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022, Oracle and/or its affiliates. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ +#ifndef _IOVA_BITMAP_H_ +#define _IOVA_BITMAP_H_ + +#include + +struct iova_bitmap; + +typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque); + +struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, + unsigned long page_size, + u64 __user *data); +void iova_bitmap_free(struct iova_bitmap *bitmap); +int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn); +void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length); + +#endif -- GitLab From 80c4b92a2dc48cce82a0348add48533db7e07314 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:43 +0300 Subject: [PATCH 0179/2223] vfio: Introduce the DMA logging feature support Introduce the DMA logging feature support in the vfio core layer. It includes the processing of the device start/stop/report DMA logging UAPIs and calling the relevant driver 'op' to do the work. Specifically, Upon start, the core translates the given input ranges into an interval tree, checks for unexpected overlapping, non aligned ranges and then pass the translated input to the driver for start tracking the given ranges. Upon report, the core translates the given input user space bitmap and page size into an IOVA kernel bitmap iterator. Then it iterates it and call the driver to set the corresponding bits for the dirtied pages in a specific IOVA range. Upon stop, the driver is called to stop the previous started tracking. The next patches from the series will introduce the mlx5 driver implementation for the logging ops. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 1 + drivers/vfio/pci/vfio_pci_core.c | 5 + drivers/vfio/vfio_main.c | 175 +++++++++++++++++++++++++++++++ include/linux/vfio.h | 28 ++++- 4 files changed, 207 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 6130d00252ed7..86c381ceb9a1e 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,6 +3,7 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + select INTERVAL_TREE help VFIO provides a framework for secure userspace device drivers. See Documentation/driver-api/vfio.rst for more details. diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 0d4b49f06b149..0a801aee2f2d1 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2128,6 +2128,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return -EINVAL; } + if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start && + vdev->vdev.log_ops->log_stop && + vdev->vdev.log_ops->log_read_and_clear)) + return -EINVAL; + /* * Prevent binding to PFs with VFs enabled, the VFs might be in use * by the host or other users. We cannot capture the VFs if they diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 77264d836d520..27d9186f35d5c 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -1658,6 +1660,167 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device, return 0; } +/* Ranges should fit into a single kernel page */ +#define LOG_MAX_RANGES \ + (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) + +static int +vfio_ioctl_device_feature_logging_start(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + size_t minsz = + offsetofend(struct vfio_device_feature_dma_logging_control, + ranges); + struct vfio_device_feature_dma_logging_range __user *ranges; + struct vfio_device_feature_dma_logging_control control; + struct vfio_device_feature_dma_logging_range range; + struct rb_root_cached root = RB_ROOT_CACHED; + struct interval_tree_node *nodes; + u64 iova_end; + u32 nnodes; + int i, ret; + + if (!device->log_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_SET, + sizeof(control)); + if (ret != 1) + return ret; + + if (copy_from_user(&control, arg, minsz)) + return -EFAULT; + + nnodes = control.num_ranges; + if (!nnodes) + return -EINVAL; + + if (nnodes > LOG_MAX_RANGES) + return -E2BIG; + + ranges = u64_to_user_ptr(control.ranges); + nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), + GFP_KERNEL); + if (!nodes) + return -ENOMEM; + + for (i = 0; i < nnodes; i++) { + if (copy_from_user(&range, &ranges[i], sizeof(range))) { + ret = -EFAULT; + goto end; + } + if (!IS_ALIGNED(range.iova, control.page_size) || + !IS_ALIGNED(range.length, control.page_size)) { + ret = -EINVAL; + goto end; + } + + if (check_add_overflow(range.iova, range.length, &iova_end) || + iova_end > ULONG_MAX) { + ret = -EOVERFLOW; + goto end; + } + + nodes[i].start = range.iova; + nodes[i].last = range.iova + range.length - 1; + if (interval_tree_iter_first(&root, nodes[i].start, + nodes[i].last)) { + /* Range overlapping */ + ret = -EINVAL; + goto end; + } + interval_tree_insert(nodes + i, &root); + } + + ret = device->log_ops->log_start(device, &root, nnodes, + &control.page_size); + if (ret) + goto end; + + if (copy_to_user(arg, &control, sizeof(control))) { + ret = -EFAULT; + device->log_ops->log_stop(device); + } + +end: + kfree(nodes); + return ret; +} + +static int +vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + int ret; + + if (!device->log_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + return device->log_ops->log_stop(device); +} + +static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, + unsigned long iova, size_t length, + void *opaque) +{ + struct vfio_device *device = opaque; + + return device->log_ops->log_read_and_clear(device, iova, length, iter); +} + +static int +vfio_ioctl_device_feature_logging_report(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + size_t minsz = + offsetofend(struct vfio_device_feature_dma_logging_report, + bitmap); + struct vfio_device_feature_dma_logging_report report; + struct iova_bitmap *iter; + u64 iova_end; + int ret; + + if (!device->log_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_GET, + sizeof(report)); + if (ret != 1) + return ret; + + if (copy_from_user(&report, arg, minsz)) + return -EFAULT; + + if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) + return -EINVAL; + + if (check_add_overflow(report.iova, report.length, &iova_end) || + iova_end > ULONG_MAX) + return -EOVERFLOW; + + iter = iova_bitmap_alloc(report.iova, report.length, + report.page_size, + u64_to_user_ptr(report.bitmap)); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = iova_bitmap_for_each(iter, device, + vfio_device_log_read_and_clear); + + iova_bitmap_free(iter); + return ret; +} + static int vfio_ioctl_device_feature(struct vfio_device *device, struct vfio_device_feature __user *arg) { @@ -1691,6 +1854,18 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_mig_device_state( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: + return vfio_ioctl_device_feature_logging_start( + device, feature.flags, arg->data, + feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: + return vfio_ioctl_device_feature_logging_stop( + device, feature.flags, arg->data, + feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: + return vfio_ioctl_device_feature_logging_report( + device, feature.flags, arg->data, + feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -EINVAL; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e05ddc6fe6a55..0e28265590916 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -14,6 +14,7 @@ #include #include #include +#include struct kvm; @@ -33,10 +34,11 @@ struct vfio_device { struct device *dev; const struct vfio_device_ops *ops; /* - * mig_ops is a static property of the vfio_device which must be set - * prior to registering the vfio_device. + * mig_ops/log_ops is a static property of the vfio_device which must + * be set prior to registering the vfio_device. */ const struct vfio_migration_ops *mig_ops; + const struct vfio_log_ops *log_ops; struct vfio_group *group; struct vfio_device_set *dev_set; struct list_head dev_set_list; @@ -108,6 +110,28 @@ struct vfio_migration_ops { enum vfio_device_mig_state *curr_state); }; +/** + * @log_start: Optional callback to ask the device start DMA logging. + * @log_stop: Optional callback to ask the device stop DMA logging. + * @log_read_and_clear: Optional callback to ask the device read + * and clear the dirty DMAs in some given range. + * + * The vfio core implementation of the DEVICE_FEATURE_DMA_LOGGING_ set + * of features does not track logging state relative to the device, + * therefore the device implementation of vfio_log_ops must handle + * arbitrary user requests. This includes rejecting subsequent calls + * to log_start without an intervening log_stop, as well as graceful + * handling of log_stop and log_read_and_clear from invalid states. + */ +struct vfio_log_ops { + int (*log_start)(struct vfio_device *device, + struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); + int (*log_stop)(struct vfio_device *device); + int (*log_read_and_clear)(struct vfio_device *device, + unsigned long iova, unsigned long length, + struct iova_bitmap *dirty); +}; + /** * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl * @flags: Arg from the device_feature op -- GitLab From 79c3cf279926f8db0a606a479944a131e27a39ea Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:44 +0300 Subject: [PATCH 0180/2223] vfio/mlx5: Init QP based resources for dirty tracking Init QP based resources for dirty tracking to be used upon start logging. It includes: Creating the host and firmware RC QPs, move each of them to its expected state based on the device specification, etc. Creating the relevant resources which are needed by both QPs as of UAR, PD, etc. Creating the host receive side resources as of MKEY, CQ, receive WQEs, etc. The above resources are cleaned-up upon stop logging. The tracker object that will be introduced by next patches will use those resources. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 595 +++++++++++++++++++++++++++++++++++- drivers/vfio/pci/mlx5/cmd.h | 53 ++++ 2 files changed, 636 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index dd5d7bfe0a498..0a362796d5671 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -7,6 +7,8 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); +static void +_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { @@ -72,19 +74,22 @@ static int mlx5fv_vf_event(struct notifier_block *nb, struct mlx5vf_pci_core_device *mvdev = container_of(nb, struct mlx5vf_pci_core_device, nb); - mutex_lock(&mvdev->state_mutex); switch (event) { case MLX5_PF_NOTIFY_ENABLE_VF: + mutex_lock(&mvdev->state_mutex); mvdev->mdev_detach = false; + mlx5vf_state_mutex_unlock(mvdev); break; case MLX5_PF_NOTIFY_DISABLE_VF: - mlx5vf_disable_fds(mvdev); + mlx5vf_cmd_close_migratable(mvdev); + mutex_lock(&mvdev->state_mutex); mvdev->mdev_detach = true; + mlx5vf_state_mutex_unlock(mvdev); break; default: break; } - mlx5vf_state_mutex_unlock(mvdev); + return 0; } @@ -95,6 +100,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) mutex_lock(&mvdev->state_mutex); mlx5vf_disable_fds(mvdev); + _mlx5vf_free_page_tracker_resources(mvdev); mlx5vf_state_mutex_unlock(mvdev); } @@ -188,11 +194,13 @@ err_exec: return ret; } -static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, u32 *mkey) +static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_recv_buf *recv_buf, + u32 *mkey) { - size_t npages = DIV_ROUND_UP(migf->total_length, PAGE_SIZE); - struct sg_dma_page_iter dma_iter; + size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + recv_buf->npages; int err = 0, inlen; __be64 *mtt; void *mkc; @@ -209,8 +217,17 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + if (migf) { + struct sg_dma_page_iter dma_iter; + + for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + } else { + int i; + + for (i = 0; i < npages; i++) + *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); + } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -223,7 +240,8 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, migf->total_length); + MLX5_SET64(mkc, mkc, len, + migf ? migf->total_length : (npages * PAGE_SIZE)); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; @@ -297,7 +315,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) goto err_dma_map; - err = _create_state_mkey(mdev, pdn, migf, &mkey); + err = _create_mkey(mdev, pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -369,7 +387,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) goto err_reg; - err = _create_state_mkey(mdev, pdn, migf, &mkey); + err = _create_mkey(mdev, pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -391,3 +409,556 @@ end: mutex_unlock(&migf->lock); return err; } + +static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, + struct mlx5_vhca_cq_buf *buf, int nent, + int cqe_size) +{ + struct mlx5_frag_buf *frag_buf = &buf->frag_buf; + u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); + u8 log_wq_sz = ilog2(cqe_size); + int err; + + err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, + mdev->priv.numa_node); + if (err) + return err; + + mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); + buf->cqe_size = cqe_size; + buf->nent = nent; + return 0; +} + +static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) +{ + struct mlx5_cqe64 *cqe64; + void *cqe; + int i; + + for (i = 0; i < buf->nent; i++) { + cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, + struct mlx5_vhca_cq *cq) +{ + mlx5_core_destroy_cq(mdev, &cq->mcq); + mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); + mlx5_db_free(mdev, &cq->db); +} + +static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, + struct mlx5_vhca_page_tracker *tracker, + size_t ncqe) +{ + int cqe_size = cache_line_size() == 128 ? 128 : 64; + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_vhca_cq *cq; + int inlen, err, eqn; + void *cqc, *in; + __be64 *pas; + int vector; + + cq = &tracker->cq; + ncqe = roundup_pow_of_two(ncqe); + err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); + if (err) + return err; + + cq->ncqe = ncqe; + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); + if (err) + goto err_db_free; + + init_cq_frag_buf(&cq->buf); + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * + cq->buf.frag_buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_buff; + } + + vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); + err = mlx5_vector2eqn(mdev, vector, &eqn); + if (err) + goto err_vec; + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); + mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); + err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); + if (err) + goto err_vec; + + kvfree(in); + return 0; + +err_vec: + kvfree(in); +err_buff: + mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); +err_db_free: + mlx5_db_free(mdev, &cq->db); + return err; +} + +static struct mlx5_vhca_qp * +mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, + struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) +{ + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + struct mlx5_vhca_qp *qp; + u8 log_rq_stride; + u8 log_rq_sz; + void *qpc; + int inlen; + void *in; + int err; + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); + log_rq_stride = ilog2(MLX5_SEND_WQE_DS); + log_rq_sz = ilog2(qp->rq.wqe_cnt); + err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); + if (err) + goto err_free; + + if (max_recv_wr) { + err = mlx5_frag_buf_alloc_node(mdev, + wq_get_byte_sz(log_rq_sz, log_rq_stride), + &qp->buf, mdev->priv.numa_node); + if (err) + goto err_db_free; + mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); + } + + qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; + inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * + qp->buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_in; + } + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, tracker->pdn); + MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); + MLX5_SET(qpc, qpc, log_page_size, + qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); + if (MLX5_CAP_GEN(mdev, cqe_version) == 1) + MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); + MLX5_SET(qpc, qpc, no_sq, 1); + if (max_recv_wr) { + MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); + MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); + MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); + MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + mlx5_fill_page_frag_array(&qp->buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, + in, pas)); + } else { + MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); + } + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + kvfree(in); + if (err) + goto err_in; + + qp->qpn = MLX5_GET(create_qp_out, out, qpn); + return qp; + +err_in: + if (max_recv_wr) + mlx5_frag_buf_free(mdev, &qp->buf); +err_db_free: + mlx5_db_free(mdev, &qp->db); +err_free: + kfree(qp); + return ERR_PTR(err); +} + +static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) +{ + struct mlx5_wqe_data_seg *data; + unsigned int ix; + + WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); + ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); + data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); + data->byte_count = cpu_to_be32(qp->max_msg_size); + data->lkey = cpu_to_be32(qp->recv_buf.mkey); + data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); + qp->rq.pc++; + /* Make sure that descriptors are written before doorbell record. */ + dma_wmb(); + *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); +} + +static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp, u32 remote_qpn, + bool host_qp) +{ + u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + void *qpc; + int ret; + + /* Init */ + qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + MLX5_SET(qpc, qpc, rre, 1); + MLX5_SET(qpc, qpc, rwe, 1); + MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); + if (ret) + return ret; + + if (host_qp) { + struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + mlx5vf_post_recv(qp); + recv_buf->next_rq_offset += qp->max_msg_size; + } + } + + /* RTR */ + qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); + MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); + MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); + MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); + MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, primary_address_path.fl, 1); + MLX5_SET(qpc, qpc, min_rnr_nak, 1); + MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); + if (ret || host_qp) + return ret; + + /* RTS */ + qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); + MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); + MLX5_SET(qpc, qpc, retry_count, 7); + MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ + MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ + MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); + + return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); +} + +static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + mlx5_cmd_exec_in(mdev, destroy_qp, in); + + mlx5_frag_buf_free(mdev, &qp->buf); + mlx5_db_free(mdev, &qp->db); + kfree(qp); +} + +static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) +{ + int i; + + /* Undo alloc_pages_bulk_array() */ + for (i = 0; i < recv_buf->npages; i++) + __free_page(recv_buf->page_list[i]); + + kvfree(recv_buf->page_list); +} + +static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, + unsigned int npages) +{ + unsigned int filled = 0, done = 0; + int i; + + recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), + GFP_KERNEL); + if (!recv_buf->page_list) + return -ENOMEM; + + for (;;) { + filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done, + recv_buf->page_list + done); + if (!filled) + goto err; + + done += filled; + if (done == npages) + break; + } + + recv_buf->npages = npages; + return 0; + +err: + for (i = 0; i < npages; i++) { + if (recv_buf->page_list[i]) + __free_page(recv_buf->page_list[i]); + } + + kvfree(recv_buf->page_list); + return -ENOMEM; +} + +static int register_dma_recv_pages(struct mlx5_core_dev *mdev, + struct mlx5_vhca_recv_buf *recv_buf) +{ + int i, j; + + recv_buf->dma_addrs = kvcalloc(recv_buf->npages, + sizeof(*recv_buf->dma_addrs), + GFP_KERNEL); + if (!recv_buf->dma_addrs) + return -ENOMEM; + + for (i = 0; i < recv_buf->npages; i++) { + recv_buf->dma_addrs[i] = dma_map_page(mdev->device, + recv_buf->page_list[i], + 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) + goto error; + } + return 0; + +error: + for (j = 0; j < i; j++) + dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], + PAGE_SIZE, DMA_FROM_DEVICE); + + kvfree(recv_buf->dma_addrs); + return -ENOMEM; +} + +static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, + struct mlx5_vhca_recv_buf *recv_buf) +{ + int i; + + for (i = 0; i < recv_buf->npages; i++) + dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], + PAGE_SIZE, DMA_FROM_DEVICE); + + kvfree(recv_buf->dma_addrs); +} + +static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp) +{ + struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; + + mlx5_core_destroy_mkey(mdev, recv_buf->mkey); + unregister_dma_recv_pages(mdev, recv_buf); + free_recv_pages(&qp->recv_buf); +} + +static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp, u32 pdn, + u64 rq_size) +{ + unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); + struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; + int err; + + err = alloc_recv_pages(recv_buf, npages); + if (err < 0) + return err; + + err = register_dma_recv_pages(mdev, recv_buf); + if (err) + goto end; + + err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + if (err) + goto err_create_mkey; + + return 0; + +err_create_mkey: + unregister_dma_recv_pages(mdev, recv_buf); +end: + free_recv_pages(recv_buf); + return err; +} + +static void +_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + struct mlx5_core_dev *mdev = mvdev->mdev; + + lockdep_assert_held(&mvdev->state_mutex); + + if (!mvdev->log_active) + return; + + WARN_ON(mvdev->mdev_detach); + + mlx5vf_destroy_qp(mdev, tracker->fw_qp); + mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); + mlx5vf_destroy_qp(mdev, tracker->host_qp); + mlx5vf_destroy_cq(mdev, &tracker->cq); + mlx5_core_dealloc_pd(mdev, tracker->pdn); + mlx5_put_uars_page(mdev, tracker->uar); + mvdev->log_active = false; +} + +int mlx5vf_stop_page_tracker(struct vfio_device *vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + + mutex_lock(&mvdev->state_mutex); + if (!mvdev->log_active) + goto end; + + _mlx5vf_free_page_tracker_resources(mvdev); + mvdev->log_active = false; +end: + mlx5vf_state_mutex_unlock(mvdev); + return 0; +} + +int mlx5vf_start_page_tracker(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + u8 log_tracked_page = ilog2(*page_size); + struct mlx5_vhca_qp *host_qp; + struct mlx5_vhca_qp *fw_qp; + struct mlx5_core_dev *mdev; + u32 max_msg_size = PAGE_SIZE; + u64 rq_size = SZ_2M; + u32 max_recv_wr; + int err; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mdev_detach) { + err = -ENOTCONN; + goto end; + } + + if (mvdev->log_active) { + err = -EINVAL; + goto end; + } + + mdev = mvdev->mdev; + memset(tracker, 0, sizeof(*tracker)); + tracker->uar = mlx5_get_uars_page(mdev); + if (IS_ERR(tracker->uar)) { + err = PTR_ERR(tracker->uar); + goto end; + } + + err = mlx5_core_alloc_pd(mdev, &tracker->pdn); + if (err) + goto err_uar; + + max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); + err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); + if (err) + goto err_dealloc_pd; + + host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); + if (IS_ERR(host_qp)) { + err = PTR_ERR(host_qp); + goto err_cq; + } + + host_qp->max_msg_size = max_msg_size; + if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_min_page_size)) { + log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_min_page_size); + } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_max_page_size)) { + log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_max_page_size); + } + + host_qp->tracked_page_size = (1ULL << log_tracked_page); + err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, + rq_size); + if (err) + goto err_host_qp; + + fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); + if (IS_ERR(fw_qp)) { + err = PTR_ERR(fw_qp); + goto err_recv_resources; + } + + err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); + if (err) + goto err_activate; + + err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); + if (err) + goto err_activate; + + tracker->host_qp = host_qp; + tracker->fw_qp = fw_qp; + *page_size = host_qp->tracked_page_size; + mvdev->log_active = true; + mlx5vf_state_mutex_unlock(mvdev); + return 0; + +err_activate: + mlx5vf_destroy_qp(mdev, fw_qp); +err_recv_resources: + mlx5vf_free_qp_recv_resources(mdev, host_qp); +err_host_qp: + mlx5vf_destroy_qp(mdev, host_qp); +err_cq: + mlx5vf_destroy_cq(mdev, &tracker->cq); +err_dealloc_pd: + mlx5_core_dealloc_pd(mdev, tracker->pdn); +err_uar: + mlx5_put_uars_page(mdev, tracker->uar); +end: + mlx5vf_state_mutex_unlock(mvdev); + return err; +} diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8208f4701a908..e71ec017bf04e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -9,6 +9,8 @@ #include #include #include +#include +#include struct mlx5vf_async_data { struct mlx5_async_work cb_work; @@ -39,6 +41,52 @@ struct mlx5_vf_migration_file { struct mlx5vf_async_data async_data; }; +struct mlx5_vhca_cq_buf { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; + int cqe_size; + int nent; +}; + +struct mlx5_vhca_cq { + struct mlx5_vhca_cq_buf buf; + struct mlx5_db db; + struct mlx5_core_cq mcq; + size_t ncqe; +}; + +struct mlx5_vhca_recv_buf { + u32 npages; + struct page **page_list; + dma_addr_t *dma_addrs; + u32 next_rq_offset; + u32 mkey; +}; + +struct mlx5_vhca_qp { + struct mlx5_frag_buf buf; + struct mlx5_db db; + struct mlx5_vhca_recv_buf recv_buf; + u32 tracked_page_size; + u32 max_msg_size; + u32 qpn; + struct { + unsigned int pc; + unsigned int cc; + unsigned int wqe_cnt; + __be32 *db; + struct mlx5_frag_buf_ctrl fbc; + } rq; +}; + +struct mlx5_vhca_page_tracker { + u32 pdn; + struct mlx5_uars_page *uar; + struct mlx5_vhca_cq cq; + struct mlx5_vhca_qp *host_qp; + struct mlx5_vhca_qp *fw_qp; +}; + struct mlx5vf_pci_core_device { struct vfio_pci_core_device core_device; int vf_id; @@ -46,6 +94,7 @@ struct mlx5vf_pci_core_device { u8 migrate_cap:1; u8 deferred_reset:1; u8 mdev_detach:1; + u8 log_active:1; /* protect migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; @@ -53,6 +102,7 @@ struct mlx5vf_pci_core_device { spinlock_t reset_lock; struct mlx5_vf_migration_file *resuming_migf; struct mlx5_vf_migration_file *saving_migf; + struct mlx5_vhca_page_tracker tracker; struct workqueue_struct *cb_wq; struct notifier_block nb; struct mlx5_core_dev *mdev; @@ -73,4 +123,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); +int mlx5vf_start_page_tracker(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); +int mlx5vf_stop_page_tracker(struct vfio_device *vdev); #endif /* MLX5_VFIO_CMD_H */ -- GitLab From c1d050b0d169fd60c8acef157db53bd4e3141799 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:45 +0300 Subject: [PATCH 0181/2223] vfio/mlx5: Create and destroy page tracker object Add support for creating and destroying page tracker object. This object is used to control/report the device dirty pages. As part of creating the tracker need to consider the device capabilities for max ranges and adapt/combine ranges accordingly. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 147 ++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + 2 files changed, 148 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0a362796d5671..f1cad96af6abd 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -410,6 +410,148 @@ end: return err; } +static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, + u32 req_nodes) +{ + struct interval_tree_node *prev, *curr, *comb_start, *comb_end; + unsigned long min_gap; + unsigned long curr_gap; + + /* Special shortcut when a single range is required */ + if (req_nodes == 1) { + unsigned long last; + + curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); + while (curr) { + last = curr->last; + prev = curr; + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); + if (prev != comb_start) + interval_tree_remove(prev, root); + } + comb_start->last = last; + return; + } + + /* Combine ranges which have the smallest gap */ + while (cur_nodes > req_nodes) { + prev = NULL; + min_gap = ULONG_MAX; + curr = interval_tree_iter_first(root, 0, ULONG_MAX); + while (curr) { + if (prev) { + curr_gap = curr->start - prev->last; + if (curr_gap < min_gap) { + min_gap = curr_gap; + comb_start = prev; + comb_end = curr; + } + } + prev = curr; + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); + } + comb_start->last = comb_end->last; + interval_tree_remove(comb_end, root); + cur_nodes--; + } +} + +static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, + struct mlx5vf_pci_core_device *mvdev, + struct rb_root_cached *ranges, u32 nnodes) +{ + int max_num_range = + MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + int record_size = MLX5_ST_SZ_BYTES(page_track_range); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + struct interval_tree_node *node = NULL; + u64 total_ranges_len = 0; + u32 num_ranges = nnodes; + u8 log_addr_space_size; + void *range_list_ptr; + void *obj_context; + void *cmd_hdr; + int inlen; + void *in; + int err; + int i; + + if (num_ranges > max_num_range) { + combine_ranges(ranges, nnodes, max_num_range); + num_ranges = max_num_range; + } + + inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + + record_size * num_ranges; + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, + general_obj_in_cmd_hdr); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, + MLX5_OBJ_TYPE_PAGE_TRACK); + obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); + MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); + MLX5_SET(page_track, obj_context, track_type, 1); + MLX5_SET(page_track, obj_context, log_page_size, + ilog2(tracker->host_qp->tracked_page_size)); + MLX5_SET(page_track, obj_context, log_msg_size, + ilog2(tracker->host_qp->max_msg_size)); + MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); + MLX5_SET(page_track, obj_context, num_ranges, num_ranges); + + range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); + for (i = 0; i < num_ranges; i++) { + void *addr_range_i_base = range_list_ptr + record_size * i; + unsigned long length = node->last - node->start; + + MLX5_SET64(page_track_range, addr_range_i_base, start_address, + node->start); + MLX5_SET64(page_track_range, addr_range_i_base, length, length); + total_ranges_len += length; + node = interval_tree_iter_next(node, 0, ULONG_MAX); + } + + WARN_ON(node); + log_addr_space_size = ilog2(total_ranges_len); + if (log_addr_space_size < + (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || + log_addr_space_size > + (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { + err = -EOPNOTSUPP; + goto out; + } + + MLX5_SET(page_track, obj_context, log_addr_space_size, + log_addr_space_size); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + if (err) + goto out; + + tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); +out: + kfree(in); + return err; +} + +static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, + u32 tracker_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq_buf *buf, int nent, int cqe_size) @@ -833,6 +975,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) WARN_ON(mvdev->mdev_detach); + mlx5vf_cmd_destroy_tracker(mdev, tracker->id); mlx5vf_destroy_qp(mdev, tracker->fw_qp); mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); mlx5vf_destroy_qp(mdev, tracker->host_qp); @@ -941,6 +1084,10 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, tracker->host_qp = host_qp; tracker->fw_qp = fw_qp; + err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); + if (err) + goto err_activate; + *page_size = host_qp->tracked_page_size; mvdev->log_active = true; mlx5vf_state_mutex_unlock(mvdev); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index e71ec017bf04e..658925ba5459b 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -80,6 +80,7 @@ struct mlx5_vhca_qp { }; struct mlx5_vhca_page_tracker { + u32 id; u32 pdn; struct mlx5_uars_page *uar; struct mlx5_vhca_cq cq; -- GitLab From 1047797e8ed4bb8c20d1b5b843cf870d00bb0ff7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:46 +0300 Subject: [PATCH 0182/2223] vfio/mlx5: Report dirty pages from tracker Report dirty pages from tracker. It includes: Querying for dirty pages in a given IOVA range, this is done by modifying the tracker into the reporting state and supplying the required range. Using the CQ event completion mechanism to be notified once data is ready on the CQ/QP to be processed. Once data is available turn on the corresponding bits in the bit map. This functionality will be used as part of the 'log_read_and_clear' driver callback in the next patches. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 191 ++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/mlx5/cmd.h | 4 + 2 files changed, 195 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index f1cad96af6abd..fa9ddd9265002 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -5,6 +5,8 @@ #include "cmd.h" +enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; + static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); static void @@ -157,6 +159,7 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; mvdev->core_device.vdev.mig_ops = mig_ops; + init_completion(&mvdev->tracker_comp); end: mlx5_vf_put_core_dev(mvdev->mdev); @@ -552,6 +555,29 @@ static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } +static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, + u32 tracker_id, unsigned long iova, + unsigned long length, u32 tracker_state) +{ + u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + void *obj_context; + void *cmd_hdr; + + cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); + + obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); + MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); + MLX5_SET64(page_track, obj_context, range_start_address, iova); + MLX5_SET64(page_track, obj_context, length, length); + MLX5_SET(page_track, obj_context, state, tracker_state); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq_buf *buf, int nent, int cqe_size) @@ -593,6 +619,16 @@ static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, mlx5_db_free(mdev, &cq->db); } +static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, + struct mlx5_eqe *eqe) +{ + struct mlx5vf_pci_core_device *mvdev = + container_of(mcq, struct mlx5vf_pci_core_device, + tracker.cq.mcq); + + complete(&mvdev->tracker_comp); +} + static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, struct mlx5_vhca_page_tracker *tracker, size_t ncqe) @@ -643,10 +679,13 @@ static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); + cq->mcq.comp = mlx5vf_cq_complete; err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); if (err) goto err_vec; + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, + cq->mcq.cons_index); kvfree(in); return 0; @@ -1109,3 +1148,155 @@ end: mlx5vf_state_mutex_unlock(mvdev); return err; } + +static void +set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, + struct iova_bitmap *dirty) +{ + u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); + u32 nent = size / entry_size; + struct page *page; + u64 addr; + u64 *buf; + int i; + + if (WARN_ON(index >= qp->recv_buf.npages || + (nent > qp->max_msg_size / entry_size))) + return; + + page = qp->recv_buf.page_list[index]; + buf = kmap_local_page(page); + for (i = 0; i < nent; i++) { + addr = MLX5_GET(page_track_report_entry, buf + i, + dirty_address_low); + addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, + dirty_address_high) << 32; + iova_bitmap_set(dirty, addr, qp->tracked_page_size); + } + kunmap_local(buf); +} + +static void +mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, + struct iova_bitmap *dirty, int *tracker_status) +{ + u32 size; + int ix; + + qp->rq.cc++; + *tracker_status = be32_to_cpu(cqe->immediate) >> 28; + size = be32_to_cpu(cqe->byte_cnt); + ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); + + /* zero length CQE, no data */ + WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); + if (size) + set_report_output(size, ix, qp, dirty); + + qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; + mlx5vf_post_recv(qp); +} + +static void *get_cqe(struct mlx5_vhca_cq *cq, int n) +{ + return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); +} + +static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { + return cqe64; + } else { + return NULL; + } +} + +static int +mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, + struct iova_bitmap *dirty, int *tracker_status) +{ + struct mlx5_cqe64 *cqe; + u8 opcode; + + cqe = get_sw_cqe(cq, cq->mcq.cons_index); + if (!cqe) + return CQ_EMPTY; + + ++cq->mcq.cons_index; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + opcode = get_cqe_opcode(cqe); + switch (opcode) { + case MLX5_CQE_RESP_SEND_IMM: + mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); + return CQ_OK; + default: + return CQ_POLL_ERR; + } +} + +int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, + unsigned long length, + struct iova_bitmap *dirty) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + struct mlx5_vhca_cq *cq = &tracker->cq; + struct mlx5_core_dev *mdev; + int poll_err, err; + + mutex_lock(&mvdev->state_mutex); + if (!mvdev->log_active) { + err = -EINVAL; + goto end; + } + + if (mvdev->mdev_detach) { + err = -ENOTCONN; + goto end; + } + + mdev = mvdev->mdev; + err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, + MLX5_PAGE_TRACK_STATE_REPORTING); + if (err) + goto end; + + tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; + while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING) { + poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, + &tracker->status); + if (poll_err == CQ_EMPTY) { + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, + cq->mcq.cons_index); + poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, + dirty, &tracker->status); + if (poll_err == CQ_EMPTY) { + wait_for_completion(&mvdev->tracker_comp); + continue; + } + } + if (poll_err == CQ_POLL_ERR) { + err = -EIO; + goto end; + } + mlx5_cq_set_ci(&cq->mcq); + } + + if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) + err = -EIO; + +end: + mlx5vf_state_mutex_unlock(mvdev); + return err; +} diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 658925ba5459b..fa1f9ab4d3d05 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -86,6 +86,7 @@ struct mlx5_vhca_page_tracker { struct mlx5_vhca_cq cq; struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; + int status; }; struct mlx5vf_pci_core_device { @@ -96,6 +97,7 @@ struct mlx5vf_pci_core_device { u8 deferred_reset:1; u8 mdev_detach:1; u8 log_active:1; + struct completion tracker_comp; /* protect migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; @@ -127,4 +129,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); int mlx5vf_stop_page_tracker(struct vfio_device *vdev); +int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, + unsigned long length, struct iova_bitmap *dirty); #endif /* MLX5_VFIO_CMD_H */ -- GitLab From e295738756ebd0726f1ed51e02f343a37233437b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:47 +0300 Subject: [PATCH 0183/2223] vfio/mlx5: Manage error scenarios on tracker Handle async error events and health/recovery flow to safely stop the tracker upon error scenarios. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 61 +++++++++++++++++++++++++++++++++++-- drivers/vfio/pci/mlx5/cmd.h | 2 ++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fa9ddd9265002..3e92b4d92be26 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -70,6 +70,13 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, return 0; } +static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) +{ + /* Mark the tracker under an error and wake it up if it's running */ + mvdev->tracker.is_err = true; + complete(&mvdev->tracker_comp); +} + static int mlx5fv_vf_event(struct notifier_block *nb, unsigned long event, void *data) { @@ -100,6 +107,8 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) if (!mvdev->migrate_cap) return; + /* Must be done outside the lock to let it progress */ + set_tracker_error(mvdev); mutex_lock(&mvdev->state_mutex); mlx5vf_disable_fds(mvdev); _mlx5vf_free_page_tracker_resources(mvdev); @@ -619,6 +628,47 @@ static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, mlx5_db_free(mdev, &cq->db); } +static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + if (type != MLX5_EVENT_TYPE_CQ_ERROR) + return; + + set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, + tracker.cq.mcq)); +} + +static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, + void *data) +{ + struct mlx5_vhca_page_tracker *tracker = + mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); + struct mlx5vf_pci_core_device *mvdev = container_of( + tracker, struct mlx5vf_pci_core_device, tracker); + struct mlx5_eqe *eqe = data; + u8 event_type = (u8)type; + u8 queue_type; + int qp_num; + + switch (event_type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + queue_type = eqe->data.qp_srq.type; + if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) + break; + qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + if (qp_num != tracker->host_qp->qpn && + qp_num != tracker->fw_qp->qpn) + break; + set_tracker_error(mvdev); + break; + default: + break; + } + + return NOTIFY_OK; +} + static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) { @@ -680,6 +730,7 @@ static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); cq->mcq.comp = mlx5vf_cq_complete; + cq->mcq.event = mlx5vf_cq_event; err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); if (err) goto err_vec; @@ -1014,6 +1065,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) WARN_ON(mvdev->mdev_detach); + mlx5_eq_notifier_unregister(mdev, &tracker->nb); mlx5vf_cmd_destroy_tracker(mdev, tracker->id); mlx5vf_destroy_qp(mdev, tracker->fw_qp); mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); @@ -1127,6 +1179,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, if (err) goto err_activate; + MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); + mlx5_eq_notifier_register(mdev, &tracker->nb); *page_size = host_qp->tracked_page_size; mvdev->log_active = true; mlx5vf_state_mutex_unlock(mvdev); @@ -1273,7 +1327,8 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, goto end; tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; - while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING) { + while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && + !tracker->is_err) { poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, &tracker->status); if (poll_err == CQ_EMPTY) { @@ -1294,8 +1349,10 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, } if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) - err = -EIO; + tracker->is_err = true; + if (tracker->is_err) + err = -EIO; end: mlx5vf_state_mutex_unlock(mvdev); return err; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index fa1f9ab4d3d05..8b0ae40c620c7 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -82,10 +82,12 @@ struct mlx5_vhca_qp { struct mlx5_vhca_page_tracker { u32 id; u32 pdn; + u8 is_err:1; struct mlx5_uars_page *uar; struct mlx5_vhca_cq cq; struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; + struct mlx5_nb nb; int status; }; -- GitLab From f39856aacb078c1c93acef011a37121b17d54fe0 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:48 +0300 Subject: [PATCH 0184/2223] vfio/mlx5: Set the driver DMA logging callbacks Now that everything is ready set the driver DMA logging callbacks if supported by the device. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 5 ++++- drivers/vfio/pci/mlx5/cmd.h | 3 ++- drivers/vfio/pci/mlx5/main.c | 9 ++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 3e92b4d92be26..c604b70437a5d 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -126,7 +126,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) } void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, - const struct vfio_migration_ops *mig_ops) + const struct vfio_migration_ops *mig_ops, + const struct vfio_log_ops *log_ops) { struct pci_dev *pdev = mvdev->core_device.pdev; int ret; @@ -169,6 +170,8 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, VFIO_MIGRATION_P2P; mvdev->core_device.vdev.mig_ops = mig_ops; init_completion(&mvdev->tracker_comp); + if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) + mvdev->core_device.vdev.log_ops = log_ops; end: mlx5_vf_put_core_dev(mvdev->mdev); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8b0ae40c620c7..921d5720a1e57 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -118,7 +118,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, - const struct vfio_migration_ops *mig_ops); + const struct vfio_migration_ops *mig_ops, + const struct vfio_log_ops *log_ops); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index a9b63d15c5d34..759a5f5f7b3f4 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -579,6 +579,12 @@ static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_get_state = mlx5vf_pci_get_device_state, }; +static const struct vfio_log_ops mlx5vf_pci_log_ops = { + .log_start = mlx5vf_start_page_tracker, + .log_stop = mlx5vf_stop_page_tracker, + .log_read_and_clear = mlx5vf_tracker_read_and_clear, +}; + static const struct vfio_device_ops mlx5vf_pci_ops = { .name = "mlx5-vfio-pci", .open_device = mlx5vf_pci_open_device, @@ -602,7 +608,8 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, if (!mvdev) return -ENOMEM; vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops); + mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, + &mlx5vf_pci_log_ops); dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) -- GitLab From e7ed42a44c36351cd064797613d6ae34c0140424 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Thu, 8 Sep 2022 12:37:09 -0700 Subject: [PATCH 0185/2223] Input: hgpk - fix repeated word in a comment Delete the redundant word 'to'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20220908131043.37099-1-wangjianli@cdjrlc.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/hgpk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/hgpk.c b/drivers/input/mouse/hgpk.c index 523b26a117d6c..3c8310da0b053 100644 --- a/drivers/input/mouse/hgpk.c +++ b/drivers/input/mouse/hgpk.c @@ -884,7 +884,7 @@ static ssize_t hgpk_trigger_recal(struct psmouse *psmouse, void *data, /* * We queue work instead of doing recalibration right here - * to avoid adding locking to to hgpk_force_recalibrate() + * to avoid adding locking to hgpk_force_recalibrate() * since workqueue provides serialization. */ psmouse_queue_work(psmouse, &priv->recalib_wq, 0); -- GitLab From e662d349ab6601ffaadd3403bca2775c8ffc050d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Sep 2022 17:21:34 +0300 Subject: [PATCH 0186/2223] pinctrl: cy8c95x0: Use 'default' in all switch-cases (part 2) Move the default values to the 'default' case in the switches. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220908142134.59068-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 1335d07822f9a..79f73d364f3f9 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -357,9 +357,9 @@ static bool cy8c95x0_volatile_register(struct device *dev, unsigned int reg) case CY8C95X0_DRV_PP_SLOW: case CY8C95X0_DRV_HIZ: return true; + default: + return false; } - - return false; } static bool cy8c95x0_precious_register(struct device *dev, unsigned int reg) -- GitLab From c530a3c716b963625e43aa915e0de6b4d1ce8ad9 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:02 +0800 Subject: [PATCH 0187/2223] sched/psi: Fix periodic aggregation shut off We don't want to wake periodic aggregation work back up if the task change is the aggregation worker itself going to sleep, or we'll ping-pong forever. Previously, we would use psi_task_change() in psi_dequeue() when task going to sleep, so this check was put in psi_task_change(). But commit 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") defer task sleep handling to psi_task_switch(), won't go through psi_task_change() anymore. So this patch move this check to psi_task_switch(). Fixes: 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-2-zhouchengming@bytedance.com --- kernel/sched/psi.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ecb4b4ff4ce0a..39463dcc16bb6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -796,7 +796,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; void *iter = NULL; u64 now; @@ -806,19 +805,9 @@ void psi_task_change(struct task_struct *task, int clear, int set) psi_flags_change(task, clear, set); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, now, true); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -854,6 +843,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; /* * When we're going to sleep, psi_dequeue() lets us @@ -867,13 +857,23 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); iter = NULL; while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, now, wake_clock); /* * TSK_ONCPU is handled up to the common ancestor. If we're tasked @@ -882,7 +882,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (sleep) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, now, wake_clock); } } } -- GitLab From 58d8c2586cedb8a67f6f0dffa5eaed0f89135b39 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:03 +0800 Subject: [PATCH 0188/2223] sched/psi: Don't create cgroup PSI files when psi_disabled commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI can be configured to skip per-cgroup stall accounting. And doesn't expose PSI files in cgroup hierarchy. This patch do the same thing when psi_disabled. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-3-zhouchengming@bytedance.com --- kernel/cgroup/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 718a70c01c045..96aefdb064bb3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3780,6 +3780,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) bool cgroup_psi_enabled(void) { + if (static_branch_likely(&psi_disabled)) + return false; + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } -- GitLab From e2ad8ab04c5cdfc8dc2f382c45d248ab01dee991 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:04 +0800 Subject: [PATCH 0189/2223] sched/psi: Save percpu memory when !psi_cgroups_enabled We won't use cgroup psi_group when !psi_cgroups_enabled, so don't bother to alloc percpu memory and init for it. Also don't need to migrate task PSI stats between cgroups in cgroup_move_task(). Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-4-zhouchengming@bytedance.com --- kernel/sched/psi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 39463dcc16bb6..77d53c03a76fd 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -201,6 +201,7 @@ void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } @@ -950,7 +951,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -968,7 +969,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; cancel_delayed_work_sync(&cgroup->psi->avgs_work); @@ -996,7 +997,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. -- GitLab From d79ddb069c5257a924456eb99b53fc1ea715c0a3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:05 +0800 Subject: [PATCH 0190/2223] sched/psi: Move private helpers to sched/stats.h This patch move psi_task_change/psi_task_switch declarations out of PSI public header, since they are only needed for implementing the PSI stats tracking in sched/stats.h psi_task_switch is obvious, psi_task_change can't be public helper since it doesn't check psi_disabled static key. And there is no any user now, so put it in sched/stats.h too. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-5-zhouchengming@bytedance.com --- include/linux/psi.h | 4 ---- kernel/sched/stats.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index dd74411ac21d7..fffd229fbf197 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -18,10 +18,6 @@ extern struct psi_group psi_system; void psi_init(void); -void psi_task_change(struct task_struct *task, int clear, int set); -void psi_task_switch(struct task_struct *prev, struct task_struct *next, - bool sleep); - void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baa839c1ba96d..c39b467ece430 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -107,6 +107,10 @@ __schedstats_from_se(struct sched_entity *se) } #ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); + /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, -- GitLab From 65176f59a18d888684525658a1d0b8bf749d24f3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:06 +0800 Subject: [PATCH 0191/2223] sched/psi: Optimize task switch inside shared cgroups again Way back when PSI_MEM_FULL was accounted from the timer tick, task switching could simply iterate next and prev to the common ancestor to update TSK_ONCPU and be done. Then memstall ticks were replaced with checking curr->in_memstall directly in psi_group_change(). That meant that now if the task switch was between a memstall and a !memstall task, we had to iterate through the common ancestors at least ONCE to fix up their state_masks. We added the identical_state filter to make sure the common ancestor elimination was skipped in that case. It seems that was always a little too eager, because it caused us to walk the common ancestors *twice* instead of the required once: the iteration for next could have stopped at the common ancestor; prev could have updated TSK_ONCPU up to the common ancestor, then finish to the root without changing any flags, just to get the new curr->in_memstall into the state_masks. This patch recognizes this and makes it so that we walk to the root exactly once if state_mask needs updating, which is simply catching up on a missed optimization that could have been done in commit 7fae6c8171d2 ("psi: Use ONCPU state tracking machinery to detect reclaim") directly. Apart from this, it's also necessary for the next patch "sched/psi: remove NR_ONCPU task accounting". Suppose we walk the common ancestors twice: (1) psi_group_change(.clear = 0, .set = TSK_ONCPU) (2) psi_group_change(.clear = TSK_ONCPU, .set = 0) We previously used tasks[NR_ONCPU] to record TSK_ONCPU, tasks[NR_ONCPU]++ in (1) then tasks[NR_ONCPU]-- in (2), so tasks[NR_ONCPU] still be correct. The next patch change to use one bit in state mask to record TSK_ONCPU, PSI_ONCPU bit will be set in (1), but then be cleared in (2), which cause the psi_group_cpu has task running on CPU but without PSI_ONCPU bit set! With this patch, we will never walk the common ancestors twice, so won't have above problem. Suggested-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-6-zhouchengming@bytedance.com --- kernel/sched/psi.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 77d53c03a76fd..d71dbc2356ffb 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -820,20 +820,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu); if (next->pid) { - bool identical_state; - psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { common = group; break; } @@ -877,10 +872,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_group_change(group, cpu, clear, set, now, wake_clock); /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) psi_group_change(group, cpu, clear, set, now, wake_clock); -- GitLab From 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Aug 2022 00:41:07 +0800 Subject: [PATCH 0192/2223] sched/psi: Remove NR_ONCPU task accounting We put all fields updated by the scheduler in the first cacheline of struct psi_group_cpu for performance. Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure, we need to reclaim space first. This patch remove NR_ONCPU task accounting in struct psi_group_cpu, use one bit in state_mask to track instead. Signed-off-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Chengming Zhou Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com --- include/linux/psi_types.h | 16 +++++++-------- kernel/sched/psi.c | 41 ++++++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index c7fe7c0897183..54cb74946db41 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -15,13 +15,6 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - /* - * This can't have values other than 0 or 1 and could be - * implemented as a bit flag. But for now we still have room - * in the first cacheline of psi_group_cpu, and this way we - * don't have to special case any state tracking for it. - */ - NR_ONCPU, /* * For IO and CPU stalls the presence of running/oncpu tasks * in the domain means a partial rather than a full stall. @@ -32,16 +25,18 @@ enum psi_task_count { * threads and memstall ones. */ NR_MEMSTALL_RUNNING, - NR_PSI_TASK_COUNTS = 5, + NR_PSI_TASK_COUNTS = 4, }; /* Task state bitmasks */ #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) -#define TSK_ONCPU (1 << NR_ONCPU) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) +/* Only one task can be scheduled, no corresponding task count */ +#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) + /* Resources that workloads could be stalled on */ enum psi_res { PSI_IO, @@ -68,6 +63,9 @@ enum psi_states { NR_PSI_STATES = 7, }; +/* Use one bit in the state mask to track TSK_ONCPU */ +#define PSI_ONCPU (1 << NR_PSI_STATES) + enum psi_aggregators { PSI_AVGS = 0, PSI_POLL, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index d71dbc2356ffb..4702a770e2720 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -212,7 +212,7 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -225,9 +225,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -689,9 +689,9 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -707,17 +707,36 @@ static void psi_group_change(struct psi_group *group, int cpu, record_times(groupc, now); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -726,9 +745,8 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } @@ -740,7 +758,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); groupc->state_mask = state_mask; @@ -828,7 +846,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } -- GitLab From 52b1364ba0b105122d6de0e719b36db705011ac1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:08 +0800 Subject: [PATCH 0193/2223] sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload. When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status. Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com --- Documentation/admin-guide/cgroup-v2.rst | 6 ++ include/linux/psi_types.h | 10 +++- kernel/cgroup/cgroup.c | 27 +++++++++ kernel/sched/core.c | 1 + kernel/sched/psi.c | 74 ++++++++++++++++++++++++- kernel/sched/stats.h | 2 + 6 files changed, 116 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index be4a77baf7841..971c418bc7784 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -976,6 +976,12 @@ All cgroup core files are prefixed with "cgroup." killing cgroups is a process directed operation, i.e. it affects the whole thread-group. + irq.pressure + A read-write nested-keyed file. + + Shows pressure stall information for IRQ/SOFTIRQ. See + :ref:`Documentation/accounting/psi.rst ` for details. + Controllers =========== diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 54cb74946db41..40c28171cd91a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -42,7 +42,10 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES = 3, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ, +#endif + NR_PSI_RESOURCES, }; /* @@ -58,9 +61,12 @@ enum psi_states { PSI_MEM_FULL, PSI_CPU_SOME, PSI_CPU_FULL, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ_FULL, +#endif /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 7, + NR_PSI_STATES, }; /* Use one bit in the state mask to track TSK_ONCPU */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 96aefdb064bb3..b46d39b662144 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3763,6 +3763,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { @@ -5179,6 +5196,16 @@ static struct cftype cgroup_base_files[] = { .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .flags = CFTYPE_PRESSURE, + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif #endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee28253c9ac0c..7d1ea9240af08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4702a770e2720..2545a78f82d8d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + void *iter = NULL; + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + while ((group = iterate_groups(task, &iter))) { + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1405,6 +1445,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1412,6 +1479,9 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c39b467ece430..84a188913cc9d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -110,6 +110,7 @@ __schedstats_from_se(struct sched_entity *se) void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); /* * PSI tracks state that persists across sleeps, such as iowaits and @@ -205,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO -- GitLab From 57899a6610e67ba26fa3251ebbef4a5ed21efc5d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:09 +0800 Subject: [PATCH 0194/2223] sched/psi: Consolidate cgroup_psi() cgroup_psi() can't return psi_group for root cgroup, so we have many open code "psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi". This patch move cgroup_psi() definition to , in which we can return psi_system for root cgroup, so can handle all cgroups. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-9-zhouchengming@bytedance.com --- include/linux/cgroup.h | 5 ----- include/linux/psi.h | 6 ++++++ kernel/cgroup/cgroup.c | 10 +++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b0914aa265062..80cb970257be9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -673,11 +673,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) -{ - return cgrp->psi; -} - bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) diff --git a/include/linux/psi.h b/include/linux/psi.h index fffd229fbf197..362a74ca1d3bf 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -7,6 +7,7 @@ #include #include #include +#include struct seq_file; struct css_set; @@ -30,6 +31,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); #ifdef CONFIG_CGROUPS +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; +} + int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b46d39b662144..772b35d65d1f0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3689,21 +3689,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } @@ -3729,7 +3729,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); @@ -3767,7 +3767,7 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IRQ); } -- GitLab From dc86aba751e2867244411adda1562f6664747019 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:10 +0800 Subject: [PATCH 0195/2223] sched/psi: Cache parent psi_group to speed up group iteration We use iterate_groups() to iterate each level psi_group to update PSI stats, which is a very hot path. In current code, iterate_groups() have to use multiple branches and cgroup_parent() to get parent psi_group for each level, which is not very efficient. This patch cache parent psi_group in struct psi_group, only need to get psi_group of task itself first, then just use group->parent to iterate. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-10-zhouchengming@bytedance.com --- include/linux/psi_types.h | 2 ++ kernel/sched/psi.c | 49 +++++++++++++++------------------------ 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 40c28171cd91a..a0b746258c682 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -151,6 +151,8 @@ struct psi_trigger { }; struct psi_group { + struct psi_group *parent; + /* Protects data used by the aggregator */ struct mutex avgs_lock; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2545a78f82d8d..9a8aee80a0874 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -772,27 +772,12 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { - if (*iter == &psi_system) - return NULL; - #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) { - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } - } + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -815,7 +800,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - void *iter = NULL; u64 now; if (!task->pid) @@ -825,8 +809,10 @@ void psi_task_change(struct task_struct *task, int clear, int set) now = cpu_clock(cpu); - while ((group = iterate_groups(task, &iter))) + group = task_psi_group(task); + do { psi_group_change(group, cpu, clear, set, now, true); + } while ((group = group->parent)); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -834,7 +820,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - void *iter; u64 now = cpu_clock(cpu); if (next->pid) { @@ -844,8 +829,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - iter = NULL; - while ((group = iterate_groups(next, &iter))) { + group = task_psi_group(next); + do { if (per_cpu_ptr(group->pcpu, cpu)->state_mask & PSI_ONCPU) { common = group; @@ -853,7 +838,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); - } + } while ((group = group->parent)); } if (prev->pid) { @@ -886,9 +871,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) + group = task_psi_group(prev); + do { + if (group == common) + break; psi_group_change(group, cpu, clear, set, now, wake_clock); + } while ((group = group->parent)); /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -898,7 +886,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) + for (; group; group = group->parent) psi_group_change(group, cpu, clear, set, now, wake_clock); } } @@ -908,7 +896,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, void psi_account_irqtime(struct task_struct *task, u32 delta) { int cpu = task_cpu(task); - void *iter = NULL; struct psi_group *group; struct psi_group_cpu *groupc; u64 now; @@ -918,7 +905,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) now = cpu_clock(cpu); - while ((group = iterate_groups(task, &iter))) { + group = task_psi_group(task); + do { groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); @@ -930,7 +918,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) if (group->poll_states & (1 << PSI_IRQ_FULL)) psi_schedule_poll_work(group, 1); - } + } while ((group = group->parent)); } #endif @@ -1010,6 +998,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return -ENOMEM; } group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } -- GitLab From 34f26a15611afb03c33df6819359d36f5b382589 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Sep 2022 17:03:32 +0800 Subject: [PATCH 0196/2223] sched/psi: Per-cgroup PSI accounting disable/re-enable interface PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This may cause non-negligible overhead for some workloads when under deep level of the hierarchy. commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI to skip per-cgroup stall accounting, only account system-wide to avoid this each level overhead. But for our use case, we also want leaf cgroup PSI stats accounted for userspace adjustment on that cgroup, apart from only system-wide adjustment. So this patch introduce a per-cgroup PSI accounting disable/re-enable interface "cgroup.pressure", which is a read-write single value file that allowed values are "0" and "1", the defaults is "1" so per-cgroup PSI stats is enabled by default. Implementation details: It should be relatively straight-forward to disable and re-enable state aggregation, time tracking, averaging on a per-cgroup level, if we can live with losing history from while it was disabled. I.e. the avgs will restart from 0, total= will have gaps. But it's hard or complex to stop/restart groupc->tasks[] updates, which is not implemented in this patch. So we always update groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when the cgroup PSI stats is disabled. Suggested-by: Johannes Weiner Suggested-by: Tejun Heo Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com --- Documentation/admin-guide/cgroup-v2.rst | 17 ++++++ include/linux/cgroup-defs.h | 3 ++ include/linux/psi.h | 2 + include/linux/psi_types.h | 3 ++ kernel/cgroup/cgroup.c | 70 ++++++++++++++++++++++--- kernel/sched/psi.c | 70 ++++++++++++++++++++++--- 6 files changed, 152 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 971c418bc7784..4cad4e2b31ec8 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -976,6 +976,23 @@ All cgroup core files are prefixed with "cgroup." killing cgroups is a process directed operation, i.e. it affects the whole thread-group. + cgroup.pressure + A read-write single value file that allowed values are "0" and "1". + The default is "1". + + Writing "0" to the file will disable the cgroup PSI accounting. + Writing "1" to the file will re-enable the cgroup PSI accounting. + + This control attribute is not hierarchical, so disable or enable PSI + accounting in a cgroup does not affect PSI accounting in descendants + and doesn't need pass enablement via ancestors from root. + + The reason this control attribute exists is that PSI accounts stalls for + each cgroup separately and aggregates it at each level of the hierarchy. + This may cause non-negligible overhead for some workloads when under + deep level of the hierarchy, in which case this control attribute can + be used to disable PSI accounting in the non-leaf cgroups. + irq.pressure A read-write nested-keyed file. diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4bcf56b3491ca..7df76b318245f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -428,6 +428,9 @@ struct cgroup { struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ + /* handles for "{cpu,memory,io,irq}.pressure" */ + struct cgroup_file psi_files[NR_PSI_RESOURCES]; + /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through diff --git a/include/linux/psi.h b/include/linux/psi.h index 362a74ca1d3bf..b029a847def1e 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); +void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ @@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } +static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index a0b746258c682..6e43727350689 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -152,6 +152,7 @@ struct psi_trigger { struct psi_group { struct psi_group *parent; + bool enabled; /* Protects data used by the aggregator */ struct mutex avgs_lock; @@ -194,6 +195,8 @@ struct psi_group { #else /* CONFIG_PSI */ +#define NR_PSI_RESOURCES 0 + struct psi_group { }; #endif /* CONFIG_PSI */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 772b35d65d1f0..fa1cf836b66a3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3708,8 +3708,8 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) return psi_show(seq, psi, PSI_CPU); } -static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, enum psi_res res) +static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; @@ -3746,21 +3746,21 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); + return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); + return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); + return pressure_write(of, buf, nbytes, PSI_CPU); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -3776,10 +3776,58 @@ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ); + return pressure_write(of, buf, nbytes, PSI_IRQ); } #endif +static int cgroup_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + seq_printf(seq, "%d\n", psi->enabled); + + return 0; +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + ssize_t ret; + int enable; + struct cgroup *cgrp; + struct psi_group *psi; + + ret = kstrtoint(strstrip(buf), 0, &enable); + if (ret) + return ret; + + if (enable < 0 || enable > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + psi = cgroup_psi(cgrp); + if (psi->enabled != enable) { + int i; + + /* show or hide {cpu,memory,io,irq}.pressure files */ + for (i = 0; i < NR_PSI_RESOURCES; i++) + cgroup_file_show(&cgrp->psi_files[i], enable); + + psi->enabled = enable; + if (enable) + psi_cgroup_restart(psi); + } + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { @@ -5175,6 +5223,7 @@ static struct cftype cgroup_base_files[] = { { .name = "io.pressure", .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5183,6 +5232,7 @@ static struct cftype cgroup_base_files[] = { { .name = "memory.pressure", .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5191,6 +5241,7 @@ static struct cftype cgroup_base_files[] = { { .name = "cpu.pressure", .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5200,12 +5251,19 @@ static struct cftype cgroup_base_files[] = { { .name = "irq.pressure", .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #endif + { + .name = "cgroup.pressure", + .flags = CFTYPE_PRESSURE, + .seq_show = cgroup_pressure_show, + .write = cgroup_pressure_write, + }, #endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9a8aee80a0874..9711827e31e59 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -181,6 +181,7 @@ static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); @@ -696,17 +697,16 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); - record_times(groupc, now); - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in @@ -745,6 +745,23 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; + } + for (s = 0; s < NR_PSI_STATES; s++) { if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); @@ -761,6 +778,8 @@ static void psi_group_change(struct psi_group *group, int cpu, if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); @@ -907,6 +926,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) group = task_psi_group(task); do { + if (!group->enabled) + continue; + groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); @@ -1080,6 +1102,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_state() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_state() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + u64 now; + + rq_lock_irq(rq, &rf); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) -- GitLab From 80d98a33008cbfd9ed0271b0e253ff88b51f96ac Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 9 Sep 2022 20:31:39 -0500 Subject: [PATCH 0197/2223] ipmi:ipmb: Don't call ipmi_unregister_smi() on a register failure The data structure won't be set up to be unregistered, and it can result in crashes if the register fails. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ipmb.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index 1019946abe4e8..740dc0f824e07 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -424,10 +424,8 @@ static void ipmi_ipmb_request_events(void *send_info) /* We don't fetch events here. */ } -static int ipmi_ipmb_remove(struct i2c_client *client) +static void ipmi_ipmb_cleanup(struct ipmi_ipmb_dev *iidev) { - struct ipmi_ipmb_dev *iidev = i2c_get_clientdata(client); - if (iidev->slave) { i2c_slave_unregister(iidev->slave); if (iidev->slave != iidev->client) @@ -436,7 +434,13 @@ static int ipmi_ipmb_remove(struct i2c_client *client) iidev->slave = NULL; iidev->client = NULL; ipmi_ipmb_stop_thread(iidev); +} + +static int ipmi_ipmb_remove(struct i2c_client *client) +{ + struct ipmi_ipmb_dev *iidev = i2c_get_clientdata(client); + ipmi_ipmb_cleanup(iidev); ipmi_unregister_smi(iidev->intf); return 0; @@ -544,7 +548,7 @@ static int ipmi_ipmb_probe(struct i2c_client *client) out_err: if (slave && slave != client) i2c_unregister_device(slave); - ipmi_ipmb_remove(client); + ipmi_ipmb_cleanup(iidev); return rv; } -- GitLab From f5b23d6704e478b5a97dbba5df9dea96a9cbf847 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:02 +0200 Subject: [PATCH 0198/2223] hfsplus: unmap the page in the "fail_page" label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "hfsplus: Replace kmap() with kmap_local_page()". kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/hfsplus is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/hfsplus. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Fix a bug due to a page being not unmapped if the code jumps to the "fail_page" label (1/4). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. This patch (of 4): Several paths within hfs_btree_open() jump to the "fail_page" label where put_page() is called while the page is still mapped. Call kunmap() to unmap the page soon before put_page(). Link: https://lkml.kernel.org/r/20220809203105.26183-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220809203105.26183-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Matthew Wilcox Cc: Fabio M. De Francesco Cc: Jens Axboe Cc: Bart Van Assche Cc: Kees Cook Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 66774f4cb4fd5..3a917a9a4edd5 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -245,6 +245,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) return tree; fail_page: + kunmap(page); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; -- GitLab From 6c3014a67a44f11dc1020c8b47a1d1d626f007a9 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:03 +0200 Subject: [PATCH 0199/2223] hfsplus: convert kmap() to kmap_local_page() in bnode.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in bnode.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bnode.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/bnode.c | 105 +++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index a5ab00e542203..87974d5e67915 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(buf, kmap(*pagep) + off, l); - kunmap(*pagep); + memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(buf, kmap(*++pagep), l); - kunmap(*pagep); + memcpy_from_page(buf, *++pagep, 0, l); } } @@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(kmap(*pagep) + off, buf, l); + memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++pagep), buf, l); + memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memset(kmap(*pagep) + off, 0, l); + memzero_page(*pagep, off, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memset(kmap(*++pagep), 0, l); + memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); - kunmap(*src_page); + memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); - kunmap(*dst_page); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++dst_page), kmap(*++src_page), l); - kunmap(*src_page); + memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); - kunmap(*dst_page); } } else { void *src_ptr, *dst_ptr; do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; src = 0; @@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } l = min(len, l); memcpy(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page **src_page, **dst_page; + void *src_ptr, *dst_ptr; int l; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); @@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { while (src < len) { - memmove(kmap(*dst_page), kmap(*src_page), src); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr, src_ptr, src); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); len -= src; src = PAGE_SIZE; src_page--; dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, len); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr + src, src_ptr + src, len); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (src < dst) { l = src; src = PAGE_SIZE; @@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (dst == PAGE_SIZE) dst_page--; else @@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, l); - kunmap(*src_page); + + dst_ptr = kmap_local_page(*dst_page) + src; + src_ptr = kmap_local_page(*src_page) + src; + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memmove(kmap(*++dst_page), - kmap(*++src_page), l); - kunmap(*src_page); + dst_ptr = kmap_local_page(*++dst_page); + src_ptr = kmap_local_page(*++src_page); + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; @@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + - node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_SIZE, tree->node_size)); + memzero_page(*pagep, node->page_offset, + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); -- GitLab From f9ef3b95a305874de0b36b7294f6cc8a0e07951e Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:04 +0200 Subject: [PATCH 0200/2223] hfsplus: convert kmap() to kmap_local_page() in bitmap.c kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in bitmap.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bitmap.c. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/bitmap.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index cebce0cfe3405..bd8dcea855880 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; offset &= ~(PAGE_CACHE_BITS - 1); @@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, } curr++; } - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; if (offset >= size) break; @@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - curr = pptr = kmap(page); + curr = pptr = kmap_local_page(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; else @@ -127,7 +127,7 @@ found: len -= 32; } set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -135,7 +135,7 @@ found: start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -151,7 +151,7 @@ last: done: *curr = cpu_to_be32(n); set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); @@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) page = read_mapping_page(mapping, pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; len = count; @@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) break; set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); page = read_mapping_page(mapping, ++pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -231,7 +231,7 @@ done: } out: set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); sbi->free_blocks += len; hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); -- GitLab From 9f25f357c557bfea39a2d6a629a6317d2b3dfc64 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 9 Aug 2022 22:31:05 +0200 Subject: [PATCH 0201/2223] hfsplus: convert kmap() to kmap_local_page() in btree.c kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in btree.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in btree.c. Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220809203105.26183-5-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Bart Van Assche Cc: Jens Axboe Cc: Kees Cook Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hfsplus/btree.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 3a917a9a4edd5..9e1732a2b92a8 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -240,12 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: - kunmap(page); + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; @@ -292,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree) return -EIO; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); @@ -304,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); return 0; @@ -395,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -408,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -418,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); @@ -441,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -491,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; @@ -499,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node) pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); -- GitLab From 765f2bf04fdaced4e7d7e94cfc3f743048629f31 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 8 Aug 2022 10:59:28 +0200 Subject: [PATCH 0202/2223] scripts/decodecode: improve faulting line determination There are cases where the IP pointer in a Code: line in an oops doesn't point at the beginning of an instruction: Code: 0f bd c2 e9 a0 cd b5 e4 48 0f bd c2 e9 97 cd b5 e4 0f 1f 80 00 00 00 00 \ e9 8b cd b5 e4 0f 1f 00 66 0f a3 d0 e9 7f cd b5 e4 0f 1f <80> 00 00 00 \ 00 0f a3 d0 e9 70 cd b5 e4 48 0f a3 d0 e9 67 cd b5 e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 0f 1f 80 00 00 00 00 nopl 0x0(%rax) ^^ and the current way of determining the faulting instruction line doesn't work because disassembled instructions are counted from the IP byte to the end and when that thing points in the middle, the trailing bytes can be interpreted as different insns: Code starting with the faulting instruction =========================================== 0: 80 00 00 addb $0x0,(%rax) 3: 00 00 add %al,(%rax) whereas, this is part of 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 5: 0f a3 d0 bt %edx,%eax ... leading to: 1d: 0f 1f 00 nopl (%rax) 20: 66 0f a3 d0 bt %dx,%ax 24:* e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 <-- trapping instruction 29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 30: 0f a3 d0 bt %edx,%eax which is the wrong faulting instruction. Change the way the faulting line number is determined by matching the opcode bytes from the beginning, leading to correct output: 1d: 0f 1f 00 nopl (%rax) 20: 66 0f a3 d0 bt %dx,%ax 24: e9 7f cd b5 e4 jmp 0xffffffffe4b5cda8 29:* 0f 1f 80 00 00 00 00 nopl 0x0(%rax) <-- trapping instruction 30: 0f a3 d0 bt %edx,%eax While at it, make decodecode use bash as the interpreter - that thing should be present on everything by now. It simplifies the code a lot too. Link: https://lkml.kernel.org/r/20220808085928.29840-1-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Andrew Morton --- scripts/decodecode | 120 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 15 deletions(-) diff --git a/scripts/decodecode b/scripts/decodecode index c711a196511c6..b28fd26865617 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Disassemble the Code: line in Linux oopses # usage: decodecode < oops.file @@ -8,6 +8,8 @@ # AFLAGS=--32 decodecode < 386.oops # PC=hex - the PC (program counter) the oops points to +faultlinenum=1 + cleanup() { rm -f $T $T.s $T.o $T.oo $T.aa $T.dis exit 1 @@ -102,28 +104,125 @@ disas() { grep -v "/tmp\|Disassembly\|\.text\|^$" > $t.dis 2>&1 } +# Match the maximum number of opcode bytes from @op_bytes contained within +# @opline +# +# Params: +# @op_bytes: The string of bytes from the Code: line +# @opline: The disassembled line coming from objdump +# +# Returns: +# The max number of opcode bytes from the beginning of @op_bytes which match +# the opcode bytes in the objdump line. +get_substr_opcode_bytes_num() +{ + local op_bytes=$1 + local opline=$2 + + local retval=0 + substr="" + + for opc in $op_bytes; + do + substr+="$opc" + + # return if opcode bytes do not match @opline anymore + if ! echo $opline | grep -q "$substr"; + then + break + fi + + # add trailing space + substr+=" " + retval=$((retval+1)) + done + + return $retval +} + +# Return the line number in objdump output to where the IP marker in the Code: +# line points to +# +# Params: +# @all_code: code in bytes without the marker +# @dis_file: disassembled file +# @ip_byte: The byte to which the IP points to +get_faultlinenum() +{ + local all_code="$1" + local dis_file="$2" + + # num bytes including IP byte + local num_bytes_ip=$(( $3 + 1 * $width )) + + # Add the two header lines (we're counting from 1). + local retval=3 + + # remove marker + all_code=$(echo $all_code | sed -e 's/[<>()]//g') + + while read line + do + get_substr_opcode_bytes_num "$all_code" "$line" + ate_opcodes=$? + + if ! (( $ate_opcodes )); then + continue + fi + + num_bytes_ip=$((num_bytes_ip - ($ate_opcodes * $width) )) + if (( $num_bytes_ip <= 0 )); then + break + fi + + # Delete matched opcode bytes from all_code. For that, compute + # how many chars those opcodes are represented by and include + # trailing space. + # + # a byte is 2 chars, ate_opcodes is also the number of trailing + # spaces + del_chars=$(( ($ate_opcodes * $width * 2) + $ate_opcodes )) + + all_code=$(echo $all_code | sed -e "s!^.\{$del_chars\}!!") + + let "retval+=1" + + done < $dis_file + + return $retval +} + marker=`expr index "$code" "\<"` if [ $marker -eq 0 ]; then marker=`expr index "$code" "\("` fi - touch $T.oo if [ $marker -ne 0 ]; then - # 2 opcode bytes and a single space - pc_sub=$(( $marker / 3 )) + # How many bytes to subtract from the program counter + # in order to get to the beginning virtual address of the + # Code: + pc_sub=$(( (($marker - 1) / (2 * $width + 1)) * $width )) echo All code >> $T.oo echo ======== >> $T.oo beforemark=`echo "$code"` echo -n " .$type 0x" > $T.s + echo $beforemark | sed -e 's/ /,0x/g; s/[<>()]//g' >> $T.s + disas $T $pc_sub + cat $T.dis >> $T.oo - rm -f $T.o $T.s $T.dis -# and fix code at-and-after marker + get_faultlinenum "$code" "$T.dis" $pc_sub + faultlinenum=$? + + # and fix code at-and-after marker code=`echo "$code" | cut -c$((${marker} + 1))-` + + rm -f $T.o $T.s $T.dis fi + echo Code starting with the faulting instruction > $T.aa echo =========================================== >> $T.aa code=`echo $code | sed -e 's/\r//;s/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` @@ -132,15 +231,6 @@ echo $code >> $T.s disas $T 0 cat $T.dis >> $T.aa -# (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3, -# i.e. the title + the "===..=" line (sed is counting from 1, 0 address is -# special) -faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \ - $(wc -l $T.aa | cut -d" " -f1) + 3)) - -faultline=`cat $T.dis | head -1 | cut -d":" -f2-` -faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'` - cat $T.oo | sed -e "${faultlinenum}s/^\([^:]*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/" echo cat $T.aa -- GitLab From 58b5c203360799e181325f3f8ce212de80ebf304 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Fri, 5 Aug 2022 13:57:33 +0200 Subject: [PATCH 0203/2223] ipc/util.c: cleanup and improve sysvipc_find_ipc() sysvipc_find_ipc() can be simplified further: - It uses a for() loop to locate the next entry in the idr. This can be replaced with idr_get_next(). - It receives two parameters (pos - which is actually an idr index and not a position, and new_pos, which is really a position). One parameter is sufficient. Link: https://lore.kernel.org/all/20210903052020.3265-3-manfred@colorfullife.com/ Link: https://lkml.kernel.org/r/20220805115733.104763-1-manfred@colorfullife.com Signed-off-by: Manfred Spraul Acked-by: Davidlohr Bueso Acked-by: Waiman Long Cc: "Eric W . Biederman" Cc: <1vier1@web.de> Signed-off-by: Andrew Morton --- ipc/util.c | 53 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index a2208d0f26b2d..05cb9de667350 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -782,28 +782,37 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) return iter->pid_ns; } -/* - * This routine locks the ipc structure found at least at position pos. +/** + * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos + * @ids: ipc identifier set + * @pos: expected position + * + * The function finds an ipc structure, based on the sequence file + * position @pos. If there is no ipc structure at position @pos, then + * the successor is selected. + * If a structure is found, then it is locked (both rcu_read_lock() and + * ipc_lock_object()) and @pos is set to the position needed to locate + * the found ipc structure. + * If nothing is found (i.e. EOF), @pos is not modified. + * + * The function returns the found ipc structure, or NULL at EOF. */ -static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, - loff_t *new_pos) +static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { - struct kern_ipc_perm *ipc = NULL; - int max_idx = ipc_get_maxidx(ids); + int tmpidx; + struct kern_ipc_perm *ipc; - if (max_idx == -1 || pos > max_idx) - goto out; + /* convert from position to idr index -> "-1" */ + tmpidx = *pos - 1; - for (; pos <= max_idx; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - rcu_read_lock(); - ipc_lock_object(ipc); - break; - } + ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); + if (ipc != NULL) { + rcu_read_lock(); + ipc_lock_object(ipc); + + /* convert from idr index to position -> "+1" */ + *pos = tmpidx + 1; } -out: - *new_pos = pos + 1; return ipc; } @@ -817,11 +826,13 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); - return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos); + /* Next -> search for *pos+1 */ + (*pos)++; + return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* - * File positions: pos 0 -> header, pos n -> ipc id = n - 1. + * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) @@ -846,8 +857,8 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - /* Find the (pos-1)th ipc */ - return sysvipc_find_ipc(ids, *pos - 1, pos); + /* Otherwise return the correct ipc structure */ + return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) -- GitLab From 64367f2e4f11cfdd983c637d219fb364ab85558c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Aug 2022 13:44:34 +0200 Subject: [PATCH 0204/2223] treewide: defconfig: address renamed CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO is now implicitly selected if one picks one of the explicit options that could be DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT, DEBUG_INFO_DWARF4, DEBUG_INFO_DWARF5. This was actually not what I had in mind when I suggested making it a 'choice' statement, but it's too late to change again now, and the Kconfig logic is more sensible in the new form. Change any defconfig file that had CONFIG_DEBUG_INFO enabled but did not pick DWARF4 or DWARF5 explicitly to now pick the toolchain default. Link: https://lkml.kernel.org/r/20220811114609.2097335-1-arnd@kernel.org Fixes: f9b3cd245784 ("Kconfig.debug: make DEBUG_INFO selectable from a choice") Signed-off-by: Arnd Bergmann Reviewed-by: Kees Cook Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Vineet Gupta Cc: Michal Simek Cc: Thomas Bogendoerfer Cc: Dinh Nguyen Cc: Yoshinori Sato Cc: Rich Felker Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: Chris Zankel Cc: Max Filippov Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/alpha/configs/defconfig | 2 +- arch/arc/configs/tb10x_defconfig | 2 +- arch/microblaze/configs/mmu_defconfig | 2 +- arch/mips/configs/bcm47xx_defconfig | 2 +- arch/mips/configs/cavium_octeon_defconfig | 2 +- arch/mips/configs/ci20_defconfig | 2 +- arch/mips/configs/cu1000-neo_defconfig | 2 +- arch/mips/configs/cu1830-neo_defconfig | 2 +- arch/mips/configs/generic_defconfig | 2 +- arch/mips/configs/omega2p_defconfig | 2 +- arch/mips/configs/qi_lb60_defconfig | 2 +- arch/mips/configs/vocore2_defconfig | 2 +- arch/nios2/configs/10m50_defconfig | 2 +- arch/nios2/configs/3c120_defconfig | 2 +- arch/sh/configs/apsh4a3a_defconfig | 2 +- arch/sh/configs/apsh4ad0a_defconfig | 2 +- arch/sh/configs/edosk7760_defconfig | 2 +- arch/sh/configs/magicpanelr2_defconfig | 2 +- arch/sh/configs/polaris_defconfig | 2 +- arch/sh/configs/r7780mp_defconfig | 2 +- arch/sh/configs/r7785rp_defconfig | 2 +- arch/sh/configs/rsk7203_defconfig | 2 +- arch/sh/configs/sdk7780_defconfig | 2 +- arch/sh/configs/se7712_defconfig | 2 +- arch/sh/configs/se7721_defconfig | 2 +- arch/sh/configs/sh2007_defconfig | 2 +- arch/sh/configs/sh7757lcr_defconfig | 2 +- arch/sh/configs/sh7785lcr_32bit_defconfig | 2 +- arch/sh/configs/urquell_defconfig | 2 +- arch/um/configs/i386_defconfig | 2 +- arch/um/configs/x86_64_defconfig | 2 +- arch/xtensa/configs/audio_kc705_defconfig | 2 +- arch/xtensa/configs/cadence_csp_defconfig | 2 +- arch/xtensa/configs/generic_kc705_defconfig | 2 +- arch/xtensa/configs/nommu_kc705_defconfig | 2 +- arch/xtensa/configs/smp_lx200_defconfig | 2 +- arch/xtensa/configs/virt_defconfig | 2 +- arch/xtensa/configs/xip_kc705_defconfig | 2 +- 38 files changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 7e93369308800..6a39fe8ce9e5f 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -65,7 +65,7 @@ CONFIG_NFSD=m CONFIG_NLS_CODEPAGE_437=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_ALPHA_LEGACY_START_ADDRESS=y CONFIG_MATHEMU=y CONFIG_CRYPTO_HMAC=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index d93b65008d4af..4a94d1684ed60 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -90,7 +90,7 @@ CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_HEADERS_INSTALL=y diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 51337fffb9473..8150daf04a76c 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -83,7 +83,7 @@ CONFIG_CIFS=y CONFIG_CIFS_STATS2=y CONFIG_ENCRYPTED_KEYS=y CONFIG_DMA_CMA=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KGDB=y CONFIG_KGDB_TESTS=y CONFIG_KGDB_KDB=y diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index 91ce75edbfb46..22ffde722bb9e 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -72,7 +72,7 @@ CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_CRC32_SARWATE=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index 97ceaf080c0c7..7f021a327566b 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -162,7 +162,7 @@ CONFIG_CRYPTO_SHA1_OCTEON=m CONFIG_CRYPTO_SHA256_OCTEON=m CONFIG_CRYPTO_SHA512_OCTEON=m CONFIG_CRYPTO_DES=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index cc69b215854ea..955b6ac581ab7 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -199,7 +199,7 @@ CONFIG_NLS_UTF8=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=32 CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index 5bd55eb32fe55..1cbc9302e1d14 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -113,7 +113,7 @@ CONFIG_PRINTK_TIME=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/cu1830-neo_defconfig b/arch/mips/configs/cu1830-neo_defconfig index cc69688962e8c..a0f73f3cd6ce7 100644 --- a/arch/mips/configs/cu1830-neo_defconfig +++ b/arch/mips/configs/cu1830-neo_defconfig @@ -116,7 +116,7 @@ CONFIG_PRINTK_TIME=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 CONFIG_CONSOLE_LOGLEVEL_QUIET=15 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=7 -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig index 714169e411cf0..bbc0d9b1c3983 100644 --- a/arch/mips/configs/generic_defconfig +++ b/arch/mips/configs/generic_defconfig @@ -83,7 +83,7 @@ CONFIG_ROOT_NFS=y # CONFIG_XZ_DEC_ARMTHUMB is not set # CONFIG_XZ_DEC_SPARC is not set CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_INFO_REDUCED=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/mips/configs/omega2p_defconfig b/arch/mips/configs/omega2p_defconfig index fc39ddf610a9b..6a5cb2d6de6b0 100644 --- a/arch/mips/configs/omega2p_defconfig +++ b/arch/mips/configs/omega2p_defconfig @@ -116,7 +116,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRC16=y CONFIG_XZ_DEC=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig index b4448d0876d57..7e5d9741bd5dd 100644 --- a/arch/mips/configs/qi_lb60_defconfig +++ b/arch/mips/configs/qi_lb60_defconfig @@ -166,7 +166,7 @@ CONFIG_NLS_UTF8=y CONFIG_FONTS=y CONFIG_FONT_SUN8x16=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_READABLE_ASM=y CONFIG_KGDB=y diff --git a/arch/mips/configs/vocore2_defconfig b/arch/mips/configs/vocore2_defconfig index a14f8ea5c3867..302cab9bd7bd3 100644 --- a/arch/mips/configs/vocore2_defconfig +++ b/arch/mips/configs/vocore2_defconfig @@ -116,7 +116,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRC16=y CONFIG_XZ_DEC=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig index a7967b4cfb6ed..91c3fce4dc7fe 100644 --- a/arch/nios2/configs/10m50_defconfig +++ b/arch/nios2/configs/10m50_defconfig @@ -74,4 +74,4 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/arch/nios2/configs/3c120_defconfig b/arch/nios2/configs/3c120_defconfig index 423a0c40a1627..c42ad7e162a36 100644 --- a/arch/nios2/configs/3c120_defconfig +++ b/arch/nios2/configs/3c120_defconfig @@ -71,4 +71,4 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_ROOT_NFS=y CONFIG_SUNRPC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/arch/sh/configs/apsh4a3a_defconfig b/arch/sh/configs/apsh4a3a_defconfig index 530498f189904..99931a13a74da 100644 --- a/arch/sh/configs/apsh4a3a_defconfig +++ b/arch/sh/configs/apsh4a3a_defconfig @@ -85,7 +85,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set # CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FTRACE is not set # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 6abd9bd701061..d9fb124bf015a 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -116,7 +116,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_VM=y CONFIG_DWARF_UNWINDER=y # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig index d77f54e906fd0..f427a95bcd21e 100644 --- a/arch/sh/configs/edosk7760_defconfig +++ b/arch/sh/configs/edosk7760_defconfig @@ -107,7 +107,7 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_CRYPTO=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig index 0989ed9295408..ef1d98e35c91f 100644 --- a/arch/sh/configs/magicpanelr2_defconfig +++ b/arch/sh/configs/magicpanelr2_defconfig @@ -84,7 +84,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set CONFIG_DEBUG_KOBJECT=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_CRC_CCITT=m CONFIG_CRC16=m diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig index 246408ec7462b..f42e4867ddc1a 100644 --- a/arch/sh/configs/polaris_defconfig +++ b/arch/sh/configs/polaris_defconfig @@ -79,5 +79,5 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_SG=y diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index f823cc6b18f91..e527cd60a1910 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -101,7 +101,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_DEBUG_PREEMPT is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index f96bc20d4b1ac..a3f952a83d970 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -96,7 +96,7 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_SH_STANDARD_BIOS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_4KSTACKS=y diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 5a54e2b883f0a..d00fafc021e1a 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -112,7 +112,7 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig index 7d6d323598480..41cb588ca99cb 100644 --- a/arch/sh/configs/sdk7780_defconfig +++ b/arch/sh/configs/sdk7780_defconfig @@ -131,7 +131,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_SH_STANDARD_BIOS=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index ee6d28ae08deb..36356223d51c8 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -93,7 +93,7 @@ CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index bad921bc10f8f..46c5a263a2392 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -121,7 +121,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_932=y CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRC_CCITT=y diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig index 79f02f1c0dc83..259c69e3fa227 100644 --- a/arch/sh/configs/sh2007_defconfig +++ b/arch/sh/configs/sh2007_defconfig @@ -159,7 +159,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y CONFIG_SH_STANDARD_BIOS=y CONFIG_CRYPTO_NULL=y diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig index a2700ab165afb..2579dc4bc0c8f 100644 --- a/arch/sh/configs/sh7757lcr_defconfig +++ b/arch/sh/configs/sh7757lcr_defconfig @@ -80,6 +80,6 @@ CONFIG_NLS_ISO8859_1=y CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FTRACE is not set # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig index 7eb3c10f28ad2..781ff13227fc9 100644 --- a/arch/sh/configs/sh7785lcr_32bit_defconfig +++ b/arch/sh/configs/sh7785lcr_32bit_defconfig @@ -141,7 +141,7 @@ CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_LATENCYTOP=y # CONFIG_FTRACE is not set CONFIG_CRYPTO_HMAC=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index cb2f56468fe02..ea773c764c5a8 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -139,7 +139,7 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_POINTER=y # CONFIG_FTRACE is not set # CONFIG_DUMP_CODE is not set diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index fb51bd206dbed..c0162286d68b7 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -69,5 +69,5 @@ CONFIG_JOLIET=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_KERNEL=y diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 477b873174243..bec6e5d956873 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -67,6 +67,6 @@ CONFIG_JOLIET=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_KERNEL=y diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig index 3be62da8089b8..ef0ebcfbccf91 100644 --- a/arch/xtensa/configs/audio_kc705_defconfig +++ b/arch/xtensa/configs/audio_kc705_defconfig @@ -120,7 +120,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig index fc240737b14de..2665962d247a7 100644 --- a/arch/xtensa/configs/cadence_csp_defconfig +++ b/arch/xtensa/configs/cadence_csp_defconfig @@ -100,7 +100,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig index e9d6b6f6eca11..236c7f23cc10a 100644 --- a/arch/xtensa/configs/generic_kc705_defconfig +++ b/arch/xtensa/configs/generic_kc705_defconfig @@ -107,7 +107,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index fcb620ef37997..8263da9e078d7 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -105,7 +105,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y # CONFIG_FRAME_POINTER is not set CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_VM=y diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig index a47c85638ec11..7bdffa3a69c60 100644 --- a/arch/xtensa/configs/smp_lx200_defconfig +++ b/arch/xtensa/configs/smp_lx200_defconfig @@ -111,7 +111,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_VM=y CONFIG_LOCKUP_DETECTOR=y diff --git a/arch/xtensa/configs/virt_defconfig b/arch/xtensa/configs/virt_defconfig index 6d1387dfa96fc..98acb7191cb77 100644 --- a/arch/xtensa/configs/virt_defconfig +++ b/arch/xtensa/configs/virt_defconfig @@ -97,7 +97,7 @@ CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_FONTS=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig index 062148e171351..1c3cebaaa71ba 100644 --- a/arch/xtensa/configs/xip_kc705_defconfig +++ b/arch/xtensa/configs/xip_kc705_defconfig @@ -102,7 +102,7 @@ CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DETECT_HUNG_TASK=y # CONFIG_SCHED_DEBUG is not set -- GitLab From 5bb6ce3aeb02497668a4e8971268b041aa61de8d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 1 Aug 2022 14:27:09 +0200 Subject: [PATCH 0205/2223] fs/isofs: replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). Tasks can be preempted and, when scheduled to run again, the kernel virtual addresses are restored and still valid. It is faster than kmap() in kernels with HIGHMEM enabled. Since kmap_local_page() can be safely used in compress.c, it should be called everywhere instead of kmap(). Therefore, replace kmap() with kmap_local_page() in compress.c. Where it is needed, use memzero_page() instead of open coding kmap_local_page() plus memset() to fill the pages with zeros. Delete the redundant flush_dcache_page() in the two call sites of memzero_page(). Tested with mkisofs on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220801122709.8164-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Roman Gushchin Cc: Pali Rohár Cc: Muchun Song Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/isofs/compress.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index b466172eec25b..09285e82eb08c 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_SIZE); - flush_dcache_page(pages[i]); + memzero_page(pages[i], 0, PAGE_SIZE); SetPageUptodate(pages[i]); } return ((loff_t)pcount) << PAGE_SHIFT; @@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, zerr != Z_STREAM_END) { if (!stream.avail_out) { if (pages[curpage]) { - stream.next_out = page_address(pages[curpage]) + stream.next_out = kmap_local_page(pages[curpage]) + poffset; stream.avail_out = PAGE_SIZE - poffset; poffset = 0; @@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, flush_dcache_page(pages[curpage]); SetPageUptodate(pages[curpage]); } + if (stream.next_out != (unsigned char *)zisofs_sink_page) { + kunmap_local(stream.next_out); + stream.next_out = NULL; + } curpage++; } if (!stream.avail_in) @@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, } inflate_out: zlib_inflateEnd(&stream); + if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page) + kunmap_local(stream.next_out); z_eio: mutex_unlock(&zisofs_zlib_lock); @@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, } if (poffset && *pages) { - memset(page_address(*pages) + poffset, 0, - PAGE_SIZE - poffset); - flush_dcache_page(*pages); + memzero_page(*pages, poffset, PAGE_SIZE - poffset); SetPageUptodate(*pages); } return 0; @@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) { + if (pages[i]) ClearPageError(pages[i]); - kmap(pages[i]); - } } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) flush_dcache_page(pages[i]); if (i == full_page && err) SetPageError(pages[i]); - kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); -- GitLab From defdaff15a84c68521c5f02b157fc8541e0356f3 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sat, 13 Aug 2022 15:00:34 -0700 Subject: [PATCH 0206/2223] checkpatch: add kmap and kmap_atomic to the deprecated list kmap() and kmap_atomic() are being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. kmap_local_page() is safe from any context and is therefore redundant with kmap_atomic() with the exception of any pagefault or preemption disable requirements. However, using kmap_atomic() for these side effects makes the code less clear. So any requirement for pagefault or preemption disable should be made explicitly. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored. Link: https://lkml.kernel.org/r/20220813220034.806698-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Suggested-by: Thomas Gleixner Suggested-by: Fabio M. De Francesco Reviewed-by: Chaitanya Kulkarni Cc: Joe Perches Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 79e759aac543b..9ff219e0a9d56 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -807,6 +807,8 @@ our %deprecated_apis = ( "rcu_barrier_sched" => "rcu_barrier", "get_state_synchronize_sched" => "get_state_synchronize_rcu", "cond_synchronize_sched" => "cond_synchronize_rcu", + "kmap" => "kmap_local_page", + "kmap_atomic" => "kmap_local_page", ); #Create a search pattern for all these strings to speed up a loop below -- GitLab From 9847f21225c4eb0b843cb2b72ed83b32edb1e6f2 Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Thu, 28 Jul 2022 16:24:34 -0700 Subject: [PATCH 0207/2223] lib/cmdline: avoid page fault in next_arg An argument list like "arg=val arg2 \"" can trigger a page fault if the page pointed by 'args[0xffffffff]' is not mapped and potential memory corruption otherwise (unlikely but possible if the bogus address is mapped and contents happen to match the ascii value of the quote character). The fix is to ensure that we load 'args[i-1]' only when (i > 0). Prior to this commit the following command would trigger an unhandled page fault in the kernel: root@(none):/linus/fs/fat# insmod ./fat.ko "foo=bar \"" [ 33.870507] BUG: unable to handle page fault for address: ffff888204252608 [ 33.872180] #PF: supervisor read access in kernel mode [ 33.873414] #PF: error_code(0x0000) - not-present page [ 33.874650] PGD 4401067 P4D 4401067 PUD 0 [ 33.875321] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [ 33.876113] CPU: 16 PID: 399 Comm: insmod Not tainted 5.19.0-dbg-DEV #4 [ 33.877193] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014 [ 33.878739] RIP: 0010:next_arg+0xd1/0x110 [ 33.879399] Code: 22 75 1d 41 c6 04 01 00 41 80 f8 22 74 18 eb 35 4c 89 0e 45 31 d2 4c 89 cf 48 c7 02 00 00 00 00 41 80 f8 22 75 1f 41 8d 42 ff <41> 80 3c 01 22 75 14 41 c6 04 01 00 eb 0d 48 c7 02 00 00 00 00 41 [ 33.882338] RSP: 0018:ffffc90001253d08 EFLAGS: 00010246 [ 33.883174] RAX: 00000000ffffffff RBX: ffff888104252608 RCX: 0fc317bba1c1dd00 [ 33.884311] RDX: ffffc90001253d40 RSI: ffffc90001253d48 RDI: ffff888104252609 [ 33.885450] RBP: ffffc90001253d10 R08: 0000000000000022 R09: ffff888104252609 [ 33.886595] R10: 0000000000000000 R11: ffffffff82c7ff20 R12: 0000000000000282 [ 33.887748] R13: 00000000ffff8000 R14: 0000000000000000 R15: 0000000000007fff [ 33.888887] FS: 00007f04ec7432c0(0000) GS:ffff88813d300000(0000) knlGS:0000000000000000 [ 33.890183] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.891111] CR2: ffff888204252608 CR3: 0000000100f36005 CR4: 0000000000170ee0 [ 33.892241] Call Trace: [ 33.892641] [ 33.892989] parse_args+0x8f/0x220 [ 33.893538] load_module+0x138b/0x15a0 [ 33.894149] ? prepare_coming_module+0x50/0x50 [ 33.894879] ? kernel_read_file_from_fd+0x5f/0x90 [ 33.895639] __se_sys_finit_module+0xce/0x130 [ 33.896342] __x64_sys_finit_module+0x1d/0x20 [ 33.897042] do_syscall_64+0x44/0xa0 [ 33.897622] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 33.898434] RIP: 0033:0x7f04ec85ef79 [ 33.899009] Code: 48 8d 3d da db 0d 00 0f 05 eb a5 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c7 9e 0d 00 f7 d8 64 89 01 48 [ 33.901912] RSP: 002b:00007fffae81bfe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 33.903081] RAX: ffffffffffffffda RBX: 0000559c5f1d2640 RCX: 00007f04ec85ef79 [ 33.904191] RDX: 0000000000000000 RSI: 0000559c5f1d12a0 RDI: 0000000000000003 [ 33.905304] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 33.906421] R10: 0000000000000003 R11: 0000000000000246 R12: 0000559c5f1d12a0 [ 33.907526] R13: 0000000000000000 R14: 0000559c5f1d25f0 R15: 0000559c5f1d12a0 [ 33.908631] [ 33.908986] Modules linked in: fat(+) [last unloaded: fat] [ 33.909843] CR2: ffff888204252608 [ 33.910375] ---[ end trace 0000000000000000 ]--- [ 33.911172] RIP: 0010:next_arg+0xd1/0x110 [ 33.911796] Code: 22 75 1d 41 c6 04 01 00 41 80 f8 22 74 18 eb 35 4c 89 0e 45 31 d2 4c 89 cf 48 c7 02 00 00 00 00 41 80 f8 22 75 1f 41 8d 42 ff <41> 80 3c 01 22 75 14 41 c6 04 01 00 eb 0d 48 c7 02 00 00 00 00 41 [ 33.914643] RSP: 0018:ffffc90001253d08 EFLAGS: 00010246 [ 33.915446] RAX: 00000000ffffffff RBX: ffff888104252608 RCX: 0fc317bba1c1dd00 [ 33.916544] RDX: ffffc90001253d40 RSI: ffffc90001253d48 RDI: ffff888104252609 [ 33.917636] RBP: ffffc90001253d10 R08: 0000000000000022 R09: ffff888104252609 [ 33.918727] R10: 0000000000000000 R11: ffffffff82c7ff20 R12: 0000000000000282 [ 33.919821] R13: 00000000ffff8000 R14: 0000000000000000 R15: 0000000000007fff [ 33.920908] FS: 00007f04ec7432c0(0000) GS:ffff88813d300000(0000) knlGS:0000000000000000 [ 33.922125] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.923017] CR2: ffff888204252608 CR3: 0000000100f36005 CR4: 0000000000170ee0 [ 33.924098] Kernel panic - not syncing: Fatal exception [ 33.925776] Kernel Offset: disabled [ 33.926347] Rebooting in 10 seconds.. Link: https://lkml.kernel.org/r/20220728232434.1666488-1-neelnatu@google.com Signed-off-by: Neel Natu Reviewed-by: Eric Dumazet Signed-off-by: Andrew Morton --- lib/cmdline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index 5546bf5887806..90ed997d95701 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -260,7 +260,7 @@ char *next_arg(char *args, char **param, char **val) args[i-1] = '\0'; } } - if (quoted && args[i-1] == '"') + if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { -- GitLab From 7bb5da0d490b2d836c5218f5186ee588d2145310 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:57 +0100 Subject: [PATCH 0208/2223] kexec: turn all kexec_mutex acquisitions into trylocks Patch series "kexec, panic: Making crash_kexec() NMI safe", v4. This patch (of 2): Most acquistions of kexec_mutex are done via mutex_trylock() - those were a direct "translation" from: 8c5a1cf0ad3a ("kexec: use a mutex for locking rather than xchg()") there have however been two additions since then that use mutex_lock(): crash_get_memory_size() and crash_shrink_memory(). A later commit will replace said mutex with an atomic variable, and locking operations will become atomic_cmpxchg(). Rather than having those mutex_lock() become while (atomic_cmpxchg(&lock, 0, 1)), turn them into trylocks that can return -EBUSY on acquisition failure. This does halve the printable size of the crash kernel, but that's still neighbouring 2G for 32bit kernels which should be ample enough. Link: https://lkml.kernel.org/r/20220630223258.4144112-1-vschneid@redhat.com Link: https://lkml.kernel.org/r/20220630223258.4144112-2-vschneid@redhat.com Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Baoquan He Signed-off-by: Andrew Morton --- include/linux/kexec.h | 2 +- kernel/kexec_core.c | 12 ++++++++---- kernel/ksysfs.c | 7 ++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 13e6c4b58f07d..41a686996aaa3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -427,7 +427,7 @@ extern int kexec_load_disabled; extern bool kexec_in_progress; int crash_shrink_memory(unsigned long new_size); -size_t crash_get_memory_size(void); +ssize_t crash_get_memory_size(void); #ifndef arch_kexec_protect_crashkres /* diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index acd029b307e42..8fa4eeb95b305 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1004,13 +1004,16 @@ void crash_kexec(struct pt_regs *regs) } } -size_t crash_get_memory_size(void) +ssize_t crash_get_memory_size(void) { - size_t size = 0; + ssize_t size = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; - mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); + mutex_unlock(&kexec_mutex); return size; } @@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - mutex_lock(&kexec_mutex); + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; if (kexec_crash_image) { ret = -ENOENT; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b1292a57c2a53..65dba9076f312 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%zu\n", crash_get_memory_size()); + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sprintf(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, -- GitLab From 05c6257433b7212f07a7e53479a8ab038fc1666a Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:58 +0100 Subject: [PATCH 0209/2223] panic, kexec: make __crash_kexec() NMI safe Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI panic() doesn't work. The cause of that lies in the PREEMPT_RT definition of mutex_trylock(): if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; This prevents an nmi_panic() from executing the main body of __crash_kexec() which does the actual kexec into the kdump kernel. The warning and return are explained by: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") [...] The reasons for this are: 1) There is a potential deadlock in the slowpath 2) Another cpu which blocks on the rtmutex will boost the task which allegedly locked the rtmutex, but that cannot work because the hard/softirq context borrows the task context. Furthermore, grabbing the lock isn't NMI safe, so do away with kexec_mutex and replace it with an atomic variable. This is somewhat overzealous as *some* callsites could keep using a mutex (e.g. the sysfs-facing ones like crash_shrink_memory()), but this has the benefit of involving a single unified lock and preventing any future NMI-related surprises. Tested by triggering NMI panics via: $ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi $ echo 1 > /proc/sys/kernel/unknown_nmi_panic $ echo 1 > /proc/sys/kernel/panic $ ipmitool power diag Link: https://lkml.kernel.org/r/20220630223258.4144112-3-vschneid@redhat.com Fixes: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: Baoquan He Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec.c | 11 ++++------- kernel/kexec_core.c | 20 ++++++++++---------- kernel/kexec_file.c | 4 ++-- kernel/kexec_internal.h | 15 ++++++++++++++- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index b5e40f0697681..cb8e6e6f983c7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, /* * Because we write directly to the reserved memory region when loading - * crash kernels we need a mutex here to prevent multiple crash kernels - * from attempting to load simultaneously, and to prevent a crash kernel - * from loading over the top of a in use crash kernel. - * - * KISS: always take the mutex. + * crash kernels we need a serialization here to prevent multiple crash + * kernels from attempting to load simultaneously. */ - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (flags & KEXEC_ON_CRASH) { @@ -165,7 +162,7 @@ out: kimage_free(image); out_unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8fa4eeb95b305..5ca4d40c9ec13 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -46,7 +46,7 @@ #include #include "kexec_internal.h" -DEFINE_MUTEX(kexec_mutex); +atomic_t __kexec_lock = ATOMIC_INIT(0); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init); */ void __noclone __crash_kexec(struct pt_regs *regs) { - /* Take the kexec_mutex here to prevent sys_kexec_load + /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - if (mutex_trylock(&kexec_mutex)) { + if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs; @@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - mutex_unlock(&kexec_mutex); + kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); @@ -1008,13 +1008,13 @@ ssize_t crash_get_memory_size(void) { ssize_t size = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); + kexec_unlock(); return size; } @@ -1025,7 +1025,7 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (kexec_crash_image) { @@ -1064,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size) insert_resource(&iomem_resource, ram_res); unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } @@ -1136,7 +1136,7 @@ int kernel_kexec(void) { int error = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1212,6 +1212,6 @@ int kernel_kexec(void) #endif Unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return error; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 1d546dc97c502..45637511e0de6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, image = NULL; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; dest_image = &kexec_image; @@ -411,7 +411,7 @@ out: if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); + kexec_unlock(); kimage_free(image); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d1..74da1409cd14b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -extern struct mutex kexec_mutex; +/* + * Whatever is used to serialize accesses to the kexec_crash_image needs to be + * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a + * "simple" atomic variable that is acquired with a cmpxchg(). + */ +extern atomic_t __kexec_lock; +static inline bool kexec_trylock(void) +{ + return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0; +} +static inline void kexec_unlock(void) +{ + atomic_set_release(&__kexec_lock, 0); +} #ifdef CONFIG_KEXEC_FILE #include -- GitLab From 4f1d2a030db09af4d21695983674a18633cd0ffb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 16:49:17 +0200 Subject: [PATCH 0210/2223] llist: use try_cmpxchg in llist_add_batch and llist_del_first Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in llist_add_batch and llist_del_first. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220712144917.4497-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- lib/llist.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/llist.c b/lib/llist.c index 611ce4881a875..7d78b736e8afd 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -30,7 +30,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, do { new_last->next = first = READ_ONCE(head->first); - } while (cmpxchg(&head->first, first, new_first) != first); + } while (!try_cmpxchg(&head->first, &first, new_first)); return !first; } @@ -52,18 +52,14 @@ EXPORT_SYMBOL_GPL(llist_add_batch); */ struct llist_node *llist_del_first(struct llist_head *head) { - struct llist_node *entry, *old_entry, *next; + struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); - for (;;) { + do { if (entry == NULL) return NULL; - old_entry = entry; next = READ_ONCE(entry->next); - entry = cmpxchg(&head->first, old_entry, next); - if (entry == old_entry) - break; - } + } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } -- GitLab From f4068af3a6383da3487c07de85db7732de851734 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Aug 2022 12:50:04 +0300 Subject: [PATCH 0211/2223] proc: save LOC in vsyscall test Do one fork in vsyscall detection code and let SIGSEGV handler exit and carry information to the parent saving LOC. [adobriyan@gmail.com: redo original patch, delete unnecessary variables, minimise code changes] Link: https://lkml.kernel.org/r/YvoWzAn5dlhF75xa@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Brian Foster Reviewed-by: Brian Foster Tested-by: Brian Foster Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-pid-vm.c | 56 +++++++--------------- 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index e5962f4794f56..69551bfa215c4 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -213,22 +213,22 @@ static int make_exe(const uint8_t *payload, size_t len) /* * 0: vsyscall VMA doesn't exist vsyscall=none - * 1: vsyscall VMA is r-xp vsyscall=emulate - * 2: vsyscall VMA is --xp vsyscall=xonly + * 1: vsyscall VMA is --xp vsyscall=xonly + * 2: vsyscall VMA is r-xp vsyscall=emulate */ -static int g_vsyscall; +static volatile int g_vsyscall; static const char *str_vsyscall; static const char str_vsyscall_0[] = ""; static const char str_vsyscall_1[] = -"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; -static const char str_vsyscall_2[] = "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; +static const char str_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) { - _exit(1); + _exit(g_vsyscall); } /* @@ -255,6 +255,7 @@ static void vsyscall(void) act.sa_sigaction = sigaction_SIGSEGV; (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 0; /* gettimeofday(NULL, NULL); */ asm volatile ( "call %P0" @@ -262,45 +263,20 @@ static void vsyscall(void) : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) : "rax", "rcx", "r11" ); - exit(0); - } - waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page exists and is executable. */ - } else { - /* vsyscall page doesn't exist. */ - g_vsyscall = 0; - return; - } - - pid = fork(); - if (pid < 0) { - fprintf(stderr, "fork, errno %d\n", errno); - exit(1); - } - if (pid == 0) { - struct rlimit rlim = {0, 0}; - (void)setrlimit(RLIMIT_CORE, &rlim); - - /* Hide "segfault at ffffffffff600000" messages. */ - struct sigaction act; - memset(&act, 0, sizeof(struct sigaction)); - act.sa_flags = SA_SIGINFO; - act.sa_sigaction = sigaction_SIGSEGV; - (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 1; *(volatile int *)0xffffffffff600000UL; - exit(0); + + g_vsyscall = 2; + exit(g_vsyscall); } waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page is readable and executable. */ - g_vsyscall = 1; - return; + if (WIFEXITED(wstatus)) { + g_vsyscall = WEXITSTATUS(wstatus); + } else { + fprintf(stderr, "error: wstatus %08x\n", wstatus); + exit(1); } - - /* vsyscall page is executable but unreadable. */ - g_vsyscall = 2; } int main(void) -- GitLab From 2be9880dc87342dc7ae459c9ea5c9ee2a45b33d8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 09:44:06 +0800 Subject: [PATCH 0212/2223] kernel: exit: cleanup release_thread() Only x86 has own release_thread(), introduce a new weak release_thread() function to clean empty definitions in other ARCHs. Link: https://lkml.kernel.org/r/20220819014406.32266-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Guo Ren [csky] Acked-by: Russell King (Oracle) Acked-by: Geert Uytterhoeven Acked-by: Brian Cain Acked-by: Michael Ellerman [powerpc] Acked-by: Stafford Horne [openrisc] Acked-by: Catalin Marinas [arm64] Acked-by: Huacai Chen [LoongArch] Cc: Alexander Gordeev Cc: Anton Ivanov Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren [csky] Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/processor.h | 2 -- arch/alpha/kernel/process.c | 5 ----- arch/arc/include/asm/processor.h | 3 --- arch/arm/include/asm/processor.h | 3 --- arch/arm/kernel/process.c | 4 ---- arch/arm64/include/asm/processor.h | 3 --- arch/arm64/kernel/process.c | 4 ---- arch/csky/include/asm/processor.h | 5 ----- arch/hexagon/include/asm/processor.h | 4 ---- arch/hexagon/kernel/process.c | 7 ------- arch/ia64/include/asm/processor.h | 7 ------- arch/loongarch/include/asm/processor.h | 3 --- arch/m68k/include/asm/processor.h | 5 ----- arch/microblaze/include/asm/processor.h | 5 ----- arch/mips/include/asm/processor.h | 3 --- arch/nios2/include/asm/processor.h | 5 ----- arch/openrisc/include/asm/processor.h | 1 - arch/openrisc/kernel/process.c | 4 ---- arch/parisc/include/asm/processor.h | 3 --- arch/parisc/kernel/process.c | 4 ---- arch/powerpc/include/asm/processor.h | 1 - arch/powerpc/kernel/process.c | 5 ----- arch/riscv/include/asm/processor.h | 5 ----- arch/s390/include/asm/processor.h | 3 --- arch/sh/include/asm/processor_32.h | 3 --- arch/sh/kernel/process_32.c | 5 ----- arch/sparc/include/asm/processor_32.h | 3 --- arch/sparc/include/asm/processor_64.h | 3 --- arch/um/include/asm/processor-generic.h | 4 ---- arch/x86/include/asm/processor.h | 3 --- arch/xtensa/include/asm/processor.h | 3 --- include/linux/sched/task.h | 3 +++ kernel/exit.c | 4 ++++ 33 files changed, 7 insertions(+), 118 deletions(-) diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 43e234c518b1c..714abe494e5fd 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -36,8 +36,6 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); /* Free all resources held by a thread. */ struct task_struct; -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e2e25f8b5e76c..dbf1bc5e2ad25 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -225,11 +225,6 @@ flush_thread(void) current_thread_info()->pcb.unique = 0; } -void -release_thread(struct task_struct *dead_task) -{ -} - /* * Copy architecture-specific thread state */ diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 54db9d7bb562d..fb844fce1ab67 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -43,9 +43,6 @@ struct task_struct; #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + (void *)task_stack_page(p)) - 1) -/* Free all resources held by a thread */ -#define release_thread(thread) do { } while (0) - /* * A lot of busy-wait loops in SMP are based off of non-volatile data otherwise * get optimised away by gcc diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index bdc35c0e8dfb9..326864f79d18f 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -81,9 +81,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3d9cace638840..712d3e6d9be90 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -232,10 +232,6 @@ void flush_thread(void) thread_notify(THREAD_NOTIFY_FLUSH, thread); } -void release_thread(struct task_struct *dead_task) -{ -} - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 86eb0bfe3b380..4cfb4cd1d4759 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -323,9 +323,6 @@ static inline bool is_ttbr1_addr(unsigned long addr) /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 92bcc1768f0b9..9015f49c206ef 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -279,10 +279,6 @@ void flush_thread(void) flush_tagged_addr_state(); } -void release_thread(struct task_struct *dead_task) -{ -} - void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 9638206bc44f7..63ad71fab30d7 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -69,11 +69,6 @@ do { \ /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 615f7e49968e6..0cd39c2cdf8f7 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -60,10 +60,6 @@ struct thread_struct { #define KSTK_EIP(tsk) (pt_elr(task_pt_regs(tsk))) #define KSTK_ESP(tsk) (pt_psp(task_pt_regs(tsk))) -/* Free all resources held by a thread; defined in process.c */ -extern void release_thread(struct task_struct *dead_task); - -/* Get wait channel for task P. */ extern unsigned long __get_wchan(struct task_struct *p); /* The following stuff is pretty HEXAGON specific. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index f0552f98a7bae..e15eeaebd7853 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -112,13 +112,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) return 0; } -/* - * Release any architecture-specific resources locked by thread - */ -void release_thread(struct task_struct *dead_task) -{ -} - /* * Some archs flush debug and FPU info here */ diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 757c2f6d8d4b8..d1978e0040548 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -318,13 +318,6 @@ struct thread_struct { struct mm_struct; struct task_struct; -/* - * Free all resources held by a thread. This is called after the - * parent of DEAD_TASK has collected the exit status of the task via - * wait(). - */ -#define release_thread(dead_task) - /* Get wait channel for task P. */ extern unsigned long __get_wchan (struct task_struct *p); diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 1c4b4308378d4..6954dc5d24e9d 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -176,9 +176,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while (0) - enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern unsigned long boot_option_idle_override; diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index d86b4009880b4..7a2da780830b8 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -145,11 +145,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, /* Forward declaration, a strange C thing */ struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); void show_registers(struct pt_regs *regs); diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 7e9e92670df33..4e193c7550dfa 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -63,11 +63,6 @@ struct thread_struct { .pgdir = swapper_pg_dir, \ } -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - unsigned long __get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 4bb24579d12e4..3fde1ff72bd16 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -344,9 +344,6 @@ struct thread_struct { struct task_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - /* * Do necessary setup to start up a newly executed thread. */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index b8125dfbcad2d..8916d93d5c2d0 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -64,11 +64,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long pc, struct task_struct; -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index aa1699c18add8..ed9efb430afa1 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -72,7 +72,6 @@ struct thread_struct { void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); -void release_thread(struct task_struct *); unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 52dc983ddeba3..f94b5ec06786e 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -125,10 +125,6 @@ void show_regs(struct pt_regs *regs) show_registers(regs); } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Copy the thread-specific (arch specific) info from the current * process to the new one p diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 4621ceb513147..a608970b249af 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -266,9 +266,6 @@ on downward growing arches, it looks like this: struct mm_struct; -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs.iaoq[0]) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 7c37e09c92da6..3db0e97e6c066 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -146,10 +146,6 @@ void flush_thread(void) */ } -void release_thread(struct task_struct *dead_task) -{ -} - /* * Idle thread support * diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fdfaae194ddd5..92e332415d02c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -75,7 +75,6 @@ extern int _chrp_type; struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); -void release_thread(struct task_struct *); #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0fbda89cd1bb5..991cda25b9a9e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1655,11 +1655,6 @@ EXPORT_SYMBOL_GPL(set_thread_tidr); #endif /* CONFIG_PPC64 */ -void -release_thread(struct task_struct *t) -{ -} - /* * this gets called so that we can store coprocessor state into memory and * copy the current task into the new thread. diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 19eedd4af4cde..94a0590c69710 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -65,11 +65,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - extern unsigned long __get_wchan(struct task_struct *p); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index bd66f8e349492..c52fe651eebab 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -186,9 +186,6 @@ struct pt_regs; void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *tsk) { } - /* Free guarded storage control block */ void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 45240ec6b85a4..27aebf1e75a20 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -127,9 +127,6 @@ struct task_struct; extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned long new_sp); -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - /* * FPU lazy state save handling. */ diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index a808843375e71..92b6649d49295 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -84,11 +84,6 @@ void flush_thread(void) #endif } -void release_thread(struct task_struct *dead_task) -{ - /* do nothing */ -} - asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index b26c35336b51d..ba8b70ffec085 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -80,9 +80,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, : "memory"); } -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while(0) - unsigned long __get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 89850dff6b033..2667f35d5ea56 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -176,9 +176,6 @@ do { \ regs->tstate &= ~TSTATE_PEF; \ } while (0) -/* Free all resources held by a thread. */ -#define release_thread(tsk) do { } while (0) - unsigned long __get_wchan(struct task_struct *task); #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index d0fc1862da957..bb5f06480da95 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -55,10 +55,6 @@ struct thread_struct { .request = { 0 } \ } -static inline void release_thread(struct task_struct *task) -{ -} - /* * User space process size: 3GB (default). */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 356308c739514..67c9d73b31faa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -587,9 +587,6 @@ static inline void load_sp0(unsigned long sp0) #endif /* CONFIG_PARAVIRT_XXL */ -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); /* diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 76bc63127c66e..5abde43c570cd 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -221,9 +221,6 @@ struct thread_struct { struct task_struct; struct mm_struct; -/* Free all resources held by a thread. */ -#define release_thread(thread) do { } while(0) - extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 81cab4b01edcb..d6c48163c6def 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr) void put_task_struct_rcu_user(struct task_struct *task); +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e3..f4b7b058f4e69 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -183,6 +183,10 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; -- GitLab From cba7543e1515e79a85d37df85a0eb2cf0f07d115 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Fri, 19 Aug 2022 08:18:19 +0000 Subject: [PATCH 0213/2223] fs/qnx6: delete unnecessary checks before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus remove the tests which are not needed around the shown calls. Link: https://lkml.kernel.org/r/20220819081819.96347-1-chi.minghao@zte.com.cn Signed-off-by: Minghao Chi Reported-by: Zeal Robot Cc: CGEL ZTE Cc: Matthew Wilcox Cc: Minghao Chi Cc: Muchun Song Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/qnx6/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b9895afca9d11..85b2fa3b211c9 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -470,10 +470,8 @@ out2: out1: iput(sbi->inodes); out: - if (bh1) - brelse(bh1); - if (bh2) - brelse(bh2); + brelse(bh1); + brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; -- GitLab From aa06a9bd853306c239f759018fb227d7e8f4e203 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Sat, 20 Aug 2022 19:18:13 +0100 Subject: [PATCH 0214/2223] ia64: fix clock_getres(CLOCK_MONOTONIC) to report ITC frequency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clock_gettime(CLOCK_MONOTONIC, &tp) is very precise on ia64 as it uses ITC (similar to rdtsc on x86). It's not quite a hrtimer as it is a few times slower than 1ns. Usually 2-3ns. clock_getres(CLOCK_MONOTONIC, &res) never reflected that fact and reported 0.04s precision (1/HZ value). In https://bugs.gentoo.org/596382 gstreamer's test suite failed loudly when it noticed precision discrepancy. Before the change: clock_getres(CLOCK_MONOTONIC, &res) reported 250Hz precision. After the change: clock_getres(CLOCK_MONOTONIC, &res) reports ITC (400Mhz) precision. The patch is based on matoro's fix. I added a bit of explanation why we need to special-case arch-specific clock_getres(). [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20220820181813.2275195-1-slyich@gmail.com Signed-off-by: Sergei Trofimovich Cc: matoro Cc: Émeric Maschino Signed-off-by: Andrew Morton --- arch/ia64/kernel/sys_ia64.c | 26 ++++++++++++++++++++++++++ arch/ia64/kernel/syscalls/syscall.tbl | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index e14db25146c22..215bf3f8cb204 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -166,3 +166,29 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u force_successful_syscall_return(); return addr; } + +asmlinkage long +ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) +{ + /* + * ia64's clock_gettime() syscall is implemented as a vdso call + * fsys_clock_gettime(). Currently it handles only + * CLOCK_REALTIME and CLOCK_MONOTONIC. Both are based on + * 'ar.itc' counter which gets incremented at a constant + * frequency. It's usually 400MHz, ~2.5x times slower than CPU + * clock frequency. Which is almost a 1ns hrtimer, but not quite. + * + * Let's special-case these timers to report correct precision + * based on ITC frequency and not HZ frequency for supported + * clocks. + */ + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + return put_timespec64(&rtn_tp, tp); + } + + return sys_clock_getres(which_clock, tp); +} diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 78b1d03e86e1d..72c929d9902b9 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -240,7 +240,7 @@ 228 common timer_delete sys_timer_delete 229 common clock_settime sys_clock_settime 230 common clock_gettime sys_clock_gettime -231 common clock_getres sys_clock_getres +231 common clock_getres ia64_clock_getres 232 common clock_nanosleep sys_clock_nanosleep 233 common fstatfs64 sys_fstatfs64 234 common statfs64 sys_statfs64 -- GitLab From 693fc06e98514c2d5951ead4aca40cf8b21100b1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 19:32:55 +0200 Subject: [PATCH 0215/2223] epoll: use try_cmpxchg in list_add_tail_lockless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in list_add_tail_lockless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Link: https://lkml.kernel.org/r/20220714173255.12987-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b56b94e2f56f..52954d4637b54 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1065,7 +1065,7 @@ static inline bool list_add_tail_lockless(struct list_head *new, * added to the list from another CPU: the winner observes * new->next == new. */ - if (cmpxchg(&new->next, new, head) != new) + if (!try_cmpxchg(&new->next, &new, head)) return false; /* -- GitLab From b0192296b45232872720d969366449c001ab1f4a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 19:16:53 +0200 Subject: [PATCH 0216/2223] buffer: use try_cmpxchg in discard_buffer Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in discard_buffer. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Note that the value from *ptr should be read using READ_ONCE to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Link: https://lkml.kernel.org/r/20220714171653.12128-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/buffer.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 55e762a58eb65..2fd98bb7d74ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1464,19 +1464,15 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } -- GitLab From 38ace0d513d9d2556beca4d07102d25e9a73c53c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 18:48:51 +0200 Subject: [PATCH 0217/2223] aio: use atomic_try_cmpxchg in __get_reqs_available Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in __get_reqs_available. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220714164851.3055-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Benjamin LaHaise Cc: Alexander Viro Signed-off-by: Andrew Morton --- fs/aio.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 606613e9d1f4f..5b2ff20ad3229 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx) local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { - int old, avail = atomic_read(&ctx->reqs_available); + int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; - - old = avail; - avail = atomic_cmpxchg(&ctx->reqs_available, - avail, avail - ctx->req_batch); - } while (avail != old); + } while (!atomic_try_cmpxchg(&ctx->reqs_available, + &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } -- GitLab From da3f52ba359590d8eb465ae0d8b6e11d6fd9432f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 21 Aug 2022 21:30:11 +0200 Subject: [PATCH 0218/2223] iversion: use atomic64_try_cmpxchg) Use atomic64_try_cmpxchg instead of atomic64_cmpxchg (*ptr, old, new) == old in inode_set_max_iversion_raw, inode_maybe_inc_version and inode_query_iversion. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The loop in inode_maybe_inc_iversion improves from: 5563: 48 89 ca mov %rcx,%rdx 5566: 48 89 c8 mov %rcx,%rax 5569: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556d: 48 83 c2 02 add $0x2,%rdx 5571: f0 48 0f b1 16 lock cmpxchg %rdx,(%rsi) 5576: 48 39 c1 cmp %rax,%rcx 5579: 0f 84 85 fc ff ff je 5204 <...> 557f: 48 89 c1 mov %rax,%rcx 5582: eb df jmp 5563 <...> to: 5563: 48 89 c2 mov %rax,%rdx 5566: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556a: 48 83 c2 02 add $0x2,%rdx 556e: f0 48 0f b1 11 lock cmpxchg %rdx,(%rcx) 5573: 0f 84 8b fc ff ff je 5204 <...> 5579: eb e8 jmp 5563 <...> Link: https://lkml.kernel.org/r/20220821193011.88208-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/iversion.h | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/include/linux/iversion.h b/include/linux/iversion.h index 3bfebde5a1a6d..eb5a158101693 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -123,17 +123,12 @@ inode_peek_iversion_raw(const struct inode *inode) static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { - u64 cur, old; + u64 cur = inode_peek_iversion_raw(inode); - cur = inode_peek_iversion_raw(inode); - for (;;) { + do { if (cur > val) break; - old = atomic64_cmpxchg(&inode->i_version, cur, val); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** @@ -197,7 +192,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) static inline bool inode_maybe_inc_iversion(struct inode *inode, bool force) { - u64 cur, old, new; + u64 cur, new; /* * The i_version field is not strictly ordered with any other inode @@ -211,19 +206,14 @@ inode_maybe_inc_iversion(struct inode *inode, bool force) */ smp_mb(); cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } @@ -304,10 +294,10 @@ inode_peek_iversion(const struct inode *inode) static inline u64 inode_query_iversion(struct inode *inode) { - u64 cur, old, new; + u64 cur, new; cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { /* @@ -320,11 +310,7 @@ inode_query_iversion(struct inode *inode) } new = cur | I_VERSION_QUERIED; - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } -- GitLab From 948084f0f6959f602f89f679522b706a72da0285 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:25:19 +0200 Subject: [PATCH 0219/2223] kexec: replace kmap() with kmap_local_page() kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. Since its use in kexec_core.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in kexec_core.c. Tested on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821182519.9483-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Acked-by: Baoquan He Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5ca4d40c9ec13..ca2743f9c634e 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image, if (result < 0) goto out; - ptr = kmap(page); + ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; @@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image, memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); - kunmap(page); + kunmap_local(ptr); if (result) { result = -EFAULT; goto out; @@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image, goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); - ptr = kmap(page); + ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); @@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); - kunmap(page); + kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; -- GitLab From d75e9a4bccf4e19928534dec797935e650f85b09 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:03:58 +0200 Subject: [PATCH 0220/2223] hfs: unmap the page in the "fail_page" label Patch series "hfs: Replace kmap() with kmap_local_page()". kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmaps pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/hfs is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/hfs. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Fix a bug due to a page being not unmapped if the code jumps to the "fail_page" label (1/3). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. This patch (of 3): Several paths within hfs_btree_open() jump to the "fail_page" label where put_page() is called while the page is still mapped. Call kunmap() to unmap the page soon before put_page(). Link: https://lkml.kernel.org/r/20220821180400.8198-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220821180400.8198-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Matthew Wilcox ] Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 19017d2961734..56c6782436e9c 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -124,6 +124,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke return tree; fail_page: + kunmap(page); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; -- GitLab From ca0ac8dfd35b218b3c95d3b38c695fbff35d94ca Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:03:59 +0200 Subject: [PATCH 0221/2223] hfs: replace kmap() with kmap_local_page() in bnode.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in bnode.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in bnode.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821180400.8198-3-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/bnode.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c83fd0e8404d3..2015e42e752a6 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int pagenum; int bytes_read; int bytes_to_read; - void *vaddr; off += node->page_offset; pagenum = off >> PAGE_SHIFT; @@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); - vaddr = kmap_atomic(page); - memcpy(buf + bytes_read, vaddr + off, bytes_to_read); - kunmap_atomic(vaddr); + memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ @@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off += node->page_offset; page = node->page[0]; - memcpy(kmap(page) + off, buf, len); - kunmap(page); + memcpy_to_page(page, off, buf, len); set_page_dirty(page); } @@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off += node->page_offset; page = node->page[0]; - memset(kmap(page) + off, 0, len); - kunmap(page); + memzero_page(page, off, len); set_page_dirty(page); } @@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, src_page = src_node->page[0]; dst_page = dst_node->page[0]; - memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); - kunmap(src_page); - kunmap(dst_page); + memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } @@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) src += node->page_offset; dst += node->page_offset; page = node->page[0]; - ptr = kmap(page); + ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); - kunmap(page); + kunmap_local(ptr); set_page_dirty(page); } @@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_SIZE, (int)tree->node_size)); + memzero_page(*pagep, node->page_offset, + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); -- GitLab From 21490eff121555b123f7088b935e40ee43d2c642 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sun, 21 Aug 2022 20:04:00 +0200 Subject: [PATCH 0222/2223] hfs: replace kmap() with kmap_local_page() in btree.c kmap() is being deprecated in favor of kmap_local_page(). Two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in btree.c is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in btree.c. Where possible, use the suited standard helpers (memzero_page(), memcpy_page()) instead of open coding kmap_local_page() plus memset() or memcpy(). Tested in a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. Link: https://lkml.kernel.org/r/20220821180400.8198-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Viacheslav Dubeyko Cc: Arnd Bergmann Cc: Chaitanya Kulkarni Cc: Christian Brauner (Microsoft) Cc: Damien Le Moal Cc: Jeff Layton Cc: Jens Axboe Cc: Kees Cook Cc: Martin K. Petersen Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/hfs/btree.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 56c6782436e9c..2fa4b1f8cc7fb 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -119,12 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: - kunmap(page); + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; @@ -170,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -181,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } @@ -269,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -282,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -291,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); @@ -314,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -361,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); -- GitLab From e1d7c7609ae0933b59840390c1b207ac0a925c8b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:38:51 +0200 Subject: [PATCH 0223/2223] bitops: use try_cmpxchg in set_mask_bits and bit_clear_unless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in set_mask_bits and bit_clear_unless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Link: https://lkml.kernel.org/r/20220822143851.3290-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/bitops.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 3b89c64bcfd8f..fb24a513d9172 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -328,10 +328,10 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ new__ = (old__ & ~mask__) | bits__; \ - } while (cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) @@ -343,11 +343,12 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ + if (old__ & test__) \ + break; \ new__ = old__ & ~clear__; \ - } while (!(old__ & test__) && \ - cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) -- GitLab From 88040e67b9533003cfd1a2d61ebd17593435322c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:36 +0200 Subject: [PATCH 0224/2223] alpha: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818205936.6144-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton --- arch/alpha/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index b4fbbba30aa2b..33bf3a6270027 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -491,9 +491,9 @@ setup_arch(char **cmdline_p) boot flags depending on the boot mode, we need some shorthand. This should do for installation. */ if (strcmp(COMMAND_LINE, "INSTALL") == 0) { - strlcpy(command_line, "root=/dev/fd0 load_ramdisk=1", sizeof command_line); + strscpy(command_line, "root=/dev/fd0 load_ramdisk=1", sizeof(command_line)); } else { - strlcpy(command_line, COMMAND_LINE, sizeof command_line); + strscpy(command_line, COMMAND_LINE, sizeof(command_line)); } strcpy(boot_command_line, command_line); *cmdline_p = command_line; -- GitLab From 216e71f13c13a7b3df352742554445907011a3a5 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:39 +0200 Subject: [PATCH 0225/2223] ia64: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818205940.6216-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- arch/ia64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index fd6301eafa9d5..c057280442727 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -552,7 +552,7 @@ setup_arch (char **cmdline_p) ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); *cmdline_p = __va(ia64_boot_param->command_line); - strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); + strscpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); efi_init(); io_port_init(); -- GitLab From c97e21fe91ed1d59eb36cac1728bcb9a82167c7a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:13 +0200 Subject: [PATCH 0226/2223] ocfs2: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210123.7637-4-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stackglue.c | 4 ++-- fs/ocfs2/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index dd77b7aaabf5c..317126261523b 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) - strlcpy(new_conn->cc_cluster_name, cluster_name, + strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e2cc9eec287c9..660bc1795848a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2221,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb, goto out_journal; } - strlcpy(osb->vol_label, di->id2.i_super.s_label, + strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); -- GitLab From 512cb7e4c110133e49da9f69885df3ed41aa284f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:53 +0200 Subject: [PATCH 0227/2223] reiserfs: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210153.8095-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/reiserfs/procfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 4a7cb16e9345c..3dba8acf4e832 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; -- GitLab From a1d3a6d9f243797d1bcaa0ca14c03396bc302ca6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:59 +0200 Subject: [PATCH 0228/2223] init: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210200.8203-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- init/do_mounts.c | 4 ++-- init/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 7058e14ad5f70..811e94daf0a84 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(name_to_dev_t); static int __init root_dev_setup(char *line) { - strlcpy(saved_root_name, line, sizeof(saved_root_name)); + strscpy(saved_root_name, line, sizeof(saved_root_name)); return 1; } @@ -343,7 +343,7 @@ static int __init split_fs_names(char *page, size_t size, char *names) int count = 1; char *p = page; - strlcpy(p, root_fs_names, size); + strscpy(p, root_fs_names, size); while (*p++) { if (p[-1] == ',') { p[-1] = '\0'; diff --git a/init/main.c b/init/main.c index 1fe7942f5d4a8..a45f9eca40af0 100644 --- a/init/main.c +++ b/init/main.c @@ -422,7 +422,7 @@ static void __init setup_boot_config(void) if (!data) data = xbc_get_embedded_bootconfig(&size); - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, bootconfig_params); @@ -762,7 +762,7 @@ void __init parse_early_param(void) return; /* All fall through to do_early_param. */ - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_early_options(tmp_cmdline); done = 1; } -- GitLab From 977bbf4385fc64986f22b024858071a35c481a8a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:02:03 +0200 Subject: [PATCH 0229/2223] lib: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220818210203.8251-1-wsa+renesas@sang-engineering.com Signed-off-by: Wolfram Sang Signed-off-by: Andrew Morton --- lib/earlycpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/earlycpio.c b/lib/earlycpio.c index 7921193f04243..d2c37d64fd0c3 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -126,7 +126,7 @@ struct cpio_data find_cpio_data(const char *path, void *data, "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", p, MAX_CPIO_FILE_NAME); } - strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); + strscpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME); cd.data = (void *)dptr; cd.size = ch[C_FILESIZE]; -- GitLab From 5fdfa161b2043001f82cbce49e87e8e9f581d510 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 23 Aug 2022 17:26:32 +0200 Subject: [PATCH 0230/2223] task_work: use try_cmpxchg in task_work_add, task_work_cancel_match and task_work_run Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in task_work_add, task_work_cancel_match and task_work_run. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The patch avoids extra memory read in case cmpxchg fails. Link: https://lkml.kernel.org/r/20220823152632.4517-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- kernel/task_work.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index dff75bcde1514..065e1ef8fc8d7 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work, /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); + head = READ_ONCE(task->task_works); do { - head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; - } while (cmpxchg(&task->task_works, head, work) != head); + } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: @@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task, * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = READ_ONCE(*pprev))) { - if (!match(work, data)) + work = READ_ONCE(*pprev); + while (work) { + if (!match(work, data)) { pprev = &work->next; - else if (cmpxchg(pprev, work, work->next) == work) + work = READ_ONCE(*pprev); + } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -151,16 +153,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + work = READ_ONCE(task->task_works); do { head = NULL; - work = READ_ONCE(task->task_works); if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } - } while (cmpxchg(&task->task_works, work, head) != work); + } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; -- GitLab From 9a15193e23b780d1da77e3db18698beb0637897d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 25 Aug 2022 16:56:03 +0200 Subject: [PATCH 0231/2223] smpboot: use atomic_try_cmpxchg in cpu_wait_death and cpu_report_death Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in cpu_wait_death and cpu_report_death. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20220825145603.5811-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- kernel/smpboot.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9f54544e7499..2c7396da470c5 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds) /* The outgoing CPU will normally get done quite quickly. */ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state; + goto update_state_early; udelay(5); /* But if the outgoing CPU dawdles, wait increasingly long times. */ @@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds) break; sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); } -update_state: +update_state_early: oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +update_state: if (oldstate == CPU_DEAD) { /* Outgoing CPU died normally, update state. */ smp_mb(); /* atomic_read() before update. */ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); } else { /* Outgoing CPU still hasn't died, set state accordingly. */ - if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, CPU_BROKEN) != oldstate) + if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, CPU_BROKEN)) goto update_state; ret = false; } @@ -475,14 +476,14 @@ bool cpu_report_death(void) int newstate; int cpu = smp_processor_id(); + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); do { - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); if (oldstate != CPU_BROKEN) newstate = CPU_DEAD; else newstate = CPU_DEAD_FROZEN; - } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, newstate) != oldstate); + } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, newstate)); return newstate == CPU_DEAD; } -- GitLab From f81259c6dbcefb255fa473090cd975f3827bca89 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:35 +0800 Subject: [PATCH 0232/2223] fail_function: switch to memdup_user_nul() helper Use memdup_user_nul() helper instead of open-coding to simplify the code. Link: https://lkml.kernel.org/r/20220826073337.2085798-1-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 60dc825ecc2b3..03643e33e4c33 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, buffer, count)) { - ret = -EFAULT; - goto out_free; - } - buf[count] = '\0'; + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + sym = strstrip(buf); mutex_lock(&fei_lock); @@ -308,7 +304,6 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } out: mutex_unlock(&fei_lock); -out_free: kfree(buf); return ret; } -- GitLab From cef9f5f866ad45a2dd64fed6e6b657043c2c6f17 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:36 +0800 Subject: [PATCH 0233/2223] fail_function: refactor code of checking return value of register_kprobe() Refactor the error handling of register_kprobe() to improve readability. No functional change. Link: https://lkml.kernel.org/r/20220826073337.2085798-2-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 03643e33e4c33..893e8f9a91189 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -294,14 +294,13 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } ret = register_kprobe(&attr->kp); - if (!ret) - fei_debugfs_add_attr(attr); - if (ret < 0) + if (ret) { fei_attr_remove(attr); - else { - list_add_tail(&attr->list, &fei_attr_list); - ret = count; + goto out; } + fei_debugfs_add_attr(attr); + list_add_tail(&attr->list, &fei_attr_list); + ret = count; out: mutex_unlock(&fei_lock); kfree(buf); -- GitLab From d2e85432a2e0a6f31bd9489800f443228f020ed6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 26 Aug 2022 15:33:37 +0800 Subject: [PATCH 0234/2223] fail_function: fix wrong use of fei_attr_remove() If register_kprobe() fails, the new attr is not added to the list yet, so it should call fei_attr_free() intstead. Link: https://lkml.kernel.org/r/20220826073337.2085798-3-yangyingliang@huawei.com Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework") Signed-off-by: Yang Yingliang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- kernel/fail_function.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 893e8f9a91189..a7ccd2930c5f4 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -295,7 +295,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = register_kprobe(&attr->kp); if (ret) { - fei_attr_remove(attr); + fei_attr_free(attr); goto out; } fei_debugfs_add_attr(attr); -- GitLab From 199cda13534f4c676d7e4601665e971f4f0582c4 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 27 Aug 2022 15:11:16 +0800 Subject: [PATCH 0235/2223] initramfs: mark my_inptr as __initdata As my_inptr is only used in __init function unpack_to_rootfs(), mark it as __initdata to allow it be freed after boot. Link: https://lkml.kernel.org/r/20220827071116.83078-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: David Disseldorp Cc: Alexander Viro Cc: Martin Wilck Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 18229cfe8906b..2f5bfb7d76521 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -482,7 +482,7 @@ static long __init flush_buffer(void *bufv, unsigned long len) return origLen; } -static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ +static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */ #include -- GitLab From d85a1bec8e8d552ab13163ca1874dcd82f3d1550 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:34 +0800 Subject: [PATCH 0236/2223] ntfs: fix use-after-free in ntfs_attr_find() Patch series "ntfs: fix bugs about Attribute", v2. This patchset fixes three bugs relative to Attribute in record: Patch 1 adds a sanity check to ensure that, attrs_offset field in first mft record loading from disk is within bounds. Patch 2 moves the ATTR_RECORD's bounds checking earlier, to avoid dereferencing ATTR_RECORD before checking this ATTR_RECORD is within bounds. Patch 3 adds an overflow checking to avoid possible forever loop in ntfs_attr_find(). Without patch 1 and patch 2, the kernel triggersa KASAN use-after-free detection as reported by Syzkaller. Although one of patch 1 or patch 2 can fix this, we still need both of them. Because patch 1 fixes the root cause, and patch 2 not only fixes the direct cause, but also fixes the potential out-of-bounds bug. This patch (of 3): Syzkaller reported use-after-free read as follows: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Kernel will loads $MFT/$DATA's first mft record in ntfs_read_inode_mount(). Yet the problem is that after loading, kernel doesn't check whether attrs_offset field is a valid value. To be more specific, if attrs_offset field is larger than bytes_allocated field, then it may trigger the out-of-bounds read bug(reported as use-after-free bug) in ntfs_attr_find(), when kernel tries to access the corresponding mft record's attribute. This patch solves it by adding the sanity check between attrs_offset field and bytes_allocated field, after loading the first mft record. Link: https://lkml.kernel.org/r/20220831160935.3409-1-yin31149@gmail.com Link: https://lkml.kernel.org/r/20220831160935.3409-2-yin31149@gmail.com Signed-off-by: Hawkins Jiawei Cc: Anton Altaparmakov Cc: ChenXiaoSong Cc: syzkaller-bugs Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/ntfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index db0f1995aedd1..08c659332e26b 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi) goto err_out; } + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); -- GitLab From 36a4d82dddbbd421d2b8e79e1cab68c8126d5075 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:36 +0800 Subject: [PATCH 0237/2223] ntfs: fix out-of-bounds read in ntfs_attr_find() Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). To ensure access on these ATTR_RECORDs are within bounds, kernel will do some checking during iteration. The problem is that during checking whether ATTR_RECORD's name is within bounds, kernel will dereferences the ATTR_RECORD name_offset field, before checking this ATTR_RECORD strcture is within bounds. This problem may result out-of-bounds read in ntfs_attr_find(), reported by Syzkaller: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== This patch solves it by moving the ATTR_RECORD strcture's bounds checking earlier, then checking whether ATTR_RECORD's name is within bounds. What's more, this patch also add some comments to improve its maintainability. Link: https://lkml.kernel.org/r/20220831160935.3409-3-yin31149@gmail.com Link: https://lore.kernel.org/all/1636796c-c85e-7f47-e96f-e074fee3c7d3@huawei.com/ Link: https://groups.google.com/g/syzkaller-bugs/c/t_XdeKPGTR4/m/LECAuIGcBgAJ Signed-off-by: chenxiaosong (A) Signed-off-by: Dan Carpenter Signed-off-by: Hawkins Jiawei Reported-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Tested-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Cc: Anton Altaparmakov Cc: syzkaller-bugs Signed-off-by: Andrew Morton --- fs/ntfs/attrib.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 52615e6090e1c..cec4be2a2d239 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -594,11 +594,23 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) + break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) -- GitLab From 63095f4f3af59322bea984a6ae44337439348fe0 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 1 Sep 2022 00:09:38 +0800 Subject: [PATCH 0238/2223] ntfs: check overflow when iterating ATTR_RECORDs Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). Because the ATTR_RECORDs are next to each other, kernel can get the next ATTR_RECORD from end address of current ATTR_RECORD, through current ATTR_RECORD length field. The problem is that during iteration, when kernel calculates the end address of current ATTR_RECORD, kernel may trigger an integer overflow bug in executing `a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))`. This may wrap, leading to a forever iteration on 32bit systems. This patch solves it by adding some checks on calculating end address of current ATTR_RECORD during iteration. Link: https://lkml.kernel.org/r/20220831160935.3409-4-yin31149@gmail.com Link: https://lore.kernel.org/all/20220827105842.GM2030@kadam/ Signed-off-by: Hawkins Jiawei Suggested-by: Dan Carpenter Cc: Anton Altaparmakov Cc: chenxiaosong (A) Cc: syzkaller-bugs Signed-off-by: Andrew Morton --- fs/ntfs/attrib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index cec4be2a2d239..a3865bc4a0c65 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -617,6 +617,14 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, return -ENOENT; if (unlikely(!a->length)) break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + if (a->type != type) continue; /* -- GitLab From 35783ccbe519b33f6652b2d7aafcfc82f10b1a1b Mon Sep 17 00:00:00 2001 From: wuchi Date: Thu, 1 Sep 2022 08:31:21 +0800 Subject: [PATCH 0239/2223] kernel/profile.c: simplify duplicated code in profile_setup() The code to parse option string "schedule/sleep/kvm" of cmdline in function profile_setup is redundant, so simplify that. Link: https://lkml.kernel.org/r/20220901003121.53597-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Andrew Morton Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- kernel/profile.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index 7ea01ba30e757..8a77769bc4b4c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,43 +59,39 @@ int profile_setup(char *str) static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; + const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel sleep profiling enabled (shift: %u)\n", - prof_shift); + select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel schedule profiling enabled (shift: %u)\n", - prof_shift); + select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel KVM profiling enabled (shift: %u)\n", - prof_shift); + select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } + + if (select) { + if (str[strlen(select)] == ',') + str += strlen(select) + 1; + if (get_option(&str, &par)) + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel %s profiling enabled (shift: %u)\n", + select, prof_shift); + } + return 1; } __setup("profile=", profile_setup); -- GitLab From 7b9e664beb237d90bc600f117668227af5ce53ae Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 20:27:13 +0300 Subject: [PATCH 0240/2223] asm-generic: make parameter types consistent in _unaligned_be48() There is a convention to use internal kernel types, so replace __u8 by u8. Link: https://lkml.kernel.org/r/20220830172713.43686-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Keith Busch Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/asm-generic/unaligned.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index df30f11b4a460..699650f819706 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -126,7 +126,7 @@ static inline void put_unaligned_le24(const u32 val, void *p) __put_unaligned_le24(val, p); } -static inline void __put_unaligned_be48(const u64 val, __u8 *p) +static inline void __put_unaligned_be48(const u64 val, u8 *p) { *p++ = val >> 40; *p++ = val >> 32; -- GitLab From 8ea0114eda0c1c85f8f01922ac8fc1e489a61129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 2 Sep 2022 13:19:23 +0200 Subject: [PATCH 0241/2223] checkpatch: handle FILE pointer type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using a "FILE *" type, checkpatch considers this an error: ERROR: need consistent spacing around '*' (ctx:WxV) #32: FILE: f.c:8: +static void a(FILE *const b) ^ Fix this by explicitly defining "FILE" as a common type. This is useful for user space patches. With this patch, we now get: <_>WS( ) <_>IDENT(static) <_>WS( ) <_>DECLARE(void ) <_>FUNC(a) PAREN('(') <_>DECLARE(FILE *const ) <_>IDENT(b) <_>PAREN(')') -> V <_>WS( ) 32 > . static void a(FILE *const b) 32 > EEVVVVVVVTTTTTVNTTTTTTTTTTTTVVV 32 > ______________________________ Link: https://lkml.kernel.org/r/20220902111923.1488671-1-mic@digikod.net Link: https://lore.kernel.org/r/20220902111923.1488671-1-mic@digikod.net Signed-off-by: Mickaël Salaün Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9ff219e0a9d56..18effbe1fe908 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -576,10 +576,14 @@ our $typeKernelTypedefs = qr{(?x: (?:__)?(?:u|s|be|le)(?:8|16|32|64)| atomic_t )}; +our $typeStdioTypedefs = qr{(?x: + FILE +)}; our $typeTypedefs = qr{(?x: $typeC99Typedefs\b| $typeOtherOSTypedefs\b| - $typeKernelTypedefs\b + $typeKernelTypedefs\b| + $typeStdioTypedefs\b )}; our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; -- GitLab From bfca3dd3d0680fc2fc7f659a152234afbac26e4d Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Thu, 1 Sep 2022 21:44:03 +0200 Subject: [PATCH 0242/2223] kernel/utsname_sysctl.c: print kernel arch Print the machine hardware name (UTS_MACHINE) in /proc/sys/kernel/arch. This helps people who debug kernel with initramfs with minimal environment (i.e. without coreutils or even busybox) or allow to open sysfs file instead of run 'uname -m' in high level languages. Link: https://lkml.kernel.org/r/20220901194403.3819-1-pvorel@suse.cz Signed-off-by: Petr Vorel Acked-by: Greg Kroah-Hartman Cc: David Sterba Cc: "Eric W . Biederman" Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 5 +++++ kernel/utsname_sysctl.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ee6572b1edada..bbaa851946956 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -65,6 +65,11 @@ combining the following values: 4 s3_beep = ======= +arch +==== + +The machine hardware name, the same output as ``uname -m`` +(e.g. ``x86_64`` or ``aarch64``). auto_msgmni =========== diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ca61d49885b6..7ffdd2cd5ff9b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -73,6 +73,13 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); static struct ctl_table uts_kern_table[] = { + { + .procname = "arch", + .data = init_uts_ns.name.machine, + .maxlen = sizeof(init_uts_ns.name.machine), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, { .procname = "ostype", .data = init_uts_ns.name.sysname, -- GitLab From b814751175470b00969a317bf3192260750f9455 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 3 Sep 2022 21:52:33 +0800 Subject: [PATCH 0243/2223] latencytop: use the last element of latency_record of system In account_global_scheduler_latency(), when we don't find the matching latency_record we try to select one which is unused in latency_record[MAXLR], but the condition will skip the last one. if (i >= MAXLR-1) Fix that. Link: https://lkml.kernel.org/r/20220903135233.5225-1-wuchi.zero@gmail.com Signed-off-by: wuchi Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- kernel/latencytop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 76166df011a4d..781249098cb6e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -112,7 +112,7 @@ static void __sched account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) { - int firstnonnull = MAXLR + 1; + int firstnonnull = MAXLR; int i; /* skip kernel threads for now */ @@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk, } i = firstnonnull; - if (i >= MAXLR - 1) + if (i >= MAXLR) return; /* Allocted a new one: */ -- GitLab From a47126ec29f538e1197862919f94d3b6668144a4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:24:57 -0500 Subject: [PATCH 0244/2223] PCI/PTM: Cache PTM Capability offset Cache the PTM Capability offset instead of searching for it every time we enable/disable PTM or save/restore PTM state. No functional change intended. Link: https://lore.kernel.org/r/20220909202505.314195-2-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Mika Westerberg --- drivers/pci/pcie/ptm.c | 41 +++++++++++++++++------------------------ include/linux/pci.h | 1 + 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 368a254e31242..85382c1358852 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -31,13 +31,9 @@ static void pci_ptm_info(struct pci_dev *dev) void pci_disable_ptm(struct pci_dev *dev) { - int ptm; + u16 ptm = dev->ptm_cap; u16 ctrl; - if (!pci_is_pcie(dev)) - return; - - ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); if (!ptm) return; @@ -48,14 +44,10 @@ void pci_disable_ptm(struct pci_dev *dev) void pci_save_ptm_state(struct pci_dev *dev) { - int ptm; + u16 ptm = dev->ptm_cap; struct pci_cap_saved_state *save_state; u16 *cap; - if (!pci_is_pcie(dev)) - return; - - ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); if (!ptm) return; @@ -69,16 +61,15 @@ void pci_save_ptm_state(struct pci_dev *dev) void pci_restore_ptm_state(struct pci_dev *dev) { + u16 ptm = dev->ptm_cap; struct pci_cap_saved_state *save_state; - int ptm; u16 *cap; - if (!pci_is_pcie(dev)) + if (!ptm) return; save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM); - ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); - if (!save_state || !ptm) + if (!save_state) return; cap = (u16 *)&save_state->cap.data[0]; @@ -87,7 +78,7 @@ void pci_restore_ptm_state(struct pci_dev *dev) void pci_ptm_init(struct pci_dev *dev) { - int pos; + u16 ptm; u32 cap, ctrl; u8 local_clock; struct pci_dev *ups; @@ -117,13 +108,14 @@ void pci_ptm_init(struct pci_dev *dev) return; } - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); - if (!pos) + ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); + if (!ptm) return; + dev->ptm_cap = ptm; pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_PTM, sizeof(u16)); - pci_read_config_dword(dev, pos + PCI_PTM_CAP, &cap); + pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); local_clock = (cap & PCI_PTM_GRANULARITY_MASK) >> 8; /* @@ -148,7 +140,7 @@ void pci_ptm_init(struct pci_dev *dev) } ctrl |= dev->ptm_granularity << 8; - pci_write_config_dword(dev, pos + PCI_PTM_CTRL, ctrl); + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); dev->ptm_enabled = 1; pci_ptm_info(dev); @@ -156,18 +148,19 @@ void pci_ptm_init(struct pci_dev *dev) int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) { - int pos; + u16 ptm; u32 cap, ctrl; struct pci_dev *ups; if (!pci_is_pcie(dev)) return -EINVAL; - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); - if (!pos) + ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); + if (!ptm) return -EINVAL; - pci_read_config_dword(dev, pos + PCI_PTM_CAP, &cap); + dev->ptm_cap = ptm; + pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); if (!(cap & PCI_PTM_CAP_REQ)) return -EINVAL; @@ -192,7 +185,7 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) ctrl = PCI_PTM_CTRL_ENABLE; ctrl |= dev->ptm_granularity << 8; - pci_write_config_dword(dev, pos + PCI_PTM_CTRL, ctrl); + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); dev->ptm_enabled = 1; pci_ptm_info(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 060af91bafcd4..54be939023a3c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -475,6 +475,7 @@ struct pci_dev { unsigned int broken_cmd_compl:1; /* No compl for some cmds */ #endif #ifdef CONFIG_PCIE_PTM + u16 ptm_cap; /* PTM Capability */ unsigned int ptm_root:1; unsigned int ptm_enabled:1; u8 ptm_granularity; -- GitLab From e243c173c015d62b2bca9b030777ceba13311033 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:24:58 -0500 Subject: [PATCH 0245/2223] PCI/PTM: Add pci_upstream_ptm() helper PTM requires an unbroken path of PTM-supporting devices between the PTM Root and the ultimate PTM Requester, but if a Switch supports PTM, only the Upstream Port can have a PTM Capability; the Downstream Ports do not. Previously we copied the PTM configuration from the Switch Upstream Port to the Downstream Ports so dev->ptm_enabled for any device implied that all the upstream devices support PTM. Instead of making it look like Downstream Ports have their own PTM config, add pci_upstream_ptm(), which returns the upstream device that has a PTM Capability (either a Root Port or a Switch Upstream Port). Link: https://lore.kernel.org/r/20220909202505.314195-3-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Mika Westerberg --- drivers/pci/pcie/ptm.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 85382c1358852..0df6cdfe38b40 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -76,6 +76,29 @@ void pci_restore_ptm_state(struct pci_dev *dev) pci_write_config_word(dev, ptm + PCI_PTM_CTRL, *cap); } +/* + * If the next upstream device supports PTM, return it; otherwise return + * NULL. PTM Messages are local, so both link partners must support it. + */ +static struct pci_dev *pci_upstream_ptm(struct pci_dev *dev) +{ + struct pci_dev *ups = pci_upstream_bridge(dev); + + /* + * Switch Downstream Ports are not permitted to have a PTM + * capability; their PTM behavior is controlled by the Upstream + * Port (PCIe r5.0, sec 7.9.16), so if the upstream bridge is a + * Switch Downstream Port, look up one more level. + */ + if (ups && pci_pcie_type(ups) == PCI_EXP_TYPE_DOWNSTREAM) + ups = pci_upstream_bridge(ups); + + if (ups && ups->ptm_cap) + return ups; + + return NULL; +} + void pci_ptm_init(struct pci_dev *dev) { u16 ptm; @@ -95,19 +118,6 @@ void pci_ptm_init(struct pci_dev *dev) pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)) return; - /* - * Switch Downstream Ports are not permitted to have a PTM - * capability; their PTM behavior is controlled by the Upstream - * Port (PCIe r5.0, sec 7.9.16). - */ - ups = pci_upstream_bridge(dev); - if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM && - ups && ups->ptm_enabled) { - dev->ptm_granularity = ups->ptm_granularity; - dev->ptm_enabled = 1; - return; - } - ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); if (!ptm) return; @@ -124,6 +134,7 @@ void pci_ptm_init(struct pci_dev *dev) * the spec recommendation (PCIe r3.1, sec 7.32.3), select the * furthest upstream Time Source as the PTM Root. */ + ups = pci_upstream_ptm(dev); if (ups && ups->ptm_enabled) { ctrl = PCI_PTM_CTRL_ENABLE; if (ups->ptm_granularity == 0) @@ -173,7 +184,7 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) * associate the endpoint with a time source. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_ENDPOINT) { - ups = pci_upstream_bridge(dev); + ups = pci_upstream_ptm(dev); if (!ups || !ups->ptm_enabled) return -EINVAL; -- GitLab From 118b9dfdc18b68abf736a71330e3ad1f5af7e47e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:24:59 -0500 Subject: [PATCH 0246/2223] PCI/PTM: Separate configuration and enable PTM configuration and enabling were previously mixed together: pci_ptm_init() collected granularity info and enabled PTM for Root Ports and Switch Upstream Ports; pci_enable_ptm() did the same for Endpoints. Move everything related to the PTM Capability register to pci_ptm_init() for all devices, and everything related to the PTM Control register to pci_enable_ptm(). Link: https://lore.kernel.org/r/20220909202505.314195-4-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pcie/ptm.c | 104 +++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 0df6cdfe38b40..ba1d50c965fa1 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -99,25 +99,19 @@ static struct pci_dev *pci_upstream_ptm(struct pci_dev *dev) return NULL; } +/* + * Find the PTM Capability (if present) and extract the information we need + * to use it. + */ void pci_ptm_init(struct pci_dev *dev) { u16 ptm; - u32 cap, ctrl; - u8 local_clock; + u32 cap; struct pci_dev *ups; if (!pci_is_pcie(dev)) return; - /* - * Enable PTM only on interior devices (root ports, switch ports, - * etc.) on the assumption that it causes no link traffic until an - * endpoint enables it. - */ - if ((pci_pcie_type(dev) == PCI_EXP_TYPE_ENDPOINT || - pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)) - return; - ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); if (!ptm) return; @@ -126,76 +120,76 @@ void pci_ptm_init(struct pci_dev *dev) pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_PTM, sizeof(u16)); pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); - local_clock = (cap & PCI_PTM_GRANULARITY_MASK) >> 8; + dev->ptm_granularity = (cap & PCI_PTM_GRANULARITY_MASK) >> 8; /* - * There's no point in enabling PTM unless it's enabled in the - * upstream device or this device can be a PTM Root itself. Per - * the spec recommendation (PCIe r3.1, sec 7.32.3), select the - * furthest upstream Time Source as the PTM Root. + * Per the spec recommendation (PCIe r6.0, sec 7.9.15.3), select the + * furthest upstream Time Source as the PTM Root. For Endpoints, + * "the Effective Granularity is the maximum Local Clock Granularity + * reported by the PTM Root and all intervening PTM Time Sources." */ ups = pci_upstream_ptm(dev); - if (ups && ups->ptm_enabled) { - ctrl = PCI_PTM_CTRL_ENABLE; + if (ups) { if (ups->ptm_granularity == 0) dev->ptm_granularity = 0; - else if (ups->ptm_granularity > local_clock) + else if (ups->ptm_granularity > dev->ptm_granularity) dev->ptm_granularity = ups->ptm_granularity; - } else { - if (cap & PCI_PTM_CAP_ROOT) { - ctrl = PCI_PTM_CTRL_ENABLE | PCI_PTM_CTRL_ROOT; - dev->ptm_root = 1; - dev->ptm_granularity = local_clock; - } else - return; - } + } else if (cap & PCI_PTM_CAP_ROOT) { + dev->ptm_root = 1; + } else if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) { - ctrl |= dev->ptm_granularity << 8; - pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); - dev->ptm_enabled = 1; + /* + * Per sec 7.9.15.3, this should be the Local Clock + * Granularity of the associated Time Source. But it + * doesn't say how to find that Time Source. + */ + dev->ptm_granularity = 0; + } - pci_ptm_info(dev); + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || + pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) + pci_enable_ptm(dev, NULL); } +/** + * pci_enable_ptm() - Enable Precision Time Measurement + * @dev: PCI device + * @granularity: pointer to return granularity + * + * Enable Precision Time Measurement for @dev. If successful and + * @granularity is non-NULL, return the Effective Granularity. + * + * Return: zero if successful, or -EINVAL if @dev lacks a PTM Capability or + * is not a PTM Root and lacks an upstream path of PTM-enabled devices. + */ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) { - u16 ptm; - u32 cap, ctrl; + u16 ptm = dev->ptm_cap; struct pci_dev *ups; + u32 ctrl; - if (!pci_is_pcie(dev)) - return -EINVAL; - - ptm = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PTM); if (!ptm) return -EINVAL; - dev->ptm_cap = ptm; - pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); - if (!(cap & PCI_PTM_CAP_REQ)) - return -EINVAL; - /* - * For a PCIe Endpoint, PTM is only useful if the endpoint can - * issue PTM requests to upstream devices that have PTM enabled. - * - * For Root Complex Integrated Endpoints, there is no upstream - * device, so there must be some implementation-specific way to - * associate the endpoint with a time source. + * A device uses local PTM Messages to request time information + * from a PTM Root that's farther upstream. Every device along the + * path must support PTM and have it enabled so it can handle the + * messages. Therefore, if this device is not a PTM Root, the + * upstream link partner must have PTM enabled before we can enable + * PTM. */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ENDPOINT) { + if (!dev->ptm_root) { ups = pci_upstream_ptm(dev); if (!ups || !ups->ptm_enabled) return -EINVAL; - - dev->ptm_granularity = ups->ptm_granularity; - } else if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) { - dev->ptm_granularity = 0; - } else - return -EINVAL; + } ctrl = PCI_PTM_CTRL_ENABLE; ctrl |= dev->ptm_granularity << 8; + if (dev->ptm_root) + ctrl |= PCI_PTM_CTRL_ROOT; + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); dev->ptm_enabled = 1; -- GitLab From e8bdc5ea481638e0a4fd5639050d2b170417f493 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:00 -0500 Subject: [PATCH 0247/2223] PCI/PTM: Add pci_suspend_ptm() and pci_resume_ptm() We disable PTM during suspend because that allows some Root Ports to enter lower-power PM states, which means we also need to disable PTM for all downstream devices. Add pci_suspend_ptm() and pci_resume_ptm() for this purpose. pci_enable_ptm() and pci_disable_ptm() are for drivers to use to enable or disable PTM. They use dev->ptm_enabled to keep track of whether PTM should be enabled. pci_suspend_ptm() and pci_resume_ptm() are PCI core-internal functions to temporarily disable PTM during suspend and (depending on dev->ptm_enabled) re-enable PTM during resume. Enable/disable/suspend/resume all use internal __pci_enable_ptm() and __pci_disable_ptm() functions that only update the PTM Control register. Outline: pci_enable_ptm(struct pci_dev *dev) { __pci_enable_ptm(dev); dev->ptm_enabled = 1; pci_ptm_info(dev); } pci_disable_ptm(struct pci_dev *dev) { if (dev->ptm_enabled) { __pci_disable_ptm(dev); dev->ptm_enabled = 0; } } pci_suspend_ptm(struct pci_dev *dev) { if (dev->ptm_enabled) __pci_disable_ptm(dev); } pci_resume_ptm(struct pci_dev *dev) { if (dev->ptm_enabled) __pci_enable_ptm(dev); } Nothing currently calls pci_resume_ptm(); the suspend path saves the PTM state before disabling PTM, so the PTM state restore in the resume path implicitly re-enables it. A future change will use pci_resume_ptm() to fix some problems with this approach. Link: https://lore.kernel.org/r/20220909202505.314195-5-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pci.c | 4 +-- drivers/pci/pci.h | 6 ++-- drivers/pci/pcie/ptm.c | 71 +++++++++++++++++++++++++++++++++--------- include/linux/pci.h | 2 ++ 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 95bc329e74c0e..83818f81577d6 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2714,7 +2714,7 @@ int pci_prepare_to_sleep(struct pci_dev *dev) * lower-power idle state as a whole. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - pci_disable_ptm(dev); + pci_suspend_ptm(dev); pci_enable_wake(dev, target_state, wakeup); @@ -2772,7 +2772,7 @@ int pci_finish_runtime_suspend(struct pci_dev *dev) * lower-power idle state as a whole. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - pci_disable_ptm(dev); + pci_suspend_ptm(dev); __pci_enable_wake(dev, target_state, pci_dev_run_wake(dev)); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 785f31086313a..ce4a277e3f41f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -507,11 +507,13 @@ static inline int pci_iov_bus_range(struct pci_bus *bus) #ifdef CONFIG_PCIE_PTM void pci_save_ptm_state(struct pci_dev *dev); void pci_restore_ptm_state(struct pci_dev *dev); -void pci_disable_ptm(struct pci_dev *dev); +void pci_suspend_ptm(struct pci_dev *dev); +void pci_resume_ptm(struct pci_dev *dev); #else static inline void pci_save_ptm_state(struct pci_dev *dev) { } static inline void pci_restore_ptm_state(struct pci_dev *dev) { } -static inline void pci_disable_ptm(struct pci_dev *dev) { } +static inline void pci_suspend_ptm(struct pci_dev *dev) { } +static inline void pci_resume_ptm(struct pci_dev *dev) { } #endif unsigned long pci_cardbus_resource_alignment(struct resource *); diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index ba1d50c965fa1..70a28b74e721d 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -29,7 +29,7 @@ static void pci_ptm_info(struct pci_dev *dev) dev->ptm_root ? " (root)" : "", clock_desc); } -void pci_disable_ptm(struct pci_dev *dev) +static void __pci_disable_ptm(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; u16 ctrl; @@ -42,6 +42,21 @@ void pci_disable_ptm(struct pci_dev *dev) pci_write_config_word(dev, ptm + PCI_PTM_CTRL, ctrl); } +/** + * pci_disable_ptm() - Disable Precision Time Measurement + * @dev: PCI device + * + * Disable Precision Time Measurement for @dev. + */ +void pci_disable_ptm(struct pci_dev *dev) +{ + if (dev->ptm_enabled) { + __pci_disable_ptm(dev); + dev->ptm_enabled = 0; + } +} +EXPORT_SYMBOL(pci_disable_ptm); + void pci_save_ptm_state(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; @@ -151,18 +166,8 @@ void pci_ptm_init(struct pci_dev *dev) pci_enable_ptm(dev, NULL); } -/** - * pci_enable_ptm() - Enable Precision Time Measurement - * @dev: PCI device - * @granularity: pointer to return granularity - * - * Enable Precision Time Measurement for @dev. If successful and - * @granularity is non-NULL, return the Effective Granularity. - * - * Return: zero if successful, or -EINVAL if @dev lacks a PTM Capability or - * is not a PTM Root and lacks an upstream path of PTM-enabled devices. - */ -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +/* Enable PTM in the Control register if possible */ +static int __pci_enable_ptm(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; struct pci_dev *ups; @@ -191,8 +196,29 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) ctrl |= PCI_PTM_CTRL_ROOT; pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); - dev->ptm_enabled = 1; + return 0; +} +/** + * pci_enable_ptm() - Enable Precision Time Measurement + * @dev: PCI device + * @granularity: pointer to return granularity + * + * Enable Precision Time Measurement for @dev. If successful and + * @granularity is non-NULL, return the Effective Granularity. + * + * Return: zero if successful, or -EINVAL if @dev lacks a PTM Capability or + * is not a PTM Root and lacks an upstream path of PTM-enabled devices. + */ +int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +{ + int rc; + + rc = __pci_enable_ptm(dev); + if (rc) + return rc; + + dev->ptm_enabled = 1; pci_ptm_info(dev); if (granularity) @@ -201,6 +227,23 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) } EXPORT_SYMBOL(pci_enable_ptm); +/* + * Disable PTM, but preserve dev->ptm_enabled so we silently re-enable it on + * resume if necessary. + */ +void pci_suspend_ptm(struct pci_dev *dev) +{ + if (dev->ptm_enabled) + __pci_disable_ptm(dev); +} + +/* If PTM was enabled before suspend, re-enable it when resuming */ +void pci_resume_ptm(struct pci_dev *dev) +{ + if (dev->ptm_enabled) + __pci_enable_ptm(dev); +} + bool pcie_ptm_enabled(struct pci_dev *dev) { if (!dev) diff --git a/include/linux/pci.h b/include/linux/pci.h index 54be939023a3c..cb5f796e3319d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1678,10 +1678,12 @@ bool pci_ats_disabled(void); #ifdef CONFIG_PCIE_PTM int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +void pci_disable_ptm(struct pci_dev *dev); bool pcie_ptm_enabled(struct pci_dev *dev); #else static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) { return -EINVAL; } +static inline void pci_disable_ptm(struct pci_dev *dev) { } static inline bool pcie_ptm_enabled(struct pci_dev *dev) { return false; } #endif -- GitLab From 91b12b2a100e977274d3c277a4ff2df0b7439e7d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:01 -0500 Subject: [PATCH 0248/2223] PCI/PTM: Move pci_ptm_info() body into its only caller pci_ptm_info() is simple and is only called by pci_enable_ptm(). Move the entire body there. No functional change intended. Link: https://lore.kernel.org/r/20220909202505.314195-6-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Mika Westerberg --- drivers/pci/pcie/ptm.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 70a28b74e721d..fc296b352fe23 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -9,26 +9,6 @@ #include #include "../pci.h" -static void pci_ptm_info(struct pci_dev *dev) -{ - char clock_desc[8]; - - switch (dev->ptm_granularity) { - case 0: - snprintf(clock_desc, sizeof(clock_desc), "unknown"); - break; - case 255: - snprintf(clock_desc, sizeof(clock_desc), ">254ns"); - break; - default: - snprintf(clock_desc, sizeof(clock_desc), "%uns", - dev->ptm_granularity); - break; - } - pci_info(dev, "PTM enabled%s, %s granularity\n", - dev->ptm_root ? " (root)" : "", clock_desc); -} - static void __pci_disable_ptm(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; @@ -213,16 +193,32 @@ static int __pci_enable_ptm(struct pci_dev *dev) int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) { int rc; + char clock_desc[8]; rc = __pci_enable_ptm(dev); if (rc) return rc; dev->ptm_enabled = 1; - pci_ptm_info(dev); if (granularity) *granularity = dev->ptm_granularity; + + switch (dev->ptm_granularity) { + case 0: + snprintf(clock_desc, sizeof(clock_desc), "unknown"); + break; + case 255: + snprintf(clock_desc, sizeof(clock_desc), ">254ns"); + break; + default: + snprintf(clock_desc, sizeof(clock_desc), "%uns", + dev->ptm_granularity); + break; + } + pci_info(dev, "PTM enabled%s, %s granularity\n", + dev->ptm_root ? " (root)" : "", clock_desc); + return 0; } EXPORT_SYMBOL(pci_enable_ptm); -- GitLab From 2b89c22f2434b931b3cf22298ac5f5ec089e9ad1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:02 -0500 Subject: [PATCH 0249/2223] PCI/PTM: Preserve RsvdP bits in PTM Control register Even though only the low 16 bits of PTM Control are currently defined, the register is 32 bits wide and the unused bits are RsvdP ("Reserved and Preserved"), so software must preserve the values of those bits when writing the register. Update PTM Control reads and writes to use 32-bit accesses and preserve the reserved bits on writes. Link: https://lore.kernel.org/r/20220909202505.314195-7-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Mika Westerberg --- drivers/pci/pcie/ptm.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index fc296b352fe23..5b8598b222b01 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -12,14 +12,14 @@ static void __pci_disable_ptm(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; - u16 ctrl; + u32 ctrl; if (!ptm) return; - pci_read_config_word(dev, ptm + PCI_PTM_CTRL, &ctrl); + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); ctrl &= ~(PCI_PTM_CTRL_ENABLE | PCI_PTM_CTRL_ROOT); - pci_write_config_word(dev, ptm + PCI_PTM_CTRL, ctrl); + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); } /** @@ -41,7 +41,7 @@ void pci_save_ptm_state(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; struct pci_cap_saved_state *save_state; - u16 *cap; + u32 *cap; if (!ptm) return; @@ -50,15 +50,15 @@ void pci_save_ptm_state(struct pci_dev *dev) if (!save_state) return; - cap = (u16 *)&save_state->cap.data[0]; - pci_read_config_word(dev, ptm + PCI_PTM_CTRL, cap); + cap = (u32 *)&save_state->cap.data[0]; + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, cap); } void pci_restore_ptm_state(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; struct pci_cap_saved_state *save_state; - u16 *cap; + u32 *cap; if (!ptm) return; @@ -67,8 +67,8 @@ void pci_restore_ptm_state(struct pci_dev *dev) if (!save_state) return; - cap = (u16 *)&save_state->cap.data[0]; - pci_write_config_word(dev, ptm + PCI_PTM_CTRL, *cap); + cap = (u32 *)&save_state->cap.data[0]; + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, *cap); } /* @@ -112,7 +112,7 @@ void pci_ptm_init(struct pci_dev *dev) return; dev->ptm_cap = ptm; - pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_PTM, sizeof(u16)); + pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_PTM, sizeof(u32)); pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); dev->ptm_granularity = (cap & PCI_PTM_GRANULARITY_MASK) >> 8; @@ -170,7 +170,10 @@ static int __pci_enable_ptm(struct pci_dev *dev) return -EINVAL; } - ctrl = PCI_PTM_CTRL_ENABLE; + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); + + ctrl |= PCI_PTM_CTRL_ENABLE; + ctrl &= ~PCI_PTM_GRANULARITY_MASK; ctrl |= dev->ptm_granularity << 8; if (dev->ptm_root) ctrl |= PCI_PTM_CTRL_ROOT; -- GitLab From 8b367e75ac482486bbfd1ca832734bec64498f73 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:03 -0500 Subject: [PATCH 0250/2223] PCI/PTM: Reorder functions in logical order pci_enable_ptm() and pci_disable_ptm() were separated. pci_save_ptm_state() and pci_restore_ptm_state() dangled at the top. Move them to logical places. No functional change intended. Link: https://lore.kernel.org/r/20220909202505.314195-8-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pcie/ptm.c | 124 ++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 5b8598b222b01..b4e5f553467c3 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -9,68 +9,6 @@ #include #include "../pci.h" -static void __pci_disable_ptm(struct pci_dev *dev) -{ - u16 ptm = dev->ptm_cap; - u32 ctrl; - - if (!ptm) - return; - - pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); - ctrl &= ~(PCI_PTM_CTRL_ENABLE | PCI_PTM_CTRL_ROOT); - pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); -} - -/** - * pci_disable_ptm() - Disable Precision Time Measurement - * @dev: PCI device - * - * Disable Precision Time Measurement for @dev. - */ -void pci_disable_ptm(struct pci_dev *dev) -{ - if (dev->ptm_enabled) { - __pci_disable_ptm(dev); - dev->ptm_enabled = 0; - } -} -EXPORT_SYMBOL(pci_disable_ptm); - -void pci_save_ptm_state(struct pci_dev *dev) -{ - u16 ptm = dev->ptm_cap; - struct pci_cap_saved_state *save_state; - u32 *cap; - - if (!ptm) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, cap); -} - -void pci_restore_ptm_state(struct pci_dev *dev) -{ - u16 ptm = dev->ptm_cap; - struct pci_cap_saved_state *save_state; - u32 *cap; - - if (!ptm) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, *cap); -} - /* * If the next upstream device supports PTM, return it; otherwise return * NULL. PTM Messages are local, so both link partners must support it. @@ -146,6 +84,40 @@ void pci_ptm_init(struct pci_dev *dev) pci_enable_ptm(dev, NULL); } +void pci_save_ptm_state(struct pci_dev *dev) +{ + u16 ptm = dev->ptm_cap; + struct pci_cap_saved_state *save_state; + u32 *cap; + + if (!ptm) + return; + + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM); + if (!save_state) + return; + + cap = (u32 *)&save_state->cap.data[0]; + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, cap); +} + +void pci_restore_ptm_state(struct pci_dev *dev) +{ + u16 ptm = dev->ptm_cap; + struct pci_cap_saved_state *save_state; + u32 *cap; + + if (!ptm) + return; + + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_PTM); + if (!save_state) + return; + + cap = (u32 *)&save_state->cap.data[0]; + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, *cap); +} + /* Enable PTM in the Control register if possible */ static int __pci_enable_ptm(struct pci_dev *dev) { @@ -226,6 +198,34 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) } EXPORT_SYMBOL(pci_enable_ptm); +static void __pci_disable_ptm(struct pci_dev *dev) +{ + u16 ptm = dev->ptm_cap; + u32 ctrl; + + if (!ptm) + return; + + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); + ctrl &= ~(PCI_PTM_CTRL_ENABLE | PCI_PTM_CTRL_ROOT); + pci_write_config_dword(dev, ptm + PCI_PTM_CTRL, ctrl); +} + +/** + * pci_disable_ptm() - Disable Precision Time Measurement + * @dev: PCI device + * + * Disable Precision Time Measurement for @dev. + */ +void pci_disable_ptm(struct pci_dev *dev) +{ + if (dev->ptm_enabled) { + __pci_disable_ptm(dev); + dev->ptm_enabled = 0; + } +} +EXPORT_SYMBOL(pci_disable_ptm); + /* * Disable PTM, but preserve dev->ptm_enabled so we silently re-enable it on * resume if necessary. -- GitLab From d736d292bba2c5225cb76cd4e04d0e9d00f22498 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:04 -0500 Subject: [PATCH 0251/2223] PCI/PTM: Consolidate PTM interface declarations Consolidate all the PTM-related declarations in drivers/pci/pci.h. No functional change intended. Link: https://lore.kernel.org/r/20220909202505.314195-9-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pci.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ce4a277e3f41f..5cca2e58cce8a 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -505,11 +505,13 @@ static inline int pci_iov_bus_range(struct pci_bus *bus) #endif /* CONFIG_PCI_IOV */ #ifdef CONFIG_PCIE_PTM +void pci_ptm_init(struct pci_dev *dev); void pci_save_ptm_state(struct pci_dev *dev); void pci_restore_ptm_state(struct pci_dev *dev); void pci_suspend_ptm(struct pci_dev *dev); void pci_resume_ptm(struct pci_dev *dev); #else +static inline void pci_ptm_init(struct pci_dev *dev) { } static inline void pci_save_ptm_state(struct pci_dev *dev) { } static inline void pci_restore_ptm_state(struct pci_dev *dev) { } static inline void pci_suspend_ptm(struct pci_dev *dev) { } @@ -577,12 +579,6 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } static inline void pcie_ecrc_get_policy(char *str) { } #endif -#ifdef CONFIG_PCIE_PTM -void pci_ptm_init(struct pci_dev *dev); -#else -static inline void pci_ptm_init(struct pci_dev *dev) { } -#endif - struct pci_dev_reset_methods { u16 vendor; u16 device; -- GitLab From c01163dbd1b8aa016c163ff4bf3a2e90311504f1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:05 -0500 Subject: [PATCH 0252/2223] PCI/PM: Always disable PTM for all devices during suspend We want to disable PTM on Root Ports because that allows some chips, e.g., Intel mobile chips since Coffee Lake, to enter a lower-power PM state. That means we also have to disable PTM on downstream devices. PCIe r6.0, sec 2.2.8, recommends that functions support generation of messages in non-D0 states, so we have to assume Switch Upstream Ports or Endpoints may send PTM Requests while in D1, D2, and D3hot. A PTM message received by a Downstream Port (including a Root Port) with PTM disabled must be treated as an Unsupported Request (sec 6.21.3). PTM was previously disabled only for Root Ports, and it was disabled in pci_prepare_to_sleep(), which is not called at all if a driver supports legacy PM or does its own state saving. Instead, disable PTM early in pci_pm_suspend() and pci_pm_runtime_suspend() so we do it in all cases. Previously PTM was disabled *after* saving device state, so the state restore on resume automatically re-enabled it. Since we now disable PTM *before* saving state, we must explicitly re-enable it in pci_pm_resume() and pci_pm_runtime_resume(). Here's a sample of errors that occur when PTM is disabled only on the Root Port. With this topology: 0000:00:1d.0 Root Port to [bus 08-71] 0000:08:00.0 Switch Upstream Port to [bus 09-71] Kai-Heng reported errors like this: pcieport 0000:00:1d.0: [20] UnsupReq (First) pcieport 0000:00:1d.0: AER: TLP Header: 34000000 08000052 00000000 00000000 Decoding TLP header 0x34...... (0011 0100b) and 0x08000052: Fmt 001b 4 DW header, no data Type 1 0100b Msg (Local - Terminate at Receiver) Requester ID 0x0800 Bus 08 Devfn 00.0 Message Code 0x52 0101 0010b PTM Request The 00:1d.0 Root Port logged an Unsupported Request error when it received a PTM Request with Requester ID 08:00.0. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216210 Fixes: a697f072f5da ("PCI: Disable PTM during suspend to save power") Link: https://lore.kernel.org/r/20220909202505.314195-10-helgaas@kernel.org Reported-by: Kai-Heng Feng Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pci-driver.c | 11 +++++++++++ drivers/pci/pci.c | 28 ++-------------------------- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 49238ddd39eec..5d8c37c3e15a2 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -774,6 +774,12 @@ static int pci_pm_suspend(struct device *dev) pci_dev->skip_bus_pm = false; + /* + * Disabling PTM allows some systems, e.g., Intel mobile chips + * since Coffee Lake, to enter a lower-power PM state. + */ + pci_suspend_ptm(pci_dev); + if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_SUSPEND); @@ -987,6 +993,8 @@ static int pci_pm_resume(struct device *dev) if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); + pci_resume_ptm(pci_dev); + if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); @@ -1274,6 +1282,8 @@ static int pci_pm_runtime_suspend(struct device *dev) pci_power_t prev = pci_dev->current_state; int error; + pci_suspend_ptm(pci_dev); + /* * If pci_dev->driver is not set (unbound), we leave the device in D0, * but it may go to D3cold when the bridge above it runtime suspends. @@ -1335,6 +1345,7 @@ static int pci_pm_runtime_resume(struct device *dev) * D3cold when the bridge above it runtime suspended. */ pci_pm_default_resume_early(pci_dev); + pci_resume_ptm(pci_dev); if (!pci_dev->driver) return 0; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 83818f81577d6..107afa0a5b03a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2706,24 +2706,12 @@ int pci_prepare_to_sleep(struct pci_dev *dev) if (target_state == PCI_POWER_ERROR) return -EIO; - /* - * There are systems (for example, Intel mobile chips since Coffee - * Lake) where the power drawn while suspended can be significantly - * reduced by disabling PTM on PCIe root ports as this allows the - * port to enter a lower-power PM state and the SoC to reach a - * lower-power idle state as a whole. - */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - pci_suspend_ptm(dev); - pci_enable_wake(dev, target_state, wakeup); error = pci_set_power_state(dev, target_state); - if (error) { + if (error) pci_enable_wake(dev, target_state, false); - pci_restore_ptm_state(dev); - } return error; } @@ -2764,24 +2752,12 @@ int pci_finish_runtime_suspend(struct pci_dev *dev) if (target_state == PCI_POWER_ERROR) return -EIO; - /* - * There are systems (for example, Intel mobile chips since Coffee - * Lake) where the power drawn while suspended can be significantly - * reduced by disabling PTM on PCIe root ports as this allows the - * port to enter a lower-power PM state and the SoC to reach a - * lower-power idle state as a whole. - */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - pci_suspend_ptm(dev); - __pci_enable_wake(dev, target_state, pci_dev_run_wake(dev)); error = pci_set_power_state(dev, target_state); - if (error) { + if (error) pci_enable_wake(dev, target_state, false); - pci_restore_ptm_state(dev); - } return error; } -- GitLab From 4c00cba122f3f3ae54aa5a3a1aec3afc7a2e6f94 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Tue, 30 Aug 2022 03:49:12 -0700 Subject: [PATCH 0253/2223] PCI/PM: Simplify pci_pm_suspend_noirq() We always want to save the device state unless the driver has already done it. Rearrange the checking in pci_pm_suspend_noirq() to make this more clear. No functional change intended. [bhelgaas: commit log, rewrap comment] Link: https://lore.kernel.org/r/20220830104913.1620539-1-rajvi.jingar@linux.intel.com Signed-off-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/pci/pci-driver.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 5d8c37c3e15a2..107d77f3c8467 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -873,20 +873,15 @@ static int pci_pm_suspend_noirq(struct device *dev) } } - if (pci_dev->skip_bus_pm) { + if (!pci_dev->state_saved) { + pci_save_state(pci_dev); + /* - * Either the device is a bridge with a child in D0 below it, or - * the function is running for the second time in a row without - * going through full resume, which is possible only during - * suspend-to-idle in a spurious wakeup case. The device should - * be in D0 at this point, but if it is a bridge, it may be - * necessary to save its state. + * If the device is a bridge with a child in D0 below it, + * it needs to stay in D0, so check skip_bus_pm to avoid + * putting it into a low-power state in that case. */ - if (!pci_dev->state_saved) - pci_save_state(pci_dev); - } else if (!pci_dev->state_saved) { - pci_save_state(pci_dev); - if (pci_power_manageable(pci_dev)) + if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } -- GitLab From c7b58576370147833999fd4cc874d0f918bdf9ca Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Aug 2022 15:52:02 -0700 Subject: [PATCH 0254/2223] f2fs: flush pending checkpoints when freezing super This avoids -EINVAL when trying to freeze f2fs. Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 24 ++++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 5 ++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7de48e791920c..7bf1feb5ac783 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1893,15 +1893,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; - if (cprc->f2fs_issue_ckpt) { - struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + if (!cprc->f2fs_issue_ckpt) + return; - cprc->f2fs_issue_ckpt = NULL; - kthread_stop(ckpt_task); + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); - flush_remained_ckpt_reqs(sbi, NULL); - } + f2fs_flush_ckpt_thread(sbi); +} + +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6770210aae704..088c3d1574b8c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3711,6 +3711,7 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) * checkpoint.c */ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e910f0e39d764..4f2ff50b247c4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1671,9 +1671,8 @@ static int f2fs_freeze(struct super_block *sb) if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; - /* ensure no checkpoint required */ - if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) - return -EINVAL; + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); -- GitLab From 4f99484d27961cb194cebcd917176fa038a5025f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 18 Aug 2022 22:40:09 -0700 Subject: [PATCH 0255/2223] f2fs: complete checkpoints during remount Otherwise, pending checkpoints can contribute a race condition to give a quota warning. - Thread - checkpoint thread add checkpoints to the list do_remount() down_write(&sb->s_umount); f2fs_remount() block_operations() down_read_trylock(&sb->s_umount) = 0 up_write(&sb->s_umount); f2fs_quota_sync() dquot_writeback_dquots() WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); Or, do_remount() down_write(&sb->s_umount); f2fs_remount() create a ckpt thread f2fs_enable_checkpoint() adds checkpoints wait for f2fs_sync_fs() trigger another pending checkpoint block_operations() down_read_trylock(&sb->s_umount) = 0 up_write(&sb->s_umount); f2fs_quota_sync() dquot_writeback_dquots() WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4f2ff50b247c4..0f29c759a8980 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2185,6 +2185,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -2350,6 +2353,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_stop_ckpt_thread(sbi); need_restart_ckpt = true; } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, -- GitLab From da35fe96d12d15779f3cb74929b7ed03941cf983 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 23 Aug 2022 10:18:42 -0700 Subject: [PATCH 0256/2223] f2fs: increase the limit for reserve_root This patch increases the threshold that limits the reserved root space from 0.2% to 12.5% by using simple shift operation. Typically Android sets 128MB, but if the storage capacity is 32GB, 0.2% which is around 64MB becomes too small. Let's relax it. Cc: stable@vger.kernel.org Reported-by: Aran Dalton Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0f29c759a8980..b8e5fe2445968 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -301,10 +301,10 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count << 1) / 1000, + block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); - /* limit is 0.2% */ + /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; -- GitLab From ddd3b16c8cc54ce776bf117bde0c4d588706ea49 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 29 Aug 2022 21:31:20 +0800 Subject: [PATCH 0257/2223] f2fs: replace logical value "true" with a int number The "true" is not match the parametera type "int", and we modify it. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a5054725d0b63..460048f3c850d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -481,7 +481,7 @@ do_sync: mutex_unlock(&sbi->flush_lock); } - f2fs_sync_fs(sbi->sb, true); + f2fs_sync_fs(sbi->sb, 1); stat_inc_bg_cp_count(sbi->stat_info); } -- GitLab From 8140654e781de334601b260b493ff13e14379ff8 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 23 Aug 2022 19:20:22 +0800 Subject: [PATCH 0258/2223] f2fs: simplify code in f2fs_prepare_decomp_mem It could return directly after init_decompress_ctx. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 70e97075e535e..730256732a9e9 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1568,12 +1568,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, if (!dic->cbuf) return -ENOMEM; - if (cops->init_decompress_ctx) { - int ret = cops->init_decompress_ctx(dic); - - if (ret) - return ret; - } + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); return 0; } -- GitLab From cd01569b040e3f496b74e4b78c2e79fc10979b28 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 13 Sep 2022 09:48:11 -0700 Subject: [PATCH 0259/2223] Input: mtk-pmic-keys - add support for MT6331 PMIC keys Add support for PMIC Keys of the MT6331 PMIC. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Mattijs Korpershoek Link: https://lore.kernel.org/r/20220913123941.385349-1-angelogioacchino.delregno@collabora.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/mtk-pmic-keys.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/input/keyboard/mtk-pmic-keys.c b/drivers/input/keyboard/mtk-pmic-keys.c index 6404081253ea1..9b34da0ec2605 100644 --- a/drivers/input/keyboard/mtk-pmic-keys.c +++ b/drivers/input/keyboard/mtk-pmic-keys.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,10 @@ #define MTK_PMIC_PWRKEY_RST BIT(6) #define MTK_PMIC_HOMEKEY_RST BIT(5) +#define MTK_PMIC_MT6331_RST_DU_MASK GENMASK(13, 12) +#define MTK_PMIC_MT6331_PWRKEY_RST BIT(9) +#define MTK_PMIC_MT6331_HOMEKEY_RST BIT(8) + #define MTK_PMIC_PWRKEY_INDEX 0 #define MTK_PMIC_HOMEKEY_INDEX 1 #define MTK_PMIC_MAX_KEY_COUNT 2 @@ -72,6 +77,19 @@ static const struct mtk_pmic_regs mt6323_regs = { .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, }; +static const struct mtk_pmic_regs mt6331_regs = { + .keys_regs[MTK_PMIC_PWRKEY_INDEX] = + MTK_PMIC_KEYS_REGS(MT6331_TOPSTATUS, 0x2, + MT6331_INT_MISC_CON, 0x4, + MTK_PMIC_MT6331_PWRKEY_RST), + .keys_regs[MTK_PMIC_HOMEKEY_INDEX] = + MTK_PMIC_KEYS_REGS(MT6331_TOPSTATUS, 0x4, + MT6331_INT_MISC_CON, 0x2, + MTK_PMIC_MT6331_HOMEKEY_RST), + .pmic_rst_reg = MT6331_TOP_RST_MISC, + .rst_lprst_mask = MTK_PMIC_MT6331_RST_DU_MASK, +}; + static const struct mtk_pmic_regs mt6358_regs = { .keys_regs[MTK_PMIC_PWRKEY_INDEX] = MTK_PMIC_KEYS_REGS(MT6358_TOPSTATUS, @@ -255,6 +273,9 @@ static const struct of_device_id of_mtk_pmic_keys_match_tbl[] = { }, { .compatible = "mediatek,mt6323-keys", .data = &mt6323_regs, + }, { + .compatible = "mediatek,mt6331-keys", + .data = &mt6331_regs, }, { .compatible = "mediatek,mt6358-keys", .data = &mt6358_regs, -- GitLab From 10e629d31aacb2348a1e9110c31a29e98b31ce38 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Thu, 8 Sep 2022 14:22:46 -0700 Subject: [PATCH 0260/2223] Input: iqs7222 - trim force communication command According to the datasheets, writing only 0xFF is sufficient to elicit a communication window. Remove the superfluous 0x00 from the force communication command. Fixes: e505edaedcb9 ("Input: add support for Azoteq IQS7222A/B/C") Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20220908131548.48120-6-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs7222.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c index b2e8097a2e6d9..376ba3e29eb64 100644 --- a/drivers/input/misc/iqs7222.c +++ b/drivers/input/misc/iqs7222.c @@ -1077,7 +1077,7 @@ static int iqs7222_hard_reset(struct iqs7222_private *iqs7222) static int iqs7222_force_comms(struct iqs7222_private *iqs7222) { - u8 msg_buf[] = { 0xFF, 0x00, }; + u8 msg_buf[] = { 0xFF, }; int ret; /* -- GitLab From 514c13b1faed74e9bc19061b6d7c78d53a3402ba Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Thu, 8 Sep 2022 14:24:24 -0700 Subject: [PATCH 0261/2223] Input: iqs7222 - avoid sending empty SYN_REPORT events Add a check to prevent sending undefined events, which ultimately map to SYN_REPORT. Fixes: e505edaedcb9 ("Input: add support for Azoteq IQS7222A/B/C") Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20220908131548.48120-7-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs7222.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c index 376ba3e29eb64..b8749c3f94b45 100644 --- a/drivers/input/misc/iqs7222.c +++ b/drivers/input/misc/iqs7222.c @@ -2326,6 +2326,9 @@ static int iqs7222_report(struct iqs7222_private *iqs7222) int k = 2 + j * (num_chan > 16 ? 2 : 1); u16 state = le16_to_cpu(status[k + i / 16]); + if (!iqs7222->kp_type[i][j]) + continue; + input_event(iqs7222->keypad, iqs7222->kp_type[i][j], iqs7222->kp_code[i][j], -- GitLab From d56111ed58482de0045e1e1201122e6e71516945 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Thu, 8 Sep 2022 14:24:35 -0700 Subject: [PATCH 0262/2223] Input: iqs7222 - set all ULP entry masks by default Some devices expose an ultra-low-power (ULP) mode entry mask for each channel. If the mask is set, the device cannot enter ULP so long as the corresponding channel remains in an active state. The vendor has advised setting the mask for any disabled channel. To accommodate this suggestion, initially set all masks and then clear them only if specified in the device tree. Fixes: e505edaedcb9 ("Input: add support for Azoteq IQS7222A/B/C") Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20220908131548.48120-8-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs7222.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c index b8749c3f94b45..ddb863bf63eec 100644 --- a/drivers/input/misc/iqs7222.c +++ b/drivers/input/misc/iqs7222.c @@ -1771,11 +1771,9 @@ static int iqs7222_parse_chan(struct iqs7222_private *iqs7222, int chan_index) if (!chan_node) return 0; - if (dev_desc->allow_offset) { - sys_setup[dev_desc->allow_offset] |= BIT(chan_index); - if (fwnode_property_present(chan_node, "azoteq,ulp-allow")) - sys_setup[dev_desc->allow_offset] &= ~BIT(chan_index); - } + if (dev_desc->allow_offset && + fwnode_property_present(chan_node, "azoteq,ulp-allow")) + sys_setup[dev_desc->allow_offset] &= ~BIT(chan_index); chan_setup[0] |= IQS7222_CHAN_SETUP_0_CHAN_EN; @@ -2206,6 +2204,9 @@ static int iqs7222_parse_all(struct iqs7222_private *iqs7222) u16 *sys_setup = iqs7222->sys_setup; int error, i; + if (dev_desc->allow_offset) + sys_setup[dev_desc->allow_offset] = U16_MAX; + if (dev_desc->event_offset) sys_setup[dev_desc->event_offset] = IQS7222_EVENT_MASK_ATI; -- GitLab From a21599cf1213ca0bdb002adeb4fa5eade71d106e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:07 +0200 Subject: [PATCH 0263/2223] dt-bindings: pinctrl: qcom,sm6115-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Iskren Chernev Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-2-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6115-pinctrl.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml index d8443811767db..8a2b4767c7b6c 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml @@ -59,8 +59,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sm6115-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sm6115-tlmm-state" + additionalProperties: false '$defs': qcom-sm6115-tlmm-state: @@ -155,25 +156,25 @@ examples: gpio-ranges = <&tlmm 0 0 114>; sdc2_on_state: sdc2-on-state { - clk { + clk-pins { pins = "sdc2_clk"; bias-disable; drive-strength = <16>; }; - cmd { + cmd-pins { pins = "sdc2_cmd"; bias-pull-up; drive-strength = <10>; }; - data { + data-pins { pins = "sdc2_data"; bias-pull-up; drive-strength = <10>; }; - sd-cd { + sd-cd-pins { pins = "gpio88"; function = "gpio"; bias-pull-up; -- GitLab From b17cf20dfc188f48f2746e1178cfde910cfa3be2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:08 +0200 Subject: [PATCH 0264/2223] dt-bindings: pinctrl: qcom,sm6115-pinctrl: require function on GPIOs Require function on GPIOs (so not on SD card pins). Signed-off-by: Krzysztof Kozlowski Reviewed-by: Iskren Chernev Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-3-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6115-pinctrl.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml index 8a2b4767c7b6c..28b29bf714b42 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml @@ -69,7 +69,6 @@ patternProperties: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -121,6 +120,16 @@ patternProperties: required: - pins + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|10[0-9]|11[0-2])$" + then: + required: + - function + additionalProperties: false allOf: -- GitLab From 495ffc067c6719f3a1722632455eb6fea9914f70 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:09 +0200 Subject: [PATCH 0265/2223] dt-bindings: pinctrl: qcom,sm6115-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Iskren Chernev Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-4-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6115-pinctrl.yaml | 80 +++++++++---------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml index 28b29bf714b42..e39fbb36d8c1c 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml @@ -150,44 +150,44 @@ additionalProperties: false examples: - | - #include - tlmm: pinctrl@500000 { - compatible = "qcom,sm6115-tlmm"; - reg = <0x500000 0x400000>, - <0x900000 0x400000>, - <0xd00000 0x400000>; - reg-names = "west", "south", "east"; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 114>; - - sdc2_on_state: sdc2-on-state { - clk-pins { - pins = "sdc2_clk"; - bias-disable; - drive-strength = <16>; - }; - - cmd-pins { - pins = "sdc2_cmd"; - bias-pull-up; - drive-strength = <10>; - }; - - data-pins { - pins = "sdc2_data"; - bias-pull-up; - drive-strength = <10>; - }; - - sd-cd-pins { - pins = "gpio88"; - function = "gpio"; - bias-pull-up; - drive-strength = <2>; - }; - }; + #include + tlmm: pinctrl@500000 { + compatible = "qcom,sm6115-tlmm"; + reg = <0x500000 0x400000>, + <0x900000 0x400000>, + <0xd00000 0x400000>; + reg-names = "west", "south", "east"; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 114>; + + sdc2_on_state: sdc2-on-state { + clk-pins { + pins = "sdc2_clk"; + bias-disable; + drive-strength = <16>; + }; + + cmd-pins { + pins = "sdc2_cmd"; + bias-pull-up; + drive-strength = <10>; + }; + + data-pins { + pins = "sdc2_data"; + bias-pull-up; + drive-strength = <10>; + }; + + sd-cd-pins { + pins = "gpio88"; + function = "gpio"; + bias-pull-up; + drive-strength = <2>; + }; }; + }; -- GitLab From 5d66124f619dafc1ca2c5b7b90b3fa355d995fa0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:10 +0200 Subject: [PATCH 0266/2223] dt-bindings: pinctrl: qcom,sm6125-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-5-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml index c8eec845ade92..84ed16f9915da 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml @@ -51,8 +51,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sm6125-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sm6125-tlmm-state" + additionalProperties: false $defs: qcom-sm6125-tlmm-state: -- GitLab From d1fc02d47bc4ba291ea85b85031cfa548da65724 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:11 +0200 Subject: [PATCH 0267/2223] dt-bindings: pinctrl: qcom,sm6125-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-6-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6125-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml index 84ed16f9915da..735eb5d6834d2 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml @@ -61,7 +61,6 @@ $defs: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -112,7 +111,16 @@ $defs: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio[0-9]|[1-9][0-9]|1[0-2][0-9]|13[0-2]$" + then: + required: + - function additionalProperties: false -- GitLab From 15239930127566a21294df41f9b73ddb5e4011f6 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:12 +0200 Subject: [PATCH 0268/2223] dt-bindings: pinctrl: qcom,sm6125-pinctrl: extend example Extend example with children for pin configuration and indent it with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-7-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6125-pinctrl.yaml | 46 +++++++++++++------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml index 735eb5d6834d2..5cb8b272cb7d8 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml @@ -126,17 +126,37 @@ $defs: examples: - | - #include - pinctrl@500000 { - compatible = "qcom,sm6125-tlmm"; - reg = <0x00500000 0x400000>, - <0x00900000 0x400000>, - <0x00d00000 0x400000>; - reg-names = "west", "south", "east"; - interrupts = ; - gpio-controller; - gpio-ranges = <&tlmm 0 0 134>; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; + #include + pinctrl@500000 { + compatible = "qcom,sm6125-tlmm"; + reg = <0x00500000 0x400000>, + <0x00900000 0x400000>, + <0x00d00000 0x400000>; + reg-names = "west", "south", "east"; + interrupts = ; + gpio-controller; + gpio-ranges = <&tlmm 0 0 134>; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + + sdc2-off-state { + clk-pins { + pins = "sdc2_clk"; + drive-strength = <2>; + bias-disable; + }; + + cmd-pins { + pins = "sdc2_cmd"; + drive-strength = <2>; + bias-pull-up; + }; + + data-pins { + pins = "sdc2_data"; + drive-strength = <2>; + bias-pull-up; + }; }; + }; -- GitLab From 7c291167877809723f990b989a05367565672586 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:13 +0200 Subject: [PATCH 0269/2223] dt-bindings: pinctrl: qcom,sm6350-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-8-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml index 898608671c4be..85a4ff5a56250 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml @@ -44,8 +44,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sm6350-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sm6350-tlmm-state" + additionalProperties: false $defs: qcom-sm6350-tlmm-state: @@ -133,13 +134,13 @@ examples: }; uart-w-subnodes-state { - rx { + rx-pins { pins = "gpio25"; function = "qup13_f2"; bias-disable; }; - tx { + tx-pins { pins = "gpio26"; function = "qup13_f2"; bias-disable; -- GitLab From 5f3332e9450d2c48c2cfc9d70e96693a890af373 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:14 +0200 Subject: [PATCH 0270/2223] dt-bindings: pinctrl: qcom,sm6350-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-9-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6350-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml index 85a4ff5a56250..0c4bf6e90ba08 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml @@ -54,7 +54,6 @@ $defs: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -111,7 +110,16 @@ $defs: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-4][0-9]|15[0-7])$" + then: + required: + - function additionalProperties: false -- GitLab From dc246ef73f5990b839d30b00d01066d95293aa85 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:15 +0200 Subject: [PATCH 0271/2223] dt-bindings: pinctrl: qcom,sm6350-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-10-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6350-pinctrl.yaml | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml index 0c4bf6e90ba08..856b9c567ecb9 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-pinctrl.yaml @@ -125,34 +125,34 @@ $defs: examples: - | - #include - pinctrl@f100000 { - compatible = "qcom,sm6350-tlmm"; - reg = <0x0f100000 0x300000>; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 157>; - - gpio-wo-subnode-state { - pins = "gpio1"; - function = "gpio"; - }; - - uart-w-subnodes-state { - rx-pins { - pins = "gpio25"; - function = "qup13_f2"; - bias-disable; - }; - - tx-pins { - pins = "gpio26"; - function = "qup13_f2"; - bias-disable; - }; - }; + #include + pinctrl@f100000 { + compatible = "qcom,sm6350-tlmm"; + reg = <0x0f100000 0x300000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 157>; + + gpio-wo-subnode-state { + pins = "gpio1"; + function = "gpio"; }; + + uart-w-subnodes-state { + rx-pins { + pins = "gpio25"; + function = "qup13_f2"; + bias-disable; + }; + + tx-pins { + pins = "gpio26"; + function = "qup13_f2"; + bias-disable; + }; + }; + }; ... -- GitLab From 51af3784f15facb4a011d59721a54a8b4ae2a3ed Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:16 +0200 Subject: [PATCH 0272/2223] dt-bindings: pinctrl: qcom,sm6375-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-11-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml index 3908807a8339e..50f0ca5ab7e77 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml @@ -44,8 +44,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sm6375-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sm6375-tlmm-state" + additionalProperties: false $defs: qcom-sm6375-tlmm-state: @@ -142,13 +143,13 @@ examples: }; uart-w-subnodes-state { - rx { + rx-pins { pins = "gpio18"; function = "qup13_f2"; bias-pull-up; }; - tx { + tx-pins { pins = "gpio19"; function = "qup13_f2"; bias-disable; -- GitLab From c8441085e2c0e17ef0610ba4ba2da100677ddf41 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:17 +0200 Subject: [PATCH 0273/2223] dt-bindings: pinctrl: qcom,sm6375-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-12-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6375-tlmm.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml index 50f0ca5ab7e77..dbd91d6b63b3a 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml @@ -54,7 +54,6 @@ $defs: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -120,7 +119,16 @@ $defs: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-4][0-9]|15[0-6])$" + then: + required: + - function additionalProperties: false -- GitLab From e3c2e3840742800131a9530d07e30a809d36dd65 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:18 +0200 Subject: [PATCH 0274/2223] dt-bindings: pinctrl: qcom,sm6375-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-13-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm6375-tlmm.yaml | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml index dbd91d6b63b3a..025faf87d147a 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml @@ -134,34 +134,34 @@ $defs: examples: - | - #include - pinctrl@500000 { - compatible = "qcom,sm6375-tlmm"; - reg = <0x00500000 0x800000>; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 157>; - - gpio-wo-subnode-state { - pins = "gpio1"; - function = "gpio"; - }; - - uart-w-subnodes-state { - rx-pins { - pins = "gpio18"; - function = "qup13_f2"; - bias-pull-up; - }; - - tx-pins { - pins = "gpio19"; - function = "qup13_f2"; - bias-disable; - }; - }; + #include + pinctrl@500000 { + compatible = "qcom,sm6375-tlmm"; + reg = <0x00500000 0x800000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 157>; + + gpio-wo-subnode-state { + pins = "gpio1"; + function = "gpio"; }; + + uart-w-subnodes-state { + rx-pins { + pins = "gpio18"; + function = "qup13_f2"; + bias-pull-up; + }; + + tx-pins { + pins = "gpio19"; + function = "qup13_f2"; + bias-disable; + }; + }; + }; ... -- GitLab From 6e6e1ef6b59d70c289f899d46049ab54bcf3f9c4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:19 +0200 Subject: [PATCH 0275/2223] dt-bindings: pinctrl: qcom,sm8250-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-14-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm8250-pinctrl.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml index 15bb1018cf21a..12bdc2e67c4da 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml @@ -110,7 +110,15 @@ patternProperties: required: - pins - - function + + allOf: + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-7][0-9])$" + then: + required: + - function additionalProperties: false -- GitLab From 2723c2530c20406425e6e44a29b9e36443e07e42 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:20 +0200 Subject: [PATCH 0276/2223] dt-bindings: pinctrl: qcom,sm8250-pinctrl: reference tlmm common pins Each subnode configuring pins (so the final -pins or pinconf) should reference common TLMM pin definition. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-15-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml index 12bdc2e67c4da..bccc83f22aae8 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml @@ -112,6 +112,7 @@ patternProperties: - pins allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" - if: properties: pins: -- GitLab From d70f858f82374021f1d5379a49cb74022a216120 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:21 +0200 Subject: [PATCH 0277/2223] dt-bindings: pinctrl: qcom,sm8250-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-16-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm8250-pinctrl.yaml | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml index bccc83f22aae8..c44d02d28bc9f 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml @@ -141,18 +141,18 @@ additionalProperties: false examples: - | - #include - pinctrl@1f00000 { - compatible = "qcom,sm8250-pinctrl"; - reg = <0x0f100000 0x300000>, - <0x0f500000 0x300000>, - <0x0f900000 0x300000>; - reg-names = "west", "south", "north"; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 180>; - wakeup-parent = <&pdc>; - }; + #include + pinctrl@1f00000 { + compatible = "qcom,sm8250-pinctrl"; + reg = <0x0f100000 0x300000>, + <0x0f500000 0x300000>, + <0x0f900000 0x300000>; + reg-names = "west", "south", "north"; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 180>; + wakeup-parent = <&pdc>; + }; -- GitLab From e9668427de337c67b1e18e9e1979180514631440 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:22 +0200 Subject: [PATCH 0278/2223] dt-bindings: pinctrl: qcom,sm8350-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: sm8350-hdk.dtb: pinctrl@f100000: qup-uart3-default-state: 'oneOf' conditional failed, one must be fixed: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-17-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml index 6b7789db2f756..211cca11f94f6 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml @@ -44,8 +44,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sm8350-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sm8350-tlmm-state" + additionalProperties: false $defs: qcom-sm8350-tlmm-state: @@ -130,13 +131,13 @@ examples: }; uart-w-subnodes-state { - rx { + rx-pins { pins = "gpio18"; function = "qup3"; bias-pull-up; }; - tx { + tx-pins { pins = "gpio19"; function = "qup3"; bias-disable; -- GitLab From 2d4e77a71f031928b95c8ab97b8ace0e46c6cc5f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:23 +0200 Subject: [PATCH 0279/2223] dt-bindings: pinctrl: qcom,sm8350-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-18-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm8350-pinctrl.yaml | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml index 211cca11f94f6..f3106d25adcfc 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml @@ -114,34 +114,34 @@ $defs: examples: - | - #include - pinctrl@f100000 { - compatible = "qcom,sm8350-tlmm"; - reg = <0x0f100000 0x300000>; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 203>; - - gpio-wo-subnode-state { - pins = "gpio1"; - function = "gpio"; - }; - - uart-w-subnodes-state { - rx-pins { - pins = "gpio18"; - function = "qup3"; - bias-pull-up; - }; - - tx-pins { - pins = "gpio19"; - function = "qup3"; - bias-disable; - }; - }; + #include + pinctrl@f100000 { + compatible = "qcom,sm8350-tlmm"; + reg = <0x0f100000 0x300000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 203>; + + gpio-wo-subnode-state { + pins = "gpio1"; + function = "gpio"; }; + + uart-w-subnodes-state { + rx-pins { + pins = "gpio18"; + function = "qup3"; + bias-pull-up; + }; + + tx-pins { + pins = "gpio19"; + function = "qup3"; + bias-disable; + }; + }; + }; ... -- GitLab From 34b88934e60e182d78b4e5f22ea8f702dff49f55 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:24 +0200 Subject: [PATCH 0280/2223] dt-bindings: pinctrl: qcom,sm8350-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-19-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm8350-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml index f3106d25adcfc..6ae5571f60da0 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml @@ -54,7 +54,6 @@ $defs: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -108,7 +107,16 @@ $defs: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-9][0-9]|20[0-3])$" + then: + required: + - function additionalProperties: false -- GitLab From d4ac2a2b7c6265156b2df6b4841e7f1117638d2b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:25 +0200 Subject: [PATCH 0281/2223] dt-bindings: pinctrl: qcom,sm8450-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: qcom/sm4250-oneplus-billie2.dtb: pinctrl@500000: sdc1-on-state: 'oneOf' conditional failed, one must be fixed: 'pins' is a required property 'clk', 'cmd', 'data', 'rclk' do not match any of the regexes: 'pinctrl-[0-9]+' [[26]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-20-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml index 9c891246245b7..d1d1c1455b3c6 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml @@ -43,8 +43,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sm8450-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sm8450-tlmm-state" + additionalProperties: false $defs: qcom-sm8450-tlmm-state: @@ -127,13 +128,13 @@ examples: }; uart-w-subnodes-state { - rx { + rx-pins { pins = "gpio26"; function = "qup7"; bias-pull-up; }; - tx { + tx-pins { pins = "gpio27"; function = "qup7"; bias-disable; -- GitLab From fde270ebb7eddf5aeae3e6235afdc92390d4d8f0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:26 +0200 Subject: [PATCH 0282/2223] dt-bindings: pinctrl: qcom,sm8450-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-21-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm8450-pinctrl.yaml | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml index d1d1c1455b3c6..87347e9c5f1ce 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml @@ -111,34 +111,34 @@ $defs: examples: - | - #include - pinctrl@f100000 { - compatible = "qcom,sm8450-tlmm"; - reg = <0x0f100000 0x300000>; - gpio-controller; - #gpio-cells = <2>; - gpio-ranges = <&tlmm 0 0 211>; - interrupt-controller; - #interrupt-cells = <2>; - interrupts = ; - - gpio-wo-subnode-state { - pins = "gpio1"; - function = "gpio"; - }; - - uart-w-subnodes-state { - rx-pins { - pins = "gpio26"; - function = "qup7"; - bias-pull-up; - }; - - tx-pins { - pins = "gpio27"; - function = "qup7"; - bias-disable; - }; - }; + #include + pinctrl@f100000 { + compatible = "qcom,sm8450-tlmm"; + reg = <0x0f100000 0x300000>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&tlmm 0 0 211>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + + gpio-wo-state { + pins = "gpio1"; + function = "gpio"; }; + + uart-w-state { + rx-pins { + pins = "gpio26"; + function = "qup7"; + bias-pull-up; + }; + + tx-pins { + pins = "gpio27"; + function = "qup7"; + bias-disable; + }; + }; + }; ... -- GitLab From 3cf5e17b26593db8a0293704614dd30d938b9a04 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:27 +0200 Subject: [PATCH 0283/2223] dt-bindings: pinctrl: qcom,sm8450-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-22-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sm8450-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml index 87347e9c5f1ce..296f503c1d97c 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml @@ -53,7 +53,6 @@ $defs: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -105,7 +104,16 @@ $defs: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-9][0-9]|20[0-9])$" + then: + required: + - function additionalProperties: false -- GitLab From 9779ed30f92c47604e40dcd8f20615712f63cbca Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:28 +0200 Subject: [PATCH 0284/2223] dt-bindings: pinctrl: qcom,sm8450-pinctrl: add gpio-line-names Add common gpio-line-names property and restrict gpio-reserved-ranges to fixed size. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-23-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml index 296f503c1d97c..9cd97a467648a 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8450-pinctrl.yaml @@ -27,7 +27,14 @@ properties: interrupt-controller: true '#interrupt-cells': true gpio-controller: true - gpio-reserved-ranges: true + + gpio-reserved-ranges: + minItems: 1 + maxItems: 105 + + gpio-line-names: + maxItems: 209 + '#gpio-cells': true gpio-ranges: true wakeup-parent: true -- GitLab From b76881c1288eca49c1579ed5f2bf8e6bedf25a2b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:29 +0200 Subject: [PATCH 0285/2223] dt-bindings: pinctrl: qcom,sc7280-pinctrl: correct number of GPIOs There are 182 GPIOs on SC7280. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-24-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml index 2d228164357c2..f948a7f30f6ab 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml @@ -60,7 +60,7 @@ patternProperties: subnode. items: oneOf: - - pattern: "^gpio([0-9]|[1-9][0-9]|1[0-7][0-4])$" + - pattern: "^gpio([0-9]|[1-9][0-9]|1[0-7][0-9]|18[0-2])$" - enum: [ sdc1_rclk, sdc1_clk, sdc1_cmd, sdc1_data, sdc2_clk, sdc2_cmd, sdc2_data, ufs_reset ] minItems: 1 -- GitLab From c35edcef53f8ca7a07bc4bbe95f756e55a74feb0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:30 +0200 Subject: [PATCH 0286/2223] dt-bindings: pinctrl: qcom,sc7280-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-25-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sc7280-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml index f948a7f30f6ab..9bd5fbdde9a24 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml @@ -51,7 +51,6 @@ patternProperties: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "/schemas/pinctrl/pincfg-node.yaml" properties: pins: @@ -118,7 +117,16 @@ patternProperties: required: - pins - - function + + allOf: + - $ref: /schemas/pinctrl/pincfg-node.yaml + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-7][0-9]|18[0-2])$" + then: + required: + - function additionalProperties: false -- GitLab From 2f23ae0f24f7ced01195d263a1db731a754b6f00 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:31 +0200 Subject: [PATCH 0287/2223] dt-bindings: pinctrl: qcom,sc7280-pinctrl: add gpio-line-names Add common gpio-line-names property (used on SC7280 Herobrine boards). Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-26-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml index 9bd5fbdde9a24..35d3962dac584 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml @@ -42,6 +42,9 @@ properties: gpio-ranges: maxItems: 1 + gpio-line-names: + maxItems: 174 + wakeup-parent: true #PIN CONFIGURATION NODES -- GitLab From 94a0cf14d7d52cb5889a6058bb98d541209effd1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:32 +0200 Subject: [PATCH 0288/2223] dt-bindings: pinctrl: qcom,sc7280-pinctrl: reference tlmm schema Qualcomm TLMM pin controller bindings should reference generic TLMM schema (which also pulls generic pinctrl schema). Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-27-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml index 35d3962dac584..b29fac302e6e1 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml @@ -134,7 +134,7 @@ patternProperties: additionalProperties: false allOf: - - $ref: "pinctrl.yaml#" + - $ref: /schemas/pinctrl/qcom,tlmm-common.yaml# required: - compatible -- GitLab From 44208c8238ea49c1ff827780a08c142a82517190 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:33 +0200 Subject: [PATCH 0289/2223] dt-bindings: pinctrl: qcom,sc7280-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-28-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sc7280-pinctrl.yaml | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml index b29fac302e6e1..30e6825793917 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml @@ -150,22 +150,22 @@ additionalProperties: false examples: - | - #include - tlmm: pinctrl@f000000 { - compatible = "qcom,sc7280-pinctrl"; - reg = <0xf000000 0x1000000>; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 175>; - wakeup-parent = <&pdc>; - - qup_uart5_default: qup-uart5-pins { - pins = "gpio46", "gpio47"; - function = "qup13"; - drive-strength = <2>; - bias-disable; - }; + #include + tlmm: pinctrl@f000000 { + compatible = "qcom,sc7280-pinctrl"; + reg = <0xf000000 0x1000000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 175>; + wakeup-parent = <&pdc>; + + qup_uart5_default: qup-uart5-pins { + pins = "gpio46", "gpio47"; + function = "qup13"; + drive-strength = <2>; + bias-disable; }; + }; -- GitLab From 985ea2c8d8bc33eca2ba8455f64e83148c3693e8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:34 +0200 Subject: [PATCH 0290/2223] dt-bindings: pinctrl: qcom,sc8180x-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-29-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml index 86509172603d3..646fabdf81f7a 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml @@ -51,8 +51,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sc8180x-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sc8180x-tlmm-state" + additionalProperties: false '$defs': qcom-sc8180x-tlmm-state: @@ -137,13 +138,13 @@ examples: }; uart-w-subnodes-state { - rx { + rx-pins { pins = "gpio4"; function = "qup6"; bias-pull-up; }; - tx { + tx-pins { pins = "gpio5"; function = "qup6"; bias-disable; -- GitLab From c21692d5f81dd7153244f82c1bd127603e59c24d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:35 +0200 Subject: [PATCH 0291/2223] dt-bindings: pinctrl: qcom,sc8180x-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-30-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sc8180x-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml index 646fabdf81f7a..4afe20bac87c3 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml @@ -61,7 +61,6 @@ patternProperties: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -112,7 +111,16 @@ patternProperties: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-8][0-9])$" + then: + required: + - function additionalProperties: false -- GitLab From 31fb6fc82f6a63df9543f247743e894ac453ac0c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:36 +0200 Subject: [PATCH 0292/2223] dt-bindings: pinctrl: qcom,sc8180x-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-31-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../pinctrl/qcom,sc8180x-pinctrl.yaml | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml index 4afe20bac87c3..b98eeba2c530b 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml @@ -126,37 +126,37 @@ patternProperties: examples: - | - #include - pinctrl@3100000 { - compatible = "qcom,sc8180x-tlmm"; - reg = <0x03100000 0x300000>, - <0x03500000 0x700000>, - <0x03d00000 0x300000>; - reg-names = "west", "east", "south"; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 190>; - - gpio-wo-subnode-state { - pins = "gpio1"; - function = "gpio"; - }; - - uart-w-subnodes-state { - rx-pins { - pins = "gpio4"; - function = "qup6"; - bias-pull-up; - }; - - tx-pins { - pins = "gpio5"; - function = "qup6"; - bias-disable; - }; - }; + #include + pinctrl@3100000 { + compatible = "qcom,sc8180x-tlmm"; + reg = <0x03100000 0x300000>, + <0x03500000 0x700000>, + <0x03d00000 0x300000>; + reg-names = "west", "east", "south"; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 190>; + + gpio-wo-subnode-state { + pins = "gpio1"; + function = "gpio"; }; + + uart-w-subnodes-state { + rx-pins { + pins = "gpio4"; + function = "qup6"; + bias-pull-up; + }; + + tx-pins { + pins = "gpio5"; + function = "qup6"; + bias-disable; + }; + }; + }; ... -- GitLab From 22b4fb602283e6f8807225d84a7918fd2961bff5 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:37 +0200 Subject: [PATCH 0293/2223] dt-bindings: pinctrl: qcom,sc8280xp-pinctrl: fix matching pin config Matching PMIC GPIOs config nodes within a '-state' node by '.*' pattern does not work as expected because of linux,phandle in the DTB: 'pins' is a required property 'function' is a required property 'rx', 'tx' do not match any of the regexes: 'pinctrl-[0-9]+' [[59]] is not of type 'object' Make the schema stricter and expect such nodes to be followed with a '-pins' suffix. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-32-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml index 87a381c9a19dc..5147afc28721e 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml @@ -43,8 +43,9 @@ patternProperties: oneOf: - $ref: "#/$defs/qcom-sc8280xp-tlmm-state" - patternProperties: - ".*": + "-pins$": $ref: "#/$defs/qcom-sc8280xp-tlmm-state" + additionalProperties: false '$defs': qcom-sc8280xp-tlmm-state: @@ -135,13 +136,13 @@ examples: }; uart-w-subnodes-state { - rx { + rx-pins { pins = "gpio4"; function = "qup14"; bias-pull-up; }; - tx { + tx-pins { pins = "gpio5"; function = "qup14"; bias-disable; -- GitLab From 3fb7fe5d3a3ee76416f862ea25c275357820b294 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:38 +0200 Subject: [PATCH 0294/2223] dt-bindings: pinctrl: qcom,sc8280xp-pinctrl: do not require function on non-GPIOs Certain pins, like SDcard related, do not have functions and such should not be required: sdc1-clk-pins: 'function' is a required property Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-33-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml index 5147afc28721e..8610f27013881 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml @@ -53,7 +53,6 @@ patternProperties: description: Pinctrl node's client devices use subnodes for desired pin configuration. Client device subnodes use below standard properties. - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" properties: pins: @@ -113,7 +112,16 @@ patternProperties: required: - pins - - function + + allOf: + - $ref: "qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state" + - if: + properties: + pins: + pattern: "^gpio([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-1][0-9]|22[0-7])$" + then: + required: + - function additionalProperties: false -- GitLab From ee83ef13dc405f6b55ad8d931cd0df9dee3a8ae8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 12 Sep 2022 08:17:39 +0200 Subject: [PATCH 0295/2223] dt-bindings: pinctrl: qcom,sc8280xp-pinctrl: fix indentation in example Bindings example should be indented with 4-spaces. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220912061746.6311-34-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij --- .../pinctrl/qcom,sc8280xp-pinctrl.yaml | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml index 8610f27013881..b9ab130cd558d 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-pinctrl.yaml @@ -127,34 +127,34 @@ patternProperties: examples: - | - #include - pinctrl@f100000 { - compatible = "qcom,sc8280xp-tlmm"; - reg = <0x0f100000 0x300000>; - interrupts = ; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - gpio-ranges = <&tlmm 0 0 230>; - - gpio-wo-subnode-state { - pins = "gpio1"; - function = "gpio"; - }; - - uart-w-subnodes-state { - rx-pins { - pins = "gpio4"; - function = "qup14"; - bias-pull-up; - }; - - tx-pins { - pins = "gpio5"; - function = "qup14"; - bias-disable; - }; - }; + #include + pinctrl@f100000 { + compatible = "qcom,sc8280xp-tlmm"; + reg = <0x0f100000 0x300000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 230>; + + gpio-wo-subnode-state { + pins = "gpio1"; + function = "gpio"; }; + + uart-w-subnodes-state { + rx-pins { + pins = "gpio4"; + function = "qup14"; + bias-pull-up; + }; + + tx-pins { + pins = "gpio5"; + function = "qup14"; + bias-disable; + }; + }; + }; ... -- GitLab From 34b4d20399e6fad2e3379b11e68dff1d1549274e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:34 +0000 Subject: [PATCH 0296/2223] KVM: arm64: Use visibility hook to treat ID regs as RAZ The generic id reg accessors already handle RAZ registers by way of the visibility hook. Add a visibility hook that returns REG_RAZ unconditionally and throw out the RAZ specific accessors. Reviewed-by: Reiji Watanabe Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-2-oliver.upton@linux.dev --- arch/arm64/kvm/sys_regs.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3234f50b8c4b2..e18efb9211f02 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1145,6 +1145,12 @@ static unsigned int id_visibility(const struct kvm_vcpu *vcpu, return 0; } +static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return REG_RAZ; +} + /* cpufeature ID register access trap handlers */ static bool __access_id_reg(struct kvm_vcpu *vcpu, @@ -1168,13 +1174,6 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, return __access_id_reg(vcpu, p, r, raz); } -static bool access_raz_id_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - return __access_id_reg(vcpu, p, r, true); -} - /* Visibility overrides for SVE-specific control registers */ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) @@ -1262,12 +1261,6 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, val, raz); } -static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - return __set_id_reg(vcpu, rd, val, true); -} - static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { @@ -1374,9 +1367,10 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, */ #define ID_UNALLOCATED(crm, op2) { \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ - .access = access_raz_id_reg, \ - .get_user = get_raz_reg, \ - .set_user = set_raz_id_reg, \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = raz_visibility \ } /* @@ -1386,9 +1380,10 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, */ #define ID_HIDDEN(name) { \ SYS_DESC(SYS_##name), \ - .access = access_raz_id_reg, \ - .get_user = get_raz_reg, \ - .set_user = set_raz_id_reg, \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = raz_visibility, \ } /* -- GitLab From 4782ccc8ef50fabb70bab9fa73186285dba6d91d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:35 +0000 Subject: [PATCH 0297/2223] KVM: arm64: Remove internal accessor helpers for id regs The internal accessors are only ever called once. Dump out their contents in the caller. No functional change intended. Signed-off-by: Oliver Upton Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-3-oliver.upton@linux.dev --- arch/arm64/kvm/sys_regs.c | 46 ++++++++++----------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e18efb9211f02..26210f3a0b271 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1153,25 +1153,17 @@ static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, /* cpufeature ID register access trap handlers */ -static bool __access_id_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r, - bool raz) -{ - if (p->is_write) - return write_to_read_only(vcpu, p, r); - - p->regval = read_id_reg(vcpu, r, raz); - return true; -} - static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { bool raz = sysreg_visible_as_raz(vcpu, r); - return __access_id_reg(vcpu, p, r, raz); + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_id_reg(vcpu, r, raz); + return true; } /* Visibility overrides for SVE-specific control registers */ @@ -1226,31 +1218,13 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, u64 *val, - bool raz) -{ - *val = read_id_reg(vcpu, rd, raz); - return 0; -} - -static int __set_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, u64 val, - bool raz) -{ - /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd, raz)) - return -EINVAL; - - return 0; -} - static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { bool raz = sysreg_visible_as_raz(vcpu, rd); - return __get_id_reg(vcpu, rd, val, raz); + *val = read_id_reg(vcpu, rd, raz); + return 0; } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1258,7 +1232,11 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, { bool raz = sysreg_visible_as_raz(vcpu, rd); - return __set_id_reg(vcpu, rd, val, raz); + /* This is what we mean by invariant: you can't change it. */ + if (val != read_id_reg(vcpu, rd, raz)) + return -EINVAL; + + return 0; } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, -- GitLab From cdd5036d048ca96ef5212fb37f4f56db40cb1bc2 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:36 +0000 Subject: [PATCH 0298/2223] KVM: arm64: Drop raz parameter from read_id_reg() There is no longer a need for caller-specified RAZ visibility. Hoist the call to sysreg_visible_as_raz() into read_id_reg() and drop the parameter. No functional change intended. Suggested-by: Reiji Watanabe Signed-off-by: Oliver Upton Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-4-oliver.upton@linux.dev --- arch/arm64/kvm/sys_regs.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 26210f3a0b271..0e20a311ea209 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1063,13 +1063,12 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(const struct kvm_vcpu *vcpu, - struct sys_reg_desc const *r, bool raz) +static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { u32 id = reg_to_encoding(r); u64 val; - if (raz) + if (sysreg_visible_as_raz(vcpu, r)) return 0; val = read_sanitised_ftr_reg(id); @@ -1157,12 +1156,10 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - bool raz = sysreg_visible_as_raz(vcpu, r); - if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_id_reg(vcpu, r, raz); + p->regval = read_id_reg(vcpu, r); return true; } @@ -1199,7 +1196,7 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return -EINVAL; /* We can only differ with CSV[23], and anything else is an error */ - val ^= read_id_reg(vcpu, rd, false); + val ^= read_id_reg(vcpu, rd); val &= ~((0xFUL << ID_AA64PFR0_CSV2_SHIFT) | (0xFUL << ID_AA64PFR0_CSV3_SHIFT)); if (val) @@ -1221,19 +1218,15 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { - bool raz = sysreg_visible_as_raz(vcpu, rd); - - *val = read_id_reg(vcpu, rd, raz); + *val = read_id_reg(vcpu, rd); return 0; } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - bool raz = sysreg_visible_as_raz(vcpu, rd); - /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd, raz)) + if (val != read_id_reg(vcpu, rd)) return -EINVAL; return 0; -- GitLab From 5d9a718b64e428a40939806873ecf16f072008b3 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:37 +0000 Subject: [PATCH 0299/2223] KVM: arm64: Spin off helper for calling visibility hook No functional change intended. Reviewed-by: Reiji Watanabe Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-5-oliver.upton@linux.dev --- arch/arm64/kvm/sys_regs.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index a8c4cc32eb9af..e78b510596223 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -136,22 +136,25 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r __vcpu_sys_reg(vcpu, r->reg) = r->val; } -static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) +static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { if (likely(!r->visibility)) - return false; + return 0; - return r->visibility(vcpu, r) & REG_HIDDEN; + return r->visibility(vcpu, r); +} + +static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return sysreg_visibility(vcpu, r) & REG_HIDDEN; } static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & REG_RAZ; + return sysreg_visibility(vcpu, r) & REG_RAZ; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, -- GitLab From 4de06e4c1dc949c35c16e4423b4ccd735264b0a9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:38 +0000 Subject: [PATCH 0300/2223] KVM: arm64: Add a visibility bit to ignore user writes We're about to ignore writes to AArch32 ID registers on AArch64-only systems. Add a bit to indicate a register is handled as write ignore when accessed from userspace. Signed-off-by: Oliver Upton Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-6-oliver.upton@linux.dev --- arch/arm64/kvm/sys_regs.c | 3 +++ arch/arm64/kvm/sys_regs.h | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0e20a311ea209..6d0511247df4e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2775,6 +2775,9 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, if (!r) return -ENOENT; + if (sysreg_user_write_ignore(vcpu, r)) + return 0; + if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index e78b510596223..e4ebb3a379fdb 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -86,6 +86,7 @@ struct sys_reg_desc { #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ #define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -157,6 +158,12 @@ static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, return sysreg_visibility(vcpu, r) & REG_RAZ; } +static inline bool sysreg_user_write_ignore(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return sysreg_visibility(vcpu, r) & REG_USER_WI; +} + static inline int cmp_sys_reg(const struct sys_reg_desc *i1, const struct sys_reg_desc *i2) { -- GitLab From d5efec7ed826b3b29c6847bf59383d8d07347a4e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:39 +0000 Subject: [PATCH 0301/2223] KVM: arm64: Treat 32bit ID registers as RAZ/WI on 64bit-only system One of the oddities of the architecture is that the AArch64 views of the AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any EL. Nonetheless, KVM exposes these registers to userspace for the sake of save/restore. It is possible that the UNKNOWN value could differ between systems, leading to a rejected write from userspace. Avoid the issue altogether by handling the AArch32 ID registers as RAZ/WI when on an AArch64-only system. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-7-oliver.upton@linux.dev --- arch/arm64/kvm/sys_regs.c | 63 ++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 6d0511247df4e..9569772cf09a5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1144,6 +1144,20 @@ static unsigned int id_visibility(const struct kvm_vcpu *vcpu, return 0; } +static unsigned int aa32_id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + /* + * AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any + * EL. Promote to RAZ/WI in order to guarantee consistency between + * systems. + */ + if (!kvm_supports_32bit_el0()) + return REG_RAZ | REG_USER_WI; + + return id_visibility(vcpu, r); +} + static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -1331,6 +1345,15 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .visibility = id_visibility, \ } +/* sys_reg_desc initialiser for known cpufeature ID registers */ +#define AA32_ID_SANITISED(name) { \ + SYS_DESC(SYS_##name), \ + .access = access_id_reg, \ + .get_user = get_id_reg, \ + .set_user = set_id_reg, \ + .visibility = aa32_id_visibility, \ +} + /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 @@ -1418,33 +1441,33 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ - ID_SANITISED(ID_PFR0_EL1), - ID_SANITISED(ID_PFR1_EL1), - ID_SANITISED(ID_DFR0_EL1), + AA32_ID_SANITISED(ID_PFR0_EL1), + AA32_ID_SANITISED(ID_PFR1_EL1), + AA32_ID_SANITISED(ID_DFR0_EL1), ID_HIDDEN(ID_AFR0_EL1), - ID_SANITISED(ID_MMFR0_EL1), - ID_SANITISED(ID_MMFR1_EL1), - ID_SANITISED(ID_MMFR2_EL1), - ID_SANITISED(ID_MMFR3_EL1), + AA32_ID_SANITISED(ID_MMFR0_EL1), + AA32_ID_SANITISED(ID_MMFR1_EL1), + AA32_ID_SANITISED(ID_MMFR2_EL1), + AA32_ID_SANITISED(ID_MMFR3_EL1), /* CRm=2 */ - ID_SANITISED(ID_ISAR0_EL1), - ID_SANITISED(ID_ISAR1_EL1), - ID_SANITISED(ID_ISAR2_EL1), - ID_SANITISED(ID_ISAR3_EL1), - ID_SANITISED(ID_ISAR4_EL1), - ID_SANITISED(ID_ISAR5_EL1), - ID_SANITISED(ID_MMFR4_EL1), - ID_SANITISED(ID_ISAR6_EL1), + AA32_ID_SANITISED(ID_ISAR0_EL1), + AA32_ID_SANITISED(ID_ISAR1_EL1), + AA32_ID_SANITISED(ID_ISAR2_EL1), + AA32_ID_SANITISED(ID_ISAR3_EL1), + AA32_ID_SANITISED(ID_ISAR4_EL1), + AA32_ID_SANITISED(ID_ISAR5_EL1), + AA32_ID_SANITISED(ID_MMFR4_EL1), + AA32_ID_SANITISED(ID_ISAR6_EL1), /* CRm=3 */ - ID_SANITISED(MVFR0_EL1), - ID_SANITISED(MVFR1_EL1), - ID_SANITISED(MVFR2_EL1), + AA32_ID_SANITISED(MVFR0_EL1), + AA32_ID_SANITISED(MVFR1_EL1), + AA32_ID_SANITISED(MVFR2_EL1), ID_UNALLOCATED(3,3), - ID_SANITISED(ID_PFR2_EL1), + AA32_ID_SANITISED(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), - ID_SANITISED(ID_MMFR5_EL1), + AA32_ID_SANITISED(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ -- GitLab From 797b84517c190053597e3f7e03ead15da872e04d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 13 Sep 2022 09:44:40 +0000 Subject: [PATCH 0302/2223] KVM: selftests: Add test for AArch32 ID registers Add a test to assert that KVM handles the AArch64 views of the AArch32 ID registers as RAZ/WI (writable only from userspace). For registers that were already hidden or unallocated, expect RAZ + invariant behavior. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220913094441.3957645-8-oliver.upton@linux.dev --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/aarch32_id_regs.c | 169 ++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d625a3f837806..87d1a0b1bae04 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +/aarch64/aarch32_id_regs /aarch64/arch_timer /aarch64/debug-exceptions /aarch64/get-reg-list diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4c122f1b17378..784abe7f09625 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -144,6 +144,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test # Compiled outputs used by test targets TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test +TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c new file mode 100644 index 0000000000000..6f9c1f19c7f64 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * aarch32_id_regs - Test for ID register behavior on AArch64-only systems + * + * Copyright (c) 2022 Google LLC. + * + * Test that KVM handles the AArch64 views of the AArch32 ID registers as RAZ + * and WI from userspace. + */ + +#include + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define BAD_ID_REG_VAL 0x1badc0deul + +#define GUEST_ASSERT_REG_RAZ(reg) GUEST_ASSERT_EQ(read_sysreg_s(reg), 0) + +static void guest_main(void) +{ + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_DFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_AFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR3_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR3_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR4_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR5_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR4_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR6_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR2_EL1); + GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 3)); + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_DFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR5_EL1); + GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 7)); + + GUEST_DONE(); +} + +static void test_guest_raz(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } +} + +static uint64_t raz_wi_reg_ids[] = { + KVM_ARM64_SYS_REG(SYS_ID_PFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_PFR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_DFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR3_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR3_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR4_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR5_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR4_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR6_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR0_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR1_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_PFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR5_EL1), +}; + +static void test_user_raz_wi(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raz_wi_reg_ids); i++) { + uint64_t reg_id = raz_wi_reg_ids[i]; + uint64_t val; + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + + /* + * Expect the ioctl to succeed with no effect on the register + * value. + */ + vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL); + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + } +} + +static uint64_t raz_invariant_reg_ids[] = { + KVM_ARM64_SYS_REG(SYS_ID_AFR0_EL1), + KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 3)), + KVM_ARM64_SYS_REG(SYS_ID_DFR1_EL1), + KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 7)), +}; + +static void test_user_raz_invariant(struct kvm_vcpu *vcpu) +{ + int i, r; + + for (i = 0; i < ARRAY_SIZE(raz_invariant_reg_ids); i++) { + uint64_t reg_id = raz_invariant_reg_ids[i]; + uint64_t val; + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + + r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL); + TEST_ASSERT(r < 0 && errno == EINVAL, + "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno); + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + } +} + + + +static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) +{ + uint64_t val, el0; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); + + el0 = (val & ARM64_FEATURE_MASK(ID_AA64PFR0_EL0)) >> ID_AA64PFR0_EL0_SHIFT; + return el0 == ID_AA64PFR0_ELx_64BIT_ONLY; +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + TEST_REQUIRE(vcpu_aarch64_only(vcpu)); + + ucall_init(vm, NULL); + + test_user_raz_wi(vcpu); + test_user_raz_invariant(vcpu); + test_guest_raz(vcpu); + + ucall_uninit(vm); + kvm_vm_free(vm); +} -- GitLab From 3f668365bcd8d17b4bcd0fdb62e5c748753196ec Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Fri, 9 Sep 2022 08:38:02 -0700 Subject: [PATCH 0303/2223] pinctrl: ocelot: add help and description information to ocelot pinctrl kconfig Add missed help information and module export name to the Microsemi Ocelot and Jaguar2 SoC. Signed-off-by: Colin Foster Link: https://lore.kernel.org/r/20220909153802.3370088-1-colin.foster@in-advantage.com Signed-off-by: Linus Walleij --- drivers/pinctrl/Kconfig | 5 +++++ drivers/pinctrl/pinctrl-ocelot.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index c09562fbb1b75..da87f2dc358bc 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -335,6 +335,11 @@ config PINCTRL_OCELOT select GENERIC_PINMUX_FUNCTIONS select OF_GPIO select REGMAP_MMIO + help + Support for the internal GPIO interfaces on Microsemi Ocelot and + Jaguar2 SoCs. + + If conpiled as a module, the module name will be pinctrl-ocelot. config PINCTRL_OXNAS bool diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index c5fd154990c8b..647e91490bacd 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -2052,4 +2052,6 @@ static struct platform_driver ocelot_pinctrl_driver = { .probe = ocelot_pinctrl_probe, }; module_platform_driver(ocelot_pinctrl_driver); + +MODULE_DESCRIPTION("Ocelot Chip Pinctrl Driver"); MODULE_LICENSE("Dual MIT/GPL"); -- GitLab From 7984b43542070f5888546d95b48003c4a8af7c0f Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 14 Sep 2022 05:34:49 -0700 Subject: [PATCH 0304/2223] Input: synaptics - enable InterTouch for the ThinkPad P1 G3 Noticed this while trying to debug some unrelated issues: this laptop has the ability to use rmi4 but doesn't by default. So let's fix that. Tested locally, including mouse buttons, on my ThinkPad P1 G3. This might also enable the X1 Extreme G3, but I don't have such a system to test locally (presumably Mark can chime in if that's the case). Signed-off-by: Lyude Paul Link: https://lore.kernel.org/r/20220909202127.141761-1-lyude@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index e3f657713b557..4971ec9f9748f 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -182,6 +182,7 @@ static const char * const smbus_pnp_ids[] = { "LEN0099", /* X1 Extreme Gen 1 / P1 Gen 1 */ "LEN009b", /* T580 */ "LEN0402", /* X1 Extreme Gen 2 / P1 Gen 2 */ + "LEN040f", /* P1 Gen 3 */ "LEN200f", /* T450s */ "LEN2044", /* L470 */ "LEN2054", /* E480 */ -- GitLab From 92858eb6cb64cfafdc2b35c942d1812275f4205a Mon Sep 17 00:00:00 2001 From: Peter Chiu Date: Mon, 12 Sep 2022 17:24:40 +0800 Subject: [PATCH 0305/2223] dt-bindings: pinctrl: update bindings for MT7986 SoC Add wifi pins in the description and set 'maxItems' for groups and pins. Reviewed-by: Sam Shih Signed-off-by: Peter Chiu Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220912092440.21011-1-chui-hao.chiu@mediatek.com Signed-off-by: Linus Walleij --- .../pinctrl/mediatek,mt7986-pinctrl.yaml | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/mediatek,mt7986-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/mediatek,mt7986-pinctrl.yaml index 4eadea55df10f..06c819ae7d509 100644 --- a/Documentation/devicetree/bindings/pinctrl/mediatek,mt7986-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/mediatek,mt7986-pinctrl.yaml @@ -117,6 +117,10 @@ patternProperties: "i2s" "audio" 62, 63, 64, 65 "switch_int" "eth" 66 "mdc_mdio" "eth" 67 + "wf_2g" "wifi" 74, 75, 76, 77, 78, 79, 80, 81, 82, 83 + "wf_5g" "wifi" 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 + "wf_dbdc" "wifi" 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 85 $ref: "/schemas/pinctrl/pinmux-node.yaml" properties: @@ -234,7 +238,9 @@ patternProperties: then: properties: groups: - enum: [wf_2g, wf_5g, wf_dbdc] + items: + enum: [wf_2g, wf_5g, wf_dbdc] + maxItems: 3 '.*conf.*': type: object additionalProperties: false @@ -248,25 +254,27 @@ patternProperties: An array of strings. Each string contains the name of a pin. There is no PIN 41 to PIN 65 above on mt7686b, you can only use those pins on mt7986a. - enum: [SYS_WATCHDOG, WF2G_LED, WF5G_LED, I2C_SCL, I2C_SDA, GPIO_0, - GPIO_1, GPIO_2, GPIO_3, GPIO_4, GPIO_5, GPIO_6, GPIO_7, - GPIO_8, GPIO_9, GPIO_10, GPIO_11, GPIO_12, GPIO_13, GPIO_14, - GPIO_15, PWM0, PWM1, SPI0_CLK, SPI0_MOSI, SPI0_MISO, SPI0_CS, - SPI0_HOLD, SPI0_WP, SPI1_CLK, SPI1_MOSI, SPI1_MISO, SPI1_CS, - SPI2_CLK, SPI2_MOSI, SPI2_MISO, SPI2_CS, SPI2_HOLD, SPI2_WP, - UART0_RXD, UART0_TXD, PCIE_PERESET_N, UART1_RXD, UART1_TXD, - UART1_CTS, UART1_RTS, UART2_RXD, UART2_TXD, UART2_CTS, - UART2_RTS, EMMC_DATA_0, EMMC_DATA_1, EMMC_DATA_2, - EMMC_DATA_3, EMMC_DATA_4, EMMC_DATA_5, EMMC_DATA_6, - EMMC_DATA_7, EMMC_CMD, EMMC_CK, EMMC_DSL, EMMC_RSTB, PCM_DTX, - PCM_DRX, PCM_CLK, PCM_FS, MT7531_INT, SMI_MDC, SMI_MDIO, - WF0_DIG_RESETB, WF0_CBA_RESETB, WF0_XO_REQ, WF0_TOP_CLK, - WF0_TOP_DATA, WF0_HB1, WF0_HB2, WF0_HB3, WF0_HB4, WF0_HB0, - WF0_HB0_B, WF0_HB5, WF0_HB6, WF0_HB7, WF0_HB8, WF0_HB9, - WF0_HB10, WF1_DIG_RESETB, WF1_CBA_RESETB, WF1_XO_REQ, - WF1_TOP_CLK, WF1_TOP_DATA, WF1_HB1, WF1_HB2, WF1_HB3, - WF1_HB4, WF1_HB0, WF1_HB0_B, WF1_HB5, WF1_HB6, WF1_HB7, - WF1_HB8] + items: + enum: [SYS_WATCHDOG, WF2G_LED, WF5G_LED, I2C_SCL, I2C_SDA, GPIO_0, + GPIO_1, GPIO_2, GPIO_3, GPIO_4, GPIO_5, GPIO_6, GPIO_7, + GPIO_8, GPIO_9, GPIO_10, GPIO_11, GPIO_12, GPIO_13, GPIO_14, + GPIO_15, PWM0, PWM1, SPI0_CLK, SPI0_MOSI, SPI0_MISO, SPI0_CS, + SPI0_HOLD, SPI0_WP, SPI1_CLK, SPI1_MOSI, SPI1_MISO, SPI1_CS, + SPI2_CLK, SPI2_MOSI, SPI2_MISO, SPI2_CS, SPI2_HOLD, SPI2_WP, + UART0_RXD, UART0_TXD, PCIE_PERESET_N, UART1_RXD, UART1_TXD, + UART1_CTS, UART1_RTS, UART2_RXD, UART2_TXD, UART2_CTS, + UART2_RTS, EMMC_DATA_0, EMMC_DATA_1, EMMC_DATA_2, + EMMC_DATA_3, EMMC_DATA_4, EMMC_DATA_5, EMMC_DATA_6, + EMMC_DATA_7, EMMC_CMD, EMMC_CK, EMMC_DSL, EMMC_RSTB, PCM_DTX, + PCM_DRX, PCM_CLK, PCM_FS, MT7531_INT, SMI_MDC, SMI_MDIO, + WF0_DIG_RESETB, WF0_CBA_RESETB, WF0_XO_REQ, WF0_TOP_CLK, + WF0_TOP_DATA, WF0_HB1, WF0_HB2, WF0_HB3, WF0_HB4, WF0_HB0, + WF0_HB0_B, WF0_HB5, WF0_HB6, WF0_HB7, WF0_HB8, WF0_HB9, + WF0_HB10, WF1_DIG_RESETB, WF1_CBA_RESETB, WF1_XO_REQ, + WF1_TOP_CLK, WF1_TOP_DATA, WF1_HB1, WF1_HB2, WF1_HB3, + WF1_HB4, WF1_HB0, WF1_HB0_B, WF1_HB5, WF1_HB6, WF1_HB7, + WF1_HB8] + maxItems: 101 bias-disable: true -- GitLab From a6b9ede1f3dfa5477791ad92d11f60f50998b689 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 5 Sep 2022 19:15:23 -0700 Subject: [PATCH 0306/2223] PCI: apple: Do not leak reset GPIO on unbind/unload/error The driver allocates reset GPIO in apple_pcie_setup_port() but neither releases the resource, nor uses devm API to have it released automatically. Let's fix this by switching to devm API. While at it let's use generic devm_fwnode_gpiod_get() instead of OF-specific gpiod_get_from_of_node() - this will allow us top stop exporting the latter down the road. Link: https://lore.kernel.org/r/YxatO5OaI2RpxQ2M@google.com Fixes: 1e33888fbe44 ("PCI: apple: Add initial hardware bring-up") Signed-off-by: Dmitry Torokhov Signed-off-by: Lorenzo Pieralisi Reviewed-by: Hector Martin Acked-by: Marc Zyngier --- drivers/pci/controller/pcie-apple.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index a2c3c207a04b7..66f37e403a09c 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -516,8 +516,8 @@ static int apple_pcie_setup_port(struct apple_pcie *pcie, u32 stat, idx; int ret, i; - reset = gpiod_get_from_of_node(np, "reset-gpios", 0, - GPIOD_OUT_LOW, "PERST#"); + reset = devm_fwnode_gpiod_get(pcie->dev, of_fwnode_handle(np), "reset", + GPIOD_OUT_LOW, "PERST#"); if (IS_ERR(reset)) return PTR_ERR(reset); -- GitLab From 612a0f0b93c8c8d3f0ab610c69e0b6678362643b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 14 Sep 2022 12:30:20 +0200 Subject: [PATCH 0307/2223] dt-bindings: input: Convert mtk-pmic-keys to DT schema Convert the mtk-pmic-keys to DT schema format. The old binding was missing documentation for key press/release interrupts, even though it was supported in hardware and driver, so support for the same was added during the conversion. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Mattijs Korpershoek Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220914103021.43593-2-angelogioacchino.delregno@collabora.com Signed-off-by: Dmitry Torokhov --- .../bindings/input/mediatek,pmic-keys.yaml | 113 ++++++++++++++++++ .../bindings/input/mtk-pmic-keys.txt | 46 ------- 2 files changed, 113 insertions(+), 46 deletions(-) create mode 100644 Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml delete mode 100644 Documentation/devicetree/bindings/input/mtk-pmic-keys.txt diff --git a/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml b/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml new file mode 100644 index 0000000000000..9d8a0c3aebcaf --- /dev/null +++ b/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/mediatek,pmic-keys.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek PMIC Keys + +maintainers: + - Chen Zhong + +allOf: + - $ref: input.yaml# + +description: | + There are two key functions provided by MT6397, MT6323 and other MediaTek + PMICs: pwrkey and homekey. + The key functions are defined as the subnode of the function node provided + by the PMIC that is defined as a Multi-Function Device (MFD). + + For MediaTek MT6323/MT6397 PMIC bindings see + Documentation/devicetree/bindings/mfd/mt6397.txt + +properties: + compatible: + enum: + - mediatek,mt6323-keys + - mediatek,mt6358-keys + - mediatek,mt6397-keys + + power-off-time-sec: true + + mediatek,long-press-mode: + description: | + Key long-press force shutdown setting + 0 - disabled + 1 - pwrkey + 2 - pwrkey+homekey + $ref: /schemas/types.yaml#/definitions/uint32 + default: 0 + maximum: 2 + +patternProperties: + "^((power|home)|(key-[a-z0-9-]+|[a-z0-9-]+-key))$": + $ref: input.yaml# + + properties: + interrupts: + minItems: 1 + items: + - description: Key press interrupt + - description: Key release interrupt + + interrupt-names: true + + linux-keycodes: + maxItems: 1 + + wakeup-source: true + + required: + - linux,keycodes + + if: + properties: + interrupt-names: + contains: + const: powerkey + then: + properties: + interrupt-names: + minItems: 1 + items: + - const: powerkey + - const: powerkey_r + else: + properties: + interrupt-names: + minItems: 1 + items: + - const: homekey + - const: homekey_r + + unevaluatedProperties: false + +required: + - compatible + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pmic { + compatible = "mediatek,mt6397"; + + keys { + compatible = "mediatek,mt6397-keys"; + mediatek,long-press-mode = <1>; + power-off-time-sec = <0>; + + key-power { + linux,keycodes = ; + wakeup-source; + }; + + key-home { + linux,keycodes = ; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/input/mtk-pmic-keys.txt b/Documentation/devicetree/bindings/input/mtk-pmic-keys.txt deleted file mode 100644 index 9d00f2a8e13a4..0000000000000 --- a/Documentation/devicetree/bindings/input/mtk-pmic-keys.txt +++ /dev/null @@ -1,46 +0,0 @@ -MediaTek MT6397/MT6323 PMIC Keys Device Driver - -There are two key functions provided by MT6397/MT6323 PMIC, pwrkey -and homekey. The key functions are defined as the subnode of the function -node provided by MT6397/MT6323 PMIC that is being defined as one kind -of Muti-Function Device (MFD) - -For MT6397/MT6323 MFD bindings see: -Documentation/devicetree/bindings/mfd/mt6397.txt - -Required properties: -- compatible: Should be one of: - - "mediatek,mt6397-keys" - - "mediatek,mt6323-keys" - - "mediatek,mt6358-keys" -- linux,keycodes: See Documentation/devicetree/bindings/input/input.yaml - -Optional Properties: -- wakeup-source: See Documentation/devicetree/bindings/power/wakeup-source.txt -- mediatek,long-press-mode: Long press key shutdown setting, 1 for - pwrkey only, 2 for pwrkey/homekey together, others for disabled. -- power-off-time-sec: See Documentation/devicetree/bindings/input/input.yaml - -Example: - - pmic: mt6397 { - compatible = "mediatek,mt6397"; - - ... - - mt6397keys: mt6397keys { - compatible = "mediatek,mt6397-keys"; - mediatek,long-press-mode = <1>; - power-off-time-sec = <0>; - - power { - linux,keycodes = <116>; - wakeup-source; - }; - - home { - linux,keycodes = <114>; - }; - }; - - }; -- GitLab From 60a884da670119cd5492fe2774a9a3b9d119e045 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 14 Sep 2022 12:30:21 +0200 Subject: [PATCH 0308/2223] dt-bindings: input: mediatek,pmic-keys: Add compatible for MT6331 keys Add a compatible for the keys found on MT6331 PMIC. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Rob Herring Reviewed-by: Mattijs Korpershoek Link: https://lore.kernel.org/r/20220914103021.43593-3-angelogioacchino.delregno@collabora.com Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml b/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml index 9d8a0c3aebcaf..2f72ec4184157 100644 --- a/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml +++ b/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml @@ -25,6 +25,7 @@ properties: compatible: enum: - mediatek,mt6323-keys + - mediatek,mt6331-keys - mediatek,mt6358-keys - mediatek,mt6397-keys -- GitLab From aac00c7fa1149fd5b5a5110096ffa78dcb120b79 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:06 +0200 Subject: [PATCH 0309/2223] clk: test: Switch to clk_hw_get_clk Following the clk_hw->clk pointer is equivalent to calling clk_hw_get_clk(), but will make the job harder if we need to rework that part in the future. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-2-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 74 +++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 6731a822f4e38..7646356f30cba 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -160,12 +160,14 @@ static void clk_test_get_rate(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, ctx->rate); + + clk_put(clk); } /* @@ -179,7 +181,7 @@ static void clk_test_set_get_rate(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -189,6 +191,8 @@ static void clk_test_set_get_rate(struct kunit *test) rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_put(clk); } /* @@ -202,7 +206,7 @@ static void clk_test_set_set_get_rate(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -216,6 +220,8 @@ static void clk_test_set_set_get_rate(struct kunit *test) rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -226,7 +232,7 @@ static void clk_test_round_set_get_rate(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rounded_rate, set_rate; rounded_rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_1); @@ -240,6 +246,8 @@ static void clk_test_round_set_get_rate(struct kunit *test) set_rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, set_rate, 0); KUNIT_EXPECT_EQ(test, rounded_rate, set_rate); + + clk_put(clk); } static struct kunit_case clk_test_cases[] = { @@ -314,7 +322,7 @@ static void clk_test_orphan_transparent_parent_mux_set_range(struct kunit *test) { struct clk_single_parent_ctx *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate, new_rate; rate = clk_get_rate(clk); @@ -329,6 +337,8 @@ static void clk_test_orphan_transparent_parent_mux_set_range(struct kunit *test) new_rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, new_rate, 0); KUNIT_EXPECT_EQ(test, rate, new_rate); + + clk_put(clk); } static struct kunit_case clk_orphan_transparent_single_parent_mux_test_cases[] = { @@ -352,7 +362,7 @@ static void clk_range_test_set_range(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -365,6 +375,8 @@ static void clk_range_test_set_range(struct kunit *test) KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -375,13 +387,15 @@ static void clk_range_test_set_range_invalid(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); KUNIT_EXPECT_LT(test, clk_set_rate_range(clk, DUMMY_CLOCK_RATE_1 + 1000, DUMMY_CLOCK_RATE_1), 0); + + clk_put(clk); } /* @@ -420,7 +434,7 @@ static void clk_range_test_set_range_round_rate_lower(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); long rate; KUNIT_ASSERT_EQ(test, @@ -433,6 +447,8 @@ static void clk_range_test_set_range_round_rate_lower(struct kunit *test) KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -443,7 +459,7 @@ static void clk_range_test_set_range_set_rate_lower(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -460,6 +476,8 @@ static void clk_range_test_set_range_set_rate_lower(struct kunit *test) KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -472,7 +490,7 @@ static void clk_range_test_set_range_set_round_rate_consistent_lower(struct kuni { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); long rounded; KUNIT_ASSERT_EQ(test, @@ -489,6 +507,8 @@ static void clk_range_test_set_range_set_round_rate_consistent_lower(struct kuni 0); KUNIT_EXPECT_EQ(test, rounded, clk_get_rate(clk)); + + clk_put(clk); } /* @@ -499,7 +519,7 @@ static void clk_range_test_set_range_round_rate_higher(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); long rate; KUNIT_ASSERT_EQ(test, @@ -512,6 +532,8 @@ static void clk_range_test_set_range_round_rate_higher(struct kunit *test) KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -522,7 +544,7 @@ static void clk_range_test_set_range_set_rate_higher(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -539,6 +561,8 @@ static void clk_range_test_set_range_set_rate_higher(struct kunit *test) KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -551,7 +575,7 @@ static void clk_range_test_set_range_set_round_rate_consistent_higher(struct kun { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); long rounded; KUNIT_ASSERT_EQ(test, @@ -568,6 +592,8 @@ static void clk_range_test_set_range_set_round_rate_consistent_higher(struct kun 0); KUNIT_EXPECT_EQ(test, rounded, clk_get_rate(clk)); + + clk_put(clk); } /* @@ -582,7 +608,7 @@ static void clk_range_test_set_range_get_rate_raised(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -598,6 +624,8 @@ static void clk_range_test_set_range_get_rate_raised(struct kunit *test) rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_put(clk); } /* @@ -612,7 +640,7 @@ static void clk_range_test_set_range_get_rate_lowered(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -628,6 +656,8 @@ static void clk_range_test_set_range_get_rate_lowered(struct kunit *test) rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } static struct kunit_case clk_range_test_cases[] = { @@ -664,7 +694,7 @@ static void clk_range_test_set_range_rate_maximized(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -700,6 +730,8 @@ static void clk_range_test_set_range_rate_maximized(struct kunit *test) rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); } /* @@ -714,7 +746,7 @@ static void clk_range_test_multiple_set_range_rate_maximized(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); struct clk *user1, *user2; unsigned long rate; @@ -758,6 +790,7 @@ static void clk_range_test_multiple_set_range_rate_maximized(struct kunit *test) clk_put(user2); clk_put(user1); + clk_put(clk); } static struct kunit_case clk_range_maximize_test_cases[] = { @@ -785,7 +818,7 @@ static void clk_range_test_set_range_rate_minimized(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); unsigned long rate; KUNIT_ASSERT_EQ(test, @@ -821,6 +854,8 @@ static void clk_range_test_set_range_rate_minimized(struct kunit *test) rate = clk_get_rate(clk); KUNIT_ASSERT_GT(test, rate, 0); KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_put(clk); } /* @@ -835,7 +870,7 @@ static void clk_range_test_multiple_set_range_rate_minimized(struct kunit *test) { struct clk_dummy_context *ctx = test->priv; struct clk_hw *hw = &ctx->hw; - struct clk *clk = hw->clk; + struct clk *clk = clk_hw_get_clk(hw, NULL); struct clk *user1, *user2; unsigned long rate; @@ -875,6 +910,7 @@ static void clk_range_test_multiple_set_range_rate_minimized(struct kunit *test) clk_put(user2); clk_put(user1); + clk_put(clk); } static struct kunit_case clk_range_minimize_test_cases[] = { -- GitLab From d77388223240884b918b8d85f88f132916afbf06 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:07 +0200 Subject: [PATCH 0310/2223] clk: Drop the rate range on clk_put() When clk_put() is called we don't make another clk_set_rate() call to re-evaluate the rate boundaries. This is unlike clk_set_rate_range() that evaluates the rate again each time it is called. However, clk_put() is essentially equivalent to clk_set_rate_range() since after clk_put() completes the consumer's boundaries shouldn't be enforced anymore. Let's add a call to clk_set_rate_range() in clk_put() to make sure those rate boundaries are dropped and the clock provider drivers can react. In order to be as non-intrusive as possible, we'll just make that call if the clock had non-default boundaries. Also add a few tests to make sure this case is covered. Fixes: c80ac50cbb37 ("clk: Always set the rate on clk_set_range_rate") Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-3-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 45 +++++++++++------ drivers/clk/clk_test.c | 110 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 14 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 7fc191c155073..a5e0ab8bd6be1 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -2325,19 +2325,15 @@ int clk_set_rate_exclusive(struct clk *clk, unsigned long rate) } EXPORT_SYMBOL_GPL(clk_set_rate_exclusive); -/** - * clk_set_rate_range - set a rate range for a clock source - * @clk: clock source - * @min: desired minimum clock rate in Hz, inclusive - * @max: desired maximum clock rate in Hz, inclusive - * - * Returns success (0) or negative errno. - */ -int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max) +static int clk_set_rate_range_nolock(struct clk *clk, + unsigned long min, + unsigned long max) { int ret = 0; unsigned long old_min, old_max, rate; + lockdep_assert_held(&prepare_lock); + if (!clk) return 0; @@ -2350,8 +2346,6 @@ int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max) return -EINVAL; } - clk_prepare_lock(); - if (clk->exclusive_count) clk_core_rate_unprotect(clk->core); @@ -2395,6 +2389,28 @@ out: if (clk->exclusive_count) clk_core_rate_protect(clk->core); + return ret; +} + +/** + * clk_set_rate_range - set a rate range for a clock source + * @clk: clock source + * @min: desired minimum clock rate in Hz, inclusive + * @max: desired maximum clock rate in Hz, inclusive + * + * Return: 0 for success or negative errno on failure. + */ +int clk_set_rate_range(struct clk *clk, unsigned long min, unsigned long max) +{ + int ret; + + if (!clk) + return 0; + + clk_prepare_lock(); + + ret = clk_set_rate_range_nolock(clk, min, max); + clk_prepare_unlock(); return ret; @@ -4348,9 +4364,10 @@ void __clk_put(struct clk *clk) } hlist_del(&clk->clks_node); - if (clk->min_rate > clk->core->req_rate || - clk->max_rate < clk->core->req_rate) - clk_core_set_rate_nolock(clk->core, clk->core->req_rate); + + /* If we had any boundaries on that clock, let's drop them. */ + if (clk->min_rate > 0 || clk->max_rate < ULONG_MAX) + clk_set_rate_range_nolock(clk, 0, ULONG_MAX); owner = clk->core->owner; kref_put(&clk->core->ref, __clk_release); diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 7646356f30cba..7d9da88c39ee6 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -793,9 +793,66 @@ static void clk_range_test_multiple_set_range_rate_maximized(struct kunit *test) clk_put(clk); } +/* + * Test that if we have several subsequent calls to + * clk_set_rate_range(), across multiple users, the core will reevaluate + * whether a new rate is needed, including when a user drop its clock. + * + * With clk_dummy_maximize_rate_ops, this means that the rate will + * trail along the maximum as it evolves. + */ +static void clk_range_test_multiple_set_range_rate_put_maximized(struct kunit *test) +{ + struct clk_dummy_context *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *user1, *user2; + unsigned long rate; + + user1 = clk_hw_get_clk(hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, user1); + + user2 = clk_hw_get_clk(hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, user2); + + KUNIT_ASSERT_EQ(test, + clk_set_rate(clk, DUMMY_CLOCK_RATE_2 + 1000), + 0); + + KUNIT_ASSERT_EQ(test, + clk_set_rate_range(user1, + 0, + DUMMY_CLOCK_RATE_2), + 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_2); + + KUNIT_ASSERT_EQ(test, + clk_set_rate_range(user2, + 0, + DUMMY_CLOCK_RATE_1), + 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_put(user2); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(user1); + clk_put(clk); +} + static struct kunit_case clk_range_maximize_test_cases[] = { KUNIT_CASE(clk_range_test_set_range_rate_maximized), KUNIT_CASE(clk_range_test_multiple_set_range_rate_maximized), + KUNIT_CASE(clk_range_test_multiple_set_range_rate_put_maximized), {} }; @@ -913,9 +970,62 @@ static void clk_range_test_multiple_set_range_rate_minimized(struct kunit *test) clk_put(clk); } +/* + * Test that if we have several subsequent calls to + * clk_set_rate_range(), across multiple users, the core will reevaluate + * whether a new rate is needed, including when a user drop its clock. + * + * With clk_dummy_minimize_rate_ops, this means that the rate will + * trail along the minimum as it evolves. + */ +static void clk_range_test_multiple_set_range_rate_put_minimized(struct kunit *test) +{ + struct clk_dummy_context *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *user1, *user2; + unsigned long rate; + + user1 = clk_hw_get_clk(hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, user1); + + user2 = clk_hw_get_clk(hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, user2); + + KUNIT_ASSERT_EQ(test, + clk_set_rate_range(user1, + DUMMY_CLOCK_RATE_1, + ULONG_MAX), + 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + KUNIT_ASSERT_EQ(test, + clk_set_rate_range(user2, + DUMMY_CLOCK_RATE_2, + ULONG_MAX), + 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(user2); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_put(user1); + clk_put(clk); +} + static struct kunit_case clk_range_minimize_test_cases[] = { KUNIT_CASE(clk_range_test_set_range_rate_minimized), KUNIT_CASE(clk_range_test_multiple_set_range_rate_minimized), + KUNIT_CASE(clk_range_test_multiple_set_range_rate_put_minimized), {} }; -- GitLab From facf949b2e6934d381050417bf4e34b20c93be09 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:08 +0200 Subject: [PATCH 0311/2223] clk: Skip clamping when rounding if there's no boundaries Commit 948fb0969eae ("clk: Always clamp the rounded rate") recently started to clamp the request rate in the clk_rate_request passed as an argument of clk_core_determine_round_nolock() with the min_rate and max_rate fields of that same request. While the clk_rate_requests created by the framework itself always have those fields set, some drivers will create it themselves and don't always fill min_rate and max_rate. In such a case, we end up clamping the rate with a minimum and maximum of 0, thus always rounding the rate to 0. Let's skip the clamping if both min_rate and max_rate are set to 0 and complain so that it gets fixed. Fixes: 948fb0969eae ("clk: Always clamp the rounded rate") Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-4-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index a5e0ab8bd6be1..9d63163244d46 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1341,7 +1341,19 @@ static int clk_core_determine_round_nolock(struct clk_core *core, if (!core) return 0; - req->rate = clamp(req->rate, req->min_rate, req->max_rate); + /* + * Some clock providers hand-craft their clk_rate_requests and + * might not fill min_rate and max_rate. + * + * If it's the case, clamping the rate is equivalent to setting + * the rate to 0 which is bad. Skip the clamping but complain so + * that it gets fixed, hopefully. + */ + if (!req->min_rate && !req->max_rate) + pr_warn("%s: %s: clk_rate_request has initialized min or max rate.\n", + __func__, core->name); + else + req->rate = clamp(req->rate, req->min_rate, req->max_rate); /* * At this point, core protection will be disabled -- GitLab From f24a0b1c22c2e90abb4ee1f7a3b0f0d8fc2ede5f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:09 +0200 Subject: [PATCH 0312/2223] clk: Mention that .recalc_rate can return 0 on error Multiple platforms (amlogic, imx8) return 0 when the clock rate cannot be determined properly by the recalc_rate hook. Mention in the documentation that the framework is ok with that. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-5-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1615010aa0ecd..9a14cfa0d2011 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -118,8 +118,9 @@ struct clk_duty { * * @recalc_rate Recalculate the rate of this clock, by querying hardware. The * parent rate is an input parameter. It is up to the caller to - * ensure that the prepare_mutex is held across this call. - * Returns the calculated rate. Optional, but recommended - if + * ensure that the prepare_mutex is held across this call. If the + * driver cannot figure out a rate for this clock, it must return + * 0. Returns the calculated rate. Optional, but recommended - if * this op is not set then clock rate will be initialized to 0. * * @round_rate: Given a target rate as input, returns the closest rate actually -- GitLab From bde8870cd8c3a3913ddbc19f8422a21828e14d99 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:10 +0200 Subject: [PATCH 0313/2223] clk: Clarify clk_get_rate() expectations As shown by a number of clock users already, clk_get_rate() can be called whether or not the clock is enabled. Similarly, a number of clock drivers will return a rate of 0 whenever the rate cannot be figured out. Since it was a bit ambiguous before, let's make it clear in the clk_get_rate() documentation. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-6-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 9d63163244d46..caa2eb6404416 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1672,8 +1672,9 @@ static unsigned long clk_core_get_rate_recalc(struct clk_core *core) * @clk: the clk whose rate is being returned * * Simply returns the cached rate of the clk, unless CLK_GET_RATE_NOCACHE flag - * is set, which means a recalc_rate will be issued. - * If clk is NULL then returns 0. + * is set, which means a recalc_rate will be issued. Can be called regardless of + * the clock enabledness. If clk is NULL, or if an error occurred, then returns + * 0. */ unsigned long clk_get_rate(struct clk *clk) { -- GitLab From 090962b6a90a2bf81142f6d5da9492380d5fba08 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:11 +0200 Subject: [PATCH 0314/2223] clk: tests: Add test suites description We start to have a few test suites, and we'll add more, so it will get pretty confusing to figure out what is supposed to be tested in what suite. Let's add some comments to explain what setup they create, and what we should be testing in every suite. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-7-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 7d9da88c39ee6..1a7cb482ec58c 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -258,6 +258,11 @@ static struct kunit_case clk_test_cases[] = { {} }; +/* + * Test suite for a basic rate clock, without any parent. + * + * These tests exercise the rate API with simple scenarios + */ static struct kunit_suite clk_test_suite = { .name = "clk-test", .init = clk_test_init, @@ -346,6 +351,14 @@ static struct kunit_case clk_orphan_transparent_single_parent_mux_test_cases[] = {} }; +/* + * Test suite for a basic mux clock with one parent. The parent is + * registered after its child. The clock will thus be an orphan when + * registered, but will no longer be when the tests run. + * + * These tests make sure a clock that used to be orphan has a sane, + * consistent, behaviour. + */ static struct kunit_suite clk_orphan_transparent_single_parent_test_suite = { .name = "clk-orphan-transparent-single-parent-test", .init = clk_orphan_transparent_single_parent_mux_test_init, @@ -675,6 +688,12 @@ static struct kunit_case clk_range_test_cases[] = { {} }; +/* + * Test suite for a basic rate clock, without any parent. + * + * These tests exercise the rate range API: clk_set_rate_range(), + * clk_set_min_rate(), clk_set_max_rate(), clk_drop_range(). + */ static struct kunit_suite clk_range_test_suite = { .name = "clk-range-test", .init = clk_test_init, @@ -856,6 +875,13 @@ static struct kunit_case clk_range_maximize_test_cases[] = { {} }; +/* + * Test suite for a basic rate clock, without any parent. + * + * These tests exercise the rate range API: clk_set_rate_range(), + * clk_set_min_rate(), clk_set_max_rate(), clk_drop_range(), with a + * driver that will always try to run at the highest possible rate. + */ static struct kunit_suite clk_range_maximize_test_suite = { .name = "clk-range-maximize-test", .init = clk_maximize_test_init, @@ -1029,6 +1055,13 @@ static struct kunit_case clk_range_minimize_test_cases[] = { {} }; +/* + * Test suite for a basic rate clock, without any parent. + * + * These tests exercise the rate range API: clk_set_rate_range(), + * clk_set_min_rate(), clk_set_max_rate(), clk_drop_range(), with a + * driver that will always try to run at the lowest possible rate. + */ static struct kunit_suite clk_range_minimize_test_suite = { .name = "clk-range-minimize-test", .init = clk_minimize_test_init, -- GitLab From 7d79c26b60e623a9a089d771f81c5997bda577cd Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:12 +0200 Subject: [PATCH 0315/2223] clk: tests: Add reference to the orphan mux bug report Some more context might be useful for unit-tests covering a previously reported bug, so let's add a link to the discussion for that bug. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-8-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 1a7cb482ec58c..b8e32406a6e4f 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -322,6 +322,9 @@ static void clk_orphan_transparent_single_parent_mux_test_exit(struct kunit *tes /* * Test that a mux-only clock, with an initial rate within a range, * will still have the same rate after the range has been enforced. + * + * See: + * https://lore.kernel.org/linux-clk/7720158d-10a7-a17b-73a4-a8615c9c6d5c@collabora.com/ */ static void clk_test_orphan_transparent_parent_mux_set_range(struct kunit *test) { -- GitLab From 350575abec48ef5363abd832386cb5a33861cb10 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:13 +0200 Subject: [PATCH 0316/2223] clk: tests: Add tests for uncached clock The clock framework supports clocks that can have their rate changed without the kernel knowing about it using the CLK_GET_RATE_NOCACHE flag. As its name suggests, this flag turns off the rate caching in the clock framework, reading out the rate from the hardware any time we need to read it. Let's add a couple of tests to make sure it works as intended. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-9-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 93 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index b8e32406a6e4f..b269420dafcc0 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -270,6 +270,96 @@ static struct kunit_suite clk_test_suite = { .test_cases = clk_test_cases, }; +static int clk_uncached_test_init(struct kunit *test) +{ + struct clk_dummy_context *ctx; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + + ctx->rate = DUMMY_CLOCK_INIT_RATE; + ctx->hw.init = CLK_HW_INIT_NO_PARENT("test-clk", + &clk_dummy_rate_ops, + CLK_GET_RATE_NOCACHE); + + ret = clk_hw_register(NULL, &ctx->hw); + if (ret) + return ret; + + return 0; +} + +/* + * Test that for an uncached clock, the clock framework doesn't cache + * the rate and clk_get_rate() will return the underlying clock rate + * even if it changed. + */ +static void clk_test_uncached_get_rate(struct kunit *test) +{ + struct clk_dummy_context *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + unsigned long rate; + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_INIT_RATE); + + /* We change the rate behind the clock framework's back */ + ctx->rate = DUMMY_CLOCK_RATE_1; + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_put(clk); +} + +/* + * Test that for an uncached clock, clk_set_rate_range() will work + * properly if the rate hasn't changed. + */ +static void clk_test_uncached_set_range(struct kunit *test) +{ + struct clk_dummy_context *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + unsigned long rate; + + KUNIT_ASSERT_EQ(test, + clk_set_rate_range(clk, + DUMMY_CLOCK_RATE_1, + DUMMY_CLOCK_RATE_2), + 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); +} + +static struct kunit_case clk_uncached_test_cases[] = { + KUNIT_CASE(clk_test_uncached_get_rate), + KUNIT_CASE(clk_test_uncached_set_range), + {} +}; + +/* + * Test suite for a basic, uncached, rate clock, without any parent. + * + * These tests exercise the rate API with simple scenarios + */ +static struct kunit_suite clk_uncached_test_suite = { + .name = "clk-uncached-test", + .init = clk_uncached_test_init, + .exit = clk_test_exit, + .test_cases = clk_uncached_test_cases, +}; + struct clk_single_parent_ctx { struct clk_dummy_context parent_ctx; struct clk_hw hw; @@ -1077,6 +1167,7 @@ kunit_test_suites( &clk_orphan_transparent_single_parent_test_suite, &clk_range_test_suite, &clk_range_maximize_test_suite, - &clk_range_minimize_test_suite + &clk_range_minimize_test_suite, + &clk_uncached_test_suite ); MODULE_LICENSE("GPL v2"); -- GitLab From 02cdeace1e1ed210eb9fc2ce562d93580b1dcfe5 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:14 +0200 Subject: [PATCH 0317/2223] clk: tests: Add tests for single parent mux We have a few tests for a mux with a single parent, testing the case where it used to be orphan. Let's leverage most of the code but register the clock properly to test a few trivial things. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-10-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 194 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 185 insertions(+), 9 deletions(-) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index b269420dafcc0..06c9220873bb6 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -365,6 +365,189 @@ struct clk_single_parent_ctx { struct clk_hw hw; }; +static int clk_single_parent_mux_test_init(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + + ctx->parent_ctx.rate = DUMMY_CLOCK_INIT_RATE; + ctx->parent_ctx.hw.init = + CLK_HW_INIT_NO_PARENT("parent-clk", + &clk_dummy_rate_ops, + 0); + + ret = clk_hw_register(NULL, &ctx->parent_ctx.hw); + if (ret) + return ret; + + ctx->hw.init = CLK_HW_INIT("test-clk", "parent-clk", + &clk_dummy_single_parent_ops, + CLK_SET_RATE_PARENT); + + ret = clk_hw_register(NULL, &ctx->hw); + if (ret) + return ret; + + return 0; +} + +static void +clk_single_parent_mux_test_exit(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + + clk_hw_unregister(&ctx->hw); + clk_hw_unregister(&ctx->parent_ctx.hw); +} + +/* + * Test that for a clock with a single parent, clk_get_parent() actually + * returns the parent. + */ +static void +clk_test_single_parent_mux_get_parent(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent = clk_hw_get_clk(&ctx->parent_ctx.hw, NULL); + + KUNIT_EXPECT_TRUE(test, clk_is_match(clk_get_parent(clk), parent)); + + clk_put(parent); + clk_put(clk); +} + +/* + * Test that for a clock that can't modify its rate and with a single + * parent, if we set disjoints range on the parent and then the child, + * the second will return an error. + * + * FIXME: clk_set_rate_range() only considers the current clock when + * evaluating whether ranges are disjoints and not the upstream clocks + * ranges. + */ +static void +clk_test_single_parent_mux_set_range_disjoint_child_last(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + int ret; + + kunit_skip(test, "This needs to be fixed in the core."); + + parent = clk_get_parent(clk); + KUNIT_ASSERT_PTR_NE(test, parent, NULL); + + ret = clk_set_rate_range(parent, 1000, 2000); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(clk, 3000, 4000); + KUNIT_EXPECT_LT(test, ret, 0); + + clk_put(clk); +} + +/* + * Test that for a clock that can't modify its rate and with a single + * parent, if we set disjoints range on the child and then the parent, + * the second will return an error. + * + * FIXME: clk_set_rate_range() only considers the current clock when + * evaluating whether ranges are disjoints and not the downstream clocks + * ranges. + */ +static void +clk_test_single_parent_mux_set_range_disjoint_parent_last(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + int ret; + + kunit_skip(test, "This needs to be fixed in the core."); + + parent = clk_get_parent(clk); + KUNIT_ASSERT_PTR_NE(test, parent, NULL); + + ret = clk_set_rate_range(clk, 1000, 2000); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(parent, 3000, 4000); + KUNIT_EXPECT_LT(test, ret, 0); + + clk_put(clk); +} + +/* + * Test that for a clock that can't modify its rate and with a single + * parent, if we set a range on the parent and a more restrictive one on + * the child, and then call clk_round_rate(), the boundaries of the + * two clocks are taken into account. + */ +static void +clk_test_single_parent_mux_set_range_round_rate_child_smaller(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long rate; + int ret; + + parent = clk_get_parent(clk); + KUNIT_ASSERT_PTR_NE(test, parent, NULL); + + ret = clk_set_rate_range(parent, DUMMY_CLOCK_RATE_1, DUMMY_CLOCK_RATE_2); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(clk, DUMMY_CLOCK_RATE_1 + 1000, DUMMY_CLOCK_RATE_2 - 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_1 - 1000); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1 + 1000); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2 - 1000); + + rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_2 + 1000); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1 + 1000); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2 - 1000); + + clk_put(clk); +} + +static struct kunit_case clk_single_parent_mux_test_cases[] = { + KUNIT_CASE(clk_test_single_parent_mux_get_parent), + KUNIT_CASE(clk_test_single_parent_mux_set_range_disjoint_child_last), + KUNIT_CASE(clk_test_single_parent_mux_set_range_disjoint_parent_last), + KUNIT_CASE(clk_test_single_parent_mux_set_range_round_rate_child_smaller), + {} +}; + +/* + * Test suite for a basic mux clock with one parent, with + * CLK_SET_RATE_PARENT on the child. + * + * These tests exercise the consumer API and check that the state of the + * child and parent are sane and consistent. + */ +static struct kunit_suite +clk_single_parent_mux_test_suite = { + .name = "clk-single-parent-mux-test", + .init = clk_single_parent_mux_test_init, + .exit = clk_single_parent_mux_test_exit, + .test_cases = clk_single_parent_mux_test_cases, +}; + static int clk_orphan_transparent_single_parent_mux_test_init(struct kunit *test) { struct clk_single_parent_ctx *ctx; @@ -401,14 +584,6 @@ static int clk_orphan_transparent_single_parent_mux_test_init(struct kunit *test return 0; } -static void clk_orphan_transparent_single_parent_mux_test_exit(struct kunit *test) -{ - struct clk_single_parent_ctx *ctx = test->priv; - - clk_hw_unregister(&ctx->hw); - clk_hw_unregister(&ctx->parent_ctx.hw); -} - /* * Test that a mux-only clock, with an initial rate within a range, * will still have the same rate after the range has been enforced. @@ -455,7 +630,7 @@ static struct kunit_case clk_orphan_transparent_single_parent_mux_test_cases[] = static struct kunit_suite clk_orphan_transparent_single_parent_test_suite = { .name = "clk-orphan-transparent-single-parent-test", .init = clk_orphan_transparent_single_parent_mux_test_init, - .exit = clk_orphan_transparent_single_parent_mux_test_exit, + .exit = clk_single_parent_mux_test_exit, .test_cases = clk_orphan_transparent_single_parent_mux_test_cases, }; @@ -1168,6 +1343,7 @@ kunit_test_suites( &clk_range_test_suite, &clk_range_maximize_test_suite, &clk_range_minimize_test_suite, + &clk_single_parent_mux_test_suite, &clk_uncached_test_suite ); MODULE_LICENSE("GPL v2"); -- GitLab From 74933ef22c1c3d3d1456c2f949f1910ce2aab1f1 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:15 +0200 Subject: [PATCH 0318/2223] clk: tests: Add tests for mux with multiple parents We'll need to test a few corner cases that occur when we have a mux clock whose default parent is missing. For now, let's create the context structure and the trivial ops, along with a test suite that just tests trivial things for now, without considering the orphan case. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-11-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 121 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 06c9220873bb6..1ccafd4fabffa 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -108,6 +108,39 @@ static const struct clk_ops clk_dummy_single_parent_ops = { .get_parent = clk_dummy_single_get_parent, }; +struct clk_multiple_parent_ctx { + struct clk_dummy_context parents_ctx[2]; + struct clk_hw hw; + u8 current_parent; +}; + +static int clk_multiple_parents_mux_set_parent(struct clk_hw *hw, u8 index) +{ + struct clk_multiple_parent_ctx *ctx = + container_of(hw, struct clk_multiple_parent_ctx, hw); + + if (index >= clk_hw_get_num_parents(hw)) + return -EINVAL; + + ctx->current_parent = index; + + return 0; +} + +static u8 clk_multiple_parents_mux_get_parent(struct clk_hw *hw) +{ + struct clk_multiple_parent_ctx *ctx = + container_of(hw, struct clk_multiple_parent_ctx, hw); + + return ctx->current_parent; +} + +static const struct clk_ops clk_multiple_parents_mux_ops = { + .get_parent = clk_multiple_parents_mux_get_parent, + .set_parent = clk_multiple_parents_mux_set_parent, + .determine_rate = __clk_mux_determine_rate_closest, +}; + static int clk_test_init_with_ops(struct kunit *test, const struct clk_ops *ops) { struct clk_dummy_context *ctx; @@ -360,6 +393,93 @@ static struct kunit_suite clk_uncached_test_suite = { .test_cases = clk_uncached_test_cases, }; +static int +clk_multiple_parents_mux_test_init(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx; + const char *parents[2] = { "parent-0", "parent-1"}; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + + ctx->parents_ctx[0].hw.init = CLK_HW_INIT_NO_PARENT("parent-0", + &clk_dummy_rate_ops, + 0); + ctx->parents_ctx[0].rate = DUMMY_CLOCK_RATE_1; + ret = clk_hw_register(NULL, &ctx->parents_ctx[0].hw); + if (ret) + return ret; + + ctx->parents_ctx[1].hw.init = CLK_HW_INIT_NO_PARENT("parent-1", + &clk_dummy_rate_ops, + 0); + ctx->parents_ctx[1].rate = DUMMY_CLOCK_RATE_2; + ret = clk_hw_register(NULL, &ctx->parents_ctx[1].hw); + if (ret) + return ret; + + ctx->current_parent = 0; + ctx->hw.init = CLK_HW_INIT_PARENTS("test-mux", parents, + &clk_multiple_parents_mux_ops, + CLK_SET_RATE_PARENT); + ret = clk_hw_register(NULL, &ctx->hw); + if (ret) + return ret; + + return 0; +} + +static void +clk_multiple_parents_mux_test_exit(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + + clk_hw_unregister(&ctx->hw); + clk_hw_unregister(&ctx->parents_ctx[0].hw); + clk_hw_unregister(&ctx->parents_ctx[1].hw); +} + +/* + * Test that for a clock with multiple parents, clk_get_parent() + * actually returns the current one. + */ +static void +clk_test_multiple_parents_mux_get_parent(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent = clk_hw_get_clk(&ctx->parents_ctx[0].hw, NULL); + + KUNIT_EXPECT_TRUE(test, clk_is_match(clk_get_parent(clk), parent)); + + clk_put(parent); + clk_put(clk); +} + +static struct kunit_case clk_multiple_parents_mux_test_cases[] = { + KUNIT_CASE(clk_test_multiple_parents_mux_get_parent), + {} +}; + +/* + * Test suite for a basic mux clock with two parents, with + * CLK_SET_RATE_PARENT on the child. + * + * These tests exercise the consumer API and check that the state of the + * child and parents are sane and consistent. + */ +static struct kunit_suite +clk_multiple_parents_mux_test_suite = { + .name = "clk-multiple-parents-mux-test", + .init = clk_multiple_parents_mux_test_init, + .exit = clk_multiple_parents_mux_test_exit, + .test_cases = clk_multiple_parents_mux_test_cases, +}; + struct clk_single_parent_ctx { struct clk_dummy_context parent_ctx; struct clk_hw hw; @@ -1339,6 +1459,7 @@ static struct kunit_suite clk_range_minimize_test_suite = { kunit_test_suites( &clk_test_suite, + &clk_multiple_parents_mux_test_suite, &clk_orphan_transparent_single_parent_test_suite, &clk_range_test_suite, &clk_range_maximize_test_suite, -- GitLab From 2e9cad1abc7149c5e6aeee7e76a6c363d392da8b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:16 +0200 Subject: [PATCH 0319/2223] clk: tests: Add some tests for orphan with multiple parents Let's leverage the dummy mux with multiple parents we have to create a mux whose default parent will never be registered, and thus will always be orphan by default. We can then create some tests to make sure that the clock API behaves properly in such a case, and that the transition to a non-orphan clock when we change the parent is done properly. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-12-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 237 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 1ccafd4fabffa..ceed49c5a88bf 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -480,6 +480,242 @@ clk_multiple_parents_mux_test_suite = { .test_cases = clk_multiple_parents_mux_test_cases, }; +static int +clk_orphan_transparent_multiple_parent_mux_test_init(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx; + const char *parents[2] = { "missing-parent", "proper-parent"}; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + + ctx->parents_ctx[1].hw.init = CLK_HW_INIT_NO_PARENT("proper-parent", + &clk_dummy_rate_ops, + 0); + ctx->parents_ctx[1].rate = DUMMY_CLOCK_INIT_RATE; + ret = clk_hw_register(NULL, &ctx->parents_ctx[1].hw); + if (ret) + return ret; + + ctx->hw.init = CLK_HW_INIT_PARENTS("test-orphan-mux", parents, + &clk_multiple_parents_mux_ops, + CLK_SET_RATE_PARENT); + ret = clk_hw_register(NULL, &ctx->hw); + if (ret) + return ret; + + return 0; +} + +static void +clk_orphan_transparent_multiple_parent_mux_test_exit(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + + clk_hw_unregister(&ctx->hw); + clk_hw_unregister(&ctx->parents_ctx[1].hw); +} + +/* + * Test that, for a mux whose current parent hasn't been registered yet and is + * thus orphan, clk_get_parent() will return NULL. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_get_parent(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + + KUNIT_EXPECT_PTR_EQ(test, clk_get_parent(clk), NULL); + + clk_put(clk); +} + +/* + * Test that, for a mux whose current parent hasn't been registered yet, + * calling clk_set_parent() to a valid parent will properly update the + * mux parent and its orphan status. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_parent(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent, *new_parent; + int ret; + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + new_parent = clk_get_parent(clk); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + KUNIT_EXPECT_TRUE(test, clk_is_match(parent, new_parent)); + + clk_put(parent); + clk_put(clk); +} + +/* + * Test that, for a mux that started orphan but got switched to a valid + * parent, the rate of the mux and its new parent are consistent. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_parent_get_rate(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long parent_rate, rate; + int ret; + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + parent_rate = clk_get_rate(parent); + KUNIT_ASSERT_GT(test, parent_rate, 0); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, parent_rate, rate); + + clk_put(parent); + clk_put(clk); +} + +/* + * Test that, for a mux that started orphan but got switched to a valid + * parent, calling clk_set_rate_range() will affect the parent state if + * its rate is out of range. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_parent_set_range_modified(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long rate; + int ret; + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(clk, DUMMY_CLOCK_RATE_1, DUMMY_CLOCK_RATE_2); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(parent); + clk_put(clk); +} + +/* + * Test that, for a mux whose current parent hasn't been registered yet, + * calling clk_set_rate_range() will succeed, and will be taken into + * account when rounding a rate. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_range_round_rate(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + unsigned long rate; + int ret; + + ret = clk_set_rate_range(clk, DUMMY_CLOCK_RATE_1, DUMMY_CLOCK_RATE_2); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_1 - 1000); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); +} + +/* + * Test that, for a mux that started orphan, was assigned and rate and + * then got switched to a valid parent, its rate is eventually within + * range. + * + * FIXME: Even though we update the rate as part of clk_set_parent(), we + * don't evaluate whether that new rate is within range and needs to be + * adjusted. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_range_set_parent_get_rate(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long rate; + int ret; + + kunit_skip(test, "This needs to be fixed in the core."); + + clk_hw_set_rate_range(hw, DUMMY_CLOCK_RATE_1, DUMMY_CLOCK_RATE_2); + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(parent); + clk_put(clk); +} + +static struct kunit_case clk_orphan_transparent_multiple_parent_mux_test_cases[] = { + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_get_parent), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_get_rate), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_set_range_modified), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_range_round_rate), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_range_set_parent_get_rate), + {} +}; + +/* + * Test suite for a basic mux clock with two parents. The default parent + * isn't registered, only the second parent is. By default, the clock + * will thus be orphan. + * + * These tests exercise the behaviour of the consumer API when dealing + * with an orphan clock, and how we deal with the transition to a valid + * parent. + */ +static struct kunit_suite clk_orphan_transparent_multiple_parent_mux_test_suite = { + .name = "clk-orphan-transparent-multiple-parent-mux-test", + .init = clk_orphan_transparent_multiple_parent_mux_test_init, + .exit = clk_orphan_transparent_multiple_parent_mux_test_exit, + .test_cases = clk_orphan_transparent_multiple_parent_mux_test_cases, +}; + struct clk_single_parent_ctx { struct clk_dummy_context parent_ctx; struct clk_hw hw; @@ -1460,6 +1696,7 @@ static struct kunit_suite clk_range_minimize_test_suite = { kunit_test_suites( &clk_test_suite, &clk_multiple_parents_mux_test_suite, + &clk_orphan_transparent_multiple_parent_mux_test_suite, &clk_orphan_transparent_single_parent_test_suite, &clk_range_test_suite, &clk_range_maximize_test_suite, -- GitLab From 3afb07231d603d51dca6a5d5e16d9d8f422f9b5f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:17 +0200 Subject: [PATCH 0320/2223] clk: Take into account uncached clocks in clk_set_rate_range() clk_set_rate_range() will use the last requested rate for the clock when it calls into the driver set_rate hook. However, if CLK_GET_RATE_NOCACHE is set on that clock, the last requested rate might not be matching the current rate of the clock. In such a case, let's read out the rate from the hardware and use that in our set_rate instead. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-13-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 6 +++++- drivers/clk/clk_test.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index caa2eb6404416..53b28e63deae3 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -2373,6 +2373,10 @@ static int clk_set_rate_range_nolock(struct clk *clk, goto out; } + rate = clk->core->req_rate; + if (clk->core->flags & CLK_GET_RATE_NOCACHE) + rate = clk_core_get_rate_recalc(clk->core); + /* * Since the boundaries have been changed, let's give the * opportunity to the provider to adjust the clock rate based on @@ -2390,7 +2394,7 @@ static int clk_set_rate_range_nolock(struct clk *clk, * - the determine_rate() callback does not really check for * this corner case when determining the rate */ - rate = clamp(clk->core->req_rate, min, max); + rate = clamp(rate, min, max); ret = clk_core_set_rate_nolock(clk->core, rate); if (ret) { /* rollback the changes */ diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index ceed49c5a88bf..d3e121f21ae21 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -375,9 +375,40 @@ static void clk_test_uncached_set_range(struct kunit *test) clk_put(clk); } +/* + * Test that for an uncached clock, clk_set_rate_range() will work + * properly if the rate has changed in hardware. + * + * In this case, it means that if the rate wasn't initially in the range + * we're trying to set, but got changed at some point into the range + * without the kernel knowing about it, its rate shouldn't be affected. + */ +static void clk_test_uncached_updated_rate_set_range(struct kunit *test) +{ + struct clk_dummy_context *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + unsigned long rate; + + /* We change the rate behind the clock framework's back */ + ctx->rate = DUMMY_CLOCK_RATE_1 + 1000; + KUNIT_ASSERT_EQ(test, + clk_set_rate_range(clk, + DUMMY_CLOCK_RATE_1, + DUMMY_CLOCK_RATE_2), + 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_RATE_1 + 1000); + + clk_put(clk); +} + static struct kunit_case clk_uncached_test_cases[] = { KUNIT_CASE(clk_test_uncached_get_rate), KUNIT_CASE(clk_test_uncached_set_range), + KUNIT_CASE(clk_test_uncached_updated_rate_set_range), {} }; -- GitLab From cb1b1dd96241f37ea41d241946d5153c48141cd5 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:18 +0200 Subject: [PATCH 0321/2223] clk: Set req_rate on reparenting If a non-rate clock started by default with a parent that never registered, core->req_rate will be 0. The expectation is that whenever the parent will be registered, req_rate will be updated with the new value that has just been computed. However, if that clock is a mux, clk_set_parent() can also make that clock no longer orphan. In this case however, we never update req_rate. The natural solution to this would be to update core->rate and core->req_rate in clk_reparent() by calling clk_recalc(). However, this doesn't work in all cases. Indeed, clk_recalc() is called by __clk_set_parent_before(), __clk_set_parent() and clk_core_reparent(). Both __clk_set_parent_before() and __clk_set_parent will call clk_recalc() with the enable_lock taken through a call to clk_enable_lock(), the underlying locking primitive being a spinlock. clk_recalc() calls the backing driver .recalc_rate hook, and that implementation might sleep if the underlying device uses a bus with accesses that might sleep, such as i2c. In such a situation, we would end up sleeping while holding a spinlock, and thus in an atomic section. In order to work around this, we can move the core->rate and core->req_rate update to the clk_recalc() calling sites, after the enable_lock has been released if it was taken. The only situation that could still be problematic is the clk_core_reparent() -> clk_reparent() case that doesn't have any locking. clk_core_reparent() is itself called by clk_hw_reparent(), which is then called by 4 drivers: * clk-stm32mp1.c, stm32/clk-stm32-core.c and tegra/clk-tegra210-emc.c use it in their set_parent implementation. The set_parent hook is only called by __clk_set_parent() and clk_change_rate(), both of them calling it without the enable_lock taken. * clk/tegra/clk-tegra124-emc.c calls it as part of its set_rate implementation. set_rate is only called by clk_change_rate(), again without the enable_lock taken. In both cases we can't end up in a situation where the clk_hw_reparent() caller would hold a spinlock, so it seems like this is a good workaround. Let's also add some unit tests to make sure we cover the original bug. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-14-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 22 ++++ drivers/clk/clk_test.c | 239 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 53b28e63deae3..91bb1ea0e147b 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1765,6 +1765,23 @@ static void clk_core_update_orphan_status(struct clk_core *core, bool is_orphan) clk_core_update_orphan_status(child, is_orphan); } +/* + * Update the orphan rate and req_rate of @core and all its children. + */ +static void clk_core_update_orphan_child_rates(struct clk_core *core) +{ + struct clk_core *child; + unsigned long parent_rate = 0; + + if (core->parent) + parent_rate = core->parent->rate; + + core->rate = core->req_rate = clk_recalc(core, parent_rate); + + hlist_for_each_entry(child, &core->children, child_node) + clk_core_update_orphan_child_rates(child); +} + static void clk_reparent(struct clk_core *core, struct clk_core *new_parent) { bool was_orphan = core->orphan; @@ -1834,6 +1851,8 @@ static struct clk_core *__clk_set_parent_before(struct clk_core *core, clk_reparent(core, parent); clk_enable_unlock(flags); + clk_core_update_orphan_child_rates(core); + return old_parent; } @@ -1878,6 +1897,8 @@ static int __clk_set_parent(struct clk_core *core, struct clk_core *parent, flags = clk_enable_lock(); clk_reparent(core, old_parent); clk_enable_unlock(flags); + + clk_core_update_orphan_child_rates(core); __clk_set_parent_after(core, old_parent, parent); return ret; @@ -2506,6 +2527,7 @@ static void clk_core_reparent(struct clk_core *core, struct clk_core *new_parent) { clk_reparent(core, new_parent); + clk_core_update_orphan_child_rates(core); __clk_recalc_accuracies(core); __clk_recalc_rates(core, POST_RATE_CHANGE); } diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index d3e121f21ae21..d1b1372f7aaa9 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -594,6 +594,41 @@ clk_test_orphan_transparent_multiple_parent_mux_set_parent(struct kunit *test) clk_put(clk); } +/* + * Test that, for a mux that started orphan but got switched to a valid + * parent, calling clk_drop_range() on the mux won't affect the parent + * rate. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_parent_drop_range(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long parent_rate, new_parent_rate; + int ret; + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + parent_rate = clk_get_rate(parent); + KUNIT_ASSERT_GT(test, parent_rate, 0); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_drop_range(clk); + KUNIT_ASSERT_EQ(test, ret, 0); + + new_parent_rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, new_parent_rate, 0); + KUNIT_EXPECT_EQ(test, parent_rate, new_parent_rate); + + clk_put(parent); + clk_put(clk); +} + /* * Test that, for a mux that started orphan but got switched to a valid * parent, the rate of the mux and its new parent are consistent. @@ -625,6 +660,39 @@ clk_test_orphan_transparent_multiple_parent_mux_set_parent_get_rate(struct kunit clk_put(clk); } +/* + * Test that, for a mux that started orphan but got switched to a valid + * parent, calling clk_put() on the mux won't affect the parent rate. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_parent_put(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk *clk, *parent; + unsigned long parent_rate, new_parent_rate; + int ret; + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + clk = clk_hw_get_clk(&ctx->hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, clk); + + parent_rate = clk_get_rate(parent); + KUNIT_ASSERT_GT(test, parent_rate, 0); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + clk_put(clk); + + new_parent_rate = clk_get_rate(parent); + KUNIT_ASSERT_GT(test, new_parent_rate, 0); + KUNIT_EXPECT_EQ(test, parent_rate, new_parent_rate); + + clk_put(parent); +} + /* * Test that, for a mux that started orphan but got switched to a valid * parent, calling clk_set_rate_range() will affect the parent state if @@ -658,6 +726,43 @@ clk_test_orphan_transparent_multiple_parent_mux_set_parent_set_range_modified(st clk_put(clk); } +/* + * Test that, for a mux that started orphan but got switched to a valid + * parent, calling clk_set_rate_range() won't affect the parent state if + * its rate is within range. + */ +static void +clk_test_orphan_transparent_multiple_parent_mux_set_parent_set_range_untouched(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long parent_rate, new_parent_rate; + int ret; + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + + parent_rate = clk_get_rate(parent); + KUNIT_ASSERT_GT(test, parent_rate, 0); + + ret = clk_set_parent(clk, parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(clk, + DUMMY_CLOCK_INIT_RATE - 1000, + DUMMY_CLOCK_INIT_RATE + 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + new_parent_rate = clk_get_rate(parent); + KUNIT_ASSERT_GT(test, new_parent_rate, 0); + KUNIT_EXPECT_EQ(test, parent_rate, new_parent_rate); + + clk_put(parent); + clk_put(clk); +} + /* * Test that, for a mux whose current parent hasn't been registered yet, * calling clk_set_rate_range() will succeed, and will be taken into @@ -724,8 +829,11 @@ clk_test_orphan_transparent_multiple_parent_mux_set_range_set_parent_get_rate(st static struct kunit_case clk_orphan_transparent_multiple_parent_mux_test_cases[] = { KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_get_parent), KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_drop_range), KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_get_rate), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_put), KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_set_range_modified), + KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_parent_set_range_untouched), KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_range_round_rate), KUNIT_CASE(clk_test_orphan_transparent_multiple_parent_mux_set_range_set_parent_get_rate), {} @@ -1021,6 +1129,136 @@ static struct kunit_suite clk_orphan_transparent_single_parent_test_suite = { .test_cases = clk_orphan_transparent_single_parent_mux_test_cases, }; +struct clk_single_parent_two_lvl_ctx { + struct clk_dummy_context parent_parent_ctx; + struct clk_dummy_context parent_ctx; + struct clk_hw hw; +}; + +static int +clk_orphan_two_level_root_last_test_init(struct kunit *test) +{ + struct clk_single_parent_two_lvl_ctx *ctx; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + + ctx->parent_ctx.hw.init = + CLK_HW_INIT("intermediate-parent", + "root-parent", + &clk_dummy_single_parent_ops, + CLK_SET_RATE_PARENT); + ret = clk_hw_register(NULL, &ctx->parent_ctx.hw); + if (ret) + return ret; + + ctx->hw.init = + CLK_HW_INIT("test-clk", "intermediate-parent", + &clk_dummy_single_parent_ops, + CLK_SET_RATE_PARENT); + ret = clk_hw_register(NULL, &ctx->hw); + if (ret) + return ret; + + ctx->parent_parent_ctx.rate = DUMMY_CLOCK_INIT_RATE; + ctx->parent_parent_ctx.hw.init = + CLK_HW_INIT_NO_PARENT("root-parent", + &clk_dummy_rate_ops, + 0); + ret = clk_hw_register(NULL, &ctx->parent_parent_ctx.hw); + if (ret) + return ret; + + return 0; +} + +static void +clk_orphan_two_level_root_last_test_exit(struct kunit *test) +{ + struct clk_single_parent_two_lvl_ctx *ctx = test->priv; + + clk_hw_unregister(&ctx->hw); + clk_hw_unregister(&ctx->parent_ctx.hw); + clk_hw_unregister(&ctx->parent_parent_ctx.hw); +} + +/* + * Test that, for a clock whose parent used to be orphan, clk_get_rate() + * will return the proper rate. + */ +static void +clk_orphan_two_level_root_last_test_get_rate(struct kunit *test) +{ + struct clk_single_parent_two_lvl_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + unsigned long rate; + + rate = clk_get_rate(clk); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_INIT_RATE); + + clk_put(clk); +} + +/* + * Test that, for a clock whose parent used to be orphan, + * clk_set_rate_range() won't affect its rate if it is already within + * range. + * + * See (for Exynos 4210): + * https://lore.kernel.org/linux-clk/366a0232-bb4a-c357-6aa8-636e398e05eb@samsung.com/ + */ +static void +clk_orphan_two_level_root_last_test_set_range(struct kunit *test) +{ + struct clk_single_parent_two_lvl_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + unsigned long rate; + int ret; + + ret = clk_set_rate_range(clk, + DUMMY_CLOCK_INIT_RATE - 1000, + DUMMY_CLOCK_INIT_RATE + 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_EQ(test, rate, DUMMY_CLOCK_INIT_RATE); + + clk_put(clk); +} + +static struct kunit_case +clk_orphan_two_level_root_last_test_cases[] = { + KUNIT_CASE(clk_orphan_two_level_root_last_test_get_rate), + KUNIT_CASE(clk_orphan_two_level_root_last_test_set_range), + {} +}; + +/* + * Test suite for a basic, transparent, clock with a parent that is also + * such a clock. The parent's parent is registered last, while the + * parent and its child are registered in that order. The intermediate + * and leaf clocks will thus be orphan when registered, but the leaf + * clock itself will always have its parent and will never be + * reparented. Indeed, it's only orphan because its parent is. + * + * These tests exercise the behaviour of the consumer API when dealing + * with an orphan clock, and how we deal with the transition to a valid + * parent. + */ +static struct kunit_suite +clk_orphan_two_level_root_last_test_suite = { + .name = "clk-orphan-two-level-root-last-test", + .init = clk_orphan_two_level_root_last_test_init, + .exit = clk_orphan_two_level_root_last_test_exit, + .test_cases = clk_orphan_two_level_root_last_test_cases, +}; + /* * Test that clk_set_rate_range won't return an error for a valid range * and that it will make sure the rate of the clock is within the @@ -1729,6 +1967,7 @@ kunit_test_suites( &clk_multiple_parents_mux_test_suite, &clk_orphan_transparent_multiple_parent_mux_test_suite, &clk_orphan_transparent_single_parent_test_suite, + &clk_orphan_two_level_root_last_test_suite, &clk_range_test_suite, &clk_range_maximize_test_suite, &clk_range_minimize_test_suite, -- GitLab From 718af795d3fd786928506cd5251597fbe29c7fda Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:19 +0200 Subject: [PATCH 0322/2223] clk: Change clk_core_init_rate_req prototype The expectation is that a clk_rate_request structure is supposed to be initialized using clk_core_init_rate_req(), yet the rate we want to request still needs to be set by hand. Let's just pass the rate as a function argument so that callers don't have any extra work to do. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-15-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 91bb1ea0e147b..75cfde9f917f9 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1380,13 +1380,16 @@ static int clk_core_determine_round_nolock(struct clk_core *core, } static void clk_core_init_rate_req(struct clk_core * const core, - struct clk_rate_request *req) + struct clk_rate_request *req, + unsigned long rate) { struct clk_core *parent; if (WARN_ON(!core || !req)) return; + req->rate = rate; + parent = core->parent; if (parent) { req->best_parent_hw = parent->hw; @@ -1412,7 +1415,7 @@ static int clk_core_round_rate_nolock(struct clk_core *core, return 0; } - clk_core_init_rate_req(core, req); + clk_core_init_rate_req(core, req, req->rate); if (clk_core_can_round(core)) return clk_core_determine_round_nolock(core, req); @@ -2004,11 +2007,10 @@ static struct clk_core *clk_calc_new_rates(struct clk_core *core, if (clk_core_can_round(core)) { struct clk_rate_request req; - req.rate = rate; req.min_rate = min_rate; req.max_rate = max_rate; - clk_core_init_rate_req(core, &req); + clk_core_init_rate_req(core, &req, rate); ret = clk_core_determine_round_nolock(core, &req); if (ret < 0) -- GitLab From 8cd9c39dce5b54494f967235f7eeac7c30c08b97 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:20 +0200 Subject: [PATCH 0323/2223] clk: Move clk_core_init_rate_req() from clk_core_round_rate_nolock() to its caller The clk_rate_request structure is used internally as an argument for the clk_core_determine_round_nolock() and clk_core_round_rate_nolock(). In both cases, the clk_core_init_rate_req() function is used to initialize the clk_rate_request structure. However, the expectation on who gets to call that function is inconsistent between those two functions. Indeed, clk_core_determine_round_nolock() will assume the structure is properly initialized and will just use it. On the other hand, clk_core_round_rate_nolock() will call clk_core_init_rate_req() itself, expecting the caller to have filled only a minimal set of parameters (rate, min_rate and max_rate). If we ignore the calling convention inconsistency, this leads to a second inconsistency for drivers: * If they get called by the framework through clk_core_round_rate_nolock(), the rate, min_rate and max_rate fields will be filled by the caller, and the best_parent_rate and best_parent_hw fields will get filled by clk_core_init_rate_req(). * If they get called by a driver through __clk_determine_rate (and thus clk_core_round_rate_nolock), only best_parent_rate and best_parent_hw are being explicitly set by the framework. Even though we can reasonably expect rate to be set, only one of the 6 in-tree users explicitly set min_rate and max_rate. * If they get called by the framework through clk_core_determine_round_nolock(), then we have two callpaths. Either it will be called by clk_core_round_rate_nolock() itself, or it will be called by clk_calc_new_rates(), which will properly initialize rate, min_rate, max_rate itself, and best_parent_rate and best_parent_hw through clk_core_init_rate_req(). Even though the first and third case seems equivalent, they aren't when the clock has CLK_SET_RATE_PARENT. Indeed, in such a case clk_core_round_rate_nolock() will call itself on the current parent clock with the same clk_rate_request structure. The clk_core_init_rate_req() function will then be called on the parent clock, with the child clk_rate_request pointer and will fill the best_parent_rate and best_parent_hw fields with the parent context. When the whole recursion stops and the call returns, the initial caller will end up with a clk_rate_request structure with some information of the child clock (rate, min_rate, max_rate) and some others of the last clock up the tree whose child had CLK_SET_RATE_PARENT (best_parent_hw, best_parent_rate). In the most common case, best_parent_rate is going to be equal on all the parent clocks so it's not a big deal. However, best_parent_hw is going to point to a clock that never has been a valid parent for that clock which is definitely confusing. In order to fix the calling inconsistency, let's move the clk_core_init_rate_req() calls to the callers, which will also help a bit with the clk_core_round_rate_nolock() recursion. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-16-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 75cfde9f917f9..553e1e9eb3001 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1415,8 +1415,6 @@ static int clk_core_round_rate_nolock(struct clk_core *core, return 0; } - clk_core_init_rate_req(core, req, req->rate); - if (clk_core_can_round(core)) return clk_core_determine_round_nolock(core, req); else if (core->flags & CLK_SET_RATE_PARENT) @@ -1464,8 +1462,8 @@ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate) int ret; struct clk_rate_request req; + clk_core_init_rate_req(hw->core, &req, rate); clk_core_get_boundaries(hw->core, &req.min_rate, &req.max_rate); - req.rate = rate; ret = clk_core_round_rate_nolock(hw->core, &req); if (ret) @@ -1497,8 +1495,8 @@ long clk_round_rate(struct clk *clk, unsigned long rate) if (clk->exclusive_count) clk_core_rate_unprotect(clk->core); + clk_core_init_rate_req(clk->core, &req, rate); clk_core_get_boundaries(clk->core, &req.min_rate, &req.max_rate); - req.rate = rate; ret = clk_core_round_rate_nolock(clk->core, &req); @@ -2209,8 +2207,8 @@ static unsigned long clk_core_req_round_rate_nolock(struct clk_core *core, if (cnt < 0) return cnt; + clk_core_init_rate_req(core, &req, req_rate); clk_core_get_boundaries(core, &req.min_rate, &req.max_rate); - req.rate = req_rate; ret = clk_core_round_rate_nolock(core, &req); -- GitLab From c35e84b0977617f96fb1dbef3fb8d71a861325c0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:21 +0200 Subject: [PATCH 0324/2223] clk: Introduce clk_hw_init_rate_request() clk-divider instantiates clk_rate_request internally for its round_rate implementations to share the code with its determine_rate implementations. However, it's missing a few fields (min_rate, max_rate) that would be initialized properly if it was using clk_core_init_rate_req(). Let's create the clk_hw_init_rate_request() function for clock providers to be able to share the code to instation clk_rate_requests with the framework. This will also be useful for some tests introduced in later patches. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-17-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk-divider.c | 20 ++++++++++---------- drivers/clk/clk.c | 20 ++++++++++++++++++++ include/linux/clk-provider.h | 6 ++++++ 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c index f6b2bf5584867..a2c2b5203b0a9 100644 --- a/drivers/clk/clk-divider.c +++ b/drivers/clk/clk-divider.c @@ -386,13 +386,13 @@ long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, const struct clk_div_table *table, u8 width, unsigned long flags) { - struct clk_rate_request req = { - .rate = rate, - .best_parent_rate = *prate, - .best_parent_hw = parent, - }; + struct clk_rate_request req; int ret; + clk_hw_init_rate_request(hw, &req, rate); + req.best_parent_rate = *prate; + req.best_parent_hw = parent; + ret = divider_determine_rate(hw, &req, table, width, flags); if (ret) return ret; @@ -408,13 +408,13 @@ long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, const struct clk_div_table *table, u8 width, unsigned long flags, unsigned int val) { - struct clk_rate_request req = { - .rate = rate, - .best_parent_rate = *prate, - .best_parent_hw = parent, - }; + struct clk_rate_request req; int ret; + clk_hw_init_rate_request(hw, &req, rate); + req.best_parent_rate = *prate; + req.best_parent_hw = parent; + ret = divider_ro_determine_rate(hw, &req, table, width, flags, val); if (ret) return ret; diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 553e1e9eb3001..96b372ff23c21 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1400,6 +1400,26 @@ static void clk_core_init_rate_req(struct clk_core * const core, } } +/** + * clk_hw_init_rate_request - Initializes a clk_rate_request + * @hw: the clk for which we want to submit a rate request + * @req: the clk_rate_request structure we want to initialise + * @rate: the rate which is to be requested + * + * Initializes a clk_rate_request structure to submit to + * __clk_determine_rate() or similar functions. + */ +void clk_hw_init_rate_request(const struct clk_hw *hw, + struct clk_rate_request *req, + unsigned long rate) +{ + if (WARN_ON(!hw || !req)) + return; + + clk_core_init_rate_req(hw->core, req, rate); +} +EXPORT_SYMBOL_GPL(clk_hw_init_rate_request); + static bool clk_core_can_round(struct clk_core * const core) { return core->ops->determine_rate || core->ops->round_rate; diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 9a14cfa0d2011..d857717aa3860 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -42,6 +42,8 @@ struct dentry; * struct clk_rate_request - Structure encoding the clk constraints that * a clock user might require. * + * Should be initialized by calling clk_hw_init_rate_request(). + * * @rate: Requested clock rate. This field will be adjusted by * clock drivers according to hardware capabilities. * @min_rate: Minimum rate imposed by clk users. @@ -60,6 +62,10 @@ struct clk_rate_request { struct clk_hw *best_parent_hw; }; +void clk_hw_init_rate_request(const struct clk_hw *hw, + struct clk_rate_request *req, + unsigned long rate); + /** * struct clk_duty - Struture encoding the duty cycle ratio of a clock * -- GitLab From 11c84a38fcff30197f6e8af29e65531a5734ee05 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:22 +0200 Subject: [PATCH 0325/2223] clk: Add our request boundaries in clk_core_init_rate_req The expectation is that a new clk_rate_request is initialized through a call to clk_core_init_rate_req(). However, at the moment it only fills the parent rate and clk_hw pointer, but omits the other fields such as the clock rate boundaries. Some users of that function will update them after calling it, but most don't. As we are passed the clk_core pointer, we have access to those boundaries in clk_core_init_rate_req() however, so let's just fill it there and remove it from the few callers that do it right. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-18-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 96b372ff23c21..2794bd3bef4b8 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1389,6 +1389,7 @@ static void clk_core_init_rate_req(struct clk_core * const core, return; req->rate = rate; + clk_core_get_boundaries(core, &req->min_rate, &req->max_rate); parent = core->parent; if (parent) { @@ -1483,7 +1484,6 @@ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate) struct clk_rate_request req; clk_core_init_rate_req(hw->core, &req, rate); - clk_core_get_boundaries(hw->core, &req.min_rate, &req.max_rate); ret = clk_core_round_rate_nolock(hw->core, &req); if (ret) @@ -1516,7 +1516,6 @@ long clk_round_rate(struct clk *clk, unsigned long rate) clk_core_rate_unprotect(clk->core); clk_core_init_rate_req(clk->core, &req, rate); - clk_core_get_boundaries(clk->core, &req.min_rate, &req.max_rate); ret = clk_core_round_rate_nolock(clk->core, &req); @@ -2025,9 +2024,6 @@ static struct clk_core *clk_calc_new_rates(struct clk_core *core, if (clk_core_can_round(core)) { struct clk_rate_request req; - req.min_rate = min_rate; - req.max_rate = max_rate; - clk_core_init_rate_req(core, &req, rate); ret = clk_core_determine_round_nolock(core, &req); @@ -2228,7 +2224,6 @@ static unsigned long clk_core_req_round_rate_nolock(struct clk_core *core, return cnt; clk_core_init_rate_req(core, &req, req_rate); - clk_core_get_boundaries(core, &req.min_rate, &req.max_rate); ret = clk_core_round_rate_nolock(core, &req); -- GitLab From 666650b25ac43700a1cce96e51a32fc89c973d28 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:23 +0200 Subject: [PATCH 0326/2223] clk: Switch from __clk_determine_rate to clk_core_round_rate_nolock clk_mux_determine_rate_flags() will call into __clk_determine_rate() with a clk_hw pointer, while it has access to the clk_core pointer already. This leads to back and forth between clk_hw and clk_core, while __clk_determine_rate will only call clk_core_round_rate_nolock() with the clk_core pointer it retrieved from the clk_hw. Let's simplify things a bit by calling into clk_core_round_rate_nolock directly. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-19-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 2794bd3bef4b8..f8a8bdd552d66 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -536,6 +536,9 @@ static bool mux_is_better_rate(unsigned long rate, unsigned long now, return now <= rate && now > best; } +static int clk_core_round_rate_nolock(struct clk_core *core, + struct clk_rate_request *req); + int clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_rate_request *req, unsigned long flags) @@ -549,8 +552,12 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw, if (core->flags & CLK_SET_RATE_NO_REPARENT) { parent = core->parent; if (core->flags & CLK_SET_RATE_PARENT) { - ret = __clk_determine_rate(parent ? parent->hw : NULL, - &parent_req); + if (!parent) { + req->rate = 0; + return 0; + } + + ret = clk_core_round_rate_nolock(parent, &parent_req); if (ret) return ret; @@ -573,7 +580,7 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw, if (core->flags & CLK_SET_RATE_PARENT) { parent_req = *req; - ret = __clk_determine_rate(parent->hw, &parent_req); + ret = clk_core_round_rate_nolock(parent, &parent_req); if (ret) continue; } else { -- GitLab From 1234a2c40b8cf16041fb9acd730160e6c5b4ba13 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:24 +0200 Subject: [PATCH 0327/2223] clk: Introduce clk_core_has_parent() We will need to know if a clk_core pointer has a given parent in other functions, so let's create a clk_core_has_parent() function that clk_has_parent() will call into. For good measure, let's add some unit tests as well to make sure it works properly. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-20-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju [sboyd@kernel.org: Move tmp declaration, fix conditional to check for current parent] Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 37 +++++++++++++++++++++--------------- drivers/clk/clk_test.c | 43 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index f8a8bdd552d66..8e4a8b9aa3203 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -539,6 +539,27 @@ static bool mux_is_better_rate(unsigned long rate, unsigned long now, static int clk_core_round_rate_nolock(struct clk_core *core, struct clk_rate_request *req); +static bool clk_core_has_parent(struct clk_core *core, const struct clk_core *parent) +{ + struct clk_core *tmp; + unsigned int i; + + /* Optimize for the case where the parent is already the parent. */ + if (core->parent == parent) + return true; + + for (i = 0; i < core->num_parents; i++) { + tmp = clk_core_get_parent_by_index(core, i); + if (!tmp) + continue; + + if (tmp == parent) + return true; + } + + return false; +} + int clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_rate_request *req, unsigned long flags) @@ -2574,25 +2595,11 @@ void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent) */ bool clk_has_parent(struct clk *clk, struct clk *parent) { - struct clk_core *core, *parent_core; - int i; - /* NULL clocks should be nops, so return success if either is NULL. */ if (!clk || !parent) return true; - core = clk->core; - parent_core = parent->core; - - /* Optimize for the case where the parent is already the parent. */ - if (core->parent == parent_core) - return true; - - for (i = 0; i < core->num_parents; i++) - if (!strcmp(core->parents[i].name, parent_core->name)) - return true; - - return false; + return clk_core_has_parent(clk->core, parent->core); } EXPORT_SYMBOL_GPL(clk_has_parent); diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index d1b1372f7aaa9..7068517428e29 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -491,8 +491,32 @@ clk_test_multiple_parents_mux_get_parent(struct kunit *test) clk_put(clk); } +/* + * Test that for a clock with a multiple parents, clk_has_parent() + * actually reports all of them as parents. + */ +static void +clk_test_multiple_parents_mux_has_parent(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + + parent = clk_hw_get_clk(&ctx->parents_ctx[0].hw, NULL); + KUNIT_EXPECT_TRUE(test, clk_has_parent(clk, parent)); + clk_put(parent); + + parent = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_EXPECT_TRUE(test, clk_has_parent(clk, parent)); + clk_put(parent); + + clk_put(clk); +} + static struct kunit_case clk_multiple_parents_mux_test_cases[] = { KUNIT_CASE(clk_test_multiple_parents_mux_get_parent), + KUNIT_CASE(clk_test_multiple_parents_mux_has_parent), {} }; @@ -918,6 +942,24 @@ clk_test_single_parent_mux_get_parent(struct kunit *test) clk_put(clk); } +/* + * Test that for a clock with a single parent, clk_has_parent() actually + * reports it as a parent. + */ +static void +clk_test_single_parent_mux_has_parent(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent = clk_hw_get_clk(&ctx->parent_ctx.hw, NULL); + + KUNIT_EXPECT_TRUE(test, clk_has_parent(clk, parent)); + + clk_put(parent); + clk_put(clk); +} + /* * Test that for a clock that can't modify its rate and with a single * parent, if we set disjoints range on the parent and then the child, @@ -1022,6 +1064,7 @@ clk_test_single_parent_mux_set_range_round_rate_child_smaller(struct kunit *test static struct kunit_case clk_single_parent_mux_test_cases[] = { KUNIT_CASE(clk_test_single_parent_mux_get_parent), + KUNIT_CASE(clk_test_single_parent_mux_has_parent), KUNIT_CASE(clk_test_single_parent_mux_set_range_disjoint_child_last), KUNIT_CASE(clk_test_single_parent_mux_set_range_disjoint_parent_last), KUNIT_CASE(clk_test_single_parent_mux_set_range_round_rate_child_smaller), -- GitLab From 22fb0e284fbc3c1b85d24c5a1df8ea3ac82ab1d1 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:25 +0200 Subject: [PATCH 0328/2223] clk: Constify clk_has_parent() clk_has_parent() doesn't modify the clocks being passed, so let's make it const. Suggested-by: Stephen Boyd Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-21-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 2 +- include/linux/clk.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 8e4a8b9aa3203..3b68a9b8234a0 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -2593,7 +2593,7 @@ void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent) * * Returns true if @parent is a possible parent for @clk, false otherwise. */ -bool clk_has_parent(struct clk *clk, struct clk *parent) +bool clk_has_parent(const struct clk *clk, const struct clk *parent) { /* NULL clocks should be nops, so return success if either is NULL. */ if (!clk || !parent) diff --git a/include/linux/clk.h b/include/linux/clk.h index c13061cabdfc9..1ef0133242374 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -799,7 +799,7 @@ int clk_set_rate_exclusive(struct clk *clk, unsigned long rate); * * Returns true if @parent is a possible parent for @clk, false otherwise. */ -bool clk_has_parent(struct clk *clk, struct clk *parent); +bool clk_has_parent(const struct clk *clk, const struct clk *parent); /** * clk_set_rate_range - set a rate range for a clock source -- GitLab From 262ca38f4b6eb418b20b8e1d6d8495c6a98727c1 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:26 +0200 Subject: [PATCH 0329/2223] clk: Stop forwarding clk_rate_requests to the parent If the clock cannot modify its rate and has CLK_SET_RATE_PARENT, clk_mux_determine_rate_flags(), clk_core_round_rate_nolock() and a number of drivers will forward the clk_rate_request to the parent clock. clk_core_round_rate_nolock() will pass the pointer directly, which means that we pass a clk_rate_request to the parent that has the rate, min_rate and max_rate of the child, and the best_parent_rate and best_parent_hw fields will be relative to the child as well, so will point to our current clock and its rate. The most common case for CLK_SET_RATE_PARENT is that the child and parent clock rates will be equal, so the rate field isn't a worry, but the other fields are. Similarly, if the parent clock driver ever modifies the best_parent_rate or best_parent_hw, this will be applied to the child once the call to clk_core_round_rate_nolock() is done. best_parent_hw is probably not going to be a valid parent, and best_parent_rate might lead to a parent rate change different to the one that was initially computed. clk_mux_determine_rate_flags() and the affected drivers will copy the request before forwarding it to the parents, so they won't be affected by the latter issue, but the former is still going to be there and will lead to erroneous data and context being passed to the various clock drivers in the same sub-tree. Let's create two new functions, clk_core_forward_rate_req() and clk_hw_forward_rate_request() for the framework and the clock providers that will copy a request from a child clock and update the context to match the parent's. We also update the relevant call sites in the framework and drivers to use that new function. Let's also add a test to make sure we avoid regressions there. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-22-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/at91/clk-generated.c | 5 +- drivers/clk/at91/clk-master.c | 9 +- drivers/clk/at91/clk-peripheral.c | 4 +- drivers/clk/clk-composite.c | 6 +- drivers/clk/clk.c | 84 ++++++++++++-- drivers/clk/clk_test.c | 182 ++++++++++++++++++++++++++++++ include/linux/clk-provider.h | 5 + 7 files changed, 279 insertions(+), 16 deletions(-) diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c index d429ba52a7190..943ea67bf135f 100644 --- a/drivers/clk/at91/clk-generated.c +++ b/drivers/clk/at91/clk-generated.c @@ -136,7 +136,6 @@ static int clk_generated_determine_rate(struct clk_hw *hw, { struct clk_generated *gck = to_clk_generated(hw); struct clk_hw *parent = NULL; - struct clk_rate_request req_parent = *req; long best_rate = -EINVAL; unsigned long min_rate, parent_rate; int best_diff = -1; @@ -192,7 +191,9 @@ static int clk_generated_determine_rate(struct clk_hw *hw, goto end; for (div = 1; div < GENERATED_MAX_DIV + 2; div++) { - req_parent.rate = req->rate * div; + struct clk_rate_request req_parent; + + clk_hw_forward_rate_request(hw, req, parent, &req_parent, req->rate * div); if (__clk_determine_rate(parent, &req_parent)) continue; clk_generated_best_diff(req, parent, req_parent.rate, div, diff --git a/drivers/clk/at91/clk-master.c b/drivers/clk/at91/clk-master.c index 164e2959c7cfb..b7cd1924de52a 100644 --- a/drivers/clk/at91/clk-master.c +++ b/drivers/clk/at91/clk-master.c @@ -581,7 +581,6 @@ static int clk_sama7g5_master_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { struct clk_master *master = to_clk_master(hw); - struct clk_rate_request req_parent = *req; struct clk_hw *parent; long best_rate = LONG_MIN, best_diff = LONG_MIN; unsigned long parent_rate; @@ -618,11 +617,15 @@ static int clk_sama7g5_master_determine_rate(struct clk_hw *hw, goto end; for (div = 0; div < MASTER_PRES_MAX + 1; div++) { + struct clk_rate_request req_parent; + unsigned long req_rate; + if (div == MASTER_PRES_MAX) - req_parent.rate = req->rate * 3; + req_rate = req->rate * 3; else - req_parent.rate = req->rate << div; + req_rate = req->rate << div; + clk_hw_forward_rate_request(hw, req, parent, &req_parent, req_rate); if (__clk_determine_rate(parent, &req_parent)) continue; diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c index e14fa5ac734ce..5104d4025484c 100644 --- a/drivers/clk/at91/clk-peripheral.c +++ b/drivers/clk/at91/clk-peripheral.c @@ -269,7 +269,6 @@ static int clk_sam9x5_peripheral_determine_rate(struct clk_hw *hw, { struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw); struct clk_hw *parent = clk_hw_get_parent(hw); - struct clk_rate_request req_parent = *req; unsigned long parent_rate = clk_hw_get_rate(parent); unsigned long tmp_rate; long best_rate = LONG_MIN; @@ -302,8 +301,9 @@ static int clk_sam9x5_peripheral_determine_rate(struct clk_hw *hw, goto end; for (shift = 0; shift <= PERIPHERAL_MAX_SHIFT; shift++) { - req_parent.rate = req->rate << shift; + struct clk_rate_request req_parent; + clk_hw_forward_rate_request(hw, req, parent, &req_parent, req->rate << shift); if (__clk_determine_rate(parent, &req_parent)) continue; diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c index b9c5f904f5356..edfa94641bbfe 100644 --- a/drivers/clk/clk-composite.c +++ b/drivers/clk/clk-composite.c @@ -85,10 +85,11 @@ static int clk_composite_determine_rate(struct clk_hw *hw, req->best_parent_hw = NULL; if (clk_hw_get_flags(hw) & CLK_SET_RATE_NO_REPARENT) { - struct clk_rate_request tmp_req = *req; + struct clk_rate_request tmp_req; parent = clk_hw_get_parent(mux_hw); + clk_hw_forward_rate_request(hw, req, parent, &tmp_req, req->rate); ret = clk_composite_determine_rate_for_parent(rate_hw, &tmp_req, parent, @@ -104,12 +105,13 @@ static int clk_composite_determine_rate(struct clk_hw *hw, } for (i = 0; i < clk_hw_get_num_parents(mux_hw); i++) { - struct clk_rate_request tmp_req = *req; + struct clk_rate_request tmp_req; parent = clk_hw_get_parent_by_index(mux_hw, i); if (!parent) continue; + clk_hw_forward_rate_request(hw, req, parent, &tmp_req, req->rate); ret = clk_composite_determine_rate_for_parent(rate_hw, &tmp_req, parent, diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 3b68a9b8234a0..3f60eb836980d 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -536,6 +536,10 @@ static bool mux_is_better_rate(unsigned long rate, unsigned long now, return now <= rate && now > best; } +static void clk_core_init_rate_req(struct clk_core * const core, + struct clk_rate_request *req, + unsigned long rate); + static int clk_core_round_rate_nolock(struct clk_core *core, struct clk_rate_request *req); @@ -560,6 +564,25 @@ static bool clk_core_has_parent(struct clk_core *core, const struct clk_core *pa return false; } +static void +clk_core_forward_rate_req(struct clk_core *core, + const struct clk_rate_request *old_req, + struct clk_core *parent, + struct clk_rate_request *req, + unsigned long parent_rate) +{ + if (WARN_ON(!clk_core_has_parent(core, parent))) + return; + + clk_core_init_rate_req(parent, req, parent_rate); + + if (req->min_rate < old_req->min_rate) + req->min_rate = old_req->min_rate; + + if (req->max_rate > old_req->max_rate) + req->max_rate = old_req->max_rate; +} + int clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_rate_request *req, unsigned long flags) @@ -567,17 +590,19 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_core *core = hw->core, *parent, *best_parent = NULL; int i, num_parents, ret; unsigned long best = 0; - struct clk_rate_request parent_req = *req; /* if NO_REPARENT flag set, pass through to current parent */ if (core->flags & CLK_SET_RATE_NO_REPARENT) { parent = core->parent; if (core->flags & CLK_SET_RATE_PARENT) { + struct clk_rate_request parent_req; + if (!parent) { req->rate = 0; return 0; } + clk_core_forward_rate_req(core, req, parent, &parent_req, req->rate); ret = clk_core_round_rate_nolock(parent, &parent_req); if (ret) return ret; @@ -595,23 +620,29 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw, /* find the parent that can provide the fastest rate <= rate */ num_parents = core->num_parents; for (i = 0; i < num_parents; i++) { + unsigned long parent_rate; + parent = clk_core_get_parent_by_index(core, i); if (!parent) continue; if (core->flags & CLK_SET_RATE_PARENT) { - parent_req = *req; + struct clk_rate_request parent_req; + + clk_core_forward_rate_req(core, req, parent, &parent_req, req->rate); ret = clk_core_round_rate_nolock(parent, &parent_req); if (ret) continue; + + parent_rate = parent_req.rate; } else { - parent_req.rate = clk_core_get_rate_nolock(parent); + parent_rate = clk_core_get_rate_nolock(parent); } - if (mux_is_better_rate(req->rate, parent_req.rate, + if (mux_is_better_rate(req->rate, parent_rate, best, flags)) { best_parent = parent; - best = parent_req.rate; + best = parent_rate; } } @@ -1449,6 +1480,31 @@ void clk_hw_init_rate_request(const struct clk_hw *hw, } EXPORT_SYMBOL_GPL(clk_hw_init_rate_request); +/** + * clk_hw_forward_rate_request - Forwards a clk_rate_request to a clock's parent + * @hw: the original clock that got the rate request + * @old_req: the original clk_rate_request structure we want to forward + * @parent: the clk we want to forward @old_req to + * @req: the clk_rate_request structure we want to initialise + * @parent_rate: The rate which is to be requested to @parent + * + * Initializes a clk_rate_request structure to submit to a clock parent + * in __clk_determine_rate() or similar functions. + */ +void clk_hw_forward_rate_request(const struct clk_hw *hw, + const struct clk_rate_request *old_req, + const struct clk_hw *parent, + struct clk_rate_request *req, + unsigned long parent_rate) +{ + if (WARN_ON(!hw || !old_req || !parent || !req)) + return; + + clk_core_forward_rate_req(hw->core, old_req, + parent->core, req, + parent_rate); +} + static bool clk_core_can_round(struct clk_core * const core) { return core->ops->determine_rate || core->ops->round_rate; @@ -1457,6 +1513,8 @@ static bool clk_core_can_round(struct clk_core * const core) static int clk_core_round_rate_nolock(struct clk_core *core, struct clk_rate_request *req) { + int ret; + lockdep_assert_held(&prepare_lock); if (!core) { @@ -1466,8 +1524,20 @@ static int clk_core_round_rate_nolock(struct clk_core *core, if (clk_core_can_round(core)) return clk_core_determine_round_nolock(core, req); - else if (core->flags & CLK_SET_RATE_PARENT) - return clk_core_round_rate_nolock(core->parent, req); + + if (core->flags & CLK_SET_RATE_PARENT) { + struct clk_rate_request parent_req; + + clk_core_forward_rate_req(core, req, core->parent, &parent_req, req->rate); + ret = clk_core_round_rate_nolock(core->parent, &parent_req); + if (ret) + return ret; + + req->best_parent_rate = parent_req.rate; + req->rate = parent_req.rate; + + return 0; + } req->rate = core->rate; return 0; diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 7068517428e29..3004ef368bd79 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -1024,6 +1024,36 @@ clk_test_single_parent_mux_set_range_disjoint_parent_last(struct kunit *test) clk_put(clk); } +/* + * Test that for a clock that can't modify its rate and with a single + * parent, if we set a range on the parent and then call + * clk_round_rate(), the boundaries of the parent are taken into + * account. + */ +static void +clk_test_single_parent_mux_set_range_round_rate_parent_only(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long rate; + int ret; + + parent = clk_get_parent(clk); + KUNIT_ASSERT_PTR_NE(test, parent, NULL); + + ret = clk_set_rate_range(parent, DUMMY_CLOCK_RATE_1, DUMMY_CLOCK_RATE_2); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_1 - 1000); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2); + + clk_put(clk); +} + /* * Test that for a clock that can't modify its rate and with a single * parent, if we set a range on the parent and a more restrictive one on @@ -1062,12 +1092,52 @@ clk_test_single_parent_mux_set_range_round_rate_child_smaller(struct kunit *test clk_put(clk); } +/* + * Test that for a clock that can't modify its rate and with a single + * parent, if we set a range on the child and a more restrictive one on + * the parent, and then call clk_round_rate(), the boundaries of the + * two clocks are taken into account. + */ +static void +clk_test_single_parent_mux_set_range_round_rate_parent_smaller(struct kunit *test) +{ + struct clk_single_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent; + unsigned long rate; + int ret; + + parent = clk_get_parent(clk); + KUNIT_ASSERT_PTR_NE(test, parent, NULL); + + ret = clk_set_rate_range(parent, DUMMY_CLOCK_RATE_1 + 1000, DUMMY_CLOCK_RATE_2 - 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(clk, DUMMY_CLOCK_RATE_1, DUMMY_CLOCK_RATE_2); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_1 - 1000); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1 + 1000); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2 - 1000); + + rate = clk_round_rate(clk, DUMMY_CLOCK_RATE_2 + 1000); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1 + 1000); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_2 - 1000); + + clk_put(clk); +} + static struct kunit_case clk_single_parent_mux_test_cases[] = { KUNIT_CASE(clk_test_single_parent_mux_get_parent), KUNIT_CASE(clk_test_single_parent_mux_has_parent), KUNIT_CASE(clk_test_single_parent_mux_set_range_disjoint_child_last), KUNIT_CASE(clk_test_single_parent_mux_set_range_disjoint_parent_last), KUNIT_CASE(clk_test_single_parent_mux_set_range_round_rate_child_smaller), + KUNIT_CASE(clk_test_single_parent_mux_set_range_round_rate_parent_only), + KUNIT_CASE(clk_test_single_parent_mux_set_range_round_rate_parent_smaller), {} }; @@ -2005,7 +2075,119 @@ static struct kunit_suite clk_range_minimize_test_suite = { .test_cases = clk_range_minimize_test_cases, }; +struct clk_leaf_mux_ctx { + struct clk_multiple_parent_ctx mux_ctx; + struct clk_hw hw; +}; + +static int +clk_leaf_mux_set_rate_parent_test_init(struct kunit *test) +{ + struct clk_leaf_mux_ctx *ctx; + const char *top_parents[2] = { "parent-0", "parent-1" }; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + + ctx->mux_ctx.parents_ctx[0].hw.init = CLK_HW_INIT_NO_PARENT("parent-0", + &clk_dummy_rate_ops, + 0); + ctx->mux_ctx.parents_ctx[0].rate = DUMMY_CLOCK_RATE_1; + ret = clk_hw_register(NULL, &ctx->mux_ctx.parents_ctx[0].hw); + if (ret) + return ret; + + ctx->mux_ctx.parents_ctx[1].hw.init = CLK_HW_INIT_NO_PARENT("parent-1", + &clk_dummy_rate_ops, + 0); + ctx->mux_ctx.parents_ctx[1].rate = DUMMY_CLOCK_RATE_2; + ret = clk_hw_register(NULL, &ctx->mux_ctx.parents_ctx[1].hw); + if (ret) + return ret; + + ctx->mux_ctx.current_parent = 0; + ctx->mux_ctx.hw.init = CLK_HW_INIT_PARENTS("test-mux", top_parents, + &clk_multiple_parents_mux_ops, + 0); + ret = clk_hw_register(NULL, &ctx->mux_ctx.hw); + if (ret) + return ret; + + ctx->hw.init = CLK_HW_INIT_HW("test-clock", &ctx->mux_ctx.hw, + &clk_dummy_single_parent_ops, + CLK_SET_RATE_PARENT); + ret = clk_hw_register(NULL, &ctx->hw); + if (ret) + return ret; + + return 0; +} + +static void clk_leaf_mux_set_rate_parent_test_exit(struct kunit *test) +{ + struct clk_leaf_mux_ctx *ctx = test->priv; + + clk_hw_unregister(&ctx->hw); + clk_hw_unregister(&ctx->mux_ctx.hw); + clk_hw_unregister(&ctx->mux_ctx.parents_ctx[0].hw); + clk_hw_unregister(&ctx->mux_ctx.parents_ctx[1].hw); +} + +/* + * Test that, for a clock that will forward any rate request to its + * parent, the rate request structure returned by __clk_determine_rate + * is sane and will be what we expect. + */ +static void clk_leaf_mux_set_rate_parent_determine_rate(struct kunit *test) +{ + struct clk_leaf_mux_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk_rate_request req; + unsigned long rate; + int ret; + + rate = clk_get_rate(clk); + KUNIT_ASSERT_EQ(test, rate, DUMMY_CLOCK_RATE_1); + + clk_hw_init_rate_request(hw, &req, DUMMY_CLOCK_RATE_2); + + ret = __clk_determine_rate(hw, &req); + KUNIT_ASSERT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, req.rate, DUMMY_CLOCK_RATE_2); + KUNIT_EXPECT_EQ(test, req.best_parent_rate, DUMMY_CLOCK_RATE_2); + KUNIT_EXPECT_PTR_EQ(test, req.best_parent_hw, &ctx->mux_ctx.hw); + + clk_put(clk); +} + +static struct kunit_case clk_leaf_mux_set_rate_parent_test_cases[] = { + KUNIT_CASE(clk_leaf_mux_set_rate_parent_determine_rate), + {} +}; + +/* + * Test suite for a clock whose parent is a mux with multiple parents. + * The leaf clock has CLK_SET_RATE_PARENT, and will forward rate + * requests to the mux, which will then select which parent is the best + * fit for a given rate. + * + * These tests exercise the behaviour of muxes, and the proper selection + * of parents. + */ +static struct kunit_suite clk_leaf_mux_set_rate_parent_test_suite = { + .name = "clk-leaf-mux-set-rate-parent", + .init = clk_leaf_mux_set_rate_parent_test_init, + .exit = clk_leaf_mux_set_rate_parent_test_exit, + .test_cases = clk_leaf_mux_set_rate_parent_test_cases, +}; + kunit_test_suites( + &clk_leaf_mux_set_rate_parent_test_suite, &clk_test_suite, &clk_multiple_parents_mux_test_suite, &clk_orphan_transparent_multiple_parent_mux_test_suite, diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index d857717aa3860..8bce6c524f297 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -65,6 +65,11 @@ struct clk_rate_request { void clk_hw_init_rate_request(const struct clk_hw *hw, struct clk_rate_request *req, unsigned long rate); +void clk_hw_forward_rate_request(const struct clk_hw *core, + const struct clk_rate_request *old_req, + const struct clk_hw *parent, + struct clk_rate_request *req, + unsigned long parent_rate); /** * struct clk_duty - Struture encoding the duty cycle ratio of a clock -- GitLab From b46fd8dbe8ad3fe6dcd44dcdf01a736c50d90a68 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:27 +0200 Subject: [PATCH 0330/2223] clk: Zero the clk_rate_request structure In order to make sure we don't carry anything over from an already existing clk_rate_request pointer we would pass to clk_core_init_rate_req(), let's zero the entire structure before initializing it. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-23-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 3f60eb836980d..6b358448885b6 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1447,6 +1447,8 @@ static void clk_core_init_rate_req(struct clk_core * const core, if (WARN_ON(!core || !req)) return; + memset(req, 0, sizeof(*req)); + req->rate = rate; clk_core_get_boundaries(core, &req->min_rate, &req->max_rate); -- GitLab From 253993253466ba7187730b196174146d5247e97b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:28 +0200 Subject: [PATCH 0331/2223] clk: Introduce the clk_hw_get_rate_range function Some clock providers are hand-crafting their clk_rate_request, and need to figure out the current boundaries of their clk_hw to fill it properly. Let's create such a function for clock providers. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-24-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 16 ++++++++++++++++ include/linux/clk-provider.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 6b358448885b6..ec518dc5d4629 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -684,6 +684,22 @@ static void clk_core_get_boundaries(struct clk_core *core, *max_rate = min(*max_rate, clk_user->max_rate); } +/* + * clk_hw_get_rate_range() - returns the clock rate range for a hw clk + * @hw: the hw clk we want to get the range from + * @min_rate: pointer to the variable that will hold the minimum + * @max_rate: pointer to the variable that will hold the maximum + * + * Fills the @min_rate and @max_rate variables with the minimum and + * maximum that clock can reach. + */ +void clk_hw_get_rate_range(struct clk_hw *hw, unsigned long *min_rate, + unsigned long *max_rate) +{ + clk_core_get_boundaries(hw->core, min_rate, max_rate); +} +EXPORT_SYMBOL_GPL(clk_hw_get_rate_range); + static bool clk_core_check_boundaries(struct clk_core *core, unsigned long min_rate, unsigned long max_rate) diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 8bce6c524f297..8724a3547a79f 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1267,6 +1267,8 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_rate_request *req, unsigned long flags); void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent); +void clk_hw_get_rate_range(struct clk_hw *hw, unsigned long *min_rate, + unsigned long *max_rate); void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate, unsigned long max_rate); -- GitLab From af1e62f2ffe2b7fa90653f273efced0e0eabf7cc Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:29 +0200 Subject: [PATCH 0332/2223] clk: qcom: clk-rcg2: Take clock boundaries into consideration for gfx3d The gfx3d clock is hand-crafting its own clk_rate_request in clk_gfx3d_determine_rate to pass to the parent of that clock. However, since the clk_rate_request is zero'd at creation, it will have a max_rate of 0 which will break any code depending on the clock boundaries. That includes the recent commit 948fb0969eae ("clk: Always clamp the rounded rate") which will clamp the rate given to clk_round_rate() to the current clock boundaries. For the gfx3d clock, it means that since both the min_rate and max_rate fields are set at zero, clk_round_rate() now always return 0. Let's initialize the min_rate and max_rate fields properly for that clock. Fixes: 948fb0969eae ("clk: Always clamp the rounded rate") Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-25-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- drivers/clk/qcom/clk-rcg2.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c index 28019edd2a508..ee536b4579523 100644 --- a/drivers/clk/qcom/clk-rcg2.c +++ b/drivers/clk/qcom/clk-rcg2.c @@ -908,6 +908,15 @@ static int clk_gfx3d_determine_rate(struct clk_hw *hw, req->best_parent_hw = p2; } + clk_hw_get_rate_range(req->best_parent_hw, + &parent_req.min_rate, &parent_req.max_rate); + + if (req->min_rate > parent_req.min_rate) + parent_req.min_rate = req->min_rate; + + if (req->max_rate < parent_req.max_rate) + parent_req.max_rate = req->max_rate; + ret = __clk_determine_rate(req->best_parent_hw, &parent_req); if (ret) return ret; -- GitLab From 433fb8a611ca2a32112668225beabda2302c9634 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:30 +0200 Subject: [PATCH 0333/2223] clk: tests: Add missing test case for ranges Let's add a test on the rate range after a reparenting. This fails for now, but it's worth having it to document the corner cases we don't support yet. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-26-maxime@cerno.tech Tested-by: Naresh Kamboju Tested-by: Linux Kernel Functional Testing Signed-off-by: Stephen Boyd --- drivers/clk/clk_test.c | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 3004ef368bd79..509256c5567aa 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -514,9 +514,62 @@ clk_test_multiple_parents_mux_has_parent(struct kunit *test) clk_put(clk); } +/* + * Test that for a clock with a multiple parents, if we set a range on + * that clock and the parent is changed, its rate after the reparenting + * is still within the range we asked for. + * + * FIXME: clk_set_parent() only does the reparenting but doesn't + * reevaluate whether the new clock rate is within its boundaries or + * not. + */ +static void +clk_test_multiple_parents_mux_set_range_set_parent_get_rate(struct kunit *test) +{ + struct clk_multiple_parent_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *parent1, *parent2; + unsigned long rate; + int ret; + + kunit_skip(test, "This needs to be fixed in the core."); + + parent1 = clk_hw_get_clk(&ctx->parents_ctx[0].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent1); + KUNIT_ASSERT_TRUE(test, clk_is_match(clk_get_parent(clk), parent1)); + + parent2 = clk_hw_get_clk(&ctx->parents_ctx[1].hw, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent2); + + ret = clk_set_rate(parent1, DUMMY_CLOCK_RATE_1); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate(parent2, DUMMY_CLOCK_RATE_2); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_rate_range(clk, + DUMMY_CLOCK_RATE_1 - 1000, + DUMMY_CLOCK_RATE_1 + 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = clk_set_parent(clk, parent2); + KUNIT_ASSERT_EQ(test, ret, 0); + + rate = clk_get_rate(clk); + KUNIT_ASSERT_GT(test, rate, 0); + KUNIT_EXPECT_GE(test, rate, DUMMY_CLOCK_RATE_1 - 1000); + KUNIT_EXPECT_LE(test, rate, DUMMY_CLOCK_RATE_1 + 1000); + + clk_put(parent2); + clk_put(parent1); + clk_put(clk); +} + static struct kunit_case clk_multiple_parents_mux_test_cases[] = { KUNIT_CASE(clk_test_multiple_parents_mux_get_parent), KUNIT_CASE(clk_test_multiple_parents_mux_has_parent), + KUNIT_CASE(clk_test_multiple_parents_mux_set_range_set_parent_get_rate), {} }; -- GitLab From a080f9ad604598a4d32ea36fbf96437c92ccacb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 12 Jul 2022 00:59:15 +0200 Subject: [PATCH 0334/2223] PCI: aardvark: Add support for PCI Bridge Subsystem Vendor ID on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register with Subsystem Device/Vendor ID is at offset 0x2c. Export it via the emulated bridge to enable support for the Subsystem Device/Vendor ID - by reading it in the PCI controller config space and storing it in the emulated bridge control structures, so that it is exposed in the respective PCI capability. After this change Subsystem ID is visible in lspci output at line: Capabilities: [40] Subsystem Link: https://lore.kernel.org/r/20220711225915.13896-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-aardvark.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 966c8b48bd969..7cc51cfb8a13a 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -33,6 +33,7 @@ #define PCIE_CORE_DEV_ID_REG 0x0 #define PCIE_CORE_CMD_STATUS_REG 0x4 #define PCIE_CORE_DEV_REV_REG 0x8 +#define PCIE_CORE_SSDEV_ID_REG 0x2c #define PCIE_CORE_PCIEXP_CAP 0xc0 #define PCIE_CORE_PCIERR_CAP 0x100 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 @@ -1077,6 +1078,8 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) /* Indicates supports for Completion Retry Status */ bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); + bridge->subsystem_vendor_id = advk_readl(pcie, PCIE_CORE_SSDEV_ID_REG) & 0xffff; + bridge->subsystem_id = advk_readl(pcie, PCIE_CORE_SSDEV_ID_REG) >> 16; bridge->has_pcie = true; bridge->data = pcie; bridge->ops = &advk_pci_bridge_emul_ops; -- GitLab From 882597aff2d442a52ca173c293939232c2e1ebea Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 14 Sep 2022 07:14:24 -0700 Subject: [PATCH 0335/2223] Input: auo-pixcir-ts - drop support for platform data Currently there are no users of auo_pixcir_ts_platdata in the mainline, and having it (with legacy gpio numbers) prevents us from converting the driver to gpiod API, so let's drop it. If, in the future, someone wants to use this driver on non-device tree, non-ACPI system, they should use static device properties instead of platform data. Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220914141428.2201784-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/auo-pixcir-ts.c | 118 ++++++++++------------ include/linux/input/auo-pixcir-ts.h | 44 -------- 2 files changed, 56 insertions(+), 106 deletions(-) delete mode 100644 include/linux/input/auo-pixcir-ts.h diff --git a/drivers/input/touchscreen/auo-pixcir-ts.c b/drivers/input/touchscreen/auo-pixcir-ts.c index c33e63ca61425..a51d66ebff2bd 100644 --- a/drivers/input/touchscreen/auo-pixcir-ts.c +++ b/drivers/input/touchscreen/auo-pixcir-ts.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -69,6 +68,16 @@ #define AUO_PIXCIR_INT_RELEASE (1 << 4) #define AUO_PIXCIR_INT_ENABLE (1 << 3) #define AUO_PIXCIR_INT_POL_HIGH (1 << 2) + +/* + * Interrupt modes: + * periodical: interrupt is asserted periodicaly + * compare coordinates: interrupt is asserted when coordinates change + * indicate touch: interrupt is asserted during touch + */ +#define AUO_PIXCIR_INT_PERIODICAL 0x00 +#define AUO_PIXCIR_INT_COMP_COORD 0x01 +#define AUO_PIXCIR_INT_TOUCH_IND 0x02 #define AUO_PIXCIR_INT_MODE_MASK 0x03 /* @@ -103,10 +112,14 @@ struct auo_pixcir_ts { struct i2c_client *client; struct input_dev *input; - const struct auo_pixcir_ts_platdata *pdata; + int gpio_int; + int gpio_rst; char phys[32]; - /* special handling for touch_indicate interupt mode */ + unsigned int x_max; + unsigned int y_max; + + /* special handling for touch_indicate interrupt mode */ bool touch_ind_mode; wait_queue_head_t wait; @@ -125,7 +138,6 @@ static int auo_pixcir_collect_data(struct auo_pixcir_ts *ts, struct auo_point_t *point) { struct i2c_client *client = ts->client; - const struct auo_pixcir_ts_platdata *pdata = ts->pdata; uint8_t raw_coord[8]; uint8_t raw_area[4]; int i, ret; @@ -152,8 +164,8 @@ static int auo_pixcir_collect_data(struct auo_pixcir_ts *ts, point[i].coord_y = raw_coord[4 * i + 3] << 8 | raw_coord[4 * i + 2]; - if (point[i].coord_x > pdata->x_max || - point[i].coord_y > pdata->y_max) { + if (point[i].coord_x > ts->x_max || + point[i].coord_y > ts->y_max) { dev_warn(&client->dev, "coordinates (%d,%d) invalid\n", point[i].coord_x, point[i].coord_y); point[i].coord_x = point[i].coord_y = 0; @@ -171,7 +183,6 @@ static int auo_pixcir_collect_data(struct auo_pixcir_ts *ts, static irqreturn_t auo_pixcir_interrupt(int irq, void *dev_id) { struct auo_pixcir_ts *ts = dev_id; - const struct auo_pixcir_ts_platdata *pdata = ts->pdata; struct auo_point_t point[AUO_PIXCIR_REPORT_POINTS]; int i; int ret; @@ -182,7 +193,7 @@ static irqreturn_t auo_pixcir_interrupt(int irq, void *dev_id) /* check for up event in touch touch_ind_mode */ if (ts->touch_ind_mode) { - if (gpio_get_value(pdata->gpio_int) == 0) { + if (gpio_get_value(ts->gpio_int) == 0) { input_mt_sync(ts->input); input_report_key(ts->input, BTN_TOUCH, 0); input_sync(ts->input); @@ -278,11 +289,9 @@ static int auo_pixcir_power_mode(struct auo_pixcir_ts *ts, int mode) return 0; } -static int auo_pixcir_int_config(struct auo_pixcir_ts *ts, - int int_setting) +static int auo_pixcir_int_config(struct auo_pixcir_ts *ts, int int_setting) { struct i2c_client *client = ts->client; - const struct auo_pixcir_ts_platdata *pdata = ts->pdata; int ret; ret = i2c_smbus_read_byte_data(client, AUO_PIXCIR_REG_INT_SETTING); @@ -304,7 +313,7 @@ static int auo_pixcir_int_config(struct auo_pixcir_ts *ts, return ret; } - ts->touch_ind_mode = pdata->int_setting == AUO_PIXCIR_INT_TOUCH_IND; + ts->touch_ind_mode = int_setting == AUO_PIXCIR_INT_TOUCH_IND; return 0; } @@ -466,49 +475,41 @@ static SIMPLE_DEV_PM_OPS(auo_pixcir_pm_ops, auo_pixcir_suspend, auo_pixcir_resume); #ifdef CONFIG_OF -static struct auo_pixcir_ts_platdata *auo_pixcir_parse_dt(struct device *dev) +static int auo_pixcir_parse_dt(struct device *dev, struct auo_pixcir_ts *ts) { - struct auo_pixcir_ts_platdata *pdata; struct device_node *np = dev->of_node; if (!np) - return ERR_PTR(-ENOENT); + return -ENOENT; - pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - return ERR_PTR(-ENOMEM); - - pdata->gpio_int = of_get_gpio(np, 0); - if (!gpio_is_valid(pdata->gpio_int)) { + ts->gpio_int = of_get_gpio(np, 0); + if (!gpio_is_valid(ts->gpio_int)) { dev_err(dev, "failed to get interrupt gpio\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - pdata->gpio_rst = of_get_gpio(np, 1); - if (!gpio_is_valid(pdata->gpio_rst)) { + ts->gpio_rst = of_get_gpio(np, 1); + if (!gpio_is_valid(ts->gpio_rst)) { dev_err(dev, "failed to get reset gpio\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - if (of_property_read_u32(np, "x-size", &pdata->x_max)) { + if (of_property_read_u32(np, "x-size", &ts->x_max)) { dev_err(dev, "failed to get x-size property\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - if (of_property_read_u32(np, "y-size", &pdata->y_max)) { + if (of_property_read_u32(np, "y-size", &ts->y_max)) { dev_err(dev, "failed to get y-size property\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - /* default to asserting the interrupt when the screen is touched */ - pdata->int_setting = AUO_PIXCIR_INT_TOUCH_IND; - - return pdata; + return 0; } #else -static struct auo_pixcir_ts_platdata *auo_pixcir_parse_dt(struct device *dev) +static int auo_pixcir_parse_dt(struct device *dev, struct auo_pixcir_ts *ts) { - return ERR_PTR(-EINVAL); + return -EINVAL; } #endif @@ -516,27 +517,18 @@ static void auo_pixcir_reset(void *data) { struct auo_pixcir_ts *ts = data; - gpio_set_value(ts->pdata->gpio_rst, 0); + gpio_set_value(ts->gpio_rst, 0); } static int auo_pixcir_probe(struct i2c_client *client, const struct i2c_device_id *id) { - const struct auo_pixcir_ts_platdata *pdata; struct auo_pixcir_ts *ts; struct input_dev *input_dev; int version; int error; - pdata = dev_get_platdata(&client->dev); - if (!pdata) { - pdata = auo_pixcir_parse_dt(&client->dev); - if (IS_ERR(pdata)) - return PTR_ERR(pdata); - } - - ts = devm_kzalloc(&client->dev, - sizeof(struct auo_pixcir_ts), GFP_KERNEL); + ts = devm_kzalloc(&client->dev, sizeof(*ts), GFP_KERNEL); if (!ts) return -ENOMEM; @@ -546,7 +538,6 @@ static int auo_pixcir_probe(struct i2c_client *client, return -ENOMEM; } - ts->pdata = pdata; ts->client = client; ts->input = input_dev; ts->touch_ind_mode = 0; @@ -556,6 +547,10 @@ static int auo_pixcir_probe(struct i2c_client *client, snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(&client->dev)); + error = auo_pixcir_parse_dt(&client->dev, ts); + if (error) + return error; + input_dev->name = "AUO-Pixcir touchscreen"; input_dev->phys = ts->phys; input_dev->id.bustype = BUS_I2C; @@ -569,36 +564,34 @@ static int auo_pixcir_probe(struct i2c_client *client, __set_bit(BTN_TOUCH, input_dev->keybit); /* For single touch */ - input_set_abs_params(input_dev, ABS_X, 0, pdata->x_max, 0, 0); - input_set_abs_params(input_dev, ABS_Y, 0, pdata->y_max, 0, 0); + input_set_abs_params(input_dev, ABS_X, 0, ts->x_max, 0, 0); + input_set_abs_params(input_dev, ABS_Y, 0, ts->y_max, 0, 0); /* For multi touch */ - input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, - pdata->x_max, 0, 0); - input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, - pdata->y_max, 0, 0); - input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, - AUO_PIXCIR_MAX_AREA, 0, 0); - input_set_abs_params(input_dev, ABS_MT_TOUCH_MINOR, 0, - AUO_PIXCIR_MAX_AREA, 0, 0); + input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, ts->x_max, 0, 0); + input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, ts->y_max, 0, 0); + input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, + 0, AUO_PIXCIR_MAX_AREA, 0, 0); + input_set_abs_params(input_dev, ABS_MT_TOUCH_MINOR, + 0, AUO_PIXCIR_MAX_AREA, 0, 0); input_set_abs_params(input_dev, ABS_MT_ORIENTATION, 0, 1, 0, 0); input_set_drvdata(ts->input, ts); - error = devm_gpio_request_one(&client->dev, pdata->gpio_int, + error = devm_gpio_request_one(&client->dev, ts->gpio_int, GPIOF_DIR_IN, "auo_pixcir_ts_int"); if (error) { dev_err(&client->dev, "request of gpio %d failed, %d\n", - pdata->gpio_int, error); + ts->gpio_int, error); return error; } - error = devm_gpio_request_one(&client->dev, pdata->gpio_rst, + error = devm_gpio_request_one(&client->dev, ts->gpio_rst, GPIOF_DIR_OUT | GPIOF_INIT_HIGH, "auo_pixcir_ts_rst"); if (error) { dev_err(&client->dev, "request of gpio %d failed, %d\n", - pdata->gpio_rst, error); + ts->gpio_rst, error); return error; } @@ -619,7 +612,8 @@ static int auo_pixcir_probe(struct i2c_client *client, dev_info(&client->dev, "firmware version 0x%X\n", version); - error = auo_pixcir_int_config(ts, pdata->int_setting); + /* default to asserting the interrupt when the screen is touched */ + error = auo_pixcir_int_config(ts, AUO_PIXCIR_INT_TOUCH_IND); if (error) return error; diff --git a/include/linux/input/auo-pixcir-ts.h b/include/linux/input/auo-pixcir-ts.h deleted file mode 100644 index ed0776997a7a5..0000000000000 --- a/include/linux/input/auo-pixcir-ts.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Driver for AUO in-cell touchscreens - * - * Copyright (c) 2011 Heiko Stuebner - * - * based on auo_touch.h from Dell Streak kernel - * - * Copyright (c) 2008 QUALCOMM Incorporated. - * Copyright (c) 2008 QUALCOMM USA, INC. - */ - -#ifndef __AUO_PIXCIR_TS_H__ -#define __AUO_PIXCIR_TS_H__ - -/* - * Interrupt modes: - * periodical: interrupt is asserted periodicaly - * compare coordinates: interrupt is asserted when coordinates change - * indicate touch: interrupt is asserted during touch - */ -#define AUO_PIXCIR_INT_PERIODICAL 0x00 -#define AUO_PIXCIR_INT_COMP_COORD 0x01 -#define AUO_PIXCIR_INT_TOUCH_IND 0x02 - -/* - * @gpio_int interrupt gpio - * @int_setting one of AUO_PIXCIR_INT_* - * @init_hw hardwarespecific init - * @exit_hw hardwarespecific shutdown - * @x_max x-resolution - * @y_max y-resolution - */ -struct auo_pixcir_ts_platdata { - int gpio_int; - int gpio_rst; - - int int_setting; - - unsigned int x_max; - unsigned int y_max; -}; - -#endif -- GitLab From a750e24a2f2203d5024a905ce6ea8fcd7a9fd8e2 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 14 Sep 2022 07:14:25 -0700 Subject: [PATCH 0336/2223] Input: auo-pixcir-ts - switch to using gpiod API This switches the driver to gpiod API and drops uses of of_get_gpio() API. Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220914141428.2201784-2-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/auo-pixcir-ts.c | 47 ++++++++++------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/drivers/input/touchscreen/auo-pixcir-ts.c b/drivers/input/touchscreen/auo-pixcir-ts.c index a51d66ebff2bd..c3bce9fb2c94c 100644 --- a/drivers/input/touchscreen/auo-pixcir-ts.c +++ b/drivers/input/touchscreen/auo-pixcir-ts.c @@ -10,6 +10,7 @@ * Copyright (c) 2008 QUALCOMM USA, INC. */ +#include #include #include #include @@ -19,9 +20,8 @@ #include #include #include -#include +#include #include -#include /* * Coordinate calculation: @@ -112,8 +112,8 @@ struct auo_pixcir_ts { struct i2c_client *client; struct input_dev *input; - int gpio_int; - int gpio_rst; + struct gpio_desc *gpio_int; + struct gpio_desc *gpio_rst; char phys[32]; unsigned int x_max; @@ -193,7 +193,7 @@ static irqreturn_t auo_pixcir_interrupt(int irq, void *dev_id) /* check for up event in touch touch_ind_mode */ if (ts->touch_ind_mode) { - if (gpio_get_value(ts->gpio_int) == 0) { + if (gpiod_get_value_cansleep(ts->gpio_int) == 0) { input_mt_sync(ts->input); input_report_key(ts->input, BTN_TOUCH, 0); input_sync(ts->input); @@ -482,18 +482,6 @@ static int auo_pixcir_parse_dt(struct device *dev, struct auo_pixcir_ts *ts) if (!np) return -ENOENT; - ts->gpio_int = of_get_gpio(np, 0); - if (!gpio_is_valid(ts->gpio_int)) { - dev_err(dev, "failed to get interrupt gpio\n"); - return -EINVAL; - } - - ts->gpio_rst = of_get_gpio(np, 1); - if (!gpio_is_valid(ts->gpio_rst)) { - dev_err(dev, "failed to get reset gpio\n"); - return -EINVAL; - } - if (of_property_read_u32(np, "x-size", &ts->x_max)) { dev_err(dev, "failed to get x-size property\n"); return -EINVAL; @@ -517,7 +505,7 @@ static void auo_pixcir_reset(void *data) { struct auo_pixcir_ts *ts = data; - gpio_set_value(ts->gpio_rst, 0); + gpiod_set_value_cansleep(ts->gpio_rst, 1); } static int auo_pixcir_probe(struct i2c_client *client, @@ -578,23 +566,28 @@ static int auo_pixcir_probe(struct i2c_client *client, input_set_drvdata(ts->input, ts); - error = devm_gpio_request_one(&client->dev, ts->gpio_int, - GPIOF_DIR_IN, "auo_pixcir_ts_int"); + ts->gpio_int = devm_gpiod_get_index(&client->dev, NULL, 0, GPIOD_IN); + error = PTR_ERR_OR_ZERO(ts->gpio_int); if (error) { - dev_err(&client->dev, "request of gpio %d failed, %d\n", - ts->gpio_int, error); + dev_err(&client->dev, + "request of int gpio failed: %d\n", error); return error; } - error = devm_gpio_request_one(&client->dev, ts->gpio_rst, - GPIOF_DIR_OUT | GPIOF_INIT_HIGH, - "auo_pixcir_ts_rst"); + gpiod_set_consumer_name(ts->gpio_int, "auo_pixcir_ts_int"); + + /* Take the chip out of reset */ + ts->gpio_rst = devm_gpiod_get_index(&client->dev, NULL, 1, + GPIOD_OUT_LOW); + error = PTR_ERR_OR_ZERO(ts->gpio_rst); if (error) { - dev_err(&client->dev, "request of gpio %d failed, %d\n", - ts->gpio_rst, error); + dev_err(&client->dev, + "request of reset gpio failed: %d\n", error); return error; } + gpiod_set_consumer_name(ts->gpio_rst, "auo_pixcir_ts_rst"); + error = devm_add_action_or_reset(&client->dev, auo_pixcir_reset, ts); if (error) { dev_err(&client->dev, "failed to register reset action, %d\n", -- GitLab From 60b7a6d0fdf310f31bc4b9027e3271891b428b0a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 14 Sep 2022 07:14:26 -0700 Subject: [PATCH 0337/2223] Input: auo-pixcir-ts - do not force rising edge interrupt trigger Instead of hard-coding rising edge as the interrupt trigger, let's rely on the platform (ACPI, DT) to configure the interrupt properly. Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220914141428.2201784-3-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/auo-pixcir-ts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/auo-pixcir-ts.c b/drivers/input/touchscreen/auo-pixcir-ts.c index c3bce9fb2c94c..4960a50f59eab 100644 --- a/drivers/input/touchscreen/auo-pixcir-ts.c +++ b/drivers/input/touchscreen/auo-pixcir-ts.c @@ -612,7 +612,7 @@ static int auo_pixcir_probe(struct i2c_client *client, error = devm_request_threaded_irq(&client->dev, client->irq, NULL, auo_pixcir_interrupt, - IRQF_TRIGGER_RISING | IRQF_ONESHOT, + IRQF_ONESHOT, input_dev->name, ts); if (error) { dev_err(&client->dev, "irq %d requested failed, %d\n", -- GitLab From 770a71b23c29c28ed467af488821b4b144ef6953 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 14 Sep 2022 07:14:27 -0700 Subject: [PATCH 0338/2223] Input: auo-pixcir-ts - switch to using generic device properties Let's use generic device properties API instead of OF-specific one. Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220914141428.2201784-4-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/auo-pixcir-ts.c | 40 ++++++----------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/drivers/input/touchscreen/auo-pixcir-ts.c b/drivers/input/touchscreen/auo-pixcir-ts.c index 4960a50f59eab..2deae5a6823a2 100644 --- a/drivers/input/touchscreen/auo-pixcir-ts.c +++ b/drivers/input/touchscreen/auo-pixcir-ts.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Coordinate calculation: @@ -474,33 +475,6 @@ unlock: static SIMPLE_DEV_PM_OPS(auo_pixcir_pm_ops, auo_pixcir_suspend, auo_pixcir_resume); -#ifdef CONFIG_OF -static int auo_pixcir_parse_dt(struct device *dev, struct auo_pixcir_ts *ts) -{ - struct device_node *np = dev->of_node; - - if (!np) - return -ENOENT; - - if (of_property_read_u32(np, "x-size", &ts->x_max)) { - dev_err(dev, "failed to get x-size property\n"); - return -EINVAL; - } - - if (of_property_read_u32(np, "y-size", &ts->y_max)) { - dev_err(dev, "failed to get y-size property\n"); - return -EINVAL; - } - - return 0; -} -#else -static int auo_pixcir_parse_dt(struct device *dev, struct auo_pixcir_ts *ts) -{ - return -EINVAL; -} -#endif - static void auo_pixcir_reset(void *data) { struct auo_pixcir_ts *ts = data; @@ -535,9 +509,15 @@ static int auo_pixcir_probe(struct i2c_client *client, snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(&client->dev)); - error = auo_pixcir_parse_dt(&client->dev, ts); - if (error) - return error; + if (device_property_read_u32(&client->dev, "x-size", &ts->x_max)) { + dev_err(&client->dev, "failed to get x-size property\n"); + return -EINVAL; + } + + if (device_property_read_u32(&client->dev, "y-size", &ts->y_max)) { + dev_err(&client->dev, "failed to get y-size property\n"); + return -EINVAL; + } input_dev->name = "AUO-Pixcir touchscreen"; input_dev->phys = ts->phys; -- GitLab From 437d49b051e8ca80d2ffa8f3fd98ce58755c2758 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 14 Sep 2022 07:14:28 -0700 Subject: [PATCH 0339/2223] dt-bindings: input: auo-pixcir-ts: fix gpio and interrupt properties Add proper interrupt trigger and gpio polarity data to the binding example. Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220914141428.2201784-5-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/touchscreen/auo_pixcir_ts.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/auo_pixcir_ts.txt b/Documentation/devicetree/bindings/input/touchscreen/auo_pixcir_ts.txt index f40f21c642b96..b8db975e9f778 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/auo_pixcir_ts.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/auo_pixcir_ts.txt @@ -17,10 +17,10 @@ Example: auo_pixcir_ts@5c { compatible = "auo,auo_pixcir_ts"; reg = <0x5c>; - interrupts = <2 0>; + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; - gpios = <&gpf 2 0 2>, /* INT */ - <&gpf 5 1 0>; /* RST */ + gpios = <&gpf 2 0 GPIO_LEVEL_HIGH>, /* INT */ + <&gpf 5 1 GPIO_LEVEL_LOW>; /* RST */ x-size = <800>; y-size = <600>; -- GitLab From b338bde5a3a9c4ccf6c83e0a20c8de3ad281ef02 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Tue, 13 Sep 2022 00:21:09 -0500 Subject: [PATCH 0340/2223] memblock tests: add simulation of physical memory with multiple NUMA nodes Add function setup_numa_memblock() for setting up a memory layout with multiple NUMA nodes in a previously allocated dummy physical memory. This function can be used in place of setup_memblock() in tests that need to simulate a NUMA system. setup_numa_memblock(): - allows for setting up a memory layout by specifying the fraction of MEM_SIZE in each node Set CONFIG_NODES_SHIFT to 4 when building with NUMA=1 to allow for up to 16 NUMA nodes. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/4566d816a85f009268d4858d1ef06c7571a960f9.1663046060.git.remckee0@gmail.com --- .../testing/memblock/scripts/Makefile.include | 2 +- tools/testing/memblock/tests/common.c | 31 +++++++++++++++++++ tools/testing/memblock/tests/common.h | 4 ++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/tools/testing/memblock/scripts/Makefile.include b/tools/testing/memblock/scripts/Makefile.include index aa6d82d56a23a..9982817235907 100644 --- a/tools/testing/memblock/scripts/Makefile.include +++ b/tools/testing/memblock/scripts/Makefile.include @@ -3,7 +3,7 @@ # Simulate CONFIG_NUMA=y ifeq ($(NUMA), 1) - CFLAGS += -D CONFIG_NUMA + CFLAGS += -D CONFIG_NUMA -D CONFIG_NODES_SHIFT=4 endif # Use 32 bit physical addresses. diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index eec6901081af3..3f795047bbe18 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -9,6 +9,7 @@ #define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS #define PREFIXES_MAX 15 #define DELIM ": " +#define BASIS 10000 static struct test_memory memory_block; static const char __maybe_unused *prefixes[PREFIXES_MAX]; @@ -72,6 +73,36 @@ void setup_memblock(void) fill_memblock(); } +/** + * setup_numa_memblock: + * Set up a memory layout with multiple NUMA nodes in a previously allocated + * dummy physical memory. + * @node_fracs: an array representing the fraction of MEM_SIZE contained in + * each node in basis point units (one hundredth of 1% or 1/10000). + * For example, if node 0 should contain 1/8 of MEM_SIZE, + * node_fracs[0] = 1250. + * + * The nids will be set to 0 through NUMA_NODES - 1. + */ +void setup_numa_memblock(const unsigned int node_fracs[]) +{ + phys_addr_t base; + int flags; + + reset_memblock_regions(); + base = (phys_addr_t)memory_block.base; + flags = (movable_node_is_enabled()) ? MEMBLOCK_NONE : MEMBLOCK_HOTPLUG; + + for (int i = 0; i < NUMA_NODES; i++) { + assert(node_fracs[i] <= BASIS); + phys_addr_t size = MEM_SIZE * node_fracs[i] / BASIS; + + memblock_add_node(base, size, i, flags); + base += size; + } + fill_memblock(); +} + void dummy_physical_memory_init(void) { memory_block.base = malloc(MEM_SIZE); diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 78128e109a95c..def71648887f5 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -10,7 +10,8 @@ #include #include <../selftests/kselftest.h> -#define MEM_SIZE SZ_16K +#define MEM_SIZE SZ_16K +#define NUMA_NODES 8 enum test_flags { /* No special request. */ @@ -102,6 +103,7 @@ struct region { void reset_memblock_regions(void); void reset_memblock_attributes(void); void setup_memblock(void); +void setup_numa_memblock(const unsigned int node_fracs[]); void dummy_physical_memory_init(void); void dummy_physical_memory_cleanup(void); void parse_args(int argc, char **argv); -- GitLab From 50c80241f15890a64b9302187faaeb7cfe78b4b8 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Tue, 13 Sep 2022 00:21:10 -0500 Subject: [PATCH 0341/2223] memblock tests: add top-down NUMA tests for memblock_alloc_try_nid* Add tests for memblock_alloc_try_nid() and memblock_alloc_try_nid_raw() where the simulated physical memory is set up with multiple NUMA nodes. Additionally, all of these tests set nid != NUMA_NO_NODE. These tests are run with a top-down allocation direction. The tested scenarios are: Range unrestricted: - region can be allocated in the specific node requested: + there are no previously reserved regions + the requested node is partially reserved but has enough space - the specific node requested cannot accommodate the request, but the region can be allocated in a different node: + there are no previously reserved regions, but node is too small + the requested node is fully reserved + the requested node is partially reserved and does not have enough space Range restricted: - region can be allocated in the specific node requested after dropping min_addr: + range partially overlaps with two different nodes, where the first node is the requested node + range partially overlaps with two different nodes, where the requested node ends before min_addr - region cannot be allocated in the specific node requested, but it can be allocated in the requested range: + range overlaps with multiple nodes along node boundaries, and the requested node ends before min_addr + range overlaps with multiple nodes along node boundaries, and the requested node starts after max_addr - region cannot be allocated in the specific node requested, but it can be allocated after dropping min_addr: + range partially overlaps with two different nodes, where the second node is the requested node Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/84009c5b3969337ccf89df850db56d364f8c228b.1663046060.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_nid_api.c | 701 ++++++++++++++++++- tools/testing/memblock/tests/alloc_nid_api.h | 16 + tools/testing/memblock/tests/common.h | 18 + 3 files changed, 724 insertions(+), 11 deletions(-) diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index db5daa50fa72e..b13fcbcac4579 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -3,6 +3,21 @@ static int alloc_nid_test_flags = TEST_F_NONE; +/* + * contains the fraction of MEM_SIZE contained in each node in basis point + * units (one hundredth of 1% or 1/10000) + */ +static const unsigned int node_fractions[] = { + 2500, /* 1/4 */ + 625, /* 1/16 */ + 1250, /* 1/8 */ + 1250, /* 1/8 */ + 625, /* 1/16 */ + 625, /* 1/16 */ + 2500, /* 1/4 */ + 625, /* 1/16 */ +}; + static inline const char * const get_memblock_alloc_try_nid_name(int flags) { if (flags & TEST_F_RAW) @@ -1054,7 +1069,7 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) return 0; } -/* Test case wrappers */ +/* Test case wrappers for range tests */ static int alloc_try_nid_simple_check(void) { test_print("\tRunning %s...\n", __func__); @@ -1186,17 +1201,10 @@ static int alloc_try_nid_low_max_check(void) return 0; } -static int memblock_alloc_nid_checks_internal(int flags) +static int memblock_alloc_nid_range_checks(void) { - const char *func = get_memblock_alloc_try_nid_name(flags); - - alloc_nid_test_flags = flags; - prefix_reset(); - prefix_push(func); - test_print("Running %s tests...\n", func); - - reset_memblock_attributes(); - dummy_physical_memory_init(); + test_print("Running %s range tests...\n", + get_memblock_alloc_try_nid_name(alloc_nid_test_flags)); alloc_try_nid_simple_check(); alloc_try_nid_misaligned_check(); @@ -1213,6 +1221,677 @@ static int memblock_alloc_nid_checks_internal(int flags) alloc_try_nid_reserved_all_check(); alloc_try_nid_low_max_check(); + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * has enough memory to allocate a region of the requested size. + * Expect to allocate an aligned region at the end of the requested node. + */ +static int alloc_try_nid_top_down_numa_simple_check(void) +{ + int nid_req = 3; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * does not have enough memory to allocate a region of the requested size: + * + * | +-----+ +------------------+ | + * | | req | | expected | | + * +---+-----+----------+------------------+-----+ + * + * | +---------+ | + * | | rgn | | + * +-----------------------------+---------+-----+ + * + * Expect to allocate an aligned region at the end of the last node that has + * enough memory (in this case, nid = 6) after falling back to NUMA_NO_NODE. + */ +static int alloc_try_nid_top_down_numa_small_node_check(void) +{ + int nid_req = 1; + int nid_exp = 6; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_2 * req_node->size; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(exp_node) - size); + ASSERT_LE(exp_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is fully reserved: + * + * | +---------+ +------------------+ | + * | |requested| | expected | | + * +--------------+---------+------------+------------------+-----+ + * + * | +---------+ +---------+ | + * | | reserved| | new | | + * +--------------+---------+---------------------+---------+-----+ + * + * Expect to allocate an aligned region at the end of the last node that is + * large enough and has enough unreserved memory (in this case, nid = 6) after + * falling back to NUMA_NO_NODE. The region count and total size get updated. + */ +static int alloc_try_nid_top_down_numa_node_reserved_check(void) +{ + int nid_req = 2; + int nid_exp = 6; + struct memblock_region *new_rgn = &memblock.reserved.regions[1]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = req_node->size; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(req_node->base, req_node->size); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(exp_node) - size); + ASSERT_LE(exp_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, size + req_node->size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved but has enough memory for the allocated region: + * + * | +---------------------------------------+ | + * | | requested | | + * +-----------+---------------------------------------+----------+ + * + * | +------------------+ +-----+ | + * | | reserved | | new | | + * +-----------+------------------+--------------+-----+----------+ + * + * Expect to allocate an aligned region at the end of the requested node. The + * region count and total size get updated. + */ +static int alloc_try_nid_top_down_numa_part_reserved_check(void) +{ + int nid_req = 4; + struct memblock_region *new_rgn = &memblock.reserved.regions[1]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_8, req_node->size); + r1.base = req_node->base; + r1.size = req_node->size / SZ_2; + size = r1.size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(r1.base, r1.size); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, size + r1.size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved and does not have enough contiguous memory for the + * allocated region: + * + * | +-----------------------+ +----------------------| + * | | requested | | expected | + * +-----------+-----------------------+---------+----------------------+ + * + * | +----------+ +-----------| + * | | reserved | | new | + * +-----------------+----------+---------------------------+-----------+ + * + * Expect to allocate an aligned region at the end of the last node that is + * large enough and has enough unreserved memory (in this case, + * nid = NUMA_NODES - 1) after falling back to NUMA_NO_NODE. The region count + * and total size get updated. + */ +static int alloc_try_nid_top_down_numa_part_reserved_fallback_check(void) +{ + int nid_req = 4; + int nid_exp = NUMA_NODES - 1; + struct memblock_region *new_rgn = &memblock.reserved.regions[1]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_2; + r1.base = req_node->base + (size / SZ_2); + r1.size = size; + + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(r1.base, r1.size); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(exp_node) - size); + ASSERT_LE(exp_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, size + r1.size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the first + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * | +-----------------------+-----------+ | + * | | requested | node3 | | + * +-----------+-----------------------+-----------+--------------+ + * + + + * | +-----------+ | + * | | rgn | | + * +-----------------------+-----------+--------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that ends at + * the end of the requested node. + */ +static int alloc_try_nid_top_down_numa_split_range_low_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t req_node_end; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + req_node_end = region_end(req_node); + min_addr = req_node_end - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node_end - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the second + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * | +--------------------------+---------+ | + * | | expected |requested| | + * +------+--------------------------+---------+----------------+ + * + + + * | +---------+ | + * | | rgn | | + * +-----------------------+---------+--------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that + * ends at the end of the first node that overlaps with the range. + */ +static int alloc_try_nid_top_down_numa_split_range_high_check(void) +{ + int nid_req = 3; + int nid_exp = nid_req - 1; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t exp_node_end; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + exp_node_end = region_end(exp_node); + min_addr = exp_node_end - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, exp_node_end - size); + ASSERT_LE(exp_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the requested + * node ends before min_addr: + * + * min_addr + * | max_addr + * | | + * v v + * | +---------------+ +-------------+---------+ | + * | | requested | | node1 | node2 | | + * +----+---------------+--------+-------------+---------+----------+ + * + + + * | +---------+ | + * | | rgn | | + * +----------+---------+-------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that ends at + * the end of the requested node. + */ +static int alloc_try_nid_top_down_numa_no_overlap_split_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *node2 = &memblock.memory.regions[6]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_512; + min_addr = node2->base - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node ends + * before min_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * |-----------+ +----------+----...----+----------+ | + * | requested | | min node | ... | max node | | + * +-----------+-----------+----------+----...----+----------+------+ + * + + + * | +-----+ | + * | | rgn | | + * +---------------------------------------------------+-----+------+ + * + * Expect to allocate a memory region at the end of the final node in + * the range after falling back to NUMA_NO_NODE. + */ +static int alloc_try_nid_top_down_numa_no_overlap_low_check(void) +{ + int nid_req = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, max_addr - size); + ASSERT_LE(max_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node starts + * after max_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * | +----------+----...----+----------+ +-----------+ | + * | | min node | ... | max node | | requested | | + * +-----+----------+----...----+----------+--------+-----------+---+ + * + + + * | +-----+ | + * | | rgn | | + * +---------------------------------+-----+------------------------+ + * + * Expect to allocate a memory region at the end of the final node in + * the range after falling back to NUMA_NO_NODE. + */ +static int alloc_try_nid_top_down_numa_no_overlap_high_check(void) +{ + int nid_req = 7; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, max_addr - size); + ASSERT_LE(max_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* Test case wrappers for NUMA tests */ +static int alloc_try_nid_numa_simple_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_simple_check(); + + return 0; +} + +static int alloc_try_nid_numa_small_node_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_small_node_check(); + + return 0; +} + +static int alloc_try_nid_numa_node_reserved_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_node_reserved_check(); + + return 0; +} + +static int alloc_try_nid_numa_part_reserved_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_part_reserved_check(); + + return 0; +} + +static int alloc_try_nid_numa_part_reserved_fallback_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_part_reserved_fallback_check(); + + return 0; +} + +static int alloc_try_nid_numa_split_range_low_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_split_range_low_check(); + + return 0; +} + +static int alloc_try_nid_numa_split_range_high_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_split_range_high_check(); + + return 0; +} + +static int alloc_try_nid_numa_no_overlap_split_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_no_overlap_split_check(); + + return 0; +} + +static int alloc_try_nid_numa_no_overlap_low_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_no_overlap_low_check(); + + return 0; +} + +static int alloc_try_nid_numa_no_overlap_high_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_try_nid_top_down_numa_no_overlap_high_check(); + + return 0; +} + +int __memblock_alloc_nid_numa_checks(void) +{ + test_print("Running %s NUMA tests...\n", + get_memblock_alloc_try_nid_name(alloc_nid_test_flags)); + + alloc_try_nid_numa_simple_check(); + alloc_try_nid_numa_small_node_check(); + alloc_try_nid_numa_node_reserved_check(); + alloc_try_nid_numa_part_reserved_check(); + alloc_try_nid_numa_part_reserved_fallback_check(); + alloc_try_nid_numa_split_range_low_check(); + alloc_try_nid_numa_split_range_high_check(); + + alloc_try_nid_numa_no_overlap_split_check(); + alloc_try_nid_numa_no_overlap_low_check(); + alloc_try_nid_numa_no_overlap_high_check(); + + return 0; +} + +static int memblock_alloc_nid_checks_internal(int flags) +{ + alloc_nid_test_flags = flags; + + prefix_reset(); + prefix_push(get_memblock_alloc_try_nid_name(flags)); + + reset_memblock_attributes(); + dummy_physical_memory_init(); + + memblock_alloc_nid_range_checks(); + memblock_alloc_nid_numa_checks(); + dummy_physical_memory_cleanup(); prefix_pop(); diff --git a/tools/testing/memblock/tests/alloc_nid_api.h b/tools/testing/memblock/tests/alloc_nid_api.h index b35cf3c3f4898..92d07d230e18f 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.h +++ b/tools/testing/memblock/tests/alloc_nid_api.h @@ -5,5 +5,21 @@ #include "common.h" int memblock_alloc_nid_checks(void); +int __memblock_alloc_nid_numa_checks(void); + +#ifdef CONFIG_NUMA +static inline int memblock_alloc_nid_numa_checks(void) +{ + __memblock_alloc_nid_numa_checks(); + return 0; +} + +#else +static inline int memblock_alloc_nid_numa_checks(void) +{ + return 0; +} + +#endif /* CONFIG_NUMA */ #endif diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index def71648887f5..d6bbbe63bfc36 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -59,6 +59,19 @@ enum test_flags { assert((_expected) < (_seen)); \ } while (0) +/** + * ASSERT_LE(): + * Check the condition + * @_expected <= @_seen + * If false, print failed test message (if running with --verbose) and then + * assert. + */ +#define ASSERT_LE(_expected, _seen) do { \ + if ((_expected) > (_seen)) \ + test_fail(); \ + assert((_expected) <= (_seen)); \ +} while (0) + /** * ASSERT_MEM_EQ(): * Check that the first @_size bytes of @_seen are all equal to @_expected. @@ -100,6 +113,11 @@ struct region { phys_addr_t size; }; +static inline phys_addr_t __maybe_unused region_end(struct memblock_region *rgn) +{ + return rgn->base + rgn->size; +} + void reset_memblock_regions(void); void reset_memblock_attributes(void); void setup_memblock(void); -- GitLab From 4b41046e7c6bd999c1519a8bf2771573bcecf52b Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Tue, 13 Sep 2022 00:21:11 -0500 Subject: [PATCH 0342/2223] memblock tests: add bottom-up NUMA tests for memblock_alloc_try_nid* Add tests for memblock_alloc_try_nid() and memblock_alloc_try_nid_raw() where the simulated physical memory is set up with multiple NUMA nodes. Additionally, all of these tests set nid != NUMA_NO_NODE. These tests are run with a bottom-up allocation direction. The tested scenarios are: Range unrestricted: - region can be allocated in the specific node requested: + there are no previously reserved regions + the requested node is partially reserved but has enough space - the specific node requested cannot accommodate the request, but the region can be allocated in a different node: + there are no previously reserved regions, but node is too small + the requested node is fully reserved + the requested node is partially reserved and does not have enough space Range restricted: - region can be allocated in the specific node requested after dropping min_addr: + range partially overlaps with two different nodes, where the first node is the requested node + range partially overlaps with two different nodes, where the requested node ends before min_addr - region cannot be allocated in the specific node requested, but it can be allocated in the requested range: + range overlaps with multiple nodes along node boundaries, and the requested node ends before min_addr + range overlaps with multiple nodes along node boundaries, and the requested node starts after max_addr - region cannot be allocated in the specific node requested, but it can be allocated after dropping min_addr: + range partially overlaps with two different nodes, where the second node is the requested node Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/00c4810daaf5d050abc71915b24ed7419bb16b51.1663046060.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_nid_api.c | 568 +++++++++++++++++++ 1 file changed, 568 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index b13fcbcac4579..7247fa145d7d8 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -1768,12 +1768,562 @@ static int alloc_try_nid_top_down_numa_no_overlap_high_check(void) return 0; } +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * has enough memory to allocate a region of the requested size. + * Expect to allocate an aligned region at the beginning of the requested node. + */ +static int alloc_try_nid_bottom_up_numa_simple_check(void) +{ + int nid_req = 3; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * does not have enough memory to allocate a region of the requested size: + * + * |----------------------+-----+ | + * | expected | req | | + * +----------------------+-----+----------------+ + * + * |---------+ | + * | rgn | | + * +---------+-----------------------------------+ + * + * Expect to allocate an aligned region at the beginning of the first node that + * has enough memory (in this case, nid = 0) after falling back to NUMA_NO_NODE. + */ +static int alloc_try_nid_bottom_up_numa_small_node_check(void) +{ + int nid_req = 1; + int nid_exp = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_2 * req_node->size; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, exp_node->base); + ASSERT_LE(region_end(new_rgn), region_end(exp_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is fully reserved: + * + * |----------------------+ +-----------+ | + * | expected | | requested | | + * +----------------------+-----+-----------+--------------------+ + * + * |-----------+ +-----------+ | + * | new | | reserved | | + * +-----------+----------------+-----------+--------------------+ + * + * Expect to allocate an aligned region at the beginning of the first node that + * is large enough and has enough unreserved memory (in this case, nid = 0) + * after falling back to NUMA_NO_NODE. The region count and total size get + * updated. + */ +static int alloc_try_nid_bottom_up_numa_node_reserved_check(void) +{ + int nid_req = 2; + int nid_exp = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = req_node->size; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(req_node->base, req_node->size); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, exp_node->base); + ASSERT_LE(region_end(new_rgn), region_end(exp_node)); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, size + req_node->size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved but has enough memory for the allocated region: + * + * | +---------------------------------------+ | + * | | requested | | + * +-----------+---------------------------------------+---------+ + * + * | +------------------+-----+ | + * | | reserved | new | | + * +-----------+------------------+-----+------------------------+ + * + * Expect to allocate an aligned region in the requested node that merges with + * the existing reserved region. The total size gets updated. + */ +static int alloc_try_nid_bottom_up_numa_part_reserved_check(void) +{ + int nid_req = 4; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t total_size; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_8, req_node->size); + r1.base = req_node->base; + r1.size = req_node->size / SZ_2; + size = r1.size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + total_size = size + r1.size; + + memblock_reserve(r1.base, r1.size); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, total_size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved and does not have enough contiguous memory for the + * allocated region: + * + * |----------------------+ +-----------------------+ | + * | expected | | requested | | + * +----------------------+-------+-----------------------+---------+ + * + * |-----------+ +----------+ | + * | new | | reserved | | + * +-----------+------------------------+----------+----------------+ + * + * Expect to allocate an aligned region at the beginning of the first + * node that is large enough and has enough unreserved memory (in this case, + * nid = 0) after falling back to NUMA_NO_NODE. The region count and total size + * get updated. + */ +static int alloc_try_nid_bottom_up_numa_part_reserved_fallback_check(void) +{ + int nid_req = 4; + int nid_exp = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_2; + r1.base = req_node->base + (size / SZ_2); + r1.size = size; + + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(r1.base, r1.size); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, exp_node->base); + ASSERT_LE(region_end(new_rgn), region_end(exp_node)); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, size + r1.size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the first + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * | +-----------------------+-----------+ | + * | | requested | node3 | | + * +-----------+-----------------------+-----------+--------------+ + * + + + * | +-----------+ | + * | | rgn | | + * +-----------+-----------+--------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region at the beginning + * of the requested node. + */ +static int alloc_try_nid_bottom_up_numa_split_range_low_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t req_node_end; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + req_node_end = region_end(req_node); + min_addr = req_node_end - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), req_node_end); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the second + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * |------------------+ +----------------------+---------+ | + * | expected | | previous |requested| | + * +------------------+--------+----------------------+---------+------+ + * + + + * |---------+ | + * | rgn | | + * +---------+---------------------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region at the beginning + * of the first node that has enough memory. + */ +static int alloc_try_nid_bottom_up_numa_split_range_high_check(void) +{ + int nid_req = 3; + int nid_exp = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *exp_node = &memblock.memory.regions[nid_exp]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t exp_node_end; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + exp_node_end = region_end(req_node); + min_addr = req_node->base - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, exp_node->base); + ASSERT_LE(region_end(new_rgn), exp_node_end); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the requested + * node ends before min_addr: + * + * min_addr + * | max_addr + * | | + * v v + * | +---------------+ +-------------+---------+ | + * | | requested | | node1 | node2 | | + * +----+---------------+--------+-------------+---------+---------+ + * + + + * | +---------+ | + * | | rgn | | + * +----+---------+------------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that starts at + * the beginning of the requested node. + */ +static int alloc_try_nid_bottom_up_numa_no_overlap_split_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *node2 = &memblock.memory.regions[6]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_512; + min_addr = node2->base - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node ends + * before min_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * |-----------+ +----------+----...----+----------+ | + * | requested | | min node | ... | max node | | + * +-----------+-----------+----------+----...----+----------+------+ + * + + + * | +-----+ | + * | | rgn | | + * +-----------------------+-----+----------------------------------+ + * + * Expect to allocate a memory region at the beginning of the first node + * in the range after falling back to NUMA_NO_NODE. + */ +static int alloc_try_nid_bottom_up_numa_no_overlap_low_check(void) +{ + int nid_req = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, min_addr); + ASSERT_LE(region_end(new_rgn), region_end(min_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node starts + * after max_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * | +----------+----...----+----------+ +---------+ | + * | | min node | ... | max node | |requested| | + * +-----+----------+----...----+----------+---------+---------+---+ + * + + + * | +-----+ | + * | | rgn | | + * +-----+-----+---------------------------------------------------+ + * + * Expect to allocate a memory region at the beginning of the first node + * in the range after falling back to NUMA_NO_NODE. + */ +static int alloc_try_nid_bottom_up_numa_no_overlap_high_check(void) +{ + int nid_req = 7; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, min_addr); + ASSERT_LE(region_end(new_rgn), region_end(min_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + /* Test case wrappers for NUMA tests */ static int alloc_try_nid_numa_simple_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_simple_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_simple_check(); return 0; } @@ -1783,6 +2333,8 @@ static int alloc_try_nid_numa_small_node_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_small_node_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_small_node_check(); return 0; } @@ -1792,6 +2344,8 @@ static int alloc_try_nid_numa_node_reserved_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_node_reserved_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_node_reserved_check(); return 0; } @@ -1801,6 +2355,8 @@ static int alloc_try_nid_numa_part_reserved_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_part_reserved_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_part_reserved_check(); return 0; } @@ -1810,6 +2366,8 @@ static int alloc_try_nid_numa_part_reserved_fallback_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_part_reserved_fallback_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_part_reserved_fallback_check(); return 0; } @@ -1819,6 +2377,8 @@ static int alloc_try_nid_numa_split_range_low_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_split_range_low_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_split_range_low_check(); return 0; } @@ -1828,6 +2388,8 @@ static int alloc_try_nid_numa_split_range_high_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_split_range_high_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_split_range_high_check(); return 0; } @@ -1837,6 +2399,8 @@ static int alloc_try_nid_numa_no_overlap_split_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_no_overlap_split_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_no_overlap_split_check(); return 0; } @@ -1846,6 +2410,8 @@ static int alloc_try_nid_numa_no_overlap_low_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_no_overlap_low_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_no_overlap_low_check(); return 0; } @@ -1855,6 +2421,8 @@ static int alloc_try_nid_numa_no_overlap_high_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_numa_no_overlap_high_check(); + memblock_set_bottom_up(true); + alloc_try_nid_bottom_up_numa_no_overlap_high_check(); return 0; } -- GitLab From 3e4519b7afc2f9d99f9303468ee0b23f88399c8d Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Tue, 13 Sep 2022 00:21:12 -0500 Subject: [PATCH 0343/2223] memblock tests: add generic NUMA tests for memblock_alloc_try_nid* Add tests for memblock_alloc_try_nid() and memblock_alloc_try_nid_raw() where the simulated physical memory is set up with multiple NUMA nodes. Additionally, two of these tests set nid != NUMA_NO_NODE. All tests are run for both top-down and bottom-up allocation directions. The tested scenarios are: Range unrestricted: - region cannot be allocated: + none of the nodes have enough memory to allocate the region Range restricted: - region can be allocated in the specific node requested without dropping min_addr: + the range fully overlaps with the node, and there are adjacent reserved regions - region cannot be allocated: + nid is set to NUMA_NO_NODE and the total range can fit the region, but the range is split between two nodes and everything else is reserved Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/4b2c7e6e5f3a9837939e99293c77e0e6fc3ae4f9.1663046060.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_nid_api.c | 197 +++++++++++++++++++ 1 file changed, 197 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 7247fa145d7d8..2c2d60f4e3e3c 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -2316,6 +2316,173 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_high_check(void) return 0; } +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * does not have enough memory to allocate a region of the requested size. + * Additionally, none of the nodes have enough memory to allocate the region: + * + * +-----------------------------------+ + * | new | + * +-----------------------------------+ + * |-------+-------+-------+-------+-------+-------+-------+-------| + * | node0 | node1 | node2 | node3 | node4 | node5 | node6 | node7 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Expect no allocation to happen. + */ +static int alloc_try_nid_numa_large_region_generic_check(void) +{ + int nid_req = 3; + void *allocated_ptr = NULL; + phys_addr_t size = MEM_SIZE / SZ_2; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_addr range when + * there are two reserved regions at the borders. The requested node starts at + * min_addr and ends at max_addr and is the same size as the region to be + * allocated: + * + * min_addr + * | max_addr + * | | + * v v + * | +-----------+-----------------------+-----------------------| + * | | node5 | requested | node7 | + * +------+-----------+-----------------------+-----------------------+ + * + + + * | +----+-----------------------+----+ | + * | | r2 | new | r1 | | + * +-------------+----+-----------------------+----+------------------+ + * + * Expect to merge all of the regions into one. The region counter and total + * size fields get updated. + */ +static int alloc_try_nid_numa_reserved_full_merge_generic_check(void) +{ + int nid_req = 6; + int nid_next = nid_req + 1; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *next_node = &memblock.memory.regions[nid_next]; + void *allocated_ptr = NULL; + struct region r1, r2; + phys_addr_t size = req_node->size; + phys_addr_t total_size; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + r1.base = next_node->base; + r1.size = SZ_128; + + r2.size = SZ_128; + r2.base = r1.base - (size + r2.size); + + total_size = r1.size + r2.size + size; + min_addr = r2.base + r2.size; + max_addr = r1.base; + + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); + + ASSERT_NE(allocated_ptr, NULL); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); + + ASSERT_EQ(new_rgn->size, total_size); + ASSERT_EQ(new_rgn->base, r2.base); + + ASSERT_LE(new_rgn->base, req_node->base); + ASSERT_LE(region_end(req_node), region_end(new_rgn)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range, + * where the total range can fit the region, but it is split between two nodes + * and everything else is reserved. Additionally, nid is set to NUMA_NO_NODE + * instead of requesting a specific node: + * + * +-----------+ + * | new | + * +-----------+ + * | +---------------------+-----------| + * | | prev node | next node | + * +------+---------------------+-----------+ + * + + + * |----------------------+ +-----| + * | r1 | | r2 | + * +----------------------+-----------+-----+ + * ^ ^ + * | | + * | max_addr + * | + * min_addr + * + * Expect no allocation to happen. + */ +static int alloc_try_nid_numa_split_all_reserved_generic_check(void) +{ + void *allocated_ptr = NULL; + struct memblock_region *next_node = &memblock.memory.regions[7]; + struct region r1, r2; + phys_addr_t size = SZ_256; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + r2.base = next_node->base + SZ_128; + r2.size = memblock_end_of_DRAM() - r2.base; + + r1.size = MEM_SIZE - (r2.size + size); + r1.base = memblock_start_of_DRAM(); + + min_addr = r1.base + r1.size; + max_addr = r2.base; + + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + /* Test case wrappers for NUMA tests */ static int alloc_try_nid_numa_simple_check(void) { @@ -2427,6 +2594,33 @@ static int alloc_try_nid_numa_no_overlap_high_check(void) return 0; } +static int alloc_try_nid_numa_large_region_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_try_nid_numa_large_region_generic_check); + run_bottom_up(alloc_try_nid_numa_large_region_generic_check); + + return 0; +} + +static int alloc_try_nid_numa_reserved_full_merge_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_try_nid_numa_reserved_full_merge_generic_check); + run_bottom_up(alloc_try_nid_numa_reserved_full_merge_generic_check); + + return 0; +} + +static int alloc_try_nid_numa_split_all_reserved_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_try_nid_numa_split_all_reserved_generic_check); + run_bottom_up(alloc_try_nid_numa_split_all_reserved_generic_check); + + return 0; +} + int __memblock_alloc_nid_numa_checks(void) { test_print("Running %s NUMA tests...\n", @@ -2443,6 +2637,9 @@ int __memblock_alloc_nid_numa_checks(void) alloc_try_nid_numa_no_overlap_split_check(); alloc_try_nid_numa_no_overlap_low_check(); alloc_try_nid_numa_no_overlap_high_check(); + alloc_try_nid_numa_large_region_check(); + alloc_try_nid_numa_reserved_full_merge_check(); + alloc_try_nid_numa_split_all_reserved_check(); return 0; } -- GitLab From 460281cf269b02f2caa88ade79c1e7eed29bfe15 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:45:14 +1000 Subject: [PATCH 0344/2223] xfs: remove the redundant word in comment Just remove the redundant word "being" in comment. Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6e19ece916bfb..ca2941ab6cbcd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -550,7 +550,7 @@ xfs_inode_item_push( if (!bp || (ip->i_flags & XFS_ISTALE)) { /* - * Inode item/buffer is being being aborted due to cluster + * Inode item/buffer is being aborted due to cluster * buffer deletion. Trigger a log force to have that operation * completed and items removed from the AIL before the next push * attempt. -- GitLab From 5617104003ae11a1ab383dbd63228b7645c26207 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:46:14 +1000 Subject: [PATCH 0345/2223] xfs: remove redundant else for clean code "else" is not generally useful after a return, so remove it for clean code. There is no logical changes. Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 386b0307aed85..f6e7e4fd72ae7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -226,12 +226,12 @@ xlog_ticket_reservation( if (head == &log->l_write_head) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); return tic->t_unit_res; - } else { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - return tic->t_unit_res * tic->t_cnt; - else - return tic->t_unit_res; } + + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + + return tic->t_unit_res; } STATIC bool -- GitLab From 78b0f58bdfef45aa9f3c7fbbd9b4d41abad6d85f Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:47:14 +1000 Subject: [PATCH 0346/2223] xfs: clean up "%Ld/%Lu" which doesn't meet C standard The "%Ld" specifier, which represents long long unsigned, doesn't meet C language standard, and even more, it makes people easily mistake with "%ld", which represent long unsigned. So replace "%Ld" with "lld". Do the same with "%Lu". Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.c | 4 ++-- fs/xfs/xfs_inode.c | 8 ++++---- fs/xfs/xfs_inode_item_recover.c | 4 ++-- fs/xfs/xfs_stats.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e56723dc9cd5b..49d0d4ea63fcd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -294,7 +294,7 @@ xfs_check_block( else thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9327a4f392065..6b21760184d9e 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -78,7 +78,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %zd).", + "corrupt inode %llu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -192,7 +192,7 @@ xfs_iformat_btree( XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", + xfs_warn(mp, "corrupt inode %llu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 28493c8e9bb23..b3eeeae3afe1a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3119,7 +3119,7 @@ xfs_iflush( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, + "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto flush_out; } @@ -3129,7 +3129,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr "PTR_FMT, + "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3140,7 +3140,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr "PTR_FMT, + "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3158,7 +3158,7 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, + "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); goto flush_out; } diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index d28ffaebd0670..0e5dba2343ea1 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -321,7 +321,7 @@ xlog_recover_inode_commit_pass2( */ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", __func__, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -329,7 +329,7 @@ xlog_recover_inode_commit_pass2( ldip = item->ri_buf[1].i_addr; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", __func__, item, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 20e0534a772c9..881720c4cf70e 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", defer_relog); -- GitLab From 92b40768c1a4e01e776cb13ab5357a8b5c78e965 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:48:14 +1000 Subject: [PATCH 0347/2223] xfs: replace unnecessary seq_printf with seq_puts Replace seq_printf with seq_puts when const string in reference, which would avoid to deal with unnecessary string format. Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 881720c4cf70e..90a77cd3ebade 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) { int j; - seq_printf(m, "qm"); + seq_puts(m, "qm"); for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); -- GitLab From de94a2e151bed6884b4f21aa518a100ac9e83af2 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:49:14 +1000 Subject: [PATCH 0348/2223] xfs: simplify if-else condition in xfs_validate_new_dalign "else" is not generally useful after a return, so remove them which makes if condition a bit more clear. There is no logical changes. Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f10c88cee116e..e8bb3c2e847e1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -300,26 +300,28 @@ xfs_validate_new_dalign( "alignment check failed: sunit/swidth vs. blocksize(%d)", mp->m_sb.sb_blocksize); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - mp->m_sb.sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, mp->m_sb.sb_blocksize); - return -EINVAL; - } } + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); + return -EINVAL; + } + + if (!mp->m_dalign) { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); + return -EINVAL; + } + + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); -- GitLab From a0ebf8c46d64ba96b413784f88af0a4dca95b6bc Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:50:14 +1000 Subject: [PATCH 0349/2223] xfs: simplify if-else condition in xfs_reflink_trim_around_shared "else" is not generally useful after a return, so remove it for clean code. There is no logical changes. Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 251f20ddd3683..93bdd25680bc9 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -200,7 +200,9 @@ xfs_reflink_trim_around_shared( if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; - } else if (fbno == agbno) { + } + + if (fbno == agbno) { /* * The start of this extent is shared. Truncate the * mapping at the end of the shared region so that a @@ -210,16 +212,16 @@ xfs_reflink_trim_around_shared( irec->br_blockcount = flen; *shared = true; return 0; - } else { - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; - return 0; } + + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + return 0; } int -- GitLab From 8838dafed5d93b3e8a403e57838a43fb09dd6e61 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 19 Sep 2022 06:51:14 +1000 Subject: [PATCH 0350/2223] xfs: missing space in xfs trace log Add space between arguments would help someone to locate the key words they want, so break quoted strings at a space character. Such as below: [Before] kworker/1:0-280 [001] ..... 600.782135: xfs_bunmap: dev 7:0 ino 0x85 disize 0x0 fileoff 0x0 fsbcount 0x400000001fffffflags ATTRFORK ... [After] kworker/1:2-564 [001] ..... 23817.906160: xfs_bunmap: dev 7:0 ino 0x85 disize 0x0 fileoff 0x0 fsbcount 0x400000001fffff flags ATTRFORK ... Signed-off-by: Zeng Heng Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f9057af6e0c80..cb7c81ba7fa38 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1170,7 +1170,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, __entry->ino_res_used = qtrx->qt_ino_res_used; __entry->icount_delta = qtrx->qt_icount_delta; ), - TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s " "blk_res %llu bcount_delta %lld delbcnt_delta %lld " "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " "ino_res %llu ino_res_used %llu icount_delta %lld", @@ -1602,7 +1602,7 @@ TRACE_EVENT(xfs_bunmap, __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx " "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, -- GitLab From abda5271f8ec6e9a84ae8129ddc59226c89def7a Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Mon, 19 Sep 2022 06:52:14 +1000 Subject: [PATCH 0351/2223] xfs: Remove the unneeded result variable Return the value xfs_dir_cilookup_result() directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Signed-off-by: ye xingchen Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_sf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 003812fd7d355..8cd37e6e9d387 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -865,7 +865,6 @@ xfs_dir2_sf_lookup( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ - int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ @@ -929,8 +928,7 @@ xfs_dir2_sf_lookup( if (!ci_sfep) return -ENOENT; /* otherwise process the CI match as required by the caller */ - error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return error; + return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); } /* -- GitLab From b0463b9dd7030a766133ad2f1571f97f204d7bdf Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 19 Sep 2022 06:53:14 +1000 Subject: [PATCH 0352/2223] xfs: remove xfs_setattr_time() declaration xfs_setattr_time() has been removed since commit e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes"), so remove it. Signed-off-by: Gaosheng Cui Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index cb5fc68c9ea00..e570dcb5df8d5 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); -- GitLab From 42b7cc11023d0aa19dbf4d60bb3b8f7423d24a24 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Sep 2022 06:54:14 +1000 Subject: [PATCH 0353/2223] xfs: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Signed-off-by: Christian Brauner (Microsoft) Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 5 ++--- fs/xfs/xfs_iops.c | 6 ++++-- fs/xfs/xfs_itable.c | 8 ++++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b3eeeae3afe1a..c000b74dd2035 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -835,9 +835,8 @@ xfs_init_new_inode( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && - !in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 45518b8c613c9..5d670c85dcc23 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -558,6 +558,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); trace_xfs_getattr(ip); @@ -568,8 +570,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 36312b00b1642..a1c2bcf65d376 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,6 +66,8 @@ xfs_bulkstat_one_int( struct xfs_bulkstat *buf = bc->buf; xfs_extnum_t nextents; int error = -EINVAL; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (xfs_internal_inum(mp, ino)) goto out_advance; @@ -81,14 +83,16 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* xfs_iget returns the following without needing * further change. */ buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; - buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); - buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); + buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid)); + buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid)); buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; -- GitLab From dc256418235a8355fbdf83b90048d8704b8d1654 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Mon, 19 Sep 2022 06:55:14 +1000 Subject: [PATCH 0354/2223] xfs: do not need to check return value of xlog_kvmalloc() In xfs_attri_log_nameval_alloc(), xlog_kvmalloc() is called to alloc memory, which will always return successfully, so we donot need to check return value. Reviewed-by: Eric Sandeen Signed-off-by: Zhiqiang Liu Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5077a7ad56460..cf5ce607dc051 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -86,8 +86,6 @@ xfs_attri_log_nameval_alloc( */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + name_len + value_len); - if (!nv) - return nv; nv->name.i_addr = nv + 1; nv->name.i_len = name_len; @@ -441,8 +439,6 @@ xfs_attr_create_intent( attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, args->namelen, args->value, args->valuelen); } - if (!attr->xattri_nameval) - return ERR_PTR(-ENOMEM); attrip = xfs_attri_init(mp, attr->xattri_nameval); xfs_trans_add_item(tp, &attrip->attri_item); @@ -762,8 +758,6 @@ xlog_recover_attri_commit_pass2( nv = xfs_attri_log_nameval_alloc(attr_name, attri_formatp->alfi_name_len, attr_value, attri_formatp->alfi_value_len); - if (!nv) - return -ENOMEM; attrip = xfs_attri_init(mp, nv); error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); -- GitLab From e5ec1f9da84324d01b7b8ec3a8bf50e8430b99a7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 17 Sep 2022 22:30:35 +0200 Subject: [PATCH 0355/2223] pinctrl: nomadik: Dereference gpio_chip properly The irq data passed to irc_chip handlers i the struct gpio_chip and nothing else. We are just lucky that the nomadik chip pointer is first in the struct. Use the proper dereferencing and helpers. Reported-by: Marc Zyngier Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20220917203036.167607-1-linus.walleij@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 30 +++++++++-------------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 58c7ac8c7d4d1..54852775d6531 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -608,8 +608,8 @@ static int __maybe_unused nmk_prcm_gpiocr_get_mode(struct pinctrl_dev *pctldev, static void nmk_gpio_irq_ack(struct irq_data *d) { - struct gpio_chip *chip = irq_data_get_irq_chip_data(d); - struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(chip); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); clk_enable(nmk_chip->clk); writel(BIT(d->hwirq), nmk_chip->addr + NMK_GPIO_IC); @@ -677,13 +677,10 @@ static void __nmk_gpio_set_wake(struct nmk_gpio_chip *nmk_chip, static int nmk_gpio_irq_maskunmask(struct irq_data *d, bool enable) { - struct nmk_gpio_chip *nmk_chip; + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); unsigned long flags; - nmk_chip = irq_data_get_irq_chip_data(d); - if (!nmk_chip) - return -EINVAL; - clk_enable(nmk_chip->clk); spin_lock_irqsave(&nmk_gpio_slpm_lock, flags); spin_lock(&nmk_chip->lock); @@ -712,13 +709,10 @@ static void nmk_gpio_irq_unmask(struct irq_data *d) static int nmk_gpio_irq_set_wake(struct irq_data *d, unsigned int on) { - struct nmk_gpio_chip *nmk_chip; + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); unsigned long flags; - nmk_chip = irq_data_get_irq_chip_data(d); - if (!nmk_chip) - return -EINVAL; - clk_enable(nmk_chip->clk); spin_lock_irqsave(&nmk_gpio_slpm_lock, flags); spin_lock(&nmk_chip->lock); @@ -740,14 +734,12 @@ static int nmk_gpio_irq_set_wake(struct irq_data *d, unsigned int on) static int nmk_gpio_irq_set_type(struct irq_data *d, unsigned int type) { + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); bool enabled = !irqd_irq_disabled(d); bool wake = irqd_is_wakeup_set(d); - struct nmk_gpio_chip *nmk_chip; unsigned long flags; - nmk_chip = irq_data_get_irq_chip_data(d); - if (!nmk_chip) - return -EINVAL; if (type & IRQ_TYPE_LEVEL_HIGH) return -EINVAL; if (type & IRQ_TYPE_LEVEL_LOW) @@ -784,7 +776,8 @@ static int nmk_gpio_irq_set_type(struct irq_data *d, unsigned int type) static unsigned int nmk_gpio_irq_startup(struct irq_data *d) { - struct nmk_gpio_chip *nmk_chip = irq_data_get_irq_chip_data(d); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); clk_enable(nmk_chip->clk); nmk_gpio_irq_unmask(d); @@ -793,7 +786,8 @@ static unsigned int nmk_gpio_irq_startup(struct irq_data *d) static void nmk_gpio_irq_shutdown(struct irq_data *d) { - struct nmk_gpio_chip *nmk_chip = irq_data_get_irq_chip_data(d); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); nmk_gpio_irq_mask(d); clk_disable(nmk_chip->clk); -- GitLab From 42da71add478b5a6f82520181a4010a3823bced0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 17 Sep 2022 22:30:36 +0200 Subject: [PATCH 0356/2223] pinctrl: nomadik: Make gpio irqchip immutable This makes the Nomadik GPIO irqchip immutable. Tested on the Samsung Galaxy SIII mini GT-I8190. Cc: Marc Zyngier Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20220917203036.167607-2-linus.walleij@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 59 ++++++++++++++--------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 54852775d6531..21e6ad1c57b29 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -244,7 +244,6 @@ enum nmk_gpio_slpm { struct nmk_gpio_chip { struct gpio_chip chip; - struct irq_chip irqchip; void __iomem *addr; struct clk *clk; unsigned int bank; @@ -675,10 +674,9 @@ static void __nmk_gpio_set_wake(struct nmk_gpio_chip *nmk_chip, __nmk_gpio_irq_modify(nmk_chip, offset, WAKE, on); } -static int nmk_gpio_irq_maskunmask(struct irq_data *d, bool enable) +static void nmk_gpio_irq_maskunmask(struct nmk_gpio_chip *nmk_chip, + struct irq_data *d, bool enable) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); unsigned long flags; clk_enable(nmk_chip->clk); @@ -693,18 +691,24 @@ static int nmk_gpio_irq_maskunmask(struct irq_data *d, bool enable) spin_unlock(&nmk_chip->lock); spin_unlock_irqrestore(&nmk_gpio_slpm_lock, flags); clk_disable(nmk_chip->clk); - - return 0; } static void nmk_gpio_irq_mask(struct irq_data *d) { - nmk_gpio_irq_maskunmask(d, false); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); + + nmk_gpio_irq_maskunmask(nmk_chip, d, false); + gpiochip_disable_irq(gc, irqd_to_hwirq(d)); } static void nmk_gpio_irq_unmask(struct irq_data *d) { - nmk_gpio_irq_maskunmask(d, true); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); + + gpiochip_enable_irq(gc, irqd_to_hwirq(d)); + nmk_gpio_irq_maskunmask(nmk_chip, d, true); } static int nmk_gpio_irq_set_wake(struct irq_data *d, unsigned int on) @@ -1072,13 +1076,34 @@ static struct nmk_gpio_chip *nmk_gpio_populate_chip(struct device_node *np, return nmk_chip; } +static void nmk_gpio_irq_print_chip(struct irq_data *d, struct seq_file *p) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(gc); + + seq_printf(p, "nmk%u-%u-%u", nmk_chip->bank, + gc->base, gc->base + gc->ngpio - 1); +} + +static const struct irq_chip nmk_irq_chip = { + .irq_ack = nmk_gpio_irq_ack, + .irq_mask = nmk_gpio_irq_mask, + .irq_unmask = nmk_gpio_irq_unmask, + .irq_set_type = nmk_gpio_irq_set_type, + .irq_set_wake = nmk_gpio_irq_set_wake, + .irq_startup = nmk_gpio_irq_startup, + .irq_shutdown = nmk_gpio_irq_shutdown, + .irq_print_chip = nmk_gpio_irq_print_chip, + .flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static int nmk_gpio_probe(struct platform_device *dev) { struct device_node *np = dev->dev.of_node; struct nmk_gpio_chip *nmk_chip; struct gpio_chip *chip; struct gpio_irq_chip *girq; - struct irq_chip *irqchip; bool supports_sleepmode; int irq; int ret; @@ -1119,22 +1144,8 @@ static int nmk_gpio_probe(struct platform_device *dev) chip->can_sleep = false; chip->owner = THIS_MODULE; - irqchip = &nmk_chip->irqchip; - irqchip->irq_ack = nmk_gpio_irq_ack; - irqchip->irq_mask = nmk_gpio_irq_mask; - irqchip->irq_unmask = nmk_gpio_irq_unmask; - irqchip->irq_set_type = nmk_gpio_irq_set_type; - irqchip->irq_set_wake = nmk_gpio_irq_set_wake; - irqchip->irq_startup = nmk_gpio_irq_startup; - irqchip->irq_shutdown = nmk_gpio_irq_shutdown; - irqchip->flags = IRQCHIP_MASK_ON_SUSPEND; - irqchip->name = kasprintf(GFP_KERNEL, "nmk%u-%u-%u", - dev->id, - chip->base, - chip->base + chip->ngpio - 1); - girq = &chip->irq; - girq->chip = irqchip; + gpio_irq_chip_set_chip(girq, &nmk_irq_chip); girq->parent_handler = nmk_gpio_irq_handler; girq->num_parents = 1; girq->parents = devm_kcalloc(&dev->dev, 1, -- GitLab From 1c2eb18ef6739c89d13a9b36d19c68b84ab37625 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 19 Sep 2022 08:54:35 +0200 Subject: [PATCH 0357/2223] pinctrl: nomadik: remove dead code after DB8540 pinctrl removal Commit b6d09f780761 ("pinctrl: nomadik: Drop U8540/9540 support") removes the DB8540 pin controller driver and its config PINCTRL_DB8540. There is some code left-over in the generic nomadik pinctrl driver, i.e., drivers/pinctrl/nomadik/pinctrl-nomadik.{ch}, that is still around for the removed DB8540 pin controller driver. Remove this remaining dead code. This issue was discovered with ./scripts/checkkconfigsymbols.py. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20220919065435.27747-1-lukas.bulwahn@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 6 ------ drivers/pinctrl/nomadik/pinctrl-nomadik.h | 14 -------------- 2 files changed, 20 deletions(-) diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 21e6ad1c57b29..f7d02513d8cc1 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -1807,10 +1807,6 @@ static const struct of_device_id nmk_pinctrl_match[] = { .compatible = "stericsson,db8500-pinctrl", .data = (void *)PINCTRL_NMK_DB8500, }, - { - .compatible = "stericsson,db8540-pinctrl", - .data = (void *)PINCTRL_NMK_DB8540, - }, {}, }; @@ -1861,8 +1857,6 @@ static int nmk_pinctrl_probe(struct platform_device *pdev) nmk_pinctrl_stn8815_init(&npct->soc); if (version == PINCTRL_NMK_DB8500) nmk_pinctrl_db8500_init(&npct->soc); - if (version == PINCTRL_NMK_DB8540) - nmk_pinctrl_db8540_init(&npct->soc); /* * Since we depend on the GPIO chips to provide clock and register base diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.h b/drivers/pinctrl/nomadik/pinctrl-nomadik.h index 820f07f4db328..84e2977573357 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.h +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.h @@ -5,7 +5,6 @@ /* Package definitions */ #define PINCTRL_NMK_STN8815 0 #define PINCTRL_NMK_DB8500 1 -#define PINCTRL_NMK_DB8540 2 /* Alternate functions: function C is set in hw by setting both A and B */ #define NMK_GPIO_ALT_GPIO 0 @@ -173,17 +172,4 @@ nmk_pinctrl_db8500_init(const struct nmk_pinctrl_soc_data **soc) #endif -#ifdef CONFIG_PINCTRL_DB8540 - -void nmk_pinctrl_db8540_init(const struct nmk_pinctrl_soc_data **soc); - -#else - -static inline void -nmk_pinctrl_db8540_init(const struct nmk_pinctrl_soc_data **soc) -{ -} - -#endif - #endif /* PINCTRL_PINCTRL_NOMADIK_H */ -- GitLab From 34fbdee086cfcc20fe889d2b83afddfbe2ac3096 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 16 Sep 2022 18:05:57 -0700 Subject: [PATCH 0358/2223] KVM: arm64: Preserve PSTATE.SS for the guest while single-step is enabled Preserve the PSTATE.SS value for the guest while userspace enables single-step (i.e. while KVM manipulates the PSTATE.SS) for the vCPU. Currently, while userspace enables single-step for the vCPU (with KVM_GUESTDBG_SINGLESTEP), KVM sets PSTATE.SS to 1 on every guest entry, not saving its original value. When userspace disables single-step, KVM doesn't restore the original value for the subsequent guest entry (use the current value instead). Exception return instructions copy PSTATE.SS from SPSR_ELx.SS only in certain cases when single-step is enabled (and set it to 0 in other cases). So, the value matters only when the guest enables single-step (and when the guest's Software step state isn't affected by single-step enabled by userspace, practically), though. Fix this by preserving the original PSTATE.SS value while userspace enables single-step, and restoring the value once it is disabled. This fix modifies the behavior of GET_ONE_REG/SET_ONE_REG for the PSTATE.SS while single-step is enabled by userspace. Presently, GET_ONE_REG/SET_ONE_REG gets/sets the current PSTATE.SS value, which KVM will override on the next guest entry (i.e. the value userspace gets/sets is not used for the next guest entry). With this patch, GET_ONE_REG/SET_ONE_REG will get/set the guest's preserved value, which KVM will preserve and try to restore after single-step is disabled. Fixes: 337b99bf7edf ("KVM: arm64: guest debug, add support for single-step") Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220917010600.532642-2-reijiw@google.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/debug.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e9c9388ccc024..ccf8a144f0096 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -393,6 +393,7 @@ struct kvm_vcpu_arch { */ struct { u32 mdscr_el1; + bool pstate_ss; } guest_debug_preserved; /* vcpu power state */ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 0b28d7db7c766..1bd2a1aee11ce 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -32,6 +32,10 @@ static DEFINE_PER_CPU(u64, mdcr_el2); * * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled * after we have restored the preserved value to the main context. + * + * When single-step is enabled by userspace, we tweak PSTATE.SS on every + * guest entry. Preserve PSTATE.SS so we can restore the original value + * for the vcpu after the single-step is disabled. */ static void save_guest_debug_regs(struct kvm_vcpu *vcpu) { @@ -41,6 +45,9 @@ static void save_guest_debug_regs(struct kvm_vcpu *vcpu) trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", vcpu->arch.guest_debug_preserved.mdscr_el1); + + vcpu->arch.guest_debug_preserved.pstate_ss = + (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); } static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) @@ -51,6 +58,11 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); + + if (vcpu->arch.guest_debug_preserved.pstate_ss) + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; } /** -- GitLab From 370531d1e95be57c62fdf065fb04fd8db7ade8f9 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 16 Sep 2022 18:05:58 -0700 Subject: [PATCH 0359/2223] KVM: arm64: Clear PSTATE.SS when the Software Step state was Active-pending While userspace enables single-step, if the Software Step state at the last guest exit was "Active-pending", clear PSTATE.SS on guest entry to restore the state. Currently, KVM sets PSTATE.SS to 1 on every guest entry while userspace enables single-step for the vCPU (with KVM_GUESTDBG_SINGLESTEP). It means KVM always makes the vCPU's Software Step state "Active-not-pending" on the guest entry, which lets the VCPU perform single-step (then Software Step exception is taken). This could cause extra single-step (without returning to userspace) if the Software Step state at the last guest exit was "Active-pending" (i.e. the last exit was triggered by an asynchronous exception after the single-step is performed, but before the Software Step exception is taken. See "Figure D2-3 Software step state machine" and "D2.12.7 Behavior in the active-pending state" in ARM DDI 0487I.a for more info about this behavior). Fix this by clearing PSTATE.SS on guest entry if the Software Step state at the last exit was "Active-pending" so that KVM restore the state (and the exception is taken before further single-step is performed). Fixes: 337b99bf7edf ("KVM: arm64: guest debug, add support for single-step") Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220917010600.532642-3-reijiw@google.com --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/debug.c | 22 +++++++++++++++++++++- arch/arm64/kvm/guest.c | 1 + arch/arm64/kvm/handle_exit.c | 8 +++++++- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ccf8a144f0096..45e2136322ba2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -536,6 +536,9 @@ struct kvm_vcpu_arch { #define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) /* vcpu system registers loaded on physical CPU */ #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) +/* Software step state is Active-pending */ +#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 1bd2a1aee11ce..56361e512b8ac 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -200,7 +200,18 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) * debugging the system. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + /* + * If the software step state at the last guest exit + * was Active-pending, we don't set DBG_SPSR_SS so + * that the state is maintained (to not run another + * single-step until the pending Software Step + * exception is taken). + */ + if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); mdscr |= DBG_MDSCR_SS; vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); @@ -274,6 +285,15 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) * Restore the guest's debug registers if we were using them. */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + /* + * Mark the vcpu as ACTIVE_PENDING + * until Software Step exception is taken. + */ + vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING); + } + restore_guest_debug_regs(vcpu); /* diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index f802a3b3f8dbc..2ff13a3f84796 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -937,6 +937,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, } else { /* If not enabled clear all flags */ vcpu->guest_debug = 0; + vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); } out: diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index bbe5b393d689f..e778eefcf214d 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -152,8 +152,14 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) run->debug.arch.hsr_high = upper_32_bits(esr); run->flags = KVM_DEBUG_ARCH_HSR_HIGH_VALID; - if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_WATCHPT_LOW: run->debug.arch.far = vcpu->arch.fault.far_el2; + break; + case ESR_ELx_EC_SOFTSTP_LOW: + vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + break; + } return 0; } -- GitLab From ff00e737090e0f015059e59829aaa58565b16321 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 16 Sep 2022 18:05:59 -0700 Subject: [PATCH 0360/2223] KVM: arm64: selftests: Refactor debug-exceptions to make it amenable to new test cases Split up the current test into a helper, but leave the debug version checking in main(), to make it convenient to add a new debug exception test case in a subsequent patch. Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220917010600.532642-4-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 2ee35cf9801e1..e6e83b895fd50 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -246,7 +246,7 @@ static int debug_version(struct kvm_vcpu *vcpu) return id_aa64dfr0 & 0xf; } -int main(int argc, char *argv[]) +static void test_guest_debug_exceptions(void) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; @@ -259,9 +259,6 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - __TEST_REQUIRE(debug_version(vcpu) >= 6, - "Armv8 debug architecture not supported."); - vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_EC_BRK_INS, guest_sw_bp_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, @@ -294,5 +291,18 @@ int main(int argc, char *argv[]) done: kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(debug_version(vcpu) >= 6, + "Armv8 debug architecture not supported."); + kvm_vm_free(vm); + test_guest_debug_exceptions(); + return 0; } -- GitLab From b18e4d4aebdddd05810ceb2f73d7f72afcd11b41 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 16 Sep 2022 18:06:00 -0700 Subject: [PATCH 0361/2223] KVM: arm64: selftests: Add a test case for KVM_GUESTDBG_SINGLESTEP Add a test case for KVM_GUESTDBG_SINGLESTEP to the debug-exceptions test. The test enables single-step execution from userspace, and check if the exit to userspace occurs for each instruction that is stepped. Set the default number of the test iterations to a number of iterations sufficient to always reproduce the problem that the previous patch fixes on an Ampere Altra machine. Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220917010600.532642-5-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index e6e83b895fd50..947bd201435ce 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -22,6 +22,7 @@ #define SPSR_SS (1 << 21) extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start; +extern unsigned char iter_ss_begin, iter_ss_end; static volatile uint64_t sw_bp_addr, hw_bp_addr; static volatile uint64_t wp_addr, wp_data_addr; static volatile uint64_t svc_addr; @@ -238,6 +239,46 @@ static void guest_svc_handler(struct ex_regs *regs) svc_addr = regs->pc; } +enum single_step_op { + SINGLE_STEP_ENABLE = 0, + SINGLE_STEP_DISABLE = 1, +}; + +static void guest_code_ss(int test_cnt) +{ + uint64_t i; + uint64_t bvr, wvr, w_bvr, w_wvr; + + for (i = 0; i < test_cnt; i++) { + /* Bits [1:0] of dbg{b,w}vr are RES0 */ + w_bvr = i << 2; + w_wvr = i << 2; + + /* Enable Single Step execution */ + GUEST_SYNC(SINGLE_STEP_ENABLE); + + /* + * The userspace will veriry that the pc is as expected during + * single step execution between iter_ss_begin and iter_ss_end. + */ + asm volatile("iter_ss_begin:nop\n"); + + write_sysreg(w_bvr, dbgbvr0_el1); + write_sysreg(w_wvr, dbgwvr0_el1); + bvr = read_sysreg(dbgbvr0_el1); + wvr = read_sysreg(dbgwvr0_el1); + + asm volatile("iter_ss_end:\n"); + + /* Disable Single Step execution */ + GUEST_SYNC(SINGLE_STEP_DISABLE); + + GUEST_ASSERT(bvr == w_bvr); + GUEST_ASSERT(wvr == w_wvr); + } + GUEST_DONE(); +} + static int debug_version(struct kvm_vcpu *vcpu) { uint64_t id_aa64dfr0; @@ -293,16 +334,106 @@ done: kvm_vm_free(vm); } +void test_single_step_from_userspace(int test_cnt) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + struct kvm_run *run; + uint64_t pc, cmd; + uint64_t test_pc = 0; + bool ss_enable = false; + struct kvm_guest_debug debug = {}; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss); + ucall_init(vm, NULL); + run = vcpu->run; + vcpu_args_set(vcpu, 1, test_cnt); + + while (1) { + vcpu_run(vcpu); + if (run->exit_reason != KVM_EXIT_DEBUG) { + cmd = get_ucall(vcpu, &uc); + if (cmd == UCALL_ABORT) { + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + } else if (cmd == UCALL_DONE) { + break; + } + + TEST_ASSERT(cmd == UCALL_SYNC, + "Unexpected ucall cmd 0x%lx", cmd); + + if (uc.args[1] == SINGLE_STEP_ENABLE) { + debug.control = KVM_GUESTDBG_ENABLE | + KVM_GUESTDBG_SINGLESTEP; + ss_enable = true; + } else { + debug.control = SINGLE_STEP_DISABLE; + ss_enable = false; + } + + vcpu_guest_debug_set(vcpu, &debug); + continue; + } + + TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG"); + + /* Check if the current pc is expected. */ + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc); + TEST_ASSERT(!test_pc || pc == test_pc, + "Unexpected pc 0x%lx (expected 0x%lx)", + pc, test_pc); + + /* + * If the current pc is between iter_ss_bgin and + * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should + * be the current pc + 4. + */ + if ((pc >= (uint64_t)&iter_ss_begin) && + (pc < (uint64_t)&iter_ss_end)) + test_pc = pc + 4; + else + test_pc = 0; + } + + kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("Usage: %s [-h] [-i iterations of the single step test]\n", name); + puts(""); + exit(0); +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; + int opt; + int ss_iteration = 10000; vm = vm_create_with_one_vcpu(&vcpu, guest_code); __TEST_REQUIRE(debug_version(vcpu) >= 6, "Armv8 debug architecture not supported."); kvm_vm_free(vm); + + while ((opt = getopt(argc, argv, "i:")) != -1) { + switch (opt) { + case 'i': + ss_iteration = atoi(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + test_guest_debug_exceptions(); + test_single_step_from_userspace(ss_iteration); return 0; } -- GitLab From 4af95d0937144d6df1b4f262d311cf2e0ace569a Mon Sep 17 00:00:00 2001 From: David Collins Date: Mon, 12 Sep 2022 14:06:23 -0700 Subject: [PATCH 0362/2223] pinctrl: qcom: spmi-gpio: add support for LV_VIN2 and MV_VIN3 subtypes Add support for SPMI PMIC GPIO subtypes GPIO_LV_VIN2 and GPIO_MV_VIN3. GPIO_LV_VIN2 GPIOs support two input reference voltages: VIN0 and VIN1. These are typically connected to 1.8 V and 1.2 V supplies respectively. GPIO_MV_VIN3 GPIOs support three input reference voltages: VIN0, VIN1, and VIN2. These are typically connected to Vph, 1.8 V, and 1.2 V supplies respectively. Signed-off-by: David Collins Signed-off-by: Anjelique Melendez Link: https://lore.kernel.org/r/20220912210624.4527-2-quic_amelende@quicinc.com Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 8ba3d5021f0b8..9534bdffe6fbd 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012-2014, 2016-2021 The Linux Foundation. All rights reserved. + * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. */ #include @@ -36,6 +37,8 @@ #define PMIC_GPIO_SUBTYPE_GPIOC_8CH 0xd #define PMIC_GPIO_SUBTYPE_GPIO_LV 0x10 #define PMIC_GPIO_SUBTYPE_GPIO_MV 0x11 +#define PMIC_GPIO_SUBTYPE_GPIO_LV_VIN2 0x12 +#define PMIC_GPIO_SUBTYPE_GPIO_MV_VIN3 0x13 #define PMIC_MPP_REG_RT_STS 0x10 #define PMIC_MPP_REG_RT_STS_VAL_MASK 0x1 @@ -822,6 +825,16 @@ static int pmic_gpio_populate(struct pmic_gpio_state *state, pad->have_buffer = true; pad->lv_mv_type = true; break; + case PMIC_GPIO_SUBTYPE_GPIO_LV_VIN2: + pad->num_sources = 2; + pad->have_buffer = true; + pad->lv_mv_type = true; + break; + case PMIC_GPIO_SUBTYPE_GPIO_MV_VIN3: + pad->num_sources = 3; + pad->have_buffer = true; + pad->lv_mv_type = true; + break; default: dev_err(state->dev, "unknown GPIO type 0x%x\n", subtype); return -ENODEV; -- GitLab From 723e8462a4fe7138bacac528dcdc7d4484c690fd Mon Sep 17 00:00:00 2001 From: Anjelique Melendez Date: Mon, 12 Sep 2022 14:06:25 -0700 Subject: [PATCH 0363/2223] pinctrl: qcom: spmi-gpio: Fix the GPIO strength mapping The SPMI based PMICs have the HIGH and LOW GPIO output strength mappings interchanged, fix them. Signed-off-by: Anjelique Melendez Link: https://lore.kernel.org/r/20220912210624.4527-3-quic_amelende@quicinc.com Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 27 ++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 9534bdffe6fbd..8f4235f878d5c 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -101,6 +101,9 @@ #define PMIC_GPIO_OUT_BUF_OPEN_DRAIN_NMOS 1 #define PMIC_GPIO_OUT_BUF_OPEN_DRAIN_PMOS 2 +#define PMIC_GPIO_OUT_STRENGTH_LOW 1 +#define PMIC_GPIO_OUT_STRENGTH_HIGH 3 + /* PMIC_GPIO_REG_EN_CTL */ #define PMIC_GPIO_REG_MASTER_EN_SHIFT 7 @@ -439,7 +442,17 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev, arg = pad->pullup; break; case PMIC_GPIO_CONF_STRENGTH: - arg = pad->strength; + switch (pad->strength) { + case PMIC_GPIO_OUT_STRENGTH_HIGH: + arg = PMIC_GPIO_STRENGTH_HIGH; + break; + case PMIC_GPIO_OUT_STRENGTH_LOW: + arg = PMIC_GPIO_STRENGTH_LOW; + break; + default: + arg = pad->strength; + break; + } break; case PMIC_GPIO_CONF_ATEST: arg = pad->atest; @@ -526,7 +539,17 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PMIC_GPIO_CONF_STRENGTH: if (arg > PMIC_GPIO_STRENGTH_LOW) return -EINVAL; - pad->strength = arg; + switch (arg) { + case PMIC_GPIO_STRENGTH_HIGH: + pad->strength = PMIC_GPIO_OUT_STRENGTH_HIGH; + break; + case PMIC_GPIO_STRENGTH_LOW: + pad->strength = PMIC_GPIO_OUT_STRENGTH_LOW; + break; + default: + pad->strength = arg; + break; + } break; case PMIC_GPIO_CONF_ATEST: if (!pad->lv_mv_type || arg > 4) -- GitLab From 3d46ff83df39a62a6b40b55479bfea23838add26 Mon Sep 17 00:00:00 2001 From: Jishnu Prakash Date: Mon, 12 Sep 2022 14:06:27 -0700 Subject: [PATCH 0364/2223] pinctrl: qcom: spmi-gpio: Add compatible for PM7250B Add support for qcom,pm7250b-gpio variant. Signed-off-by: Jishnu Prakash Signed-off-by: David Collins Signed-off-by: Anjelique Melendez Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220912210624.4527-4-quic_amelende@quicinc.com Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 8f4235f878d5c..8c31a8f6b7e4e 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -1201,6 +1201,7 @@ static const struct of_device_id pmic_gpio_of_match[] = { { .compatible = "qcom,pm6150-gpio", .data = (void *) 10 }, { .compatible = "qcom,pm6150l-gpio", .data = (void *) 12 }, { .compatible = "qcom,pm6350-gpio", .data = (void *) 9 }, + { .compatible = "qcom,pm7250b-gpio", .data = (void *) 12 }, { .compatible = "qcom,pm7325-gpio", .data = (void *) 10 }, { .compatible = "qcom,pm8005-gpio", .data = (void *) 4 }, { .compatible = "qcom,pm8008-gpio", .data = (void *) 2 }, -- GitLab From a72be048b71c10475d169d3951c49fb8a6a803e3 Mon Sep 17 00:00:00 2001 From: Anjelique Melendez Date: Mon, 12 Sep 2022 14:06:29 -0700 Subject: [PATCH 0365/2223] dt-bindings: qcom-pmic-gpio: Add PM7250B and PM8450 bindings Update the Qualcomm Technologies, Inc. PMIC GPIO binding documentation to include compatible strings for PM7250B and PM8450 PMICs. Signed-off-by: Anjelique Melendez Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220912210624.4527-5-quic_amelende@quicinc.com Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml index 694898f382be5..29dd503f95221 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml @@ -24,6 +24,7 @@ properties: - qcom,pm6150-gpio - qcom,pm6150l-gpio - qcom,pm6350-gpio + - qcom,pm7250b-gpio - qcom,pm7325-gpio - qcom,pm8005-gpio - qcom,pm8008-gpio @@ -231,6 +232,7 @@ allOf: enum: - qcom,pm660l-gpio - qcom,pm6150l-gpio + - qcom,pm7250b-gpio - qcom,pm8038-gpio - qcom,pm8150b-gpio - qcom,pm8150l-gpio @@ -392,6 +394,7 @@ $defs: - gpio1-gpio10 for pm6150 - gpio1-gpio12 for pm6150l - gpio1-gpio9 for pm6350 + - gpio1-gpio12 for pm7250b - gpio1-gpio10 for pm7325 - gpio1-gpio4 for pm8005 - gpio1-gpio2 for pm8008 @@ -407,6 +410,7 @@ $defs: - gpio1-gpio10 for pm8350 - gpio1-gpio8 for pm8350b - gpio1-gpio9 for pm8350c + - gpio1-gpio4 for pm8450 - gpio1-gpio38 for pm8917 - gpio1-gpio44 for pm8921 - gpio1-gpio36 for pm8941 -- GitLab From 6a164c646999847b843e651f71c53dfaceb2c2b4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 9 May 2022 16:04:08 +0200 Subject: [PATCH 0366/2223] genirq: Provide generic_handle_domain_irq_safe(). commit 509853f9e1e7b ("genirq: Provide generic_handle_irq_safe()") addressed the problem of demultiplexing interrupt handlers which are force threaded on PREEMPT_RT enabled kernels which means that the demultiplexed handler is invoked with interrupts enabled which triggers a lockdep warning due to a non-irq safe lock acquisition. The same problem exists for the irq domain based interrupt handling via generic_handle_domain_irq() which has been reported against the AMD pin-ctrl driver. Provide generic_handle_domain_irq_safe() which can used from any context. [ tglx: Split the usage sites out and massaged changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=215954 --- include/linux/irqdesc.h | 1 + kernel/irq/irqdesc.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 1cd4e36890fbf..844a8e30e6de5 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq); * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); +int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq); int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 5db0230aa6b52..a91f9001103ce 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); + /** + * generic_handle_irq_safe - Invoke the handler for a HW irq belonging + * to a domain from any context. + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process + * context). If the interrupt is marked as 'enforce IRQ-context only' then + * the function must be invoked from hard interrupt context. + */ +int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); + /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. -- GitLab From f460c70125bcb1b753f152d9d0c9cee3ddbc2d91 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 19 Sep 2022 14:42:54 +0200 Subject: [PATCH 0367/2223] pinctrl: amd: Use generic_handle_irq_safe() On PREEMPT_RT enabled kernels the demultiplex interrupt handler is force threaded and runs with interrupts enabled. The invocation of generic_handle_domain_irq() with interrupts enabled triggers a lockdep warning due to a non-irq safe lock acquisition. Instead of disabling interrupts on the driver level, use generic_handle_domain_irq_safe(). [ tglx: Split out from combo patch ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=215954 --- drivers/pinctrl/pinctrl-amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 4691a33bc374f..4ed2b4ba95681 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int irq, void *dev_id) if (!(regval & PIN_IRQ_PENDING) || !(regval & BIT(INTERRUPT_MASK_OFF))) continue; - generic_handle_domain_irq(gc->irq.domain, irqnr + i); + generic_handle_domain_irq_safe(gc->irq.domain, irqnr + i); /* Clear interrupt. * We must read the pin register again, in case the -- GitLab From f285de79569f9e674816a67308316206e4eb30ee Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 19 Sep 2022 14:43:46 +0200 Subject: [PATCH 0368/2223] ssb: gpio: Use generic_handle_irq_safe() On PREEMPT_RT enabled kernels the demultiplex interrupt handler is force threaded and runs with interrupts enabled. The invocation of generic_handle_domain_irq() with interrupts enabled triggers a lockdep warning due to a non-irq safe lock acquisition. Instead of disabling interrupts on the driver level, use generic_handle_domain_irq_safe(). [ tglx: Split out from combo patch ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de --- drivers/ssb/driver_gpio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c index 2de3896489c84..897cb8db5084f 100644 --- a/drivers/ssb/driver_gpio.c +++ b/drivers/ssb/driver_gpio.c @@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_handler(int irq, void *dev_id) return IRQ_NONE; for_each_set_bit(gpio, &irqs, bus->gpio.ngpio) - generic_handle_irq(ssb_gpio_to_irq(&bus->gpio, gpio)); + generic_handle_domain_irq_safe(bus->irq_domain, gpio); + ssb_chipco_gpio_polarity(chipco, irqs, val & irqs); return IRQ_HANDLED; @@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_handler(int irq, void *dev_id) return IRQ_NONE; for_each_set_bit(gpio, &irqs, bus->gpio.ngpio) - generic_handle_irq(ssb_gpio_to_irq(&bus->gpio, gpio)); + generic_handle_domain_irq_safe(bus->irq_domain, gpio); + ssb_extif_gpio_polarity(extif, irqs, val & irqs); return IRQ_HANDLED; -- GitLab From c6a91405ac5cd5baa03fea061e11b05788223160 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 19 Sep 2022 14:44:28 +0200 Subject: [PATCH 0369/2223] platform/x86: intel_int0002_vgpio: Use generic_handle_irq_safe() On PREEMPT_RT enabled kernels the demultiplex interrupt handler is force threaded and runs with interrupts enabled. The invocation of generic_handle_irq() with interrupts enabled triggers a lockdep warning due to a non-irq safe lock acquisition. Instead of disabling interrupts on the driver level, use generic_handle_domain_irq_safe(). [ tglx: Split out from combo patch ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de --- drivers/platform/x86/intel/int0002_vgpio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c index 617dbf98980ec..97cfbc520a02c 100644 --- a/drivers/platform/x86/intel/int0002_vgpio.c +++ b/drivers/platform/x86/intel/int0002_vgpio.c @@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, void *data) if (!(gpe_sts_reg & GPE0A_PME_B0_STS_BIT)) return IRQ_NONE; - generic_handle_irq(irq_find_mapping(chip->irq.domain, - GPE0A_PME_B0_VIRT_GPIO_PIN)); + generic_handle_domain_irq_safe(chip->irq.domain, GPE0A_PME_B0_VIRT_GPIO_PIN); pm_wakeup_hard_event(chip->parent); -- GitLab From 118c3ba24d04f084eadd9d4a0ab7830f495e9106 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 19 Sep 2022 14:45:18 +0200 Subject: [PATCH 0370/2223] gpio: mlxbf2: Use generic_handle_irq_safe() On PREEMPT_RT enabled kernels the demultiplex interrupt handler is force threaded and runs with interrupts enabled. The invocation of generic_handle_irq() with interrupts enabled triggers a lockdep warning due to a non-irq safe lock acquisition. Instead of disabling interrupts on the driver level, use generic_handle_domain_irq_safe(). [ tglx: Split out from combo patch ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de --- drivers/gpio/gpio-mlxbf2.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 64cb060d9d753..77a41151c921b 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr) pending = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CAUSE_EVTEN0); writel(pending, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); - for_each_set_bit(level, &pending, gc->ngpio) { - int gpio_irq = irq_find_mapping(gc->irq.domain, level); - generic_handle_irq(gpio_irq); - } + for_each_set_bit(level, &pending, gc->ngpio) + generic_handle_domain_irq_safe(gc->irq.domain, level); return IRQ_RETVAL(pending); } -- GitLab From 94ec234a16cf3acdb319f05917b1efec9642222e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 19 Sep 2022 14:46:16 +0200 Subject: [PATCH 0371/2223] bcma: gpio: Use generic_handle_irq_safe() On PREEMPT_RT enabled kernels the demultiplex interrupt handler is force threaded and runs with interrupts enabled. The invocation of generic_handle_irq() with interrupts enabled triggers a lockdep warning due to a non-irq safe lock acquisition. Instead of disabling interrupts on the driver level, use generic_handle_domain_irq_safe(). [ tglx: Split out from combo patch ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de --- drivers/bcma/driver_gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c index fac8ff983aec8..65fb9bad1577a 100644 --- a/drivers/bcma/driver_gpio.c +++ b/drivers/bcma/driver_gpio.c @@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id) return IRQ_NONE; for_each_set_bit(gpio, &irqs, gc->ngpio) - generic_handle_irq(irq_find_mapping(gc->irq.domain, gpio)); + generic_handle_domain_irq_safe(gc->irq.domain, gpio); bcma_chipco_gpio_polarity(cc, irqs, val & irqs); return IRQ_HANDLED; -- GitLab From e01bae16a7d68931f0450cb079479c4a8f56d3e3 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 16 Sep 2022 22:03:29 +0800 Subject: [PATCH 0372/2223] PCI/P2PDMA: Use for_each_pci_dev() helper Use for_each_pci_dev() instead of open-coding it. No functional change. Link: https://lore.kernel.org/r/20220916140329.679633-1-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe --- drivers/pci/p2pdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 4496a7c5c4785..88dc66ee1c467 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -649,7 +649,7 @@ struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients) if (!closest_pdevs) return NULL; - while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) { + for_each_pci_dev(pdev) { if (!pci_has_p2pmem(pdev)) continue; -- GitLab From 714e76347a4e0bbd39730ddbb2c7e56971ba7caa Mon Sep 17 00:00:00 2001 From: Benjamin Beichler Date: Tue, 7 Jun 2022 11:27:14 +0000 Subject: [PATCH 0373/2223] um: read multiple msg from virtio slave request fd If VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS is activated, the user mode linux virtio irq handler only read one msg from the corresponding socket. This creates issues, when the device emulation creates multiple call requests (e.g. for multiple virtqueues), as the socket buffer tend to fill up and the call requests are delayed. This creates a deadlock situation, when the device simulation blocks, because of sending a msg and the kernel side blocks because of synchronously waiting for an acknowledge of kick request. Actually inband notifications are meant to be used in combination with the time travel protocol, but it is not required, therefore this corner case needs to be handled. Anyways, in general it seems to be more natural to consume always all messages from a socket, instead of only a single one. Fixes: 2cd097ba8c05 ("um: virtio: Implement VHOST_USER_PROTOCOL_F_SLAVE_REQ") Signed-off-by: Benjamin Beichler Reviewed-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virtio_uml.c | 71 +++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index e719af8bdf56d..588930a0ced17 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -374,45 +374,48 @@ static irqreturn_t vu_req_read_message(struct virtio_uml_device *vu_dev, u8 extra_payload[512]; } msg; int rc; + irqreturn_t irq_rc = IRQ_NONE; - rc = vhost_user_recv_req(vu_dev, &msg.msg, - sizeof(msg.msg.payload) + - sizeof(msg.extra_payload)); - - vu_dev->recv_rc = rc; - if (rc) - return IRQ_NONE; - - switch (msg.msg.header.request) { - case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG: - vu_dev->config_changed_irq = true; - response = 0; - break; - case VHOST_USER_SLAVE_VRING_CALL: - virtio_device_for_each_vq((&vu_dev->vdev), vq) { - if (vq->index == msg.msg.payload.vring_state.index) { - response = 0; - vu_dev->vq_irq_vq_map |= BIT_ULL(vq->index); - break; + while (1) { + rc = vhost_user_recv_req(vu_dev, &msg.msg, + sizeof(msg.msg.payload) + + sizeof(msg.extra_payload)); + if (rc) + break; + + switch (msg.msg.header.request) { + case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG: + vu_dev->config_changed_irq = true; + response = 0; + break; + case VHOST_USER_SLAVE_VRING_CALL: + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + if (vq->index == msg.msg.payload.vring_state.index) { + response = 0; + vu_dev->vq_irq_vq_map |= BIT_ULL(vq->index); + break; + } } + break; + case VHOST_USER_SLAVE_IOTLB_MSG: + /* not supported - VIRTIO_F_ACCESS_PLATFORM */ + case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: + /* not supported - VHOST_USER_PROTOCOL_F_HOST_NOTIFIER */ + default: + vu_err(vu_dev, "unexpected slave request %d\n", + msg.msg.header.request); } - break; - case VHOST_USER_SLAVE_IOTLB_MSG: - /* not supported - VIRTIO_F_ACCESS_PLATFORM */ - case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: - /* not supported - VHOST_USER_PROTOCOL_F_HOST_NOTIFIER */ - default: - vu_err(vu_dev, "unexpected slave request %d\n", - msg.msg.header.request); - } - - if (ev && !vu_dev->suspended) - time_travel_add_irq_event(ev); - if (msg.msg.header.flags & VHOST_USER_FLAG_NEED_REPLY) - vhost_user_reply(vu_dev, &msg.msg, response); + if (ev && !vu_dev->suspended) + time_travel_add_irq_event(ev); - return IRQ_HANDLED; + if (msg.msg.header.flags & VHOST_USER_FLAG_NEED_REPLY) + vhost_user_reply(vu_dev, &msg.msg, response); + irq_rc = IRQ_HANDLED; + }; + /* mask EAGAIN as we try non-blocking read until socket is empty */ + vu_dev->recv_rc = (rc == -EAGAIN) ? 0 : rc; + return irq_rc; } static irqreturn_t vu_req_interrupt(int irq, void *data) -- GitLab From 16c546e148fa6d14a019431436a6f7b4087dbccd Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 12 Jul 2022 15:52:55 +0800 Subject: [PATCH 0374/2223] UM: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK When CONFIG_CPUMASK_OFFSTACK and CONFIG_DEBUG_PER_CPU_MAPS is selected, cpu_max_bits_warn() generates a runtime warning similar as below while we show /proc/cpuinfo. Fix this by using nr_cpu_ids (the runtime limit) instead of NR_CPUS to iterate CPUs. [ 3.052463] ------------[ cut here ]------------ [ 3.059679] WARNING: CPU: 3 PID: 1 at include/linux/cpumask.h:108 show_cpuinfo+0x5e8/0x5f0 [ 3.070072] Modules linked in: efivarfs autofs4 [ 3.076257] CPU: 0 PID: 1 Comm: systemd Not tainted 5.19-rc5+ #1052 [ 3.099465] Stack : 9000000100157b08 9000000000f18530 9000000000cf846c 9000000100154000 [ 3.109127] 9000000100157a50 0000000000000000 9000000100157a58 9000000000ef7430 [ 3.118774] 90000001001578e8 0000000000000040 0000000000000020 ffffffffffffffff [ 3.128412] 0000000000aaaaaa 1ab25f00eec96a37 900000010021de80 900000000101c890 [ 3.138056] 0000000000000000 0000000000000000 0000000000000000 0000000000aaaaaa [ 3.147711] ffff8000339dc220 0000000000000001 0000000006ab4000 0000000000000000 [ 3.157364] 900000000101c998 0000000000000004 9000000000ef7430 0000000000000000 [ 3.167012] 0000000000000009 000000000000006c 0000000000000000 0000000000000000 [ 3.176641] 9000000000d3de08 9000000001639390 90000000002086d8 00007ffff0080286 [ 3.186260] 00000000000000b0 0000000000000004 0000000000000000 0000000000071c1c [ 3.195868] ... [ 3.199917] Call Trace: [ 3.203941] [<90000000002086d8>] show_stack+0x38/0x14c [ 3.210666] [<9000000000cf846c>] dump_stack_lvl+0x60/0x88 [ 3.217625] [<900000000023d268>] __warn+0xd0/0x100 [ 3.223958] [<9000000000cf3c90>] warn_slowpath_fmt+0x7c/0xcc [ 3.231150] [<9000000000210220>] show_cpuinfo+0x5e8/0x5f0 [ 3.238080] [<90000000004f578c>] seq_read_iter+0x354/0x4b4 [ 3.245098] [<90000000004c2e90>] new_sync_read+0x17c/0x1c4 [ 3.252114] [<90000000004c5174>] vfs_read+0x138/0x1d0 [ 3.258694] [<90000000004c55f8>] ksys_read+0x70/0x100 [ 3.265265] [<9000000000cfde9c>] do_syscall+0x7c/0x94 [ 3.271820] [<9000000000202fe4>] handle_syscall+0xc4/0x160 [ 3.281824] ---[ end trace 8b484262b4b8c24c ]--- Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen Signed-off-by: Richard Weinberger --- arch/um/kernel/um_arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index e0de60e503b98..11df937318575 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -96,7 +96,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? cpu_data + *pos : NULL; + return *pos < nr_cpu_ids ? cpu_data + *pos : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) -- GitLab From 0d644e918532f7eba2b02e0eaf60ee1a1b20a856 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Sat, 6 Aug 2022 21:52:23 +0200 Subject: [PATCH 0375/2223] um: increase default virtual physical memory to 64 MiB The current 32 MiB of RAM causes OOMs to appear shortly after booting in a minimal OpenWrt 22.03 configuration with a 5.10.134 kernel. Of course, passing a "mem=64M" (from the --help text) parameter works too, but it produces the following (info) message: | [ 0.000000] Unknown kernel command line parameters "mem=64M", will be passed to user space. That's why, I think it would be nicer, if this is working out of the box again :). Signed-off-by: Christian Lamparter Signed-off-by: Richard Weinberger --- arch/um/kernel/um_arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 11df937318575..207f195dff567 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -132,7 +132,7 @@ static int have_root __initdata; static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 32 * 1024 * 1024; +long long physmem_size = 64 * 1024 * 1024; EXPORT_SYMBOL(physmem_size); static const char *usage_string = -- GitLab From e6e4d33f380fbfd85b909d16c9b639299e5c37a6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 22:59:51 +0200 Subject: [PATCH 0376/2223] um: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Richard Weinberger --- arch/um/drivers/net_kern.c | 2 +- arch/um/drivers/vector_kern.c | 2 +- arch/um/kernel/um_arch.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 59331384c2d38..3d7836c465070 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -265,7 +265,7 @@ static void uml_net_poll_controller(struct net_device *dev) static void uml_net_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); + strscpy(info->driver, DRIVER_NAME, sizeof(info->driver)); } static const struct ethtool_ops uml_net_ethtool_ops = { diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 5482653127436..ded7c47d2fbe5 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1372,7 +1372,7 @@ static void vector_net_poll_controller(struct net_device *dev) static void vector_net_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); + strscpy(info->driver, DRIVER_NAME, sizeof(info->driver)); } static int vector_net_load_bpf_flash(struct net_device *dev, diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 207f195dff567..0f2adc9a95a2a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -416,7 +416,7 @@ void __init setup_arch(char **cmdline_p) read_initrd(); paging_init(); - strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; setup_hostinfo(host_info, sizeof host_info); -- GitLab From b7f28a37a59fb0ae35dc087b9cdfa77d089b996b Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:17 +0200 Subject: [PATCH 0377/2223] hostfs: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Richard Weinberger --- fs/hostfs/hostfs_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07881b76d42f9..277468783feee 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -103,7 +103,7 @@ static char *__dentry_name(struct dentry *dentry, char *name) */ BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); - strlcpy(name, root, PATH_MAX); + strscpy(name, root, PATH_MAX); if (len > p - name) { __putname(name); return NULL; -- GitLab From 98639412fee2fda3c9da184825b469e5ac874829 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sun, 11 Sep 2022 10:51:40 +0800 Subject: [PATCH 0378/2223] um: virt-pci: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Signed-off-by: Xiu Jianfeng Signed-off-by: Richard Weinberger --- arch/um/drivers/virt-pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 0278470231841..acb55b302b14c 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -857,7 +857,7 @@ void *pci_root_bus_fwnode(struct pci_bus *bus) return um_pci_fwnode; } -static int um_pci_init(void) +static int __init um_pci_init(void) { int err, i; @@ -940,7 +940,7 @@ free: } module_init(um_pci_init); -static void um_pci_exit(void) +static void __exit um_pci_exit(void) { unregister_virtio_driver(&um_pci_virtio_driver); irq_domain_remove(um_pci_msi_domain); -- GitLab From 7c5c8faeab4db1eecc181e01ccc7c16b1ec24b99 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sun, 11 Sep 2022 10:52:38 +0800 Subject: [PATCH 0379/2223] um: mmaper: add __exit annotations to module exit funcs Add missing __exit annotations to module exit funcs. Signed-off-by: Xiu Jianfeng Signed-off-by: Richard Weinberger --- arch/um/drivers/mmapper_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c index 0bf78ff890110..807cd33587405 100644 --- a/arch/um/drivers/mmapper_kern.c +++ b/arch/um/drivers/mmapper_kern.c @@ -122,7 +122,7 @@ static int __init mmapper_init(void) return 0; } -static void mmapper_exit(void) +static void __exit mmapper_exit(void) { misc_deregister(&mmapper_dev); } -- GitLab From c8b2c268b0b77cb6aad676bf215f49212d903b2a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Wed, 14 Sep 2022 15:30:27 +0800 Subject: [PATCH 0380/2223] um: remove unused reactivate_chan() declaration All uses of reactivate_chan() were removed by commit 940b241d9050 ("um: Remove obsolete reenable_XX calls"), so remove the declaration, too. Signed-off-by: Gaosheng Cui Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/chan.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/drivers/chan.h b/arch/um/drivers/chan.h index c37cc4f26f91c..3fec3b8406e98 100644 --- a/arch/um/drivers/chan.h +++ b/arch/um/drivers/chan.h @@ -36,7 +36,6 @@ extern int console_write_chan(struct chan *chan, const char *buf, int len); extern int console_open_chan(struct line *line, struct console *co); extern void deactivate_chan(struct chan *chan, int irq); -extern void reactivate_chan(struct chan *chan, int irq); extern void chan_enable_winch(struct chan *chan, struct tty_port *port); extern int enable_chan(struct line *line); extern void close_chan(struct line *line); -- GitLab From 758dfdb9185cf94160f20e85bbe05583e3cd4ff4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 19 Aug 2022 19:17:24 -0300 Subject: [PATCH 0381/2223] um: Improve panic notifiers consistency and ordering Currently the panic notifiers from user mode linux don't follow the convention for most of the other notifiers present in the kernel (indentation, priority setting, numeric return). More important, the priorities could be improved, since it's a special case (userspace), hence we could run the notifiers earlier; user mode linux shouldn't care much with other panic notifiers but the ordering among the mconsole and arch notifier is important, given that the arch one effectively triggers a core dump. Fix that by running the mconsole notifier as the first panic notifier, followed by the architecture one (that coredumps). Cc: Anton Ivanov Cc: Johannes Berg Cc: Richard Weinberger Signed-off-by: Guilherme G. Piccoli V3: - No changes. V2: - Kept the notifier header to avoid implicit usage - thanks Johannes for the suggestion! Signed-off-by: Richard Weinberger --- arch/um/drivers/mconsole_kern.c | 7 +++---- arch/um/kernel/um_arch.c | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 8ca67a6926830..69af3ce8407af 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -846,13 +846,12 @@ static int notify_panic(struct notifier_block *self, unsigned long unused1, mconsole_notify(notify_socket, MCONSOLE_PANIC, message, strlen(message) + 1); - return 0; + return NOTIFY_DONE; } static struct notifier_block panic_exit_notifier = { - .notifier_call = notify_panic, - .next = NULL, - .priority = 1 + .notifier_call = notify_panic, + .priority = INT_MAX, /* run as soon as possible */ }; static int add_notifier(void) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 0f2adc9a95a2a..754d29a387a81 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -247,13 +247,13 @@ static int panic_exit(struct notifier_block *self, unsigned long unused1, bust_spinlocks(0); uml_exitcode = 1; os_dump_core(); - return 0; + + return NOTIFY_DONE; } static struct notifier_block panic_exit_notifier = { - .notifier_call = panic_exit, - .next = NULL, - .priority = 0 + .notifier_call = panic_exit, + .priority = INT_MAX - 1, /* run as 2nd notifier, won't return */ }; void uml_finishsetup(void) -- GitLab From 3848d470cb881b7954a4a563bf73ffeb9cf4f30e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 15 Jul 2022 12:29:38 +0800 Subject: [PATCH 0382/2223] um: Fix comment typo The double `in' is duplicated in line 172, remove one. Signed-off-by: Jason Wang Signed-off-by: Richard Weinberger --- arch/um/kernel/physmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index e7c7b53a1435b..91485119ae67a 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -169,7 +169,7 @@ __uml_setup("iomem=", parse_iomem, ); /* - * This list is constructed in parse_iomem and addresses filled in in + * This list is constructed in parse_iomem and addresses filled in * setup_iomem, both of which run during early boot. Afterwards, it's * unchanged. */ -- GitLab From 4dc5a328315a6acbb60e772fb4826d87626a793d Mon Sep 17 00:00:00 2001 From: Xin Gao Date: Thu, 21 Jul 2022 03:24:51 +0800 Subject: [PATCH 0383/2223] um: Do not initialise statics to 0. do not initialise statics to 0. Signed-off-by: Xin Gao Signed-off-by: Richard Weinberger --- arch/um/drivers/mconsole_kern.c | 2 +- arch/um/drivers/ssl.c | 2 +- arch/um/drivers/stdio_console.c | 2 +- arch/um/drivers/ubd_kern.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 69af3ce8407af..5026e7b9adfe5 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -283,7 +283,7 @@ struct unplugged_pages { }; static DEFINE_MUTEX(plug_mem_mutex); -static unsigned long long unplugged_pages_count = 0; +static unsigned long long unplugged_pages_count; static LIST_HEAD(unplugged_pages); static int unplug_index = UNPLUGGED_PER_PAGE; diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c index 8514966778d53..277cea3d30eb5 100644 --- a/arch/um/drivers/ssl.c +++ b/arch/um/drivers/ssl.c @@ -106,7 +106,7 @@ static const struct tty_operations ssl_ops = { /* Changed by ssl_init and referenced by ssl_exit, which are both serialized * by being an initcall and exitcall, respectively. */ -static int ssl_init_done = 0; +static int ssl_init_done; static void ssl_console_write(struct console *c, const char *string, unsigned len) diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c index 489d5a746ed33..1c239737d88ec 100644 --- a/arch/um/drivers/stdio_console.c +++ b/arch/um/drivers/stdio_console.c @@ -88,7 +88,7 @@ static int con_remove(int n, char **error_out) } /* Set in an initcall, checked in an exitcall */ -static int con_init_done = 0; +static int con_init_done; static int con_install(struct tty_driver *driver, struct tty_struct *tty) { diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index eb2d2f0f0bcca..f4c1e6e97ad52 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1555,7 +1555,7 @@ static void do_io(struct io_thread_req *req, struct io_desc *desc) int kernel_fd = -1; /* Only changed by the io thread. XXX: currently unused. */ -static int io_count = 0; +static int io_count; int io_thread(void *arg) { -- GitLab From 193cb8372424184dde28088a4230a5fed0afb0ad Mon Sep 17 00:00:00 2001 From: Shaomin Deng Date: Sat, 27 Aug 2022 12:26:31 -0400 Subject: [PATCH 0384/2223] uml: Remove the initialization of statics to 0 It is always unnecessary to initialise statics to 0. Signed-off-by: Shaomin Deng Signed-off-by: Richard Weinberger --- arch/um/kernel/umid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c index 8031a038eb588..72bc60ade347b 100644 --- a/arch/um/kernel/umid.c +++ b/arch/um/kernel/umid.c @@ -9,7 +9,7 @@ #include /* Changed by set_umid_arg */ -static int umid_inited = 0; +static int umid_inited; static int __init set_umid_arg(char *name, int *add) { -- GitLab From 790cf9e3da3f16d65d389d714f6e18f27cf18704 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 17 Sep 2022 20:20:15 +0800 Subject: [PATCH 0385/2223] pinctrl: stm32: Switch to use dev_err_probe() helper In the probe path, dev_err() can be replace with dev_err_probe() which will check if error code is -EPROBE_DEFER and prints the error name. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220917122015.1893880-1-yangyingliang@huawei.com Signed-off-by: Linus Walleij --- drivers/pinctrl/stm32/pinctrl-stm32.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 14bcca73238ae..e485506ea599c 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1603,10 +1603,9 @@ int stm32_pctl_probe(struct platform_device *pdev) bank->clk = of_clk_get_by_name(np, NULL); if (IS_ERR(bank->clk)) { - if (PTR_ERR(bank->clk) != -EPROBE_DEFER) - dev_err(dev, "failed to get clk (%ld)\n", PTR_ERR(bank->clk)); fwnode_handle_put(child); - return PTR_ERR(bank->clk); + return dev_err_probe(dev, PTR_ERR(bank->clk), + "failed to get clk\n"); } i++; } -- GitLab From 35b871f72a5a06dc5a328427a437797ad99c0696 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 17 Sep 2022 20:22:08 +0800 Subject: [PATCH 0386/2223] pinctrl: sunxi: sun50i-h5: Switch to use dev_err_probe() helper In the probe path, dev_err() can be replace with dev_err_probe() which will check if error code is -EPROBE_DEFER and and prints the error name. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220917122208.1894769-1-yangyingliang@huawei.com Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sun50i-h5.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sun50i-h5.c b/drivers/pinctrl/sunxi/pinctrl-sun50i-h5.c index 31d62bbb7f43f..96a350e70668a 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sun50i-h5.c +++ b/drivers/pinctrl/sunxi/pinctrl-sun50i-h5.c @@ -551,12 +551,9 @@ static int sun50i_h5_pinctrl_probe(struct platform_device *pdev) int ret; ret = platform_irq_count(pdev); - if (ret < 0) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "Couldn't determine irq count: %pe\n", - ERR_PTR(ret)); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Couldn't determine irq count\n"); switch (ret) { case 2: -- GitLab From 56e380cfcd82a228dc006902b88cf1adaf9851dc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Sep 2022 23:54:48 +0300 Subject: [PATCH 0387/2223] pinctrl: cy8c95x0: Lock register accesses in cy8c95x0_set_mux() It seems that cy8c95x0_set_mux() missed serialization of IO access. And its implementation looks half-baked. Add locking to the function. Fixes: e6cbbe42944d ("pinctrl: Add Cypress cy8c95x0 support") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220916205450.86278-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 79f73d364f3f9..75be06d29dc14 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1152,8 +1152,13 @@ static int cy8c95x0_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); + int ret; - return cy8c95x0_pinmux_cfg(chip, selector, group); + mutex_lock(&chip->i2c_lock); + ret = cy8c95x0_pinmux_cfg(chip, selector, group); + mutex_unlock(&chip->i2c_lock); + + return ret; } static const struct pinmux_ops cy8c95x0_pmxops = { -- GitLab From d6afdf8826ef4c719ab78d33e932dc6ad9dedb35 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Sep 2022 23:54:49 +0300 Subject: [PATCH 0388/2223] pinctrl: cy8c95x0: Drop atomicity on operations on push_pull The push_pull member is always accessed under the mutex, hence no need to use atomic operations on it. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220916205450.86278-2-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 75be06d29dc14..367a9386dfb7d 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -573,7 +573,8 @@ static int cy8c95x0_gpio_direction_input(struct gpio_chip *gc, unsigned int off) ret = regmap_write_bits(chip->regmap, CY8C95X0_DRV_HIZ, bit, bit); if (ret) goto out; - clear_bit(off, chip->push_pull); + + __clear_bit(off, chip->push_pull); } out: @@ -775,27 +776,27 @@ static int cy8c95x0_gpio_set_pincfg(struct cy8c95x0_pinctrl *chip, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: - clear_bit(off, chip->push_pull); + __clear_bit(off, chip->push_pull); reg = CY8C95X0_DRV_PU; break; case PIN_CONFIG_BIAS_PULL_DOWN: - clear_bit(off, chip->push_pull); + __clear_bit(off, chip->push_pull); reg = CY8C95X0_DRV_PD; break; case PIN_CONFIG_BIAS_DISABLE: - clear_bit(off, chip->push_pull); + __clear_bit(off, chip->push_pull); reg = CY8C95X0_DRV_HIZ; break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: - clear_bit(off, chip->push_pull); + __clear_bit(off, chip->push_pull); reg = CY8C95X0_DRV_ODL; break; case PIN_CONFIG_DRIVE_OPEN_SOURCE: - clear_bit(off, chip->push_pull); + __clear_bit(off, chip->push_pull); reg = CY8C95X0_DRV_ODH; break; case PIN_CONFIG_DRIVE_PUSH_PULL: - set_bit(off, chip->push_pull); + __set_bit(off, chip->push_pull); reg = CY8C95X0_DRV_PP_FAST; break; case PIN_CONFIG_MODE_PWM: -- GitLab From ee6cac37368b7ec2b3f798fb7d6d4ce7a62db537 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Sep 2022 23:54:50 +0300 Subject: [PATCH 0389/2223] pinctrl: cy8c95x0: Align function names in cy8c95x0_pmxops Align the function names in the cy8c95x0_pmxops() to follow the struct pinmux_ops members naming schema. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220916205450.86278-3-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 367a9386dfb7d..68509a2301b8f 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1103,7 +1103,7 @@ static const struct pinctrl_ops cy8c95x0_pinctrl_ops = { .pin_dbg_show = cy8c95x0_pin_dbg_show, }; -static const char *cy8c95x0_get_functions_name(struct pinctrl_dev *pctldev, unsigned int selector) +static const char *cy8c95x0_get_function_name(struct pinctrl_dev *pctldev, unsigned int selector) { return cy8c95x0_get_fname(selector); } @@ -1113,9 +1113,9 @@ static int cy8c95x0_get_functions_count(struct pinctrl_dev *pctldev) return 2; } -static int cy8c95x0_get_groups(struct pinctrl_dev *pctldev, unsigned int selector, - const char * const **groups, - unsigned int * const num_groups) +static int cy8c95x0_get_function_groups(struct pinctrl_dev *pctldev, unsigned int selector, + const char * const **groups, + unsigned int * const num_groups) { struct cy8c95x0_pinctrl *chip = pinctrl_dev_get_drvdata(pctldev); @@ -1164,8 +1164,8 @@ static int cy8c95x0_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, static const struct pinmux_ops cy8c95x0_pmxops = { .get_functions_count = cy8c95x0_get_functions_count, - .get_function_name = cy8c95x0_get_functions_name, - .get_function_groups = cy8c95x0_get_groups, + .get_function_name = cy8c95x0_get_function_name, + .get_function_groups = cy8c95x0_get_function_groups, .set_mux = cy8c95x0_set_mux, .strict = true, }; -- GitLab From 670f8ce56dd0632dc29a0322e188cc73ce3c6b92 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Wed, 17 Aug 2022 13:22:00 +0100 Subject: [PATCH 0390/2223] gfs2: Check sb_bsize_shift after reading superblock Fuzzers like to scribble over sb_bsize_shift but in reality it's very unlikely that this field would be corrupted on its own. Nevertheless it should be checked to avoid the possibility of messy mount errors due to bad calculations. It's always a fixed value based on the block size so we can just check that it's the expected value. Tested with: mkfs.gfs2 -O -p lock_nolock /dev/vdb for i in 0 -1 64 65 32 33; do gfs2_edit -p sb field sb_bsize_shift $i /dev/vdb mount /dev/vdb /mnt/test && umount /mnt/test done Before this patch we get a withdraw after [ 76.413681] gfs2: fsid=loop0.0: fatal: invalid metadata block [ 76.413681] bh = 19 (type: exp=5, found=4) [ 76.413681] function = gfs2_meta_buffer, file = fs/gfs2/meta_io.c, line = 492 and with UBSAN configured we also get complaints like [ 76.373395] UBSAN: shift-out-of-bounds in fs/gfs2/ops_fstype.c:295:19 [ 76.373815] shift exponent 4294967287 is too large for 64-bit type 'long unsigned int' After the patch, these complaints don't appear, mount fails immediately and we get an explanation in dmesg. Reported-by: syzbot+dcf33a7aae997956fe06@syzkaller.appspotmail.com Signed-off-by: Andrew Price Signed-off-by: Andreas Gruenbacher --- fs/gfs2/ops_fstype.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 236b59ef93b68..c7e2e62383668 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -178,7 +178,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) pr_warn("Invalid block size\n"); return -EINVAL; } - + if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) { + pr_warn("Invalid block size shift\n"); + return -EINVAL; + } return 0; } -- GitLab From 74b1b10e29b1f25e1a081fa82733baea65429d53 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 30 Aug 2022 13:52:13 -0500 Subject: [PATCH 0391/2223] gfs2: Register fs after creating workqueues Before this patch, the gfs2 file system was registered prior to creating the three workqueues. In some cases this allowed dlm to send recovery work to a workqueue that did not yet exist because gfs2 was still initializing. This patch changes the order of gfs2's initialization routine so it only registers the file system after the work queues are created. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/main.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 14ae9de762772..afcb32854f142 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,14 +151,6 @@ static int __init init_gfs2_fs(void) if (error) goto fail_shrinker; - error = register_filesystem(&gfs2_fs_type); - if (error) - goto fail_fs1; - - error = register_filesystem(&gfs2meta_fs_type); - if (error) - goto fail_fs2; - error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); @@ -180,11 +172,23 @@ static int __init init_gfs2_fs(void) goto fail_mempool; gfs2_register_debugfs(); + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail_fs1; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_fs2; + pr_info("GFS2 installed\n"); return 0; +fail_fs2: + unregister_filesystem(&gfs2_fs_type); +fail_fs1: + mempool_destroy(gfs2_page_pool); fail_mempool: destroy_workqueue(gfs2_freeze_wq); fail_wq3: @@ -192,10 +196,6 @@ fail_wq3: fail_wq2: destroy_workqueue(gfs_recovery_wq); fail_wq1: - unregister_filesystem(&gfs2meta_fs_type); -fail_fs2: - unregister_filesystem(&gfs2_fs_type); -fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); -- GitLab From a84b280f195df83124eea755132df072c1e15c46 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 19 Sep 2022 14:14:28 +0800 Subject: [PATCH 0392/2223] nvdimm/region: Fix kernel-doc drivers/nvdimm/region_devs.c:1103: warning: expecting prototype for nvdimm_flush(). Prototype was for generic_nvdimm_flush() instead. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2209 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220919061428.102883-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 473a71bbd9c9e..70f1a23cbe31d 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1096,7 +1096,7 @@ int nvdimm_flush(struct nd_region *nd_region, struct bio *bio) return rc; } /** - * nvdimm_flush - flush any posted write queues between the cpu and pmem media + * generic_nvdimm_flush() - flush any posted write queues between the cpu and pmem media * @nd_region: interleaved pmem region */ int generic_nvdimm_flush(struct nd_region *nd_region) -- GitLab From 7912d30fbb1a9df7e99eb7a5991582512e65927c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 14 Sep 2022 14:12:51 +0800 Subject: [PATCH 0393/2223] nvdimm: make __nvdimm_security_overwrite_query static This symbol is not used outside of security.c, so marks it static. drivers/nvdimm/security.c:411:6: warning: no previous prototype for function '__nvdimm_security_overwrite_query'. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2148 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220914061251.42052-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Dan Williams --- drivers/nvdimm/security.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index b5aa55c614616..8aefb60c42fff 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -408,7 +408,7 @@ static int security_overwrite(struct nvdimm *nvdimm, unsigned int keyid) return rc; } -void __nvdimm_security_overwrite_query(struct nvdimm *nvdimm) +static void __nvdimm_security_overwrite_query(struct nvdimm *nvdimm) { struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev); int rc; -- GitLab From 23a2d0c5944896ce9123f36ab62d7ca64c8b25ff Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 3 Aug 2022 04:19:18 +0800 Subject: [PATCH 0394/2223] nvdimm/namespace: Fix comment typo The double `existing' is duplicated in the comment, remove one. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20220802201918.8408-1-wangborong@cdjrlc.com Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index bf4f5c09d9b1b..847816992b9e6 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -388,7 +388,7 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, * * BLK-space is valid as long as it does not precede a PMEM * allocation in a given region. PMEM-space must be contiguous - * and adjacent to an existing existing allocation (if one + * and adjacent to an existing allocation (if one * exists). If reserving PMEM any space is valid. */ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, -- GitLab From 8066cc86b7aaaf6b4b38a81932459c6450440daa Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Sep 2022 11:02:27 +0300 Subject: [PATCH 0395/2223] PCI: Fix used_buses calculation in pci_scan_child_bus_extend() pci_scan_bridge_extend() returns the subordinate bus number needed to cover all the buses below a bridge. pci_scan_child_bus_extend() computes the number of buses to reserve by comparing that with the current max bus number. Previously it did the subtraction in the wrong order, so 'used_buses' was nonsense. Subtract 'max' from 'cmax' as is done for the similar pci_scan_bridge_extend() call in the following block. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216000 Fixes: 3374c545c27c ("PCI: Account for all bridges on bus when distributing bus numbers") Link: https://lore.kernel.org/r/20220905080232.36087-2-mika.westerberg@linux.intel.com Reported-by: Chris Chiu Tested-by: Chris Chiu Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c5286b027f00d..4f940dcd102cb 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2920,8 +2920,8 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus, * hotplug bridges too much during the second scan below. */ used_buses++; - if (cmax - max > 1) - used_buses += cmax - max - 1; + if (max - cmax > 1) + used_buses += max - cmax - 1; } /* Scan bridges that need to be reconfigured */ -- GitLab From 27ef523a6653b35270296114dc50a9f630d896a9 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 13 Jan 2022 09:36:22 +0800 Subject: [PATCH 0396/2223] ubifs: Fix ubifs_check_dir_empty() kernel-doc comment Fix function name in fs/ubifs/dir.c kernel-doc comment to remove warning found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ubifs/dir.c:883: warning: expecting prototype for check_dir_empty(). Prototype was for ubifs_check_dir_empty() instead Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 86151889548e3..7306f88b2c7e8 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -872,7 +872,7 @@ out_fname: } /** - * check_dir_empty - check if a directory is empty or not. + * ubifs_check_dir_empty - check if a directory is empty or not. * @dir: VFS inode object of the directory to check * * This function checks if directory @dir is empty. Returns zero if the -- GitLab From 6c97bb345f163e45a8e4a14acc9391be0beaa6bb Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 17 Aug 2022 09:14:06 +0800 Subject: [PATCH 0397/2223] ubi: block: Remove in vain semicolon Remove the repeated ';' from code, it is not needed. Signed-off-by: Li zeming Reviewed-by: Zhihao Cheng [rw: Massaged commit message a bit] Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 4cf67a2a0d04b..4fc7e756d8858 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -409,7 +409,7 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = blk_mq_alloc_tag_set(&dev->tag_set); if (ret) { dev_err(disk_to_dev(dev->gd), "blk_mq_alloc_tag_set failed"); - goto out_free_dev;; + goto out_free_dev; } -- GitLab From 818f9e8353c8e60dd0876bdac445e8fad346e50a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 11 Aug 2022 21:57:30 +0800 Subject: [PATCH 0398/2223] ubi: ubi-media.h: Fix comment typo The double `the' is duplicated in the comment, remove one. Signed-off-by: Jason Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/ubi-media.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/ubi-media.h b/drivers/mtd/ubi/ubi-media.h index 386db0598e954..2c9cd3b6434f4 100644 --- a/drivers/mtd/ubi/ubi-media.h +++ b/drivers/mtd/ubi/ubi-media.h @@ -131,7 +131,7 @@ enum { * is changed radically. This field is duplicated in the volume identifier * header. * - * The @vid_hdr_offset and @data_offset fields contain the offset of the the + * The @vid_hdr_offset and @data_offset fields contain the offset of the * volume identifier header and user data, relative to the beginning of the * physical eraseblock. These values have to be the same for all physical * eraseblocks. -- GitLab From ec1f97f501a746403990515bbefcaecd7562b042 Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Wed, 10 Aug 2022 21:38:56 +0800 Subject: [PATCH 0399/2223] ubi: Fix repeated words in comments Delete the redundant word 'a'. Delete the redundant word 'the'. Signed-off-by: Jilin Yuan Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/cdev.c | 2 +- drivers/mtd/ubi/eba.c | 2 +- drivers/mtd/ubi/io.c | 2 +- drivers/mtd/ubi/ubi.h | 6 +++--- drivers/mtd/ubi/wl.c | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index cc9a28cf9d827..1d3bbcfb4bb59 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -672,7 +672,7 @@ static int verify_rsvol_req(const struct ubi_device *ubi, * @req: volumes re-name request * * This is a helper function for the volume re-name IOCTL which validates the - * the request, opens the volume and calls corresponding volumes management + * request, opens the volume and calls corresponding volumes management * function. Returns zero in case of success and a negative error code in case * of failure. */ diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index ccc5979642b78..09c408c45a621 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -377,7 +377,7 @@ static int leb_write_lock(struct ubi_device *ubi, int vol_id, int lnum) * * This function locks a logical eraseblock for writing if there is no * contention and does nothing if there is contention. Returns %0 in case of - * success, %1 in case of contention, and and a negative error code in case of + * success, %1 in case of contention, and a negative error code in case of * failure. */ static int leb_write_trylock(struct ubi_device *ubi, int vol_id, int lnum) diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index 8a7306cc19471..01b6448612533 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1147,7 +1147,7 @@ fail: * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * - * This function returns zero if the erase counter header is all right and and + * This function returns zero if the erase counter header is all right and * a negative error code if not or if an error occurred. */ static int self_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum) diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 078112e23dfd5..0110eb3d4db6e 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -86,7 +86,7 @@ void ubi_err(const struct ubi_device *ubi, const char *fmt, ...); * Error codes returned by the I/O sub-system. * * UBI_IO_FF: the read region of flash contains only 0xFFs - * UBI_IO_FF_BITFLIPS: the same as %UBI_IO_FF, but also also there was a data + * UBI_IO_FF_BITFLIPS: the same as %UBI_IO_FF, but also there was a data * integrity error reported by the MTD driver * (uncorrectable ECC error in case of NAND) * UBI_IO_BAD_HDR: the EC or VID header is corrupted (bad magic or CRC) @@ -281,7 +281,7 @@ struct ubi_eba_leb_desc { /** * struct ubi_volume - UBI volume description data structure. - * @dev: device object to make use of the the Linux device model + * @dev: device object to make use of the Linux device model * @cdev: character device object to create character device * @ubi: reference to the UBI device description object * @vol_id: volume ID @@ -439,7 +439,7 @@ struct ubi_debug_info { /** * struct ubi_device - UBI device description structure - * @dev: UBI device object to use the the Linux device model + * @dev: UBI device object to use the Linux device model * @cdev: character device object to create character device * @ubi_num: UBI device number * @ubi_name: UBI device name diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 55bae06cf4083..0fadd55beee0f 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -376,7 +376,7 @@ static struct ubi_wl_entry *find_mean_wl_entry(struct ubi_device *ubi, * refill_wl_user_pool(). * @ubi: UBI device description object * - * This function returns a a wear leveling entry in case of success and + * This function returns a wear leveling entry in case of success and * NULL in case of failure. */ static struct ubi_wl_entry *wl_get_wle(struct ubi_device *ubi) @@ -429,7 +429,7 @@ static int prot_queue_del(struct ubi_device *ubi, int pnum) /** * sync_erase - synchronously erase a physical eraseblock. * @ubi: UBI device description object - * @e: the the physical eraseblock to erase + * @e: the physical eraseblock to erase * @torture: if the physical eraseblock has to be tortured * * This function returns zero in case of success and a negative error code in @@ -1016,7 +1016,7 @@ static int ensure_wear_leveling(struct ubi_device *ubi, int nested) /* * If the ubi->scrub tree is not empty, scrubbing is needed, and the - * the WL worker has to be scheduled anyway. + * WL worker has to be scheduled anyway. */ if (!ubi->scrub.rb_node) { #ifdef CONFIG_MTD_UBI_FASTMAP -- GitLab From b58b25280003f078f9b861656ca097074267f75a Mon Sep 17 00:00:00 2001 From: Zhang Jiaming Date: Mon, 4 Jul 2022 10:13:56 +0800 Subject: [PATCH 0400/2223] ubi: fastmap: Fix typo in comments There are a typo(dont't) in comments. Fix it. Signed-off-by: Zhang Jiaming Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/wl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 0fadd55beee0f..68eb0f21b3fe2 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1464,7 +1464,7 @@ static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e) * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule - * @force: dont't read the block, assume bitflips happened and take action. + * @force: don't read the block, assume bitflips happened and take action. * * This function reads the given eraseblock and checks if bitflips occured. * In case of bitflips, the eraseblock is scheduled for scrubbing. -- GitLab From e079be2c354aede45ce130027264611b276dcef1 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 14 Mar 2022 12:53:37 +0100 Subject: [PATCH 0401/2223] ubi: block: Fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 4fc7e756d8858..75eaecc8639f0 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -441,7 +441,7 @@ int ubiblock_create(struct ubi_volume_info *vi) /* * Create one workqueue per volume (per registered block device). - * Rembember workqueues are cheap, they're not threads. + * Remember workqueues are cheap, they're not threads. */ dev->wq = alloc_workqueue("%s", 0, 0, gd->disk_name); if (!dev->wq) { -- GitLab From 019ac05e4c97f51913318d9b0e2ffd34db917d3c Mon Sep 17 00:00:00 2001 From: Jiang Jian Date: Thu, 23 Jun 2022 15:19:53 +0800 Subject: [PATCH 0402/2223] mtd: ubi: drop unexpected word 'a' in comments there is an unexpected word 'a' in the comments that need to be dropped file - drivers/mtd/ubi/vmt.c line - 626,779 * Returns zero if volume is all right and a a negative error code if not. changed to: * Returns zero if volume is all right and a negative error code if not. Signed-off-by: Jiang Jian Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/vmt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 6ea95ade4ca6b..8fcc0bdf06358 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -623,7 +623,7 @@ void ubi_free_volume(struct ubi_device *ubi, struct ubi_volume *vol) * @ubi: UBI device description object * @vol_id: volume ID * - * Returns zero if volume is all right and a a negative error code if not. + * Returns zero if volume is all right and a negative error code if not. */ static int self_check_volume(struct ubi_device *ubi, int vol_id) { @@ -776,7 +776,7 @@ fail: * self_check_volumes - check information about all volumes. * @ubi: UBI device description object * - * Returns zero if volumes are all right and a a negative error code if not. + * Returns zero if volumes are all right and a negative error code if not. */ static int self_check_volumes(struct ubi_device *ubi) { -- GitLab From 713346ca1db2bebd4c7c4d5ea364ed03d504f5ed Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Sat, 9 Jul 2022 16:40:32 +0800 Subject: [PATCH 0403/2223] ubifs: Fix UBIFS ro fail due to truncate in the encrypted directory The ubifs_compress() function does not compress the data When the data length is short than 128 bytes or the compressed data length is not ideal.It cause that the compressed length of the truncated data in the truncate_data_node() function may be greater than the length of the raw data read from the flash. The above two lengths are transferred to the ubifs_encrypt() function as parameters. This may lead to assertion fails and then the file system becomes read-only. This patch use the actual length of the data in the memory as the input parameter for assert comparison, which avoids the problem. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216213 Signed-off-by: ZhaoLong Wang Signed-off-by: Richard Weinberger --- fs/ubifs/crypto.c | 11 +++++++++++ fs/ubifs/journal.c | 28 +++++++++++++++++----------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index c57b46a352d8f..3125e76376ee6 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 75dab0ae3939d..2b1d7c4297bf2 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1472,23 +1472,25 @@ out_free: * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1550,6 +1552,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_trun_node *trun; struct ubifs_data_node *dn; int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int dn_size; struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1565,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; - sz += ubifs_auth_node_sz(c); + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1602,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } -- GitLab From a0c51565730729f0df2ee886e34b4da6d359a10b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 19 Jul 2022 16:00:17 +0800 Subject: [PATCH 0404/2223] ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 25 ++++++++++++++----------- fs/ubifs/ubifs.h | 2 +- fs/ubifs/xattr.c | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7306f88b2c7e8..8d0f68f8907c8 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -309,7 +312,7 @@ static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -370,7 +373,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -462,7 +465,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -1004,7 +1007,7 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1091,7 +1094,7 @@ static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1173,7 +1176,7 @@ static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 7d6d2f152e039..478bbbb5382f8 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2026,7 +2026,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e4c4761aff7f8..3db8486e3725e 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; -- GitLab From e7f35da21f6f8c6a8c7d262dd4e4bd32e3083f79 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 4 Jul 2022 20:46:00 +0200 Subject: [PATCH 0405/2223] ubi: fastmap: Use the bitmap API to allocate bitmaps Use bitmap_zalloc()/bitmap_free() instead of hand-writing them. It is less verbose and it improves the semantic. Signed-off-by: Christophe JAILLET Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/fastmap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 6e95c4b1473e6..ca2d9efe62c3c 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -20,8 +20,7 @@ static inline unsigned long *init_seen(struct ubi_device *ubi) if (!ubi_dbg_chk_fastmap(ubi)) return NULL; - ret = kcalloc(BITS_TO_LONGS(ubi->peb_count), sizeof(unsigned long), - GFP_KERNEL); + ret = bitmap_zalloc(ubi->peb_count, GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); @@ -34,7 +33,7 @@ static inline unsigned long *init_seen(struct ubi_device *ubi) */ static inline void free_seen(unsigned long *seen) { - kfree(seen); + bitmap_free(seen); } /** @@ -1108,8 +1107,7 @@ int ubi_fastmap_init_checkmap(struct ubi_volume *vol, int leb_count) if (!ubi->fast_attach) return 0; - vol->checkmap = kcalloc(BITS_TO_LONGS(leb_count), sizeof(unsigned long), - GFP_KERNEL); + vol->checkmap = bitmap_zalloc(leb_count, GFP_KERNEL); if (!vol->checkmap) return -ENOMEM; @@ -1118,7 +1116,7 @@ int ubi_fastmap_init_checkmap(struct ubi_volume *vol, int leb_count) void ubi_fastmap_destroy_checkmap(struct ubi_volume *vol) { - kfree(vol->checkmap); + bitmap_free(vol->checkmap); } /** -- GitLab From fd6dd9584ed3ee6debf2e7f9c9e69ef09b368277 Mon Sep 17 00:00:00 2001 From: Bernardo Rodrigues Date: Sun, 5 Dec 2021 18:00:49 -0300 Subject: [PATCH 0406/2223] leds: pca963x: fix blink with hw acceleration LEDs would behave differently depending on the blink hardware acceleration configuration. This commit will make LEDs respond exactly the same independently of the hardware acceleration status. In other words, if you had two pca963x, side by side, one with blink hardware acceleration "ON" and the other "OFF; and performed some arbitrary sequence of API calls (e.g. turn on/off, change brightness, change blink mode, etc.) you probably would end with not matching LED states. 'pca963x software blink' and 'leds-gpio' behavior were used as reference. Actual chip used to validate this change: pca9634 Some of the unmatched behaviors being fixed are (when hw blink was "ON") - Leds would stop blinking when the brightness was changed. - Leds would persist their blinking mode even after being turned off (brightness = 0). - Leds would only blink if another led was solid (pca963x will be forced out of low power) Signed-off-by: Bernardo Rodrigues Signed-off-by: Pavel Machek --- drivers/leds/leds-pca963x.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/leds/leds-pca963x.c b/drivers/leds/leds-pca963x.c index 00aecd67e3483..d8d866bcda19f 100644 --- a/drivers/leds/leds-pca963x.c +++ b/drivers/leds/leds-pca963x.c @@ -101,6 +101,7 @@ struct pca963x_led { struct pca963x *chip; struct led_classdev led_cdev; int led_num; /* 0 .. 15 potentially */ + bool blinking; u8 gdc; u8 gfrq; }; @@ -129,12 +130,21 @@ static int pca963x_brightness(struct pca963x_led *led, switch (brightness) { case LED_FULL: - val = (ledout & ~mask) | (PCA963X_LED_ON << shift); + if (led->blinking) { + val = (ledout & ~mask) | (PCA963X_LED_GRP_PWM << shift); + ret = i2c_smbus_write_byte_data(client, + PCA963X_PWM_BASE + + led->led_num, + LED_FULL); + } else { + val = (ledout & ~mask) | (PCA963X_LED_ON << shift); + } ret = i2c_smbus_write_byte_data(client, ledout_addr, val); break; case LED_OFF: val = ledout & ~mask; ret = i2c_smbus_write_byte_data(client, ledout_addr, val); + led->blinking = false; break; default: ret = i2c_smbus_write_byte_data(client, @@ -144,7 +154,11 @@ static int pca963x_brightness(struct pca963x_led *led, if (ret < 0) return ret; - val = (ledout & ~mask) | (PCA963X_LED_PWM << shift); + if (led->blinking) + val = (ledout & ~mask) | (PCA963X_LED_GRP_PWM << shift); + else + val = (ledout & ~mask) | (PCA963X_LED_PWM << shift); + ret = i2c_smbus_write_byte_data(client, ledout_addr, val); break; } @@ -181,6 +195,7 @@ static void pca963x_blink(struct pca963x_led *led) } mutex_unlock(&led->chip->mutex); + led->blinking = true; } static int pca963x_power_state(struct pca963x_led *led) @@ -275,6 +290,8 @@ static int pca963x_blink_set(struct led_classdev *led_cdev, led->gfrq = gfrq; pca963x_blink(led); + led->led_cdev.brightness = LED_FULL; + pca963x_led_set(led_cdev, LED_FULL); *delay_on = time_on; *delay_off = time_off; @@ -337,6 +354,7 @@ static int pca963x_register_leds(struct i2c_client *client, led->led_cdev.brightness_set_blocking = pca963x_led_set; if (hw_blink) led->led_cdev.blink_set = pca963x_blink_set; + led->blinking = false; init_data.fwnode = child; /* for backwards compatibility */ -- GitLab From 31fd7108302388d732973c58470d4be559d352ec Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 17 Feb 2022 18:43:57 +0100 Subject: [PATCH 0407/2223] dt-bindings: leds: Document mmc trigger The mmc subsystem supports triggering leds on card activity, document the trigger value here. The value is a pattern in this case. Signed-off-by: Marek Vasut Cc: Jacek Anaszewski Cc: Pavel Machek Cc: Rob Herring Cc: devicetree@vger.kernel.org To: linux-leds@vger.kernel.org Signed-off-by: Pavel Machek Reviewed-by: Rob Herring --- .../devicetree/bindings/leds/common.yaml | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/common.yaml b/Documentation/devicetree/bindings/leds/common.yaml index 328952d7acbbc..3c14a98430e19 100644 --- a/Documentation/devicetree/bindings/leds/common.yaml +++ b/Documentation/devicetree/bindings/leds/common.yaml @@ -79,24 +79,27 @@ properties: the LED. $ref: /schemas/types.yaml#/definitions/string - enum: - # LED will act as a back-light, controlled by the framebuffer system - - backlight - # LED will turn on (but for leds-gpio see "default-state" property in - # Documentation/devicetree/bindings/leds/leds-gpio.yaml) - - default-on - # LED "double" flashes at a load average based rate - - heartbeat - # LED indicates disk activity - - disk-activity - # LED indicates IDE disk activity (deprecated), in new implementations - # use "disk-activity" - - ide-disk - # LED flashes at a fixed, configurable rate - - timer - # LED alters the brightness for the specified duration with one software - # timer (requires "led-pattern" property) - - pattern + oneOf: + - enum: + # LED will act as a back-light, controlled by the framebuffer system + - backlight + # LED will turn on (but for leds-gpio see "default-state" property in + # Documentation/devicetree/bindings/leds/leds-gpio.yaml) + - default-on + # LED "double" flashes at a load average based rate + - heartbeat + # LED indicates disk activity + - disk-activity + # LED indicates IDE disk activity (deprecated), in new implementations + # use "disk-activity" + - ide-disk + # LED flashes at a fixed, configurable rate + - timer + # LED alters the brightness for the specified duration with one software + # timer (requires "led-pattern" property) + - pattern + # LED is triggered by SD/MMC activity + - pattern: "^mmc[0-9]+$" led-pattern: description: | -- GitLab From 669d204469c46e91d99da24914130f78277a71d3 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 17 Aug 2022 11:27:35 +0800 Subject: [PATCH 0408/2223] ubi: fastmap: Add fastmap control support for 'UBI_IOCATT' ioctl [1] suggests that fastmap is suitable for large flash devices. Module parameter 'fm_autoconvert' is a coarse grained switch to enable all ubi devices to generate fastmap, which may turn on fastmap even for small flash devices. This patch imports a new field 'disable_fm' in struct 'ubi_attach_req' to support following situations by ioctl 'UBI_IOCATT'. [old functions] A. Disable 'fm_autoconvert': Disbable fastmap for all ubi devices B. Enable 'fm_autoconvert': Enable fastmap for all ubi devices [new function] C. Enable 'fm_autoconvert', set 'disable_fm' for given device: Don't create new fastmap and do full scan (existed fastmap will be destroyed) for the given ubi device. A simple test case in [2]. [1] http://www.linux-mtd.infradead.org/doc/ubi.html#L_fastmap [2] https://bugzilla.kernel.org/show_bug.cgi?id=216278 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 14 ++++++++++---- drivers/mtd/ubi/cdev.c | 2 +- drivers/mtd/ubi/ubi.h | 3 ++- include/uapi/mtd/ubi-user.h | 8 +++++++- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index a32050fecabf3..a901f8edfa41d 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -807,6 +807,7 @@ static int autoresize(struct ubi_device *ubi, int vol_id) * @ubi_num: number to assign to the new UBI device * @vid_hdr_offset: VID header offset * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs + * @disable_fm: whether disable fastmap * * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in @@ -814,11 +815,15 @@ static int autoresize(struct ubi_device *ubi, int vol_id) * automatically. Returns the new UBI device number in case of success and a * negative error code in case of failure. * + * If @disable_fm is true, ubi doesn't create new fastmap even the module param + * 'fm_autoconvert' is set, and existed old fastmap will be destroyed after + * doing full scanning. + * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, - int vid_hdr_offset, int max_beb_per1024) + int vid_hdr_offset, int max_beb_per1024, bool disable_fm) { struct ubi_device *ubi; int i, err; @@ -921,7 +926,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, UBI_FM_MIN_POOL_SIZE); ubi->fm_wl_pool.max_size = ubi->fm_pool.max_size / 2; - ubi->fm_disabled = !fm_autoconvert; + ubi->fm_disabled = (!fm_autoconvert || disable_fm) ? 1 : 0; if (fm_debug) ubi_enable_dbg_chk_fastmap(ubi); @@ -962,7 +967,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, if (!ubi->fm_buf) goto out_free; #endif - err = ubi_attach(ubi, 0); + err = ubi_attach(ubi, disable_fm ? 1 : 0); if (err) { ubi_err(ubi, "failed to attach mtd%d, error %d", mtd->index, err); @@ -1242,7 +1247,8 @@ static int __init ubi_init(void) mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, - p->vid_hdr_offs, p->max_beb_per1024); + p->vid_hdr_offs, p->max_beb_per1024, + false); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index 1d3bbcfb4bb59..f43430b9c1e65 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -1041,7 +1041,7 @@ static long ctrl_cdev_ioctl(struct file *file, unsigned int cmd, */ mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, req.ubi_num, req.vid_hdr_offset, - req.max_beb_per1024); + req.max_beb_per1024, !!req.disable_fm); mutex_unlock(&ubi_devices_mutex); if (err < 0) put_mtd_device(mtd); diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 0110eb3d4db6e..c8f1bd4fa1008 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -937,7 +937,8 @@ int ubi_io_write_vid_hdr(struct ubi_device *ubi, int pnum, /* build.c */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, - int vid_hdr_offset, int max_beb_per1024); + int vid_hdr_offset, int max_beb_per1024, + bool disable_fm); int ubi_detach_mtd_dev(int ubi_num, int anyway); struct ubi_device *ubi_get_device(int ubi_num); void ubi_put_device(struct ubi_device *ubi); diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h index b69e9ba6742b9..dcb179de43585 100644 --- a/include/uapi/mtd/ubi-user.h +++ b/include/uapi/mtd/ubi-user.h @@ -247,6 +247,7 @@ enum { * @vid_hdr_offset: VID header offset (use defaults if %0) * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * @padding: reserved for future, not used, has to be zeroed + * @disable_fm: whether disable fastmap * * This data structure is used to specify MTD device UBI has to attach and the * parameters it has to use. The number which should be assigned to the new UBI @@ -281,13 +282,18 @@ enum { * eraseblocks for new bad eraseblocks, but attempts to use available * eraseblocks (if any). The accepted range is 0-768. If 0 is given, the * default kernel value of %CONFIG_MTD_UBI_BEB_LIMIT will be used. + * + * If @disable_fm is not zero, ubi doesn't create new fastmap even the module + * param 'fm_autoconvert' is set, and existed old fastmap will be destroyed + * after doing full scanning. */ struct ubi_attach_req { __s32 ubi_num; __s32 mtd_num; __s32 vid_hdr_offset; __s16 max_beb_per1024; - __s8 padding[10]; + __s8 disable_fm; + __s8 padding[9]; }; /* -- GitLab From 49ad31e9d78527045614c534df057cadee487773 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Sep 2022 11:02:28 +0300 Subject: [PATCH 0409/2223] PCI: Pass available buses even if the bridge is already configured If some part of the PCI topology is already configured (by the boot firmware) but not all, and it includes hotplug bridges, we may need to extend the bus resources of those bridges to accommodate any future hotplugs, in the same way we already do with the normal hotplug case. Pass the available buses to pci_scan_child_bus_extend() even when the bridge in question is already configured so the bus allocation code can use these available buses to extend the possible hotplug bridges below. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216000 Link: https://lore.kernel.org/r/20220905080232.36087-3-mika.westerberg@linux.intel.com Reported-by: Chris Chiu Tested-by: Chris Chiu Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 4f940dcd102cb..86130926a74f0 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1297,7 +1297,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev, if ((secondary || subordinate) && !pcibios_assign_all_busses() && !is_cardbus && !broken) { - unsigned int cmax; + unsigned int cmax, buses; /* * Bus already configured by firmware, process it in the @@ -1322,7 +1322,8 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev, child->bridge_ctl = bctl; } - cmax = pci_scan_child_bus(child); + buses = subordinate - secondary; + cmax = pci_scan_child_bus_extend(child, buses); if (cmax > subordinate) pci_warn(dev, "bridge has subordinate %02x but max busn %02x\n", subordinate, cmax); -- GitLab From d1caf229c7587b5c514910fff8dc382e69fdcdf5 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Sep 2022 11:02:29 +0300 Subject: [PATCH 0410/2223] PCI: Move pci_assign_unassigned_root_bus_resources() We need to be able to call pci_bridge_distribute_available_resources() from this function so move it accordingly to avoid need for forward declaration. No functional impact. Link: https://lore.kernel.org/r/20220905080232.36087-4-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/setup-bus.c | 226 ++++++++++++++++++++-------------------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 8cb68e6f6ef93..3b981da0fb4ee 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1745,119 +1745,6 @@ static enum enable_type pci_realloc_detect(struct pci_bus *bus, } #endif -/* - * First try will not touch PCI bridge res. - * Second and later try will clear small leaf bridge res. - * Will stop till to the max depth if can not find good one. - */ -void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus) -{ - LIST_HEAD(realloc_head); - /* List of resources that want additional resources */ - struct list_head *add_list = NULL; - int tried_times = 0; - enum release_type rel_type = leaf_only; - LIST_HEAD(fail_head); - struct pci_dev_resource *fail_res; - int pci_try_num = 1; - enum enable_type enable_local; - - /* Don't realloc if asked to do so */ - enable_local = pci_realloc_detect(bus, pci_realloc_enable); - if (pci_realloc_enabled(enable_local)) { - int max_depth = pci_bus_get_depth(bus); - - pci_try_num = max_depth + 1; - dev_info(&bus->dev, "max bus depth: %d pci_try_num: %d\n", - max_depth, pci_try_num); - } - -again: - /* - * Last try will use add_list, otherwise will try good to have as must - * have, so can realloc parent bridge resource - */ - if (tried_times + 1 == pci_try_num) - add_list = &realloc_head; - /* - * Depth first, calculate sizes and alignments of all subordinate buses. - */ - __pci_bus_size_bridges(bus, add_list); - - /* Depth last, allocate resources and update the hardware. */ - __pci_bus_assign_resources(bus, add_list, &fail_head); - if (add_list) - BUG_ON(!list_empty(add_list)); - tried_times++; - - /* Any device complain? */ - if (list_empty(&fail_head)) - goto dump; - - if (tried_times >= pci_try_num) { - if (enable_local == undefined) - dev_info(&bus->dev, "Some PCI device resources are unassigned, try booting with pci=realloc\n"); - else if (enable_local == auto_enabled) - dev_info(&bus->dev, "Automatically enabled pci realloc, if you have problem, try booting with pci=realloc=off\n"); - - free_list(&fail_head); - goto dump; - } - - dev_info(&bus->dev, "No. %d try to assign unassigned res\n", - tried_times + 1); - - /* Third times and later will not check if it is leaf */ - if ((tried_times + 1) > 2) - rel_type = whole_subtree; - - /* - * Try to release leaf bridge's resources that doesn't fit resource of - * child device under that bridge. - */ - list_for_each_entry(fail_res, &fail_head, list) - pci_bus_release_bridge_resources(fail_res->dev->bus, - fail_res->flags & PCI_RES_TYPE_MASK, - rel_type); - - /* Restore size and flags */ - list_for_each_entry(fail_res, &fail_head, list) { - struct resource *res = fail_res->res; - int idx; - - res->start = fail_res->start; - res->end = fail_res->end; - res->flags = fail_res->flags; - - if (pci_is_bridge(fail_res->dev)) { - idx = res - &fail_res->dev->resource[0]; - if (idx >= PCI_BRIDGE_RESOURCES && - idx <= PCI_BRIDGE_RESOURCE_END) - res->flags = 0; - } - } - free_list(&fail_head); - - goto again; - -dump: - /* Dump the resource on buses */ - pci_bus_dump_resources(bus); -} - -void __init pci_assign_unassigned_resources(void) -{ - struct pci_bus *root_bus; - - list_for_each_entry(root_bus, &pci_root_buses, node) { - pci_assign_unassigned_root_bus_resources(root_bus); - - /* Make sure the root bridge has a companion ACPI device */ - if (ACPI_HANDLE(root_bus->bridge)) - acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); - } -} - static void adjust_bridge_window(struct pci_dev *bridge, struct resource *res, struct list_head *add_list, resource_size_t new_size) @@ -2047,6 +1934,119 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, available_mmio_pref); } +/* + * First try will not touch PCI bridge res. + * Second and later try will clear small leaf bridge res. + * Will stop till to the max depth if can not find good one. + */ +void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus) +{ + LIST_HEAD(realloc_head); + /* List of resources that want additional resources */ + struct list_head *add_list = NULL; + int tried_times = 0; + enum release_type rel_type = leaf_only; + LIST_HEAD(fail_head); + struct pci_dev_resource *fail_res; + int pci_try_num = 1; + enum enable_type enable_local; + + /* Don't realloc if asked to do so */ + enable_local = pci_realloc_detect(bus, pci_realloc_enable); + if (pci_realloc_enabled(enable_local)) { + int max_depth = pci_bus_get_depth(bus); + + pci_try_num = max_depth + 1; + dev_info(&bus->dev, "max bus depth: %d pci_try_num: %d\n", + max_depth, pci_try_num); + } + +again: + /* + * Last try will use add_list, otherwise will try good to have as must + * have, so can realloc parent bridge resource + */ + if (tried_times + 1 == pci_try_num) + add_list = &realloc_head; + /* + * Depth first, calculate sizes and alignments of all subordinate buses. + */ + __pci_bus_size_bridges(bus, add_list); + + /* Depth last, allocate resources and update the hardware. */ + __pci_bus_assign_resources(bus, add_list, &fail_head); + if (add_list) + BUG_ON(!list_empty(add_list)); + tried_times++; + + /* Any device complain? */ + if (list_empty(&fail_head)) + goto dump; + + if (tried_times >= pci_try_num) { + if (enable_local == undefined) + dev_info(&bus->dev, "Some PCI device resources are unassigned, try booting with pci=realloc\n"); + else if (enable_local == auto_enabled) + dev_info(&bus->dev, "Automatically enabled pci realloc, if you have problem, try booting with pci=realloc=off\n"); + + free_list(&fail_head); + goto dump; + } + + dev_info(&bus->dev, "No. %d try to assign unassigned res\n", + tried_times + 1); + + /* Third times and later will not check if it is leaf */ + if ((tried_times + 1) > 2) + rel_type = whole_subtree; + + /* + * Try to release leaf bridge's resources that doesn't fit resource of + * child device under that bridge. + */ + list_for_each_entry(fail_res, &fail_head, list) + pci_bus_release_bridge_resources(fail_res->dev->bus, + fail_res->flags & PCI_RES_TYPE_MASK, + rel_type); + + /* Restore size and flags */ + list_for_each_entry(fail_res, &fail_head, list) { + struct resource *res = fail_res->res; + int idx; + + res->start = fail_res->start; + res->end = fail_res->end; + res->flags = fail_res->flags; + + if (pci_is_bridge(fail_res->dev)) { + idx = res - &fail_res->dev->resource[0]; + if (idx >= PCI_BRIDGE_RESOURCES && + idx <= PCI_BRIDGE_RESOURCE_END) + res->flags = 0; + } + } + free_list(&fail_head); + + goto again; + +dump: + /* Dump the resource on buses */ + pci_bus_dump_resources(bus); +} + +void __init pci_assign_unassigned_resources(void) +{ + struct pci_bus *root_bus; + + list_for_each_entry(root_bus, &pci_root_buses, node) { + pci_assign_unassigned_root_bus_resources(root_bus); + + /* Make sure the root bridge has a companion ACPI device */ + if (ACPI_HANDLE(root_bus->bridge)) + acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); + } +} + void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge) { struct pci_bus *parent = bridge->subordinate; -- GitLab From e96e27fc6f7971380283768e9a734af16b1716ee Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Sep 2022 11:02:30 +0300 Subject: [PATCH 0411/2223] PCI: Distribute available resources for root buses, too Previously we distributed spare resources only upon hot-add, so if the initial root bus scan found devices that had not been fully configured by the BIOS, we allocated only enough resources to cover what was then present. If some of those devices were hotplug bridges, we did not leave any additional resource space for future expansion. Distribute the available resources for root buses, too, to make this work the same way as the normal hotplug case. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216000 Link: https://lore.kernel.org/r/20220905080232.36087-5-mika.westerberg@linux.intel.com Reported-by: Chris Chiu Tested-by: Chris Chiu Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/setup-bus.c | 62 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 3b981da0fb4ee..df9fc974b3133 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1768,7 +1768,10 @@ static void adjust_bridge_window(struct pci_dev *bridge, struct resource *res, } res->end = res->start + new_size - 1; - remove_from_list(add_list, res); + + /* If the resource is part of the add_list remove it now */ + if (add_list) + remove_from_list(add_list, res); } static void pci_bus_distribute_available_resources(struct pci_bus *bus, @@ -1923,6 +1926,8 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, if (!bridge->is_hotplug_bridge) return; + pci_dbg(bridge, "distributing available resources\n"); + /* Take the initial extra resources from the hotplug port */ available_io = bridge->resource[PCI_BRIDGE_IO_WINDOW]; available_mmio = bridge->resource[PCI_BRIDGE_MEM_WINDOW]; @@ -1934,6 +1939,59 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, available_mmio_pref); } +static bool pci_bridge_resources_not_assigned(struct pci_dev *dev) +{ + const struct resource *r; + + /* + * Check the child device's resources and if they are not yet + * assigned it means we are configuring them (not the boot + * firmware) so we should be able to extend the upstream + * bridge's (that's the hotplug downstream PCIe port) resources + * in the same way we do with the normal hotplug case. + */ + r = &dev->resource[PCI_BRIDGE_IO_WINDOW]; + if (!r->flags || !(r->flags & IORESOURCE_STARTALIGN)) + return false; + r = &dev->resource[PCI_BRIDGE_MEM_WINDOW]; + if (!r->flags || !(r->flags & IORESOURCE_STARTALIGN)) + return false; + r = &dev->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + if (!r->flags || !(r->flags & IORESOURCE_STARTALIGN)) + return false; + + return true; +} + +static void pci_root_bus_distribute_available_resources(struct pci_bus *bus, + struct list_head *add_list) +{ + struct pci_dev *dev, *bridge = bus->self; + + for_each_pci_bridge(dev, bus) { + struct pci_bus *b; + + b = dev->subordinate; + if (!b) + continue; + + /* + * Need to check "bridge" here too because it is NULL + * in case of root bus. + */ + if (bridge && pci_bridge_resources_not_assigned(dev)) { + pci_bridge_distribute_available_resources(bridge, add_list); + /* + * There is only PCIe upstream port on the bus + * so we don't need to go futher. + */ + return; + } + + pci_root_bus_distribute_available_resources(b, add_list); + } +} + /* * First try will not touch PCI bridge res. * Second and later try will clear small leaf bridge res. @@ -1973,6 +2031,8 @@ again: */ __pci_bus_size_bridges(bus, add_list); + pci_root_bus_distribute_available_resources(bus, add_list); + /* Depth last, allocate resources and update the hardware. */ __pci_bus_assign_resources(bus, add_list, &fail_head); if (add_list) -- GitLab From 17d2d67d76e41c7fd00608fdad350e1790c5c24a Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Sep 2022 11:02:31 +0300 Subject: [PATCH 0412/2223] PCI: Fix whitespace and indentation Drop two empty lines from pci_scan_child_bus_extend() and correct indentation in pci_bridge_distribute_available_resources() to better follow the kernel coding style. No functional impact. Link: https://lore.kernel.org/r/20220905080232.36087-6-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/probe.c | 2 -- drivers/pci/setup-bus.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 86130926a74f0..8f25deb6b763d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2930,7 +2930,6 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus, unsigned int buses = 0; if (!hotplug_bridges && normal_bridges == 1) { - /* * There is only one bridge on the bus (upstream * port) so it gets all available buses which it @@ -2939,7 +2938,6 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus, */ buses = available_buses; } else if (dev->is_hotplug_bridge) { - /* * Distribute the extra buses between hotplug * bridges if any. diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index df9fc974b3133..dc6a30ee6edfb 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1919,7 +1919,7 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, } static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, - struct list_head *add_list) + struct list_head *add_list) { struct resource available_io, available_mmio, available_mmio_pref; -- GitLab From 58e011609c4305fc50674c4610cbe8a8c26261f6 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Sep 2022 11:02:32 +0300 Subject: [PATCH 0413/2223] PCI: Fix typo in pci_scan_child_bus_extend() Should be 'if' not 'of'. Fix this. Link: https://lore.kernel.org/r/20220905080232.36087-7-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8f25deb6b763d..b66fa42c4b1fa 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2956,7 +2956,7 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus, /* * Make sure a hotplug bridge has at least the minimum requested * number of buses but allow it to grow up to the maximum available - * bus number of there is room. + * bus number if there is room. */ if (bus->self && bus->self->is_hotplug_bridge) { used_buses = max_t(unsigned int, available_buses, -- GitLab From cb9ff3f3b84c95867856c3be42de73972feb1249 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:47 +0800 Subject: [PATCH 0414/2223] vfio: Add helpers for unifying vfio_device life cycle The idea is to let vfio core manage the vfio_device life cycle instead of duplicating the logic cross drivers. This is also a preparatory step for adding struct device into vfio_device. New pair of helpers together with a kref in vfio_device: - vfio_alloc_device() - vfio_put_device() Drivers can register @init/@release callbacks to manage any private state wrapping the vfio_device. However vfio-ccw doesn't fit this model due to a life cycle mess that its private structure mixes both parent and mdev info hence must be allocated/freed outside of the life cycle of vfio device. Per prior discussions this won't be fixed in short term by IBM folks. Instead of waiting for those modifications introduce another helper vfio_init_device() so ccw can call it to initialize a pre-allocated vfio_device. Further implication of the ccw trick is that vfio_device cannot be freed uniformly in vfio core. Instead, require *EVERY* driver to implement @release and free vfio_device inside. Then ccw can choose to delay the free at its own discretion. Another trick down the road is that kvzalloc() is used to accommodate the need of gvt which uses vzalloc() while all others use kzalloc(). So drivers should call a helper vfio_free_device() to free the vfio_device instead of assuming that kfree() or vfree() is appliable. Later once the ccw mess is fixed we can remove those tricks and fully handle structure alloc/free in vfio core. Existing vfio_{un}init_group_dev() will be deprecated after all existing usages are converted to the new model. Suggested-by: Jason Gunthorpe Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Tony Krowiak Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-2-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 92 ++++++++++++++++++++++++++++++++++++++++ include/linux/vfio.h | 25 ++++++++++- 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 27d9186f35d5c..b9c6a97d647a7 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -498,6 +498,98 @@ void vfio_uninit_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); +/* Release helper called by vfio_put_device() */ +void vfio_device_release(struct kref *kref) +{ + struct vfio_device *device = + container_of(kref, struct vfio_device, kref); + + vfio_uninit_group_dev(device); + + /* + * kvfree() cannot be done here due to a life cycle mess in + * vfio-ccw. Before the ccw part is fixed all drivers are + * required to support @release and call vfio_free_device() + * from there. + */ + device->ops->release(device); +} +EXPORT_SYMBOL_GPL(vfio_device_release); + +/* + * Allocate and initialize vfio_device so it can be registered to vfio + * core. + * + * Drivers should use the wrapper vfio_alloc_device() for allocation. + * @size is the size of the structure to be allocated, including any + * private data used by the driver. + * + * Driver may provide an @init callback to cover device private data. + * + * Use vfio_put_device() to release the structure after success return. + */ +struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, + const struct vfio_device_ops *ops) +{ + struct vfio_device *device; + int ret; + + if (WARN_ON(size < sizeof(struct vfio_device))) + return ERR_PTR(-EINVAL); + + device = kvzalloc(size, GFP_KERNEL); + if (!device) + return ERR_PTR(-ENOMEM); + + ret = vfio_init_device(device, dev, ops); + if (ret) + goto out_free; + return device; + +out_free: + kvfree(device); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(_vfio_alloc_device); + +/* + * Initialize a vfio_device so it can be registered to vfio core. + * + * Only vfio-ccw driver should call this interface. + */ +int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) +{ + int ret; + + vfio_init_group_dev(device, dev, ops); + + if (ops->init) { + ret = ops->init(device); + if (ret) + goto out_uninit; + } + + kref_init(&device->kref); + return 0; + +out_uninit: + vfio_uninit_group_dev(device); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_init_device); + +/* + * The helper called by driver @release callback to free the device + * structure. Drivers which don't have private data to clean can + * simply use this helper as its @release. + */ +void vfio_free_device(struct vfio_device *device) +{ + kvfree(device); +} +EXPORT_SYMBOL_GPL(vfio_free_device); + static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 0e28265590916..f67cac700e6f9 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -47,7 +47,8 @@ struct vfio_device { struct kvm *kvm; /* Members below here are private, not for driver use */ - refcount_t refcount; + struct kref kref; /* object life cycle */ + refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; struct list_head group_next; @@ -57,6 +58,8 @@ struct vfio_device { /** * struct vfio_device_ops - VFIO bus driver device callbacks * + * @init: initialize private fields in device structure + * @release: Reclaim private fields in device structure * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -74,6 +77,8 @@ struct vfio_device { */ struct vfio_device_ops { char *name; + int (*init)(struct vfio_device *vdev); + void (*release)(struct vfio_device *vdev); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -161,6 +166,24 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops, return 1; } +struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, + const struct vfio_device_ops *ops); +#define vfio_alloc_device(dev_struct, member, dev, ops) \ + container_of(_vfio_alloc_device(sizeof(struct dev_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct dev_struct, member)), \ + dev, ops), \ + struct dev_struct, member) + +int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); +void vfio_free_device(struct vfio_device *device); +void vfio_device_release(struct kref *kref); +static inline void vfio_put_device(struct vfio_device *device) +{ + kref_put(&device->kref, vfio_device_release); +} + void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); -- GitLab From 63d7c77989de98d3e92611dbb858028b74dca377 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:48 +0800 Subject: [PATCH 0415/2223] vfio/pci: Use the new device life cycle helpers Also introduce two pci core helpers as @init/@release for pci drivers: - vfio_pci_core_init_dev() - vfio_pci_core_release_dev() Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-3-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 20 +++++++++--------- drivers/vfio/pci/vfio_pci_core.c | 35 ++++++++++++++++++++++++++++++++ include/linux/vfio_pci_core.h | 2 ++ 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d9b5c03f8d5b2..1d4919edfbde4 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -127,6 +127,8 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) static const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, @@ -146,20 +148,19 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (vfio_pci_is_denylisted(pdev)) return -EINVAL; - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops); + vdev = vfio_alloc_device(vfio_pci_core_device, vdev, &pdev->dev, + &vfio_pci_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); ret = vfio_pci_core_register_device(vdev); if (ret) - goto out_free; + goto out_put_vdev; return 0; -out_free: - vfio_pci_core_uninit_device(vdev); - kfree(vdev); +out_put_vdev: + vfio_put_device(&vdev->vdev); return ret; } @@ -168,8 +169,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); vfio_pci_core_unregister_device(vdev); - vfio_pci_core_uninit_device(vdev); - kfree(vdev); + vfio_put_device(&vdev->vdev); } static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 0a801aee2f2d1..77d33739c6e87 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2078,6 +2078,41 @@ static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev) VGA_RSRC_LEGACY_MEM); } +int vfio_pci_core_init_dev(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + + vdev->pdev = to_pci_dev(core_vdev->dev); + vdev->irq_type = VFIO_PCI_NUM_IRQS; + mutex_init(&vdev->igate); + spin_lock_init(&vdev->irqlock); + mutex_init(&vdev->ioeventfds_lock); + INIT_LIST_HEAD(&vdev->dummy_resources_list); + INIT_LIST_HEAD(&vdev->ioeventfds_list); + mutex_init(&vdev->vma_lock); + INIT_LIST_HEAD(&vdev->vma_list); + INIT_LIST_HEAD(&vdev->sriov_pfs_item); + init_rwsem(&vdev->memory_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev); + +void vfio_pci_core_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + + mutex_destroy(&vdev->igate); + mutex_destroy(&vdev->ioeventfds_lock); + mutex_destroy(&vdev->vma_lock); + kfree(vdev->region); + kfree(vdev->pm_save); + vfio_free_device(core_vdev); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); + void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, struct pci_dev *pdev, const struct vfio_device_ops *vfio_pci_ops) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 089b603bcfdca..0499ea8360587 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -109,6 +109,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev); void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, struct pci_dev *pdev, const struct vfio_device_ops *vfio_pci_ops); +int vfio_pci_core_init_dev(struct vfio_device *core_vdev); +void vfio_pci_core_release_dev(struct vfio_device *core_vdev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); -- GitLab From d3966e305ac4e0b5a63f784d9152fac4961554de Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:49 +0800 Subject: [PATCH 0416/2223] vfio/mlx5: Use the new device life cycle helpers mlx5 has its own @init/@release for handling migration cap. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-4-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 50 ++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 759a5f5f7b3f4..fd6ccb8454a24 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -585,8 +585,35 @@ static const struct vfio_log_ops mlx5vf_pci_log_ops = { .log_read_and_clear = mlx5vf_tracker_read_and_clear, }; +static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, + struct mlx5vf_pci_core_device, core_device.vdev); + int ret; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + return ret; + + mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, + &mlx5vf_pci_log_ops); + + return 0; +} + +static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, + struct mlx5vf_pci_core_device, core_device.vdev); + + mlx5vf_cmd_remove_migratable(mvdev); + vfio_pci_core_release_dev(core_vdev); +} + static const struct vfio_device_ops mlx5vf_pci_ops = { .name = "mlx5-vfio-pci", + .init = mlx5vf_pci_init_dev, + .release = mlx5vf_pci_release_dev, .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, @@ -604,22 +631,19 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, struct mlx5vf_pci_core_device *mvdev; int ret; - mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); - if (!mvdev) - return -ENOMEM; - vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, - &mlx5vf_pci_log_ops); + mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, + &pdev->dev, &mlx5vf_pci_ops); + if (IS_ERR(mvdev)) + return PTR_ERR(mvdev); + dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) - goto out_free; + goto out_put_vdev; return 0; -out_free: - mlx5vf_cmd_remove_migratable(mvdev); - vfio_pci_core_uninit_device(&mvdev->core_device); - kfree(mvdev); +out_put_vdev: + vfio_put_device(&mvdev->core_device.vdev); return ret; } @@ -628,9 +652,7 @@ static void mlx5vf_pci_remove(struct pci_dev *pdev) struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); vfio_pci_core_unregister_device(&mvdev->core_device); - mlx5vf_cmd_remove_migratable(mvdev); - vfio_pci_core_uninit_device(&mvdev->core_device); - kfree(mvdev); + vfio_put_device(&mvdev->core_device.vdev); } static const struct pci_device_id mlx5vf_pci_table[] = { -- GitLab From 27aeb915595b87165a3004aab05b2b837d01e6ed Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:50 +0800 Subject: [PATCH 0417/2223] vfio/hisi_acc: Use the new device life cycle helpers Tidy up @probe so all migration specific initialization logic is moved to migration specific @init callback. Remove vfio_pci_core_{un}init_device() given no user now. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20220921104401.38898-5-kevin.tian@intel.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 80 +++++++++---------- drivers/vfio/pci/vfio_pci_core.c | 30 ------- include/linux/vfio_pci_core.h | 4 - 3 files changed, 37 insertions(+), 77 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 258cae0863eae..47174e2b61bd3 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1213,8 +1213,28 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_get_state = hisi_acc_vfio_pci_get_device_state, }; +static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, + struct hisi_acc_vf_core_device, core_device.vdev); + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); + struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev); + + hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1; + hisi_acc_vdev->pf_qm = pf_qm; + hisi_acc_vdev->vf_dev = pdev; + mutex_init(&hisi_acc_vdev->state_mutex); + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; + + return vfio_pci_core_init_dev(core_vdev); +} + static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .name = "hisi-acc-vfio-pci-migration", + .init = hisi_acc_vfio_pci_migrn_init_dev, + .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = hisi_acc_vfio_pci_ioctl, @@ -1228,6 +1248,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .name = "hisi-acc-vfio-pci", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, @@ -1239,63 +1261,36 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .match = vfio_pci_core_match, }; -static int -hisi_acc_vfio_pci_migrn_init(struct hisi_acc_vf_core_device *hisi_acc_vdev, - struct pci_dev *pdev, struct hisi_qm *pf_qm) -{ - int vf_id; - - vf_id = pci_iov_vf_id(pdev); - if (vf_id < 0) - return vf_id; - - hisi_acc_vdev->vf_id = vf_id + 1; - hisi_acc_vdev->core_device.vdev.migration_flags = - VFIO_MIGRATION_STOP_COPY; - hisi_acc_vdev->pf_qm = pf_qm; - hisi_acc_vdev->vf_dev = pdev; - mutex_init(&hisi_acc_vdev->state_mutex); - - return 0; -} - static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct hisi_acc_vf_core_device *hisi_acc_vdev; + const struct vfio_device_ops *ops = &hisi_acc_vfio_pci_ops; struct hisi_qm *pf_qm; + int vf_id; int ret; - hisi_acc_vdev = kzalloc(sizeof(*hisi_acc_vdev), GFP_KERNEL); - if (!hisi_acc_vdev) - return -ENOMEM; - pf_qm = hisi_acc_get_pf_qm(pdev); if (pf_qm && pf_qm->ver >= QM_HW_V3) { - ret = hisi_acc_vfio_pci_migrn_init(hisi_acc_vdev, pdev, pf_qm); - if (!ret) { - vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, - &hisi_acc_vfio_pci_migrn_ops); - hisi_acc_vdev->core_device.vdev.mig_ops = - &hisi_acc_vfio_pci_migrn_state_ops; - } else { + vf_id = pci_iov_vf_id(pdev); + if (vf_id >= 0) + ops = &hisi_acc_vfio_pci_migrn_ops; + else pci_warn(pdev, "migration support failed, continue with generic interface\n"); - vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, - &hisi_acc_vfio_pci_ops); - } - } else { - vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, - &hisi_acc_vfio_pci_ops); } + hisi_acc_vdev = vfio_alloc_device(hisi_acc_vf_core_device, + core_device.vdev, &pdev->dev, ops); + if (IS_ERR(hisi_acc_vdev)) + return PTR_ERR(hisi_acc_vdev); + dev_set_drvdata(&pdev->dev, &hisi_acc_vdev->core_device); ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device); if (ret) - goto out_free; + goto out_put_vdev; return 0; -out_free: - vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); - kfree(hisi_acc_vdev); +out_put_vdev: + vfio_put_device(&hisi_acc_vdev->core_device.vdev); return ret; } @@ -1304,8 +1299,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); - vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); - kfree(hisi_acc_vdev); + vfio_put_device(&hisi_acc_vdev->core_device.vdev); } static const struct pci_device_id hisi_acc_vfio_pci_table[] = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 77d33739c6e87..59a28251bb0b9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2113,36 +2113,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); -void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, - struct pci_dev *pdev, - const struct vfio_device_ops *vfio_pci_ops) -{ - vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops); - vdev->pdev = pdev; - vdev->irq_type = VFIO_PCI_NUM_IRQS; - mutex_init(&vdev->igate); - spin_lock_init(&vdev->irqlock); - mutex_init(&vdev->ioeventfds_lock); - INIT_LIST_HEAD(&vdev->dummy_resources_list); - INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); - INIT_LIST_HEAD(&vdev->sriov_pfs_item); - init_rwsem(&vdev->memory_lock); -} -EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); - -void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev) -{ - mutex_destroy(&vdev->igate); - mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); - vfio_uninit_group_dev(&vdev->vdev); - kfree(vdev->region); - kfree(vdev->pm_save); -} -EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); - int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 0499ea8360587..367fd79226a30 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -106,13 +106,9 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); -void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, - struct pci_dev *pdev, - const struct vfio_device_ops *vfio_pci_ops); int vfio_pci_core_init_dev(struct vfio_device *core_vdev); void vfio_pci_core_release_dev(struct vfio_device *core_vdev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); -void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); extern const struct pci_error_handlers vfio_pci_core_err_handlers; int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, -- GitLab From 603c09f2873d6e86dce636c9e5ae330e2c033940 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:51 +0800 Subject: [PATCH 0418/2223] vfio/mdpy: Use the new device life cycle helpers and manage mdpy_count inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-6-kevin.tian@intel.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mdpy.c | 81 +++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index e8c46eb2e2468..bb2af1ec0f7c6 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -216,61 +216,77 @@ static int mdpy_reset(struct mdev_state *mdev_state) return 0; } -static int mdpy_probe(struct mdev_device *mdev) +static int mdpy_init_dev(struct vfio_device *vdev) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); const struct mdpy_type *type = &mdpy_types[mdev_get_type_group_id(mdev)]; - struct device *dev = mdev_dev(mdev); - struct mdev_state *mdev_state; u32 fbsize; - int ret; + int ret = -ENOMEM; if (mdpy_count >= max_devices) - return -ENOMEM; - - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - return -ENOMEM; - vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mdpy_dev_ops); + return ret; mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { - ret = -ENOMEM; - goto err_state; - } + if (!mdev_state->vconfig) + return ret; fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); mdev_state->memblk = vmalloc_user(fbsize); - if (!mdev_state->memblk) { - ret = -ENOMEM; - goto err_vconfig; - } - dev_info(dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, - type->height); + if (!mdev_state->memblk) + goto out_vconfig; mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mdev_state->type = type; + mdev_state->type = type; mdev_state->memsize = fbsize; mdpy_create_config_space(mdev_state); mdpy_reset(mdev_state); + dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, + type->height); + mdpy_count++; + return 0; + +out_vconfig: + kfree(mdev_state->vconfig); + return ret; +} + +static int mdpy_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mdpy_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) - goto err_mem; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, mdev_state); return 0; -err_mem: + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mdpy_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + mdpy_count--; vfree(mdev_state->memblk); -err_vconfig: kfree(mdev_state->vconfig); -err_state: - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); - return ret; + vfio_free_device(vdev); } static void mdpy_remove(struct mdev_device *mdev) @@ -280,12 +296,7 @@ static void mdpy_remove(struct mdev_device *mdev) dev_info(&mdev->dev, "%s\n", __func__); vfio_unregister_group_dev(&mdev_state->vdev); - vfree(mdev_state->memblk); - kfree(mdev_state->vconfig); - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); - - mdpy_count--; + vfio_put_device(&mdev_state->vdev); } static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf, @@ -708,6 +719,8 @@ static struct attribute_group *mdev_type_groups[] = { }; static const struct vfio_device_ops mdpy_dev_ops = { + .init = mdpy_init_dev, + .release = mdpy_release_dev, .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, -- GitLab From 67c5a1814f4c1ad0e61f11112010057191730853 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:52 +0800 Subject: [PATCH 0419/2223] vfio/mtty: Use the new device life cycle helpers and manage available ports inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-7-kevin.tian@intel.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 67 +++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index f42a59ed2e3fe..d151928e4f21d 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -703,9 +703,11 @@ accessfailed: return ret; } -static int mtty_probe(struct mdev_device *mdev) +static int mtty_init_dev(struct vfio_device *vdev) { - struct mdev_state *mdev_state; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); int nr_ports = mdev_get_type_group_id(mdev) + 1; int avail_ports = atomic_read(&mdev_avail_ports); int ret; @@ -716,58 +718,65 @@ static int mtty_probe(struct mdev_device *mdev) } while (!atomic_try_cmpxchg(&mdev_avail_ports, &avail_ports, avail_ports - nr_ports)); - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) { - ret = -ENOMEM; - goto err_nr_ports; - } - - vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops); - mdev_state->nr_ports = nr_ports; mdev_state->irq_index = -1; mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; mutex_init(&mdev_state->rxtx_lock); - mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { + mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (!mdev_state->vconfig) { ret = -ENOMEM; - goto err_state; + goto err_nr_ports; } mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mtty_create_config_space(mdev_state); + return 0; + +err_nr_ports: + atomic_add(nr_ports, &mdev_avail_ports); + return ret; +} + +static int mtty_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mtty_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) - goto err_vconfig; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, mdev_state); return 0; -err_vconfig: - kfree(mdev_state->vconfig); -err_state: - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); -err_nr_ports: - atomic_add(nr_ports, &mdev_avail_ports); +err_put_vdev: + vfio_put_device(&mdev_state->vdev); return ret; } +static void mtty_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + atomic_add(mdev_state->nr_ports, &mdev_avail_ports); + kfree(mdev_state->vconfig); + vfio_free_device(vdev); +} + static void mtty_remove(struct mdev_device *mdev) { struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); - int nr_ports = mdev_state->nr_ports; vfio_unregister_group_dev(&mdev_state->vdev); - - kfree(mdev_state->vconfig); - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); - atomic_add(nr_ports, &mdev_avail_ports); + vfio_put_device(&mdev_state->vdev); } static int mtty_reset(struct mdev_state *mdev_state) @@ -1287,6 +1296,8 @@ static struct attribute_group *mdev_type_groups[] = { static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", + .init = mtty_init_dev, + .release = mtty_release_dev, .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, -- GitLab From 3d5d18e1f899ac3b03e108aef8560f4cb0969da1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:53 +0800 Subject: [PATCH 0420/2223] vfio/mbochs: Use the new device life cycle helpers and manage avail_mbytes inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-8-kevin.tian@intel.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mbochs.c | 73 ++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 344c2901a82bf..6901947e27d2d 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -505,13 +505,14 @@ static int mbochs_reset(struct mdev_state *mdev_state) return 0; } -static int mbochs_probe(struct mdev_device *mdev) +static int mbochs_init_dev(struct vfio_device *vdev) { - int avail_mbytes = atomic_read(&mbochs_avail_mbytes); + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); const struct mbochs_type *type = &mbochs_types[mdev_get_type_group_id(mdev)]; - struct device *dev = mdev_dev(mdev); - struct mdev_state *mdev_state; + int avail_mbytes = atomic_read(&mbochs_avail_mbytes); int ret = -ENOMEM; do { @@ -520,14 +521,9 @@ static int mbochs_probe(struct mdev_device *mdev) } while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes, avail_mbytes - type->mbytes)); - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - goto err_avail; - vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mbochs_dev_ops); - mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) - goto err_mem; + if (!mdev_state->vconfig) + goto err_avail; mdev_state->memsize = type->mbytes * 1024 * 1024; mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT; @@ -535,10 +531,7 @@ static int mbochs_probe(struct mdev_device *mdev) sizeof(struct page *), GFP_KERNEL); if (!mdev_state->pages) - goto err_mem; - - dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__, - type->name, type->mbytes, mdev_state->pagecount); + goto err_vconfig; mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; @@ -553,19 +546,47 @@ static int mbochs_probe(struct mdev_device *mdev) mbochs_create_config_space(mdev_state); mbochs_reset(mdev_state); + dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__, + type->name, type->mbytes, mdev_state->pagecount); + return 0; + +err_vconfig: + kfree(mdev_state->vconfig); +err_avail: + atomic_add(type->mbytes, &mbochs_avail_mbytes); + return ret; +} + +static int mbochs_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret = -ENOMEM; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mbochs_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) - goto err_mem; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, mdev_state); return 0; -err_mem: - vfio_uninit_group_dev(&mdev_state->vdev); + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mbochs_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); kfree(mdev_state->pages); kfree(mdev_state->vconfig); - kfree(mdev_state); -err_avail: - atomic_add(type->mbytes, &mbochs_avail_mbytes); - return ret; + vfio_free_device(vdev); } static void mbochs_remove(struct mdev_device *mdev) @@ -573,11 +594,7 @@ static void mbochs_remove(struct mdev_device *mdev) struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); vfio_unregister_group_dev(&mdev_state->vdev); - vfio_uninit_group_dev(&mdev_state->vdev); - atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); - kfree(mdev_state->pages); - kfree(mdev_state->vconfig); - kfree(mdev_state); + vfio_put_device(&mdev_state->vdev); } static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf, @@ -1397,6 +1414,8 @@ static struct attribute_group *mdev_type_groups[] = { static const struct vfio_device_ops mbochs_dev_ops = { .close_device = mbochs_close_device, + .init = mbochs_init_dev, + .release = mbochs_release_dev, .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, -- GitLab From a5ddd2a99a7a393ceb023b83d7e78fbb3284bcfd Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:54 +0800 Subject: [PATCH 0421/2223] drm/i915/gvt: Use the new device life cycle helpers Move vfio_device to the start of intel_vgpu as required by the new helpers. Change intel_gvt_create_vgpu() to use intel_vgpu as the first param as other vgpu helpers do. Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Zhenyu Wang Link: https://lore.kernel.org/r/20220921104401.38898-9-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/gvt.h | 5 ++- drivers/gpu/drm/i915/gvt/kvmgt.c | 52 ++++++++++++++++++++++---------- drivers/gpu/drm/i915/gvt/vgpu.c | 33 ++++++++------------ 3 files changed, 50 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 705689e640119..89fab7896fc6a 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -172,6 +172,7 @@ struct intel_vgpu_submission { #define KVMGT_DEBUGFS_FILENAME "kvmgt_nr_cache_entries" struct intel_vgpu { + struct vfio_device vfio_device; struct intel_gvt *gvt; struct mutex vgpu_lock; int id; @@ -211,7 +212,6 @@ struct intel_vgpu { u32 scan_nonprivbb; - struct vfio_device vfio_device; struct vfio_region *region; int num_regions; struct eventfd_ctx *intx_trigger; @@ -494,8 +494,7 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); -struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type); +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_release_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index e3cd589464777..41bba40feef8f 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1546,7 +1546,33 @@ static const struct attribute_group *intel_vgpu_groups[] = { NULL, }; +static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) +{ + struct mdev_device *mdev = to_mdev_device(vfio_dev->dev); + struct device *pdev = mdev_parent_dev(mdev); + struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt; + struct intel_vgpu_type *type; + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + + type = &gvt->types[mdev_get_type_group_id(mdev)]; + if (!type) + return -EINVAL; + + vgpu->gvt = gvt; + return intel_gvt_create_vgpu(vgpu, type); +} + +static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) +{ + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + + intel_gvt_destroy_vgpu(vgpu); + vfio_free_device(vfio_dev); +} + static const struct vfio_device_ops intel_vgpu_dev_ops = { + .init = intel_vgpu_init_dev, + .release = intel_vgpu_release_dev, .open_device = intel_vgpu_open_device, .close_device = intel_vgpu_close_device, .read = intel_vgpu_read, @@ -1558,35 +1584,28 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { static int intel_vgpu_probe(struct mdev_device *mdev) { - struct device *pdev = mdev_parent_dev(mdev); - struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt; - struct intel_vgpu_type *type; struct intel_vgpu *vgpu; int ret; - type = &gvt->types[mdev_get_type_group_id(mdev)]; - if (!type) - return -EINVAL; - - vgpu = intel_gvt_create_vgpu(gvt, type); + vgpu = vfio_alloc_device(intel_vgpu, vfio_device, &mdev->dev, + &intel_vgpu_dev_ops); if (IS_ERR(vgpu)) { gvt_err("failed to create intel vgpu: %ld\n", PTR_ERR(vgpu)); return PTR_ERR(vgpu); } - vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev, - &intel_vgpu_dev_ops); - dev_set_drvdata(&mdev->dev, vgpu); ret = vfio_register_emulated_iommu_dev(&vgpu->vfio_device); - if (ret) { - intel_gvt_destroy_vgpu(vgpu); - return ret; - } + if (ret) + goto out_put_vdev; gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n", dev_name(mdev_dev(mdev))); return 0; + +out_put_vdev: + vfio_put_device(&vgpu->vfio_device); + return ret; } static void intel_vgpu_remove(struct mdev_device *mdev) @@ -1595,7 +1614,8 @@ static void intel_vgpu_remove(struct mdev_device *mdev) if (WARN_ON_ONCE(vgpu->attached)) return; - intel_gvt_destroy_vgpu(vgpu); + + vfio_put_device(&vgpu->vfio_device); } static struct mdev_driver intel_vgpu_mdev_driver = { diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 46da19b3225d2..5c533fbc2c8da 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -302,8 +302,6 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu) mutex_lock(&gvt->lock); intel_gvt_update_vgpu_types(gvt); mutex_unlock(&gvt->lock); - - vfree(vgpu); } #define IDLE_VGPU_IDR 0 @@ -363,28 +361,23 @@ void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu) vfree(vgpu); } -static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_creation_params *param) +static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, + struct intel_vgpu_creation_params *param) { + struct intel_gvt *gvt = vgpu->gvt; struct drm_i915_private *dev_priv = gvt->gt->i915; - struct intel_vgpu *vgpu; int ret; gvt_dbg_core("low %llu MB high %llu MB fence %llu\n", param->low_gm_sz, param->high_gm_sz, param->fence_sz); - vgpu = vzalloc(sizeof(*vgpu)); - if (!vgpu) - return ERR_PTR(-ENOMEM); - ret = idr_alloc(&gvt->vgpu_idr, vgpu, IDLE_VGPU_IDR + 1, GVT_MAX_VGPU, GFP_KERNEL); if (ret < 0) - goto out_free_vgpu; + return ret; vgpu->id = ret; - vgpu->gvt = gvt; vgpu->sched_ctl.weight = param->weight; mutex_init(&vgpu->vgpu_lock); mutex_init(&vgpu->dmabuf_lock); @@ -437,7 +430,7 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, if (ret) goto out_clean_sched_policy; - return vgpu; + return 0; out_clean_sched_policy: intel_vgpu_clean_sched_policy(vgpu); @@ -455,9 +448,7 @@ out_clean_vgpu_mmio: intel_vgpu_clean_mmio(vgpu); out_clean_idr: idr_remove(&gvt->vgpu_idr, vgpu->id); -out_free_vgpu: - vfree(vgpu); - return ERR_PTR(ret); + return ret; } /** @@ -470,11 +461,11 @@ out_free_vgpu: * Returns: * pointer to intel_vgpu, error pointer if failed. */ -struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type) +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type) { + struct intel_gvt *gvt = vgpu->gvt; struct intel_vgpu_creation_params param; - struct intel_vgpu *vgpu; + int ret; param.primary = 1; param.low_gm_sz = type->low_gm_size; @@ -488,15 +479,15 @@ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); mutex_lock(&gvt->lock); - vgpu = __intel_gvt_create_vgpu(gvt, ¶m); - if (!IS_ERR(vgpu)) { + ret = __intel_gvt_create_vgpu(vgpu, ¶m); + if (!ret) { /* calculate left instance change for types */ intel_gvt_update_vgpu_types(gvt); intel_gvt_update_reg_whitelist(vgpu); } mutex_unlock(&gvt->lock); - return vgpu; + return ret; } /** -- GitLab From 7cb5a82eb162d268f65c7b0fbec4a5f6495bab79 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:55 +0800 Subject: [PATCH 0422/2223] vfio/ap: Use the new device life cycle helpers and manage available_instances inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Tony Krowiak Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-10-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/s390/crypto/vfio_ap_ops.c | 50 ++++++++++++++++++------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 6c8c41fac4e14..161597357a642 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -684,42 +684,44 @@ static bool vfio_ap_mdev_filter_matrix(unsigned long *apm, unsigned long *aqm, AP_DOMAINS); } -static int vfio_ap_mdev_probe(struct mdev_device *mdev) +static int vfio_ap_mdev_init_dev(struct vfio_device *vdev) { - struct ap_matrix_mdev *matrix_mdev; - int ret; + struct ap_matrix_mdev *matrix_mdev = + container_of(vdev, struct ap_matrix_mdev, vdev); if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) return -EPERM; - matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL); - if (!matrix_mdev) { - ret = -ENOMEM; - goto err_dec_available; - } - vfio_init_group_dev(&matrix_mdev->vdev, &mdev->dev, - &vfio_ap_matrix_dev_ops); - - matrix_mdev->mdev = mdev; + matrix_mdev->mdev = to_mdev_device(vdev->dev); vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); matrix_mdev->pqap_hook = handle_pqap; vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->shadow_apcb); hash_init(matrix_mdev->qtable.queues); + return 0; +} + +static int vfio_ap_mdev_probe(struct mdev_device *mdev) +{ + struct ap_matrix_mdev *matrix_mdev; + int ret; + + matrix_mdev = vfio_alloc_device(ap_matrix_mdev, vdev, &mdev->dev, + &vfio_ap_matrix_dev_ops); + if (IS_ERR(matrix_mdev)) + return PTR_ERR(matrix_mdev); + ret = vfio_register_emulated_iommu_dev(&matrix_mdev->vdev); if (ret) - goto err_list; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, matrix_mdev); mutex_lock(&matrix_dev->mdevs_lock); list_add(&matrix_mdev->node, &matrix_dev->mdev_list); mutex_unlock(&matrix_dev->mdevs_lock); return 0; -err_list: - vfio_uninit_group_dev(&matrix_mdev->vdev); - kfree(matrix_mdev); -err_dec_available: - atomic_inc(&matrix_dev->available_instances); +err_put_vdev: + vfio_put_device(&matrix_mdev->vdev); return ret; } @@ -766,6 +768,12 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) } } +static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) +{ + atomic_inc(&matrix_dev->available_instances); + vfio_free_device(vdev); +} + static void vfio_ap_mdev_remove(struct mdev_device *mdev) { struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); @@ -779,9 +787,7 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) list_del(&matrix_mdev->node); mutex_unlock(&matrix_dev->mdevs_lock); mutex_unlock(&matrix_dev->guests_lock); - vfio_uninit_group_dev(&matrix_mdev->vdev); - kfree(matrix_mdev); - atomic_inc(&matrix_dev->available_instances); + vfio_put_device(&matrix_mdev->vdev); } static ssize_t name_show(struct mdev_type *mtype, @@ -1794,6 +1800,8 @@ static const struct attribute_group vfio_queue_attr_group = { }; static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { + .init = vfio_ap_mdev_init_dev, + .release = vfio_ap_mdev_release_dev, .open_device = vfio_ap_mdev_open_device, .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, -- GitLab From 7566692c571dced7208b7cc26c1d3b898a233487 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:56 +0800 Subject: [PATCH 0423/2223] vfio/fsl-mc: Use the new device life cycle helpers Also add a comment to mark that vfio core releases device_set if @init fails. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-11-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 85 ++++++++++++++++++------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 42b344bd7cd5b..b16874e913e4f 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -418,16 +418,7 @@ static int vfio_fsl_mc_mmap(struct vfio_device *core_vdev, return vfio_fsl_mc_mmap_mmio(vdev->regions[index], vma); } -static const struct vfio_device_ops vfio_fsl_mc_ops = { - .name = "vfio-fsl-mc", - .open_device = vfio_fsl_mc_open_device, - .close_device = vfio_fsl_mc_close_device, - .ioctl = vfio_fsl_mc_ioctl, - .read = vfio_fsl_mc_read, - .write = vfio_fsl_mc_write, - .mmap = vfio_fsl_mc_mmap, -}; - +static const struct vfio_device_ops vfio_fsl_mc_ops; static int vfio_fsl_mc_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -518,35 +509,43 @@ static void vfio_fsl_uninit_device(struct vfio_fsl_mc_device *vdev) bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb); } -static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) +static int vfio_fsl_mc_init_dev(struct vfio_device *core_vdev) { - struct vfio_fsl_mc_device *vdev; - struct device *dev = &mc_dev->dev; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = to_fsl_mc_device(core_vdev->dev); int ret; - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - - vfio_init_group_dev(&vdev->vdev, dev, &vfio_fsl_mc_ops); vdev->mc_dev = mc_dev; mutex_init(&vdev->igate); if (is_fsl_mc_bus_dprc(mc_dev)) - ret = vfio_assign_device_set(&vdev->vdev, &mc_dev->dev); + ret = vfio_assign_device_set(core_vdev, &mc_dev->dev); else - ret = vfio_assign_device_set(&vdev->vdev, mc_dev->dev.parent); - if (ret) - goto out_uninit; + ret = vfio_assign_device_set(core_vdev, mc_dev->dev.parent); - ret = vfio_fsl_mc_init_device(vdev); if (ret) - goto out_uninit; + return ret; + + /* device_set is released by vfio core if @init fails */ + return vfio_fsl_mc_init_device(vdev); +} + +static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) +{ + struct vfio_fsl_mc_device *vdev; + struct device *dev = &mc_dev->dev; + int ret; + + vdev = vfio_alloc_device(vfio_fsl_mc_device, vdev, dev, + &vfio_fsl_mc_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); ret = vfio_register_group_dev(&vdev->vdev); if (ret) { dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n"); - goto out_device; + goto out_put_vdev; } ret = vfio_fsl_mc_scan_container(mc_dev); @@ -557,30 +556,44 @@ static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) out_group_dev: vfio_unregister_group_dev(&vdev->vdev); -out_device: - vfio_fsl_uninit_device(vdev); -out_uninit: - vfio_uninit_group_dev(&vdev->vdev); - kfree(vdev); +out_put_vdev: + vfio_put_device(&vdev->vdev); return ret; } +static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + + vfio_fsl_uninit_device(vdev); + mutex_destroy(&vdev->igate); + vfio_free_device(core_vdev); +} + static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) { struct device *dev = &mc_dev->dev; struct vfio_fsl_mc_device *vdev = dev_get_drvdata(dev); vfio_unregister_group_dev(&vdev->vdev); - mutex_destroy(&vdev->igate); - dprc_remove_devices(mc_dev, NULL, 0); - vfio_fsl_uninit_device(vdev); - - vfio_uninit_group_dev(&vdev->vdev); - kfree(vdev); + vfio_put_device(&vdev->vdev); return 0; } +static const struct vfio_device_ops vfio_fsl_mc_ops = { + .name = "vfio-fsl-mc", + .init = vfio_fsl_mc_init_dev, + .release = vfio_fsl_mc_release_dev, + .open_device = vfio_fsl_mc_open_device, + .close_device = vfio_fsl_mc_close_device, + .ioctl = vfio_fsl_mc_ioctl, + .read = vfio_fsl_mc_read, + .write = vfio_fsl_mc_write, + .mmap = vfio_fsl_mc_mmap, +}; + static struct fsl_mc_driver vfio_fsl_mc_driver = { .probe = vfio_fsl_mc_probe, .remove = vfio_fsl_mc_remove, -- GitLab From 5f6c7e0831a1f1faffad43bb8dbc260b49f2d3dc Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:57 +0800 Subject: [PATCH 0424/2223] vfio/platform: Use the new device life cycle helpers Move vfio_device_ops from platform core to platform drivers so device specific init/cleanup can be added. Introduce two new helpers vfio_platform_init/release_common() for the use in driver @init/@release. vfio_platform_probe/remove_common() will be deprecated. Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Tested-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-12-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform.c | 66 +++++++++++++++---- drivers/vfio/platform/vfio_platform_common.c | 53 ++++++++++++--- drivers/vfio/platform/vfio_platform_private.h | 15 +++++ 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 04f40c5acfd67..82cedcebfd902 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "vfio_platform_private.h" @@ -36,14 +37,11 @@ static int get_platform_irq(struct vfio_platform_device *vdev, int i) return platform_get_irq_optional(pdev, i); } -static int vfio_platform_probe(struct platform_device *pdev) +static int vfio_platform_init_dev(struct vfio_device *core_vdev) { - struct vfio_platform_device *vdev; - int ret; - - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct platform_device *pdev = to_platform_device(core_vdev->dev); vdev->opaque = (void *) pdev; vdev->name = pdev->name; @@ -52,24 +50,64 @@ static int vfio_platform_probe(struct platform_device *pdev) vdev->get_irq = get_platform_irq; vdev->reset_required = reset_required; - ret = vfio_platform_probe_common(vdev, &pdev->dev); - if (ret) { - kfree(vdev); - return ret; - } + return vfio_platform_init_common(vdev); +} + +static const struct vfio_device_ops vfio_platform_ops; +static int vfio_platform_probe(struct platform_device *pdev) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = vfio_alloc_device(vfio_platform_device, vdev, &pdev->dev, + &vfio_platform_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_put_vdev; + + pm_runtime_enable(&pdev->dev); dev_set_drvdata(&pdev->dev, vdev); return 0; + +out_put_vdev: + vfio_put_device(&vdev->vdev); + return ret; +} + +static void vfio_platform_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + vfio_platform_release_common(vdev); + vfio_free_device(core_vdev); } static int vfio_platform_remove(struct platform_device *pdev) { struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev); - vfio_platform_remove_common(vdev); - kfree(vdev); + vfio_unregister_group_dev(&vdev->vdev); + pm_runtime_disable(vdev->device); + vfio_put_device(&vdev->vdev); return 0; } +static const struct vfio_device_ops vfio_platform_ops = { + .name = "vfio-platform", + .init = vfio_platform_init_dev, + .release = vfio_platform_release_dev, + .open_device = vfio_platform_open_device, + .close_device = vfio_platform_close_device, + .ioctl = vfio_platform_ioctl, + .read = vfio_platform_read, + .write = vfio_platform_write, + .mmap = vfio_platform_mmap, +}; + static struct platform_driver vfio_platform_driver = { .probe = vfio_platform_probe, .remove = vfio_platform_remove, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 256f55b84e70a..4c01bf0adebba 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -218,7 +218,7 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev, return -EINVAL; } -static void vfio_platform_close_device(struct vfio_device *core_vdev) +void vfio_platform_close_device(struct vfio_device *core_vdev) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -236,8 +236,9 @@ static void vfio_platform_close_device(struct vfio_device *core_vdev) vfio_platform_regions_cleanup(vdev); vfio_platform_irq_cleanup(vdev); } +EXPORT_SYMBOL_GPL(vfio_platform_close_device); -static int vfio_platform_open_device(struct vfio_device *core_vdev) +int vfio_platform_open_device(struct vfio_device *core_vdev) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -273,9 +274,10 @@ err_irq: vfio_platform_regions_cleanup(vdev); return ret; } +EXPORT_SYMBOL_GPL(vfio_platform_open_device); -static long vfio_platform_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) +long vfio_platform_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -382,6 +384,7 @@ static long vfio_platform_ioctl(struct vfio_device *core_vdev, return -ENOTTY; } +EXPORT_SYMBOL_GPL(vfio_platform_ioctl); static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, char __user *buf, size_t count, @@ -438,8 +441,8 @@ err: return -EFAULT; } -static ssize_t vfio_platform_read(struct vfio_device *core_vdev, - char __user *buf, size_t count, loff_t *ppos) +ssize_t vfio_platform_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, loff_t *ppos) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -460,6 +463,7 @@ static ssize_t vfio_platform_read(struct vfio_device *core_vdev, return -EINVAL; } +EXPORT_SYMBOL_GPL(vfio_platform_read); static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, const char __user *buf, size_t count, @@ -515,8 +519,8 @@ err: return -EFAULT; } -static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf, - size_t count, loff_t *ppos) +ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf, + size_t count, loff_t *ppos) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -537,6 +541,7 @@ static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __u return -EINVAL; } +EXPORT_SYMBOL_GPL(vfio_platform_write); static int vfio_platform_mmap_mmio(struct vfio_platform_region region, struct vm_area_struct *vma) @@ -558,7 +563,7 @@ static int vfio_platform_mmap_mmio(struct vfio_platform_region region, req_len, vma->vm_page_prot); } -static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) +int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -598,6 +603,7 @@ static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_stru return -EINVAL; } +EXPORT_SYMBOL_GPL(vfio_platform_mmap); static const struct vfio_device_ops vfio_platform_ops = { .name = "vfio-platform", @@ -639,6 +645,35 @@ static int vfio_platform_of_probe(struct vfio_platform_device *vdev, * If the firmware is ACPI type, then acpi_disabled is 0. All other checks are * valid checks. We cannot claim that this system is DT. */ +int vfio_platform_init_common(struct vfio_platform_device *vdev) +{ + int ret; + struct device *dev = vdev->vdev.dev; + + ret = vfio_platform_acpi_probe(vdev, dev); + if (ret) + ret = vfio_platform_of_probe(vdev, dev); + + if (ret) + return ret; + + vdev->device = dev; + mutex_init(&vdev->igate); + + ret = vfio_platform_get_reset(vdev); + if (ret && vdev->reset_required) + dev_err(dev, "No reset function found for device %s\n", + vdev->name); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_platform_init_common); + +void vfio_platform_release_common(struct vfio_platform_device *vdev) +{ + vfio_platform_put_reset(vdev); +} +EXPORT_SYMBOL_GPL(vfio_platform_release_common); + int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct device *dev) { diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 691b43f4b2b29..a769d649fb97d 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -81,6 +81,21 @@ struct vfio_platform_reset_node { int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct device *dev); void vfio_platform_remove_common(struct vfio_platform_device *vdev); +int vfio_platform_init_common(struct vfio_platform_device *vdev); +void vfio_platform_release_common(struct vfio_platform_device *vdev); + +int vfio_platform_open_device(struct vfio_device *core_vdev); +void vfio_platform_close_device(struct vfio_device *core_vdev); +long vfio_platform_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg); +ssize_t vfio_platform_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, + loff_t *ppos); +ssize_t vfio_platform_write(struct vfio_device *core_vdev, + const char __user *buf, + size_t count, loff_t *ppos); +int vfio_platform_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma); int vfio_platform_irq_init(struct vfio_platform_device *vdev); void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev); -- GitLab From ac1237912fbd0f2503344aa268ceb43628cdffa8 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:58 +0800 Subject: [PATCH 0425/2223] vfio/amba: Use the new device life cycle helpers Implement amba's own vfio_device_ops. Remove vfio_platform_probe/remove_common() given no user now. Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-13-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 72 ++++++++++++++----- drivers/vfio/platform/vfio_platform_common.c | 60 ---------------- drivers/vfio/platform/vfio_platform_private.h | 3 - 3 files changed, 55 insertions(+), 80 deletions(-) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 1aaa4f721bd2c..eaea63e5294c5 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "vfio_platform_private.h" @@ -40,20 +41,16 @@ static int get_amba_irq(struct vfio_platform_device *vdev, int i) return ret ? ret : -ENXIO; } -static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) +static int vfio_amba_init_dev(struct vfio_device *core_vdev) { - struct vfio_platform_device *vdev; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct amba_device *adev = to_amba_device(core_vdev->dev); int ret; - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid); - if (!vdev->name) { - kfree(vdev); + if (!vdev->name) return -ENOMEM; - } vdev->opaque = (void *) adev; vdev->flags = VFIO_DEVICE_FLAGS_AMBA; @@ -61,26 +58,67 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) vdev->get_irq = get_amba_irq; vdev->reset_required = false; - ret = vfio_platform_probe_common(vdev, &adev->dev); - if (ret) { + ret = vfio_platform_init_common(vdev); + if (ret) kfree(vdev->name); - kfree(vdev); - return ret; - } + return ret; +} + +static const struct vfio_device_ops vfio_amba_ops; +static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = vfio_alloc_device(vfio_platform_device, vdev, &adev->dev, + &vfio_amba_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_put_vdev; + + pm_runtime_enable(&adev->dev); dev_set_drvdata(&adev->dev, vdev); return 0; + +out_put_vdev: + vfio_put_device(&vdev->vdev); + return ret; +} + +static void vfio_amba_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + vfio_platform_release_common(vdev); + kfree(vdev->name); + vfio_free_device(core_vdev); } static void vfio_amba_remove(struct amba_device *adev) { struct vfio_platform_device *vdev = dev_get_drvdata(&adev->dev); - vfio_platform_remove_common(vdev); - kfree(vdev->name); - kfree(vdev); + vfio_unregister_group_dev(&vdev->vdev); + pm_runtime_disable(vdev->device); + vfio_put_device(&vdev->vdev); } +static const struct vfio_device_ops vfio_amba_ops = { + .name = "vfio-amba", + .init = vfio_amba_init_dev, + .release = vfio_amba_release_dev, + .open_device = vfio_platform_open_device, + .close_device = vfio_platform_close_device, + .ioctl = vfio_platform_ioctl, + .read = vfio_platform_read, + .write = vfio_platform_write, + .mmap = vfio_platform_mmap, +}; + static const struct amba_id pl330_ids[] = { { 0, 0 }, }; diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 4c01bf0adebba..55dc4f43c31e3 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -605,16 +605,6 @@ int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma } EXPORT_SYMBOL_GPL(vfio_platform_mmap); -static const struct vfio_device_ops vfio_platform_ops = { - .name = "vfio-platform", - .open_device = vfio_platform_open_device, - .close_device = vfio_platform_close_device, - .ioctl = vfio_platform_ioctl, - .read = vfio_platform_read, - .write = vfio_platform_write, - .mmap = vfio_platform_mmap, -}; - static int vfio_platform_of_probe(struct vfio_platform_device *vdev, struct device *dev) { @@ -674,56 +664,6 @@ void vfio_platform_release_common(struct vfio_platform_device *vdev) } EXPORT_SYMBOL_GPL(vfio_platform_release_common); -int vfio_platform_probe_common(struct vfio_platform_device *vdev, - struct device *dev) -{ - int ret; - - vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops); - - ret = vfio_platform_acpi_probe(vdev, dev); - if (ret) - ret = vfio_platform_of_probe(vdev, dev); - - if (ret) - goto out_uninit; - - vdev->device = dev; - - ret = vfio_platform_get_reset(vdev); - if (ret && vdev->reset_required) { - dev_err(dev, "No reset function found for device %s\n", - vdev->name); - goto out_uninit; - } - - ret = vfio_register_group_dev(&vdev->vdev); - if (ret) - goto put_reset; - - mutex_init(&vdev->igate); - - pm_runtime_enable(dev); - return 0; - -put_reset: - vfio_platform_put_reset(vdev); -out_uninit: - vfio_uninit_group_dev(&vdev->vdev); - return ret; -} -EXPORT_SYMBOL_GPL(vfio_platform_probe_common); - -void vfio_platform_remove_common(struct vfio_platform_device *vdev) -{ - vfio_unregister_group_dev(&vdev->vdev); - - pm_runtime_disable(vdev->device); - vfio_platform_put_reset(vdev); - vfio_uninit_group_dev(&vdev->vdev); -} -EXPORT_SYMBOL_GPL(vfio_platform_remove_common); - void __vfio_platform_register_reset(struct vfio_platform_reset_node *node) { mutex_lock(&driver_lock); diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index a769d649fb97d..8d8fab5168490 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -78,9 +78,6 @@ struct vfio_platform_reset_node { vfio_platform_reset_fn_t of_reset; }; -int vfio_platform_probe_common(struct vfio_platform_device *vdev, - struct device *dev); -void vfio_platform_remove_common(struct vfio_platform_device *vdev); int vfio_platform_init_common(struct vfio_platform_device *vdev); void vfio_platform_release_common(struct vfio_platform_device *vdev); -- GitLab From ebb72b765fb49685c4603d2bff47a4ab5d2580a9 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:59 +0800 Subject: [PATCH 0426/2223] vfio/ccw: Use the new device life cycle helpers ccw is the only exception which cannot use vfio_alloc_device() because its private device structure is designed to serve both mdev and parent. Life cycle of the parent is managed by css_driver so vfio_ccw_private must be allocated/freed in css_driver probe/remove path instead of conforming to vfio core life cycle for mdev. Given that use a wait/completion scheme so the mdev remove path waits after vfio_put_device() until receiving a completion notification from @release. The completion indicates that all active references on vfio_device have been released. After that point although free of vfio_ccw_private is delayed to css_driver it's at least guaranteed to have no parallel reference on released vfio device part from other code paths. memset() in @probe is removed. vfio_device is either already cleared when probed for the first time or cleared in @release from last probe. The right fix is to introduce separate structures for mdev and parent, but this won't happen in short term per prior discussions. Remove vfio_init/uninit_group_dev() as no user now. Suggested-by: Jason Gunthorpe Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220921104401.38898-14-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 52 +++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_private.h | 3 ++ drivers/vfio/vfio_main.c | 23 +++---------- include/linux/vfio.h | 3 -- 4 files changed, 53 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 4a806a2273b54..9f8486c0d3d37 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -87,6 +87,15 @@ static struct attribute_group *mdev_type_groups[] = { NULL, }; +static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) +{ + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + + init_completion(&private->release_comp); + return 0; +} + static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); @@ -98,9 +107,9 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) if (atomic_dec_if_positive(&private->avail) < 0) return -EPERM; - memset(&private->vdev, 0, sizeof(private->vdev)); - vfio_init_group_dev(&private->vdev, &mdev->dev, - &vfio_ccw_dev_ops); + ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); + if (ret) + return ret; VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", private->sch->schid.cssid, @@ -109,16 +118,33 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) ret = vfio_register_emulated_iommu_dev(&private->vdev); if (ret) - goto err_atomic; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, private); return 0; -err_atomic: - vfio_uninit_group_dev(&private->vdev); +err_put_vdev: + vfio_put_device(&private->vdev); atomic_inc(&private->avail); return ret; } +static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) +{ + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + + /* + * We cannot free vfio_ccw_private here because it includes + * parent info which must be free'ed by css driver. + * + * Use a workaround by memset'ing the core device part and + * then notifying the remove path that all active references + * to this device have been released. + */ + memset(vdev, 0, sizeof(*vdev)); + complete(&private->release_comp); +} + static void vfio_ccw_mdev_remove(struct mdev_device *mdev) { struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); @@ -130,7 +156,17 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) vfio_unregister_group_dev(&private->vdev); - vfio_uninit_group_dev(&private->vdev); + vfio_put_device(&private->vdev); + /* + * Wait for all active references on mdev are released so it + * is safe to defer kfree() to a later point. + * + * TODO: the clean fix is to split parent/mdev info from ccw + * private structure so each can be managed in its own life + * cycle. + */ + wait_for_completion(&private->release_comp); + atomic_inc(&private->avail); } @@ -592,6 +628,8 @@ static void vfio_ccw_mdev_request(struct vfio_device *vdev, unsigned int count) } static const struct vfio_device_ops vfio_ccw_dev_ops = { + .init = vfio_ccw_mdev_init_dev, + .release = vfio_ccw_mdev_release_dev, .open_device = vfio_ccw_mdev_open_device, .close_device = vfio_ccw_mdev_close_device, .read = vfio_ccw_mdev_read, diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index cd24b7fada91c..63d9202b29c7f 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -88,6 +88,7 @@ struct vfio_ccw_crw { * @req_trigger: eventfd ctx for signaling userspace to return device * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling + * @release_comp: synchronization helper for vfio device release */ struct vfio_ccw_private { struct vfio_device vdev; @@ -113,6 +114,8 @@ struct vfio_ccw_private { struct eventfd_ctx *req_trigger; struct work_struct io_work; struct work_struct crw_work; + + struct completion release_comp; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index b9c6a97d647a7..12952858d9039 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -483,28 +483,13 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, /* * VFIO driver API */ -void vfio_init_group_dev(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops) -{ - init_completion(&device->comp); - device->dev = dev; - device->ops = ops; -} -EXPORT_SYMBOL_GPL(vfio_init_group_dev); - -void vfio_uninit_group_dev(struct vfio_device *device) -{ - vfio_release_device_set(device); -} -EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); - /* Release helper called by vfio_put_device() */ void vfio_device_release(struct kref *kref) { struct vfio_device *device = container_of(kref, struct vfio_device, kref); - vfio_uninit_group_dev(device); + vfio_release_device_set(device); /* * kvfree() cannot be done here due to a life cycle mess in @@ -562,7 +547,9 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, { int ret; - vfio_init_group_dev(device, dev, ops); + init_completion(&device->comp); + device->dev = dev; + device->ops = ops; if (ops->init) { ret = ops->init(device); @@ -574,7 +561,7 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, return 0; out_uninit: - vfio_uninit_group_dev(device); + vfio_release_device_set(device); return ret; } EXPORT_SYMBOL_GPL(vfio_init_device); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f67cac700e6f9..3cf857b1eec71 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -184,9 +184,6 @@ static inline void vfio_put_device(struct vfio_device *device) kref_put(&device->kref, vfio_device_release); } -void vfio_init_group_dev(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); -void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); -- GitLab From 4a725b8de4cc5e88c00f7607d9ba0e97151251e5 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:44:00 +0800 Subject: [PATCH 0427/2223] vfio: Rename vfio_device_put() and vfio_device_try_get() With the addition of vfio_put_device() now the names become confusing. vfio_put_device() is clear from object life cycle p.o.v given kref. vfio_device_put()/vfio_device_try_get() are helpers for tracking users on a registered device. Now rename them: - vfio_device_put() -> vfio_device_put_registration() - vfio_device_try_get() -> vfio_device_try_get_registration() Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-15-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 12952858d9039..c27449613a1d2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -453,13 +453,13 @@ static void vfio_group_get(struct vfio_group *group) * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -static void vfio_device_put(struct vfio_device *device) +static void vfio_device_put_registration(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -static bool vfio_device_try_get(struct vfio_device *device) +static bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } @@ -471,7 +471,8 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev && vfio_device_try_get(device)) { + if (device->dev == dev && + vfio_device_try_get_registration(device)) { mutex_unlock(&group->device_lock); return device; } @@ -673,7 +674,7 @@ static int __vfio_register_dev(struct vfio_device *device, if (existing_device) { dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); - vfio_device_put(existing_device); + vfio_device_put_registration(existing_device); if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -731,7 +732,7 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, ret = !strcmp(dev_name(it->dev), buf); } - if (ret && vfio_device_try_get(it)) { + if (ret && vfio_device_try_get_registration(it)) { device = it; break; } @@ -751,7 +752,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) bool interrupted = false; long rc; - vfio_device_put(device); + vfio_device_put_registration(device); rc = try_wait_for_completion(&device->comp); while (rc <= 0) { if (device->ops->request) @@ -1311,7 +1312,7 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, err_put_fdno: put_unused_fd(fdno); err_put_device: - vfio_device_put(device); + vfio_device_put_registration(device); return ret; } @@ -1493,7 +1494,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) vfio_device_unassign_container(device); - vfio_device_put(device); + vfio_device_put_registration(device); return 0; } -- GitLab From 3c28a76124b25882411f005924be73795b6ef078 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:44:01 +0800 Subject: [PATCH 0428/2223] vfio: Add struct device to vfio_device and replace kref. With it a 'vfio-dev/vfioX' node is created under the sysfs path of the parent, indicating the device is bound to a vfio driver, e.g.: /sys/devices/pci0000\:6f/0000\:6f\:01.0/vfio-dev/vfio0 It is also a preparatory step toward adding cdev for supporting future device-oriented uAPI. Add Documentation/ABI/testing/sysfs-devices-vfio-dev. Suggested-by: Jason Gunthorpe Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-16-kevin.tian@intel.com Signed-off-by: Alex Williamson --- .../ABI/testing/sysfs-devices-vfio-dev | 8 +++ MAINTAINERS | 1 + drivers/vfio/vfio_main.c | 64 +++++++++++++++---- include/linux/vfio.h | 6 +- 4 files changed, 65 insertions(+), 14 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-devices-vfio-dev diff --git a/Documentation/ABI/testing/sysfs-devices-vfio-dev b/Documentation/ABI/testing/sysfs-devices-vfio-dev new file mode 100644 index 0000000000000..e21424fd96662 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-vfio-dev @@ -0,0 +1,8 @@ +What: /sys/...//vfio-dev/vfioX/ +Date: September 2022 +Contact: Yi Liu +Description: + This directory is created when the device is bound to a + vfio driver. The layout under this directory matches what + exists for a standard 'struct device'. 'X' is a unique + index marking this device in vfio. diff --git a/MAINTAINERS b/MAINTAINERS index d30f26e07cd39..02c8f11b1c17a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21312,6 +21312,7 @@ R: Cornelia Huck L: kvm@vger.kernel.org S: Maintained T: git git://github.com/awilliam/linux-vfio.git +F: Documentation/ABI/testing/sysfs-devices-vfio-dev F: Documentation/driver-api/vfio.rst F: drivers/vfio/ F: include/linux/vfio.h diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index c27449613a1d2..f9d10dbcf3e61 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -49,6 +49,8 @@ static struct vfio { struct mutex group_lock; /* locks group_list */ struct ida group_ida; dev_t group_devt; + struct class *device_class; + struct ida device_ida; } vfio; struct vfio_iommu_driver { @@ -485,12 +487,13 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, * VFIO driver API */ /* Release helper called by vfio_put_device() */ -void vfio_device_release(struct kref *kref) +static void vfio_device_release(struct device *dev) { struct vfio_device *device = - container_of(kref, struct vfio_device, kref); + container_of(dev, struct vfio_device, device); vfio_release_device_set(device); + ida_free(&vfio.device_ida, device->index); /* * kvfree() cannot be done here due to a life cycle mess in @@ -500,7 +503,6 @@ void vfio_device_release(struct kref *kref) */ device->ops->release(device); } -EXPORT_SYMBOL_GPL(vfio_device_release); /* * Allocate and initialize vfio_device so it can be registered to vfio @@ -548,6 +550,13 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, { int ret; + ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); + if (ret < 0) { + dev_dbg(dev, "Error to alloc index\n"); + return ret; + } + + device->index = ret; init_completion(&device->comp); device->dev = dev; device->ops = ops; @@ -558,11 +567,15 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, goto out_uninit; } - kref_init(&device->kref); + device_initialize(&device->device); + device->device.release = vfio_device_release; + device->device.class = vfio.device_class; + device->device.parent = device->dev; return 0; out_uninit: vfio_release_device_set(device); + ida_free(&vfio.device_ida, device->index); return ret; } EXPORT_SYMBOL_GPL(vfio_init_device); @@ -659,6 +672,7 @@ static int __vfio_register_dev(struct vfio_device *device, struct vfio_group *group) { struct vfio_device *existing_device; + int ret; if (IS_ERR(group)) return PTR_ERR(group); @@ -675,16 +689,21 @@ static int __vfio_register_dev(struct vfio_device *device, dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put_registration(existing_device); - if (group->type == VFIO_NO_IOMMU || - group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - vfio_group_put(group); - return -EBUSY; + ret = -EBUSY; + goto err_out; } /* Our reference on group is moved to the device */ device->group = group; + ret = dev_set_name(&device->device, "vfio%d", device->index); + if (ret) + goto err_out; + + ret = device_add(&device->device); + if (ret) + goto err_out; + /* Refcounting can't start until the driver calls register */ refcount_set(&device->refcount, 1); @@ -693,6 +712,12 @@ static int __vfio_register_dev(struct vfio_device *device, mutex_unlock(&group->device_lock); return 0; +err_out: + if (group->type == VFIO_NO_IOMMU || + group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + vfio_group_put(group); + return ret; } int vfio_register_group_dev(struct vfio_device *device) @@ -779,6 +804,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) list_del(&device->group_next); mutex_unlock(&group->device_lock); + /* Balances device_add in register path */ + device_del(&device->device); + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -2362,6 +2390,7 @@ static int __init vfio_init(void) int ret; ida_init(&vfio.group_ida); + ida_init(&vfio.device_ida); mutex_init(&vfio.group_lock); mutex_init(&vfio.iommu_drivers_lock); INIT_LIST_HEAD(&vfio.group_list); @@ -2377,11 +2406,18 @@ static int __init vfio_init(void) vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { ret = PTR_ERR(vfio.class); - goto err_class; + goto err_group_class; } vfio.class->devnode = vfio_devnode; + /* /sys/class/vfio-dev/vfioX */ + vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); + if (IS_ERR(vfio.device_class)) { + ret = PTR_ERR(vfio.device_class); + goto err_dev_class; + } + ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); if (ret) goto err_alloc_chrdev; @@ -2398,9 +2434,12 @@ static int __init vfio_init(void) err_driver_register: unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); err_alloc_chrdev: + class_destroy(vfio.device_class); + vfio.device_class = NULL; +err_dev_class: class_destroy(vfio.class); vfio.class = NULL; -err_class: +err_group_class: misc_deregister(&vfio_dev); return ret; } @@ -2412,8 +2451,11 @@ static void __exit vfio_cleanup(void) #ifdef CONFIG_VFIO_NOIOMMU vfio_unregister_iommu_driver(&vfio_noiommu_ops); #endif + ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); + class_destroy(vfio.device_class); + vfio.device_class = NULL; class_destroy(vfio.class); vfio.class = NULL; misc_deregister(&vfio_dev); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 3cf857b1eec71..ee399a768070d 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -47,7 +47,8 @@ struct vfio_device { struct kvm *kvm; /* Members below here are private, not for driver use */ - struct kref kref; /* object life cycle */ + unsigned int index; + struct device device; /* device.kref covers object life circle */ refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; @@ -178,10 +179,9 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, int vfio_init_device(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); -void vfio_device_release(struct kref *kref); static inline void vfio_put_device(struct vfio_device *device) { - kref_put(&device->kref, vfio_device_release); + put_device(&device->device); } int vfio_register_group_dev(struct vfio_device *device); -- GitLab From 0e32818397426a688f598f35d3bc762eca6d7592 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 21 Sep 2022 20:49:16 +0100 Subject: [PATCH 0429/2223] PCI: Sanitise firmware BAR assignments behind a PCI-PCI bridge When pci_assign_resource() is unable to assign resources to a BAR, it uses pci_revert_fw_address() to fall back to a firmware assignment (if any). Previously pci_revert_fw_address() assumed all addresses could reach the device, but this is not true if the device is below a bridge that only forwards addresses within its windows. This problem was observed on a Tyan Tomcat IV S1564D system where the BIOS did not assign valid addresses to several bridges and USB devices: pci 0000:00:11.0: PCI-to-PCIe bridge to [bus 01-ff] pci 0000:00:11.0: bridge window [io 0xe000-0xefff] pci 0000:01:00.0: PCIe Upstream Port to [bus 02-ff] pci 0000:01:00.0: bridge window [io 0x0000-0x0fff] # unreachable pci 0000:02:02.0: PCIe Downstream Port to [bus 05-ff] pci 0000:02:02.0: bridge window [io 0x0000-0x0fff] # unreachable pci 0000:05:00.0: PCIe-to-PCI bridge to [bus 06-ff] pci 0000:05:00.0: bridge window [io 0x0000-0x0fff] # unreachable pci 0000:06:08.0: USB UHCI 1.1 pci 0000:06:08.0: BAR 4: [io 0xfce0-0xfcff] # unreachable pci 0000:06:08.1: USB UHCI 1.1 pci 0000:06:08.1: BAR 4: [io 0xfce0-0xfcff] # unreachable pci 0000:06:08.0: can't claim BAR 4 [io 0xfce0-0xfcff]: no compatible bridge window pci 0000:06:08.1: can't claim BAR 4 [io 0xfce0-0xfcff]: no compatible bridge window During the first pass of assigning unassigned resources, there was not enough I/O space available, so we couldn't assign the 06:08.0 BAR and reverted to the firmware assignment (still unreachable). Reverting the 06:08.1 assignment failed because it conflicted with 06:08.0: pci 0000:00:11.0: bridge window [io 0xe000-0xefff] pci 0000:01:00.0: no space for bridge window [io size 0x2000] pci 0000:02:02.0: no space for bridge window [io size 0x1000] pci 0000:05:00.0: no space for bridge window [io size 0x1000] pci 0000:06:08.0: BAR 4: no space for [io size 0x0020] pci 0000:06:08.0: BAR 4: trying firmware assignment [io 0xfce0-0xfcff] pci 0000:06:08.1: BAR 4: no space for [io size 0x0020] pci 0000:06:08.1: BAR 4: trying firmware assignment [io 0xfce0-0xfcff] pci 0000:06:08.1: BAR 4: [io 0xfce0-0xfcff] conflicts with 0000:06:08.0 [io 0xfce0-0xfcff] A subsequent pass assigned valid bridge windows and a valid 06:08.1 BAR, but left the 06:08.0 BAR alone, so the UHCI device was still unusable: pci 0000:00:11.0: bridge window [io 0xe000-0xefff] released pci 0000:00:11.0: bridge window [io 0x1000-0x2fff] # reassigned pci 0000:01:00.0: bridge window [io 0x1000-0x2fff] # reassigned pci 0000:02:02.0: bridge window [io 0x2000-0x2fff] # reassigned pci 0000:05:00.0: bridge window [io 0x2000-0x2fff] # reassigned pci 0000:06:08.0: BAR 4: assigned [io 0xfce0-0xfcff] # left alone pci 0000:06:08.1: BAR 4: assigned [io 0x2000-0x201f] ... uhci_hcd 0000:06:08.0: host system error, PCI problems? uhci_hcd 0000:06:08.0: host controller process error, something bad happened! uhci_hcd 0000:06:08.0: host controller halted, very bad! uhci_hcd 0000:06:08.0: HCRESET not completed yet! uhci_hcd 0000:06:08.0: HC died; cleaning up If the address assigned by firmware is not reachable because it's not within upstream bridge windows, fail instead of assigning the unusable address from firmware. [bhelgaas: commit log, use pci_upstream_bridge()] Link: https://bugzilla.kernel.org/show_bug.cgi?id=16263 Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203012338460.46819@angie.orcam.me.uk Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209211921250.29493@angie.orcam.me.uk Fixes: 58c84eda0756 ("PCI: fall back to original BIOS BAR addresses") Signed-off-by: Maciej W. Rozycki Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v2.6.35+ --- drivers/pci/setup-res.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 439ac5f5907a6..b492e67c3d871 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -214,6 +214,17 @@ static int pci_revert_fw_address(struct resource *res, struct pci_dev *dev, root = pci_find_parent_resource(dev, res); if (!root) { + /* + * If dev is behind a bridge, accesses will only reach it + * if res is inside the relevant bridge window. + */ + if (pci_upstream_bridge(dev)) + return -ENXIO; + + /* + * On the root bus, assume the host bridge will forward + * everything. + */ if (res->flags & IORESOURCE_IO) root = &ioport_resource; else -- GitLab From ec7174f637d75abe5ada8482b9947898db231cd2 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 22 Sep 2022 19:19:24 +0800 Subject: [PATCH 0430/2223] ipmi: Add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Signed-off-by: Xiu Jianfeng Message-Id: <20220922111924.36044-1-xiujianfeng@huawei.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ssif.c | 4 ++-- drivers/char/ipmi/kcs_bmc_cdev_ipmi.c | 4 ++-- drivers/char/ipmi/kcs_bmc_serio.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index fc742ee9c0468..00e9439db0a4e 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -2100,7 +2100,7 @@ static struct platform_driver ipmi_driver = { .id_table = ssif_plat_ids }; -static int init_ipmi_ssif(void) +static int __init init_ipmi_ssif(void) { int i; int rv; @@ -2142,7 +2142,7 @@ static int init_ipmi_ssif(void) } module_init(init_ipmi_ssif); -static void cleanup_ipmi_ssif(void) +static void __exit cleanup_ipmi_ssif(void) { if (!initialized) return; diff --git a/drivers/char/ipmi/kcs_bmc_cdev_ipmi.c b/drivers/char/ipmi/kcs_bmc_cdev_ipmi.c index 486834a962c3d..cf670e891966d 100644 --- a/drivers/char/ipmi/kcs_bmc_cdev_ipmi.c +++ b/drivers/char/ipmi/kcs_bmc_cdev_ipmi.c @@ -548,7 +548,7 @@ static struct kcs_bmc_driver kcs_bmc_ipmi_driver = { .ops = &kcs_bmc_ipmi_driver_ops, }; -static int kcs_bmc_ipmi_init(void) +static int __init kcs_bmc_ipmi_init(void) { kcs_bmc_register_driver(&kcs_bmc_ipmi_driver); @@ -556,7 +556,7 @@ static int kcs_bmc_ipmi_init(void) } module_init(kcs_bmc_ipmi_init); -static void kcs_bmc_ipmi_exit(void) +static void __exit kcs_bmc_ipmi_exit(void) { kcs_bmc_unregister_driver(&kcs_bmc_ipmi_driver); } diff --git a/drivers/char/ipmi/kcs_bmc_serio.c b/drivers/char/ipmi/kcs_bmc_serio.c index 7e2067628a6ce..1793358be7822 100644 --- a/drivers/char/ipmi/kcs_bmc_serio.c +++ b/drivers/char/ipmi/kcs_bmc_serio.c @@ -140,7 +140,7 @@ static struct kcs_bmc_driver kcs_bmc_serio_driver = { .ops = &kcs_bmc_serio_driver_ops, }; -static int kcs_bmc_serio_init(void) +static int __init kcs_bmc_serio_init(void) { kcs_bmc_register_driver(&kcs_bmc_serio_driver); @@ -148,7 +148,7 @@ static int kcs_bmc_serio_init(void) } module_init(kcs_bmc_serio_init); -static void kcs_bmc_serio_exit(void) +static void __exit kcs_bmc_serio_exit(void) { kcs_bmc_unregister_driver(&kcs_bmc_serio_driver); } -- GitLab From 7ab72c597356be1e7f0f3d856e54ce78527f43c8 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Thu, 15 Sep 2022 15:37:01 -0400 Subject: [PATCH 0431/2223] riscv: Make VM_WRITE imply VM_READ RISC-V does not presently have write-only mappings as that PTE bit pattern is considered reserved in the privileged spec, so allow handling of read faults in VMAs that have VM_WRITE without VM_READ in order to be consistent with other architectures that have similar limitations. Fixes: 2139619bcad7 ("riscv: mmap with PROT_WRITE but no PROT_READ is invalid") Reviewed-by: Atish Patra Signed-off-by: Andrew Bresticker Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220915193702.2201018-2-abrestic@rivosinc.com/ Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index f2fbd1400b7c9..d86f7cebd4a7e 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -184,7 +184,8 @@ static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) } break; case EXC_LOAD_PAGE_FAULT: - if (!(vma->vm_flags & VM_READ)) { + /* Write implies read */ + if (!(vma->vm_flags & (VM_READ | VM_WRITE))) { return true; } break; -- GitLab From 9e2e6042a7ec6504fe8e366717afa2f40cf16488 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Thu, 15 Sep 2022 15:37:02 -0400 Subject: [PATCH 0432/2223] riscv: Allow PROT_WRITE-only mmap() Commit 2139619bcad7 ("riscv: mmap with PROT_WRITE but no PROT_READ is invalid") made mmap() return EINVAL if PROT_WRITE was set wihtout PROT_READ with the justification that a write-only PTE is considered a reserved PTE permission bit pattern in the privileged spec. This check is unnecessary since we let VM_WRITE imply VM_READ on RISC-V, and it is inconsistent with other architectures that don't support write-only PTEs, creating a potential software portability issue. Just remove the check altogether and let PROT_WRITE imply PROT_READ as is the case on other architectures. Note that this also allows PROT_WRITE|PROT_EXEC mappings which were disallowed prior to the aforementioned commit; PROT_READ is implied in such mappings as well. Fixes: 2139619bcad7 ("riscv: mmap with PROT_WRITE but no PROT_READ is invalid") Reviewed-by: Atish Patra Signed-off-by: Andrew Bresticker Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220915193702.2201018-3-abrestic@rivosinc.com/ Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sys_riscv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 571556bb9261a..5d3f2fbeb33c7 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -18,9 +18,6 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) return -EINVAL; - if (unlikely((prot & PROT_WRITE) && !(prot & PROT_READ))) - return -EINVAL; - return ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> (PAGE_SHIFT - page_shift_offset)); } -- GitLab From e3bb4de0a0380910180e758a30ccfda65f8e286e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:19 -0300 Subject: [PATCH 0433/2223] vfio: Add header guards and includes to drivers/vfio/vfio.h As is normal for headers. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 503bea6c843d5..093784f1dea7a 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -3,6 +3,14 @@ * Copyright (C) 2012 Red Hat, Inc. All rights reserved. * Author: Alex Williamson */ +#ifndef __VFIO_VFIO_H__ +#define __VFIO_VFIO_H__ + +#include +#include +#include + +struct iommu_group; enum vfio_group_type { /* @@ -69,3 +77,5 @@ struct vfio_iommu_driver_ops { int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); + +#endif -- GitLab From 429a781c8e01c24ebb2b9da0a63a14e6fd9e0837 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:20 -0300 Subject: [PATCH 0434/2223] vfio: Rename __vfio_group_unset_container() To vfio_group_detach_container(). This function is really a container function. Fold the WARN_ON() into it as a precondition assertion. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f9d10dbcf3e61..3d8813125358a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1036,12 +1036,13 @@ static const struct file_operations vfio_fops = { /* * VFIO Group fd, /dev/vfio/$GROUP */ -static void __vfio_group_unset_container(struct vfio_group *group) +static void vfio_group_detach_container(struct vfio_group *group) { struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; lockdep_assert_held_write(&group->group_rwsem); + WARN_ON(group->container_users != 1); down_write(&container->group_lock); @@ -1089,7 +1090,7 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) ret = -EBUSY; goto out_unlock; } - __vfio_group_unset_container(group); + vfio_group_detach_container(group); out_unlock: up_write(&group->group_rwsem); @@ -1441,10 +1442,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) * is only called when there are no open devices. */ WARN_ON(group->notifier.head); - if (group->container) { - WARN_ON(group->container_users != 1); - __vfio_group_unset_container(group); - } + if (group->container) + vfio_group_detach_container(group); group->opened_file = NULL; up_write(&group->group_rwsem); -- GitLab From 03e650f6611563c0ccbd0d769d5748fd10d8ee8e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:21 -0300 Subject: [PATCH 0435/2223] vfio: Split the container logic into vfio_container_attach_group() This splits up the ioctl of vfio_group_ioctl_set_container() so it determines the type of file then invokes a type specific attachment function. Future patches will add iommufd to this function as an alternative type. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 78 ++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3d8813125358a..879c5d27c7127 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1097,40 +1097,29 @@ out_unlock: return ret; } -static int vfio_group_ioctl_set_container(struct vfio_group *group, - int __user *arg) +static struct vfio_container *vfio_container_from_file(struct file *file) { - struct fd f; struct vfio_container *container; - struct vfio_iommu_driver *driver; - int container_fd; - int ret = 0; - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - if (get_user(container_fd, arg)) - return -EFAULT; - if (container_fd < 0) - return -EINVAL; - f = fdget(container_fd); - if (!f.file) - return -EBADF; /* Sanity check, is this really our fd? */ - if (f.file->f_op != &vfio_fops) { - ret = -EINVAL; - goto out_fdput; - } - container = f.file->private_data; + if (file->f_op != &vfio_fops) + return NULL; + + container = file->private_data; WARN_ON(!container); /* fget ensures we don't race vfio_release */ + return container; +} - down_write(&group->group_rwsem); +static int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ + struct vfio_iommu_driver *driver; + int ret = 0; - if (group->container || WARN_ON(group->container_users)) { - ret = -EINVAL; - goto out_unlock_group; - } + lockdep_assert_held_write(&group->group_rwsem); + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) + return -EPERM; down_write(&container->group_lock); @@ -1142,7 +1131,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, } if (group->type == VFIO_IOMMU) { - ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); + ret = iommu_group_claim_dma_owner(group->iommu_group, group); if (ret) goto out_unlock_container; } @@ -1170,9 +1159,38 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, out_unlock_container: up_write(&container->group_lock); -out_unlock_group: + return ret; +} + +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) +{ + struct vfio_container *container; + struct fd f; + int ret; + int fd; + + if (get_user(fd, arg)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + down_write(&group->group_rwsem); + if (group->container || WARN_ON(group->container_users)) { + ret = -EINVAL; + goto out_unlock; + } + container = vfio_container_from_file(f.file); + ret = -EINVAL; + if (container) { + ret = vfio_container_attach_group(container, group); + goto out_unlock; + } + +out_unlock: up_write(&group->group_rwsem); -out_fdput: fdput(f); return ret; } -- GitLab From 444d43ecd01033a758e73b8ae154ee7f3e827f7b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:22 -0300 Subject: [PATCH 0436/2223] vfio: Remove #ifdefs around CONFIG_VFIO_NOIOMMU This can all be accomplished using typical IS_ENABLED techniques, drop it all. Also rename the variable to vfio_noiommu so this can be made global in following patches. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 43 ++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 879c5d27c7127..f79e7eb02931b 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -86,10 +86,12 @@ struct vfio_group { }; #ifdef CONFIG_VFIO_NOIOMMU -static bool noiommu __read_mostly; +static bool vfio_noiommu __read_mostly; module_param_named(enable_unsafe_noiommu_mode, - noiommu, bool, S_IRUGO | S_IWUSR); + vfio_noiommu, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); +#else +enum { vfio_noiommu = false }; #endif static DEFINE_XARRAY(vfio_device_set_xa); @@ -166,7 +168,6 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -#ifdef CONFIG_VFIO_NOIOMMU static void *vfio_noiommu_open(unsigned long arg) { if (arg != VFIO_NOIOMMU_IOMMU) @@ -185,7 +186,7 @@ static long vfio_noiommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { if (cmd == VFIO_CHECK_EXTENSION) - return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; + return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; return -ENOTTY; } @@ -215,18 +216,13 @@ static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { * Only noiommu containers can use vfio-noiommu and noiommu containers can only * use vfio-noiommu. */ -static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, - const struct vfio_iommu_driver *driver) +static bool vfio_iommu_driver_allowed(struct vfio_container *container, + const struct vfio_iommu_driver *driver) { + if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + return true; return container->noiommu == (driver->ops == &vfio_noiommu_ops); } -#else -static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, - const struct vfio_iommu_driver *driver) -{ - return true; -} -#endif /* CONFIG_VFIO_NOIOMMU */ /* * IOMMU driver registration @@ -630,8 +626,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) struct vfio_group *group; iommu_group = iommu_group_get(dev); -#ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_group && noiommu) { + if (!iommu_group && vfio_noiommu) { /* * With noiommu enabled, create an IOMMU group for devices that * don't already have one, implying no IOMMU hardware/driver @@ -645,7 +640,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) } return group; } -#endif + if (!iommu_group) return ERR_PTR(-EINVAL); @@ -2439,11 +2434,11 @@ static int __init vfio_init(void) if (ret) goto err_alloc_chrdev; -#ifdef CONFIG_VFIO_NOIOMMU - ret = vfio_register_iommu_driver(&vfio_noiommu_ops); -#endif - if (ret) - goto err_driver_register; + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); + if (ret) + goto err_driver_register; + } pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; @@ -2465,9 +2460,9 @@ static void __exit vfio_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); -#ifdef CONFIG_VFIO_NOIOMMU - vfio_unregister_iommu_driver(&vfio_noiommu_ops); -#endif + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + vfio_unregister_iommu_driver(&vfio_noiommu_ops); + ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); -- GitLab From c41da4622e08f874ab02e12eb6b6aaa9ac21daa7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:23 -0300 Subject: [PATCH 0437/2223] vfio: Split out container code from the init/cleanup functions This miscdev, noiommu driver and a couple of globals are all container items. Move this init into its own functions. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 54 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f79e7eb02931b..3cb52e9ab035a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -2397,15 +2397,11 @@ static struct miscdevice vfio_dev = { .mode = S_IRUGO | S_IWUGO, }; -static int __init vfio_init(void) +static int __init vfio_container_init(void) { int ret; - ida_init(&vfio.group_ida); - ida_init(&vfio.device_ida); - mutex_init(&vfio.group_lock); mutex_init(&vfio.iommu_drivers_lock); - INIT_LIST_HEAD(&vfio.group_list); INIT_LIST_HEAD(&vfio.iommu_drivers_list); ret = misc_register(&vfio_dev); @@ -2414,6 +2410,39 @@ static int __init vfio_init(void) return ret; } + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); + if (ret) + goto err_misc; + } + return 0; + +err_misc: + misc_deregister(&vfio_dev); + return ret; +} + +static void vfio_container_cleanup(void) +{ + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + vfio_unregister_iommu_driver(&vfio_noiommu_ops); + misc_deregister(&vfio_dev); + mutex_destroy(&vfio.iommu_drivers_lock); +} + +static int __init vfio_init(void) +{ + int ret; + + ida_init(&vfio.group_ida); + ida_init(&vfio.device_ida); + mutex_init(&vfio.group_lock); + INIT_LIST_HEAD(&vfio.group_list); + + ret = vfio_container_init(); + if (ret) + return ret; + /* /dev/vfio/$GROUP */ vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { @@ -2434,17 +2463,9 @@ static int __init vfio_init(void) if (ret) goto err_alloc_chrdev; - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { - ret = vfio_register_iommu_driver(&vfio_noiommu_ops); - if (ret) - goto err_driver_register; - } - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; -err_driver_register: - unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); err_alloc_chrdev: class_destroy(vfio.device_class); vfio.device_class = NULL; @@ -2452,7 +2473,7 @@ err_dev_class: class_destroy(vfio.class); vfio.class = NULL; err_group_class: - misc_deregister(&vfio_dev); + vfio_container_cleanup(); return ret; } @@ -2460,17 +2481,14 @@ static void __exit vfio_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) - vfio_unregister_iommu_driver(&vfio_noiommu_ops); - ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.device_class); vfio.device_class = NULL; class_destroy(vfio.class); + vfio_container_cleanup(); vfio.class = NULL; - misc_deregister(&vfio_dev); xa_destroy(&vfio_device_set_xa); } -- GitLab From 1408640d578887d7860737221043d91fc6d5a723 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:24 -0300 Subject: [PATCH 0438/2223] vfio: Rename vfio_ioctl_check_extension() To vfio_container_ioctl_check_extension(). A following patch will turn this into a non-static function, make it clear it is related to the container. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3cb52e9ab035a..33e55e40c4169 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -813,8 +813,9 @@ EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); /* * VFIO base fd, /dev/vfio/vfio */ -static long vfio_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) +static long +vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) { struct vfio_iommu_driver *driver; long ret = 0; @@ -971,7 +972,7 @@ static long vfio_fops_unl_ioctl(struct file *filep, ret = VFIO_API_VERSION; break; case VFIO_CHECK_EXTENSION: - ret = vfio_ioctl_check_extension(container, arg); + ret = vfio_container_ioctl_check_extension(container, arg); break; case VFIO_SET_IOMMU: ret = vfio_ioctl_set_iommu(container, arg); @@ -2100,8 +2101,8 @@ bool vfio_file_enforced_coherent(struct file *file) down_read(&group->group_rwsem); if (group->container) { - ret = vfio_ioctl_check_extension(group->container, - VFIO_DMA_CC_IOMMU); + ret = vfio_container_ioctl_check_extension(group->container, + VFIO_DMA_CC_IOMMU); } else { /* * Since the coherency state is determined only once a container -- GitLab From 9446162e740aefff95c324ac0887f0b68c739695 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:25 -0300 Subject: [PATCH 0439/2223] vfio: Split the register_device ops call into functions This is a container item. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 33e55e40c4169..1ac7160f9329c 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1226,9 +1226,28 @@ static void vfio_device_unassign_container(struct vfio_device *device) up_write(&device->group->group_rwsem); } +static void vfio_device_container_register(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->register_device) + iommu_driver->ops->register_device( + device->group->container->iommu_data, device); +} + +static void vfio_device_container_unregister(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->unregister_device) + iommu_driver->ops->unregister_device( + device->group->container->iommu_data, device); +} + static struct file *vfio_device_open(struct vfio_device *device) { - struct vfio_iommu_driver *iommu_driver; struct file *filep; int ret; @@ -1259,12 +1278,7 @@ static struct file *vfio_device_open(struct vfio_device *device) if (ret) goto err_undo_count; } - - iommu_driver = device->group->container->iommu_driver; - if (iommu_driver && iommu_driver->ops->register_device) - iommu_driver->ops->register_device( - device->group->container->iommu_data, device); - + vfio_device_container_register(device); up_read(&device->group->group_rwsem); } mutex_unlock(&device->dev_set->lock); @@ -1302,10 +1316,7 @@ err_close_device: if (device->open_count == 1 && device->ops->close_device) { device->ops->close_device(device); - iommu_driver = device->group->container->iommu_driver; - if (iommu_driver && iommu_driver->ops->unregister_device) - iommu_driver->ops->unregister_device( - device->group->container->iommu_data, device); + vfio_device_container_unregister(device); } err_undo_count: up_read(&device->group->group_rwsem); @@ -1513,7 +1524,6 @@ static inline void vfio_device_pm_runtime_put(struct vfio_device *device) static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - struct vfio_iommu_driver *iommu_driver; mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); @@ -1521,10 +1531,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); - iommu_driver = device->group->container->iommu_driver; - if (iommu_driver && iommu_driver->ops->unregister_device) - iommu_driver->ops->unregister_device( - device->group->container->iommu_data, device); + vfio_device_container_unregister(device); up_read(&device->group->group_rwsem); device->open_count--; if (device->open_count == 0) -- GitLab From cdc71fe4ecbf48f7292ae8b7e4ff4a2a8b5bdbca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:26 -0300 Subject: [PATCH 0440/2223] vfio: Move container code into drivers/vfio/container.c All the functions that dereference struct vfio_container are moved into container.c. Simple code motion, no functional change. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 1 + drivers/vfio/container.c | 680 ++++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 46 +++ drivers/vfio/vfio_main.c | 692 +-------------------------------------- 4 files changed, 728 insertions(+), 691 deletions(-) create mode 100644 drivers/vfio/container.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index d67c604d0407e..b693a1169286f 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ iova_bitmap.o \ + container.o obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c new file mode 100644 index 0000000000000..db7c071ee3de1 --- /dev/null +++ b/drivers/vfio/container.c @@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * + * VFIO container (/dev/vfio/vfio) + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfio.h" + +struct vfio_container { + struct kref kref; + struct list_head group_list; + struct rw_semaphore group_lock; + struct vfio_iommu_driver *iommu_driver; + void *iommu_data; + bool noiommu; +}; + +static struct vfio { + struct list_head iommu_drivers_list; + struct mutex iommu_drivers_lock; +} vfio; + +#ifdef CONFIG_VFIO_NOIOMMU +bool vfio_noiommu __read_mostly; +module_param_named(enable_unsafe_noiommu_mode, + vfio_noiommu, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); +#endif + +static void *vfio_noiommu_open(unsigned long arg) +{ + if (arg != VFIO_NOIOMMU_IOMMU) + return ERR_PTR(-EINVAL); + if (!capable(CAP_SYS_RAWIO)) + return ERR_PTR(-EPERM); + + return NULL; +} + +static void vfio_noiommu_release(void *iommu_data) +{ +} + +static long vfio_noiommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + if (cmd == VFIO_CHECK_EXTENSION) + return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; + + return -ENOTTY; +} + +static int vfio_noiommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group, enum vfio_group_type type) +{ + return 0; +} + +static void vfio_noiommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ +} + +static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { + .name = "vfio-noiommu", + .owner = THIS_MODULE, + .open = vfio_noiommu_open, + .release = vfio_noiommu_release, + .ioctl = vfio_noiommu_ioctl, + .attach_group = vfio_noiommu_attach_group, + .detach_group = vfio_noiommu_detach_group, +}; + +/* + * Only noiommu containers can use vfio-noiommu and noiommu containers can only + * use vfio-noiommu. + */ +static bool vfio_iommu_driver_allowed(struct vfio_container *container, + const struct vfio_iommu_driver *driver) +{ + if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + return true; + return container->noiommu == (driver->ops == &vfio_noiommu_ops); +} + +/* + * IOMMU driver registration + */ +int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) +{ + struct vfio_iommu_driver *driver, *tmp; + + if (WARN_ON(!ops->register_device != !ops->unregister_device)) + return -EINVAL; + + driver = kzalloc(sizeof(*driver), GFP_KERNEL); + if (!driver) + return -ENOMEM; + + driver->ops = ops; + + mutex_lock(&vfio.iommu_drivers_lock); + + /* Check for duplicates */ + list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { + if (tmp->ops == ops) { + mutex_unlock(&vfio.iommu_drivers_lock); + kfree(driver); + return -EINVAL; + } + } + + list_add(&driver->vfio_next, &vfio.iommu_drivers_list); + + mutex_unlock(&vfio.iommu_drivers_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); + +void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) +{ + struct vfio_iommu_driver *driver; + + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { + if (driver->ops == ops) { + list_del(&driver->vfio_next); + mutex_unlock(&vfio.iommu_drivers_lock); + kfree(driver); + return; + } + } + mutex_unlock(&vfio.iommu_drivers_lock); +} +EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); + +/* + * Container objects - containers are created when /dev/vfio/vfio is + * opened, but their lifecycle extends until the last user is done, so + * it's freed via kref. Must support container/group/device being + * closed in any order. + */ +static void vfio_container_release(struct kref *kref) +{ + struct vfio_container *container; + container = container_of(kref, struct vfio_container, kref); + + kfree(container); +} + +static void vfio_container_get(struct vfio_container *container) +{ + kref_get(&container->kref); +} + +static void vfio_container_put(struct vfio_container *container) +{ + kref_put(&container->kref, vfio_container_release); +} + +void vfio_device_container_register(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->register_device) + iommu_driver->ops->register_device( + device->group->container->iommu_data, device); +} + +void vfio_device_container_unregister(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->unregister_device) + iommu_driver->ops->unregister_device( + device->group->container->iommu_data, device); +} + +long vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_iommu_driver *driver; + long ret = 0; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + + switch (arg) { + /* No base extensions yet */ + default: + /* + * If no driver is set, poll all registered drivers for + * extensions and return the first positive result. If + * a driver is already set, further queries will be passed + * only to that driver. + */ + if (!driver) { + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { + + if (!list_empty(&container->group_list) && + !vfio_iommu_driver_allowed(container, + driver)) + continue; + if (!try_module_get(driver->ops->owner)) + continue; + + ret = driver->ops->ioctl(NULL, + VFIO_CHECK_EXTENSION, + arg); + module_put(driver->ops->owner); + if (ret > 0) + break; + } + mutex_unlock(&vfio.iommu_drivers_lock); + } else + ret = driver->ops->ioctl(container->iommu_data, + VFIO_CHECK_EXTENSION, arg); + } + + up_read(&container->group_lock); + + return ret; +} + +/* hold write lock on container->group_lock */ +static int __vfio_container_attach_groups(struct vfio_container *container, + struct vfio_iommu_driver *driver, + void *data) +{ + struct vfio_group *group; + int ret = -ENODEV; + + list_for_each_entry(group, &container->group_list, container_next) { + ret = driver->ops->attach_group(data, group->iommu_group, + group->type); + if (ret) + goto unwind; + } + + return ret; + +unwind: + list_for_each_entry_continue_reverse(group, &container->group_list, + container_next) { + driver->ops->detach_group(data, group->iommu_group); + } + + return ret; +} + +static long vfio_ioctl_set_iommu(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_iommu_driver *driver; + long ret = -ENODEV; + + down_write(&container->group_lock); + + /* + * The container is designed to be an unprivileged interface while + * the group can be assigned to specific users. Therefore, only by + * adding a group to a container does the user get the privilege of + * enabling the iommu, which may allocate finite resources. There + * is no unset_iommu, but by removing all the groups from a container, + * the container is deprivileged and returns to an unset state. + */ + if (list_empty(&container->group_list) || container->iommu_driver) { + up_write(&container->group_lock); + return -EINVAL; + } + + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { + void *data; + + if (!vfio_iommu_driver_allowed(container, driver)) + continue; + if (!try_module_get(driver->ops->owner)) + continue; + + /* + * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, + * so test which iommu driver reported support for this + * extension and call open on them. We also pass them the + * magic, allowing a single driver to support multiple + * interfaces if they'd like. + */ + if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { + module_put(driver->ops->owner); + continue; + } + + data = driver->ops->open(arg); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + module_put(driver->ops->owner); + continue; + } + + ret = __vfio_container_attach_groups(container, driver, data); + if (ret) { + driver->ops->release(data); + module_put(driver->ops->owner); + continue; + } + + container->iommu_driver = driver; + container->iommu_data = data; + break; + } + + mutex_unlock(&vfio.iommu_drivers_lock); + up_write(&container->group_lock); + + return ret; +} + +static long vfio_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver; + void *data; + long ret = -EINVAL; + + if (!container) + return ret; + + switch (cmd) { + case VFIO_GET_API_VERSION: + ret = VFIO_API_VERSION; + break; + case VFIO_CHECK_EXTENSION: + ret = vfio_container_ioctl_check_extension(container, arg); + break; + case VFIO_SET_IOMMU: + ret = vfio_ioctl_set_iommu(container, arg); + break; + default: + driver = container->iommu_driver; + data = container->iommu_data; + + if (driver) /* passthrough all unrecognized ioctls */ + ret = driver->ops->ioctl(data, cmd, arg); + } + + return ret; +} + +static int vfio_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_container *container; + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return -ENOMEM; + + INIT_LIST_HEAD(&container->group_list); + init_rwsem(&container->group_lock); + kref_init(&container->kref); + + filep->private_data = container; + + return 0; +} + +static int vfio_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver = container->iommu_driver; + + if (driver && driver->ops->notify) + driver->ops->notify(container->iommu_data, + VFIO_IOMMU_CONTAINER_CLOSE); + + filep->private_data = NULL; + + vfio_container_put(container); + + return 0; +} + +static const struct file_operations vfio_fops = { + .owner = THIS_MODULE, + .open = vfio_fops_open, + .release = vfio_fops_release, + .unlocked_ioctl = vfio_fops_unl_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +struct vfio_container *vfio_container_from_file(struct file *file) +{ + struct vfio_container *container; + + /* Sanity check, is this really our fd? */ + if (file->f_op != &vfio_fops) + return NULL; + + container = file->private_data; + WARN_ON(!container); /* fget ensures we don't race vfio_release */ + return container; +} + +static struct miscdevice vfio_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &vfio_fops, + .nodename = "vfio/vfio", + .mode = S_IRUGO | S_IWUGO, +}; + +int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ + struct vfio_iommu_driver *driver; + int ret = 0; + + lockdep_assert_held_write(&group->group_rwsem); + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + down_write(&container->group_lock); + + /* Real groups and fake groups cannot mix */ + if (!list_empty(&container->group_list) && + container->noiommu != (group->type == VFIO_NO_IOMMU)) { + ret = -EPERM; + goto out_unlock_container; + } + + if (group->type == VFIO_IOMMU) { + ret = iommu_group_claim_dma_owner(group->iommu_group, group); + if (ret) + goto out_unlock_container; + } + + driver = container->iommu_driver; + if (driver) { + ret = driver->ops->attach_group(container->iommu_data, + group->iommu_group, + group->type); + if (ret) { + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner( + group->iommu_group); + goto out_unlock_container; + } + } + + group->container = container; + group->container_users = 1; + container->noiommu = (group->type == VFIO_NO_IOMMU); + list_add(&group->container_next, &container->group_list); + + /* Get a reference on the container and mark a user within the group */ + vfio_container_get(container); + +out_unlock_container: + up_write(&container->group_lock); + return ret; +} + +void vfio_group_detach_container(struct vfio_group *group) +{ + struct vfio_container *container = group->container; + struct vfio_iommu_driver *driver; + + lockdep_assert_held_write(&group->group_rwsem); + WARN_ON(group->container_users != 1); + + down_write(&container->group_lock); + + driver = container->iommu_driver; + if (driver) + driver->ops->detach_group(container->iommu_data, + group->iommu_group); + + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner(group->iommu_group); + + group->container = NULL; + group->container_users = 0; + list_del(&group->container_next); + + /* Detaching the last group deprivileges a container, remove iommu */ + if (driver && list_empty(&container->group_list)) { + driver->ops->release(container->iommu_data); + module_put(driver->ops->owner); + container->iommu_driver = NULL; + container->iommu_data = NULL; + } + + up_write(&container->group_lock); + + vfio_container_put(container); +} + +int vfio_device_assign_container(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held_write(&group->group_rwsem); + + if (!group->container || !group->container->iommu_driver || + WARN_ON(!group->container_users)) + return -EINVAL; + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + get_file(group->opened_file); + group->container_users++; + return 0; +} + +void vfio_device_unassign_container(struct vfio_device *device) +{ + down_write(&device->group->group_rwsem); + WARN_ON(device->group->container_users <= 1); + device->group->container_users--; + fput(device->group->opened_file); + up_write(&device->group->group_rwsem); +} + +/* + * Pin contiguous user pages and return their associated host pages for local + * domain only. + * @device [in] : device + * @iova [in] : starting IOVA of user pages to be pinned. + * @npage [in] : count of pages to be pinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @pages[out] : array of host pages + * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev(). + */ +int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, + int npage, int prot, struct page **pages) +{ + struct vfio_container *container; + struct vfio_group *group = device->group; + struct vfio_iommu_driver *driver; + int ret; + + if (!pages || !npage || !vfio_assert_device_open(device)) + return -EINVAL; + + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) + return -E2BIG; + + /* group->container cannot change while a vfio device is open */ + container = group->container; + driver = container->iommu_driver; + if (likely(driver && driver->ops->pin_pages)) + ret = driver->ops->pin_pages(container->iommu_data, + group->iommu_group, iova, + npage, prot, pages); + else + ret = -ENOTTY; + + return ret; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin contiguous host pages for local domain only. + * @device [in] : device + * @iova [in] : starting address of user pages to be unpinned. + * @npage [in] : count of pages to be unpinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + */ +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + + if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) + return; + + if (WARN_ON(!vfio_assert_device_open(device))) + return; + + /* group->container cannot change while a vfio device is open */ + container = device->group->container; + driver = container->iommu_driver; + + driver->ops->unpin_pages(container->iommu_data, iova, npage); +} +EXPORT_SYMBOL(vfio_unpin_pages); + +/* + * This interface allows the CPUs to perform some sort of virtual DMA on + * behalf of the device. + * + * CPUs read/write from/into a range of IOVAs pointing to user space memory + * into/from a kernel buffer. + * + * As the read/write of user space memory is conducted via the CPUs and is + * not a real device DMA, it is not necessary to pin the user space memory. + * + * @device [in] : VFIO device + * @iova [in] : base IOVA of a user space buffer + * @data [in] : pointer to kernel buffer + * @len [in] : kernel buffer length + * @write : indicate read or write + * Return error code on failure or 0 on success. + */ +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, + size_t len, bool write) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret = 0; + + if (!data || len <= 0 || !vfio_assert_device_open(device)) + return -EINVAL; + + /* group->container cannot change while a vfio device is open */ + container = device->group->container; + driver = container->iommu_driver; + + if (likely(driver && driver->ops->dma_rw)) + ret = driver->ops->dma_rw(container->iommu_data, + iova, data, len, write); + else + ret = -ENOTTY; + return ret; +} +EXPORT_SYMBOL(vfio_dma_rw); + +int __init vfio_container_init(void) +{ + int ret; + + mutex_init(&vfio.iommu_drivers_lock); + INIT_LIST_HEAD(&vfio.iommu_drivers_list); + + ret = misc_register(&vfio_dev); + if (ret) { + pr_err("vfio: misc device register failed\n"); + return ret; + } + + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); + if (ret) + goto err_misc; + } + return 0; + +err_misc: + misc_deregister(&vfio_dev); + return ret; +} + +void vfio_container_cleanup(void) +{ + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + vfio_unregister_iommu_driver(&vfio_noiommu_ops); + misc_deregister(&vfio_dev); + mutex_destroy(&vfio.iommu_drivers_lock); +} diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 093784f1dea7a..56fab31f8e0ff 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -11,6 +11,8 @@ #include struct iommu_group; +struct vfio_device; +struct vfio_container; enum vfio_group_type { /* @@ -36,6 +38,24 @@ enum vfio_group_type { VFIO_NO_IOMMU, }; +struct vfio_group { + struct device dev; + struct cdev cdev; + refcount_t users; + unsigned int container_users; + struct iommu_group *iommu_group; + struct vfio_container *container; + struct list_head device_list; + struct mutex device_lock; + struct list_head vfio_next; + struct list_head container_next; + enum vfio_group_type type; + struct rw_semaphore group_rwsem; + struct kvm *kvm; + struct file *opened_file; + struct blocking_notifier_head notifier; +}; + /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -75,7 +95,33 @@ struct vfio_iommu_driver_ops { enum vfio_iommu_notify_type event); }; +struct vfio_iommu_driver { + const struct vfio_iommu_driver_ops *ops; + struct list_head vfio_next; +}; + int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); +bool vfio_assert_device_open(struct vfio_device *device); + +struct vfio_container *vfio_container_from_file(struct file *filep); +int vfio_device_assign_container(struct vfio_device *device); +void vfio_device_unassign_container(struct vfio_device *device); +int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group); +void vfio_group_detach_container(struct vfio_group *group); +void vfio_device_container_register(struct vfio_device *device); +void vfio_device_container_unregister(struct vfio_device *device); +long vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg); +int __init vfio_container_init(void); +void vfio_container_cleanup(void); + +#ifdef CONFIG_VFIO_NOIOMMU +extern bool vfio_noiommu __read_mostly; +#else +enum { vfio_noiommu = false }; +#endif + #endif diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 1ac7160f9329c..af5945c71c417 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -43,8 +43,6 @@ static struct vfio { struct class *class; - struct list_head iommu_drivers_list; - struct mutex iommu_drivers_lock; struct list_head group_list; struct mutex group_lock; /* locks group_list */ struct ida group_ida; @@ -53,47 +51,6 @@ static struct vfio { struct ida device_ida; } vfio; -struct vfio_iommu_driver { - const struct vfio_iommu_driver_ops *ops; - struct list_head vfio_next; -}; - -struct vfio_container { - struct kref kref; - struct list_head group_list; - struct rw_semaphore group_lock; - struct vfio_iommu_driver *iommu_driver; - void *iommu_data; - bool noiommu; -}; - -struct vfio_group { - struct device dev; - struct cdev cdev; - refcount_t users; - unsigned int container_users; - struct iommu_group *iommu_group; - struct vfio_container *container; - struct list_head device_list; - struct mutex device_lock; - struct list_head vfio_next; - struct list_head container_next; - enum vfio_group_type type; - struct rw_semaphore group_rwsem; - struct kvm *kvm; - struct file *opened_file; - struct blocking_notifier_head notifier; -}; - -#ifdef CONFIG_VFIO_NOIOMMU -static bool vfio_noiommu __read_mostly; -module_param_named(enable_unsafe_noiommu_mode, - vfio_noiommu, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); -#else -enum { vfio_noiommu = false }; -#endif - static DEFINE_XARRAY(vfio_device_set_xa); static const struct file_operations vfio_group_fops; @@ -168,140 +125,8 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -static void *vfio_noiommu_open(unsigned long arg) -{ - if (arg != VFIO_NOIOMMU_IOMMU) - return ERR_PTR(-EINVAL); - if (!capable(CAP_SYS_RAWIO)) - return ERR_PTR(-EPERM); - - return NULL; -} - -static void vfio_noiommu_release(void *iommu_data) -{ -} - -static long vfio_noiommu_ioctl(void *iommu_data, - unsigned int cmd, unsigned long arg) -{ - if (cmd == VFIO_CHECK_EXTENSION) - return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; - - return -ENOTTY; -} - -static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group, enum vfio_group_type type) -{ - return 0; -} - -static void vfio_noiommu_detach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ -} - -static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { - .name = "vfio-noiommu", - .owner = THIS_MODULE, - .open = vfio_noiommu_open, - .release = vfio_noiommu_release, - .ioctl = vfio_noiommu_ioctl, - .attach_group = vfio_noiommu_attach_group, - .detach_group = vfio_noiommu_detach_group, -}; - -/* - * Only noiommu containers can use vfio-noiommu and noiommu containers can only - * use vfio-noiommu. - */ -static bool vfio_iommu_driver_allowed(struct vfio_container *container, - const struct vfio_iommu_driver *driver) -{ - if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) - return true; - return container->noiommu == (driver->ops == &vfio_noiommu_ops); -} - -/* - * IOMMU driver registration - */ -int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) -{ - struct vfio_iommu_driver *driver, *tmp; - - if (WARN_ON(!ops->register_device != !ops->unregister_device)) - return -EINVAL; - - driver = kzalloc(sizeof(*driver), GFP_KERNEL); - if (!driver) - return -ENOMEM; - - driver->ops = ops; - - mutex_lock(&vfio.iommu_drivers_lock); - - /* Check for duplicates */ - list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { - if (tmp->ops == ops) { - mutex_unlock(&vfio.iommu_drivers_lock); - kfree(driver); - return -EINVAL; - } - } - - list_add(&driver->vfio_next, &vfio.iommu_drivers_list); - - mutex_unlock(&vfio.iommu_drivers_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); - -void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) -{ - struct vfio_iommu_driver *driver; - - mutex_lock(&vfio.iommu_drivers_lock); - list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { - if (driver->ops == ops) { - list_del(&driver->vfio_next); - mutex_unlock(&vfio.iommu_drivers_lock); - kfree(driver); - return; - } - } - mutex_unlock(&vfio.iommu_drivers_lock); -} -EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); - static void vfio_group_get(struct vfio_group *group); -/* - * Container objects - containers are created when /dev/vfio/vfio is - * opened, but their lifecycle extends until the last user is done, so - * it's freed via kref. Must support container/group/device being - * closed in any order. - */ -static void vfio_container_get(struct vfio_container *container) -{ - kref_get(&container->kref); -} - -static void vfio_container_release(struct kref *kref) -{ - struct vfio_container *container; - container = container_of(kref, struct vfio_container, kref); - - kfree(container); -} - -static void vfio_container_put(struct vfio_container *container) -{ - kref_put(&container->kref, vfio_container_release); -} - /* * Group objects - create, release, get, put, search */ @@ -810,263 +635,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); -/* - * VFIO base fd, /dev/vfio/vfio - */ -static long -vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) -{ - struct vfio_iommu_driver *driver; - long ret = 0; - - down_read(&container->group_lock); - - driver = container->iommu_driver; - - switch (arg) { - /* No base extensions yet */ - default: - /* - * If no driver is set, poll all registered drivers for - * extensions and return the first positive result. If - * a driver is already set, further queries will be passed - * only to that driver. - */ - if (!driver) { - mutex_lock(&vfio.iommu_drivers_lock); - list_for_each_entry(driver, &vfio.iommu_drivers_list, - vfio_next) { - - if (!list_empty(&container->group_list) && - !vfio_iommu_driver_allowed(container, - driver)) - continue; - if (!try_module_get(driver->ops->owner)) - continue; - - ret = driver->ops->ioctl(NULL, - VFIO_CHECK_EXTENSION, - arg); - module_put(driver->ops->owner); - if (ret > 0) - break; - } - mutex_unlock(&vfio.iommu_drivers_lock); - } else - ret = driver->ops->ioctl(container->iommu_data, - VFIO_CHECK_EXTENSION, arg); - } - - up_read(&container->group_lock); - - return ret; -} - -/* hold write lock on container->group_lock */ -static int __vfio_container_attach_groups(struct vfio_container *container, - struct vfio_iommu_driver *driver, - void *data) -{ - struct vfio_group *group; - int ret = -ENODEV; - - list_for_each_entry(group, &container->group_list, container_next) { - ret = driver->ops->attach_group(data, group->iommu_group, - group->type); - if (ret) - goto unwind; - } - - return ret; - -unwind: - list_for_each_entry_continue_reverse(group, &container->group_list, - container_next) { - driver->ops->detach_group(data, group->iommu_group); - } - - return ret; -} - -static long vfio_ioctl_set_iommu(struct vfio_container *container, - unsigned long arg) -{ - struct vfio_iommu_driver *driver; - long ret = -ENODEV; - - down_write(&container->group_lock); - - /* - * The container is designed to be an unprivileged interface while - * the group can be assigned to specific users. Therefore, only by - * adding a group to a container does the user get the privilege of - * enabling the iommu, which may allocate finite resources. There - * is no unset_iommu, but by removing all the groups from a container, - * the container is deprivileged and returns to an unset state. - */ - if (list_empty(&container->group_list) || container->iommu_driver) { - up_write(&container->group_lock); - return -EINVAL; - } - - mutex_lock(&vfio.iommu_drivers_lock); - list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { - void *data; - - if (!vfio_iommu_driver_allowed(container, driver)) - continue; - if (!try_module_get(driver->ops->owner)) - continue; - - /* - * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, - * so test which iommu driver reported support for this - * extension and call open on them. We also pass them the - * magic, allowing a single driver to support multiple - * interfaces if they'd like. - */ - if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { - module_put(driver->ops->owner); - continue; - } - - data = driver->ops->open(arg); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - module_put(driver->ops->owner); - continue; - } - - ret = __vfio_container_attach_groups(container, driver, data); - if (ret) { - driver->ops->release(data); - module_put(driver->ops->owner); - continue; - } - - container->iommu_driver = driver; - container->iommu_data = data; - break; - } - - mutex_unlock(&vfio.iommu_drivers_lock); - up_write(&container->group_lock); - - return ret; -} - -static long vfio_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver; - void *data; - long ret = -EINVAL; - - if (!container) - return ret; - - switch (cmd) { - case VFIO_GET_API_VERSION: - ret = VFIO_API_VERSION; - break; - case VFIO_CHECK_EXTENSION: - ret = vfio_container_ioctl_check_extension(container, arg); - break; - case VFIO_SET_IOMMU: - ret = vfio_ioctl_set_iommu(container, arg); - break; - default: - driver = container->iommu_driver; - data = container->iommu_data; - - if (driver) /* passthrough all unrecognized ioctls */ - ret = driver->ops->ioctl(data, cmd, arg); - } - - return ret; -} - -static int vfio_fops_open(struct inode *inode, struct file *filep) -{ - struct vfio_container *container; - - container = kzalloc(sizeof(*container), GFP_KERNEL); - if (!container) - return -ENOMEM; - - INIT_LIST_HEAD(&container->group_list); - init_rwsem(&container->group_lock); - kref_init(&container->kref); - - filep->private_data = container; - - return 0; -} - -static int vfio_fops_release(struct inode *inode, struct file *filep) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; - - if (driver && driver->ops->notify) - driver->ops->notify(container->iommu_data, - VFIO_IOMMU_CONTAINER_CLOSE); - - filep->private_data = NULL; - - vfio_container_put(container); - - return 0; -} - -static const struct file_operations vfio_fops = { - .owner = THIS_MODULE, - .open = vfio_fops_open, - .release = vfio_fops_release, - .unlocked_ioctl = vfio_fops_unl_ioctl, - .compat_ioctl = compat_ptr_ioctl, -}; - /* * VFIO Group fd, /dev/vfio/$GROUP */ -static void vfio_group_detach_container(struct vfio_group *group) -{ - struct vfio_container *container = group->container; - struct vfio_iommu_driver *driver; - - lockdep_assert_held_write(&group->group_rwsem); - WARN_ON(group->container_users != 1); - - down_write(&container->group_lock); - - driver = container->iommu_driver; - if (driver) - driver->ops->detach_group(container->iommu_data, - group->iommu_group); - - if (group->type == VFIO_IOMMU) - iommu_group_release_dma_owner(group->iommu_group); - - group->container = NULL; - group->container_users = 0; - list_del(&group->container_next); - - /* Detaching the last group deprivileges a container, remove iommu */ - if (driver && list_empty(&container->group_list)) { - driver->ops->release(container->iommu_data); - module_put(driver->ops->owner); - container->iommu_driver = NULL; - container->iommu_data = NULL; - } - - up_write(&container->group_lock); - - vfio_container_put(container); -} - /* * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or * if there was no container to unset. Since the ioctl is called on @@ -1093,71 +664,6 @@ out_unlock: return ret; } -static struct vfio_container *vfio_container_from_file(struct file *file) -{ - struct vfio_container *container; - - /* Sanity check, is this really our fd? */ - if (file->f_op != &vfio_fops) - return NULL; - - container = file->private_data; - WARN_ON(!container); /* fget ensures we don't race vfio_release */ - return container; -} - -static int vfio_container_attach_group(struct vfio_container *container, - struct vfio_group *group) -{ - struct vfio_iommu_driver *driver; - int ret = 0; - - lockdep_assert_held_write(&group->group_rwsem); - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - down_write(&container->group_lock); - - /* Real groups and fake groups cannot mix */ - if (!list_empty(&container->group_list) && - container->noiommu != (group->type == VFIO_NO_IOMMU)) { - ret = -EPERM; - goto out_unlock_container; - } - - if (group->type == VFIO_IOMMU) { - ret = iommu_group_claim_dma_owner(group->iommu_group, group); - if (ret) - goto out_unlock_container; - } - - driver = container->iommu_driver; - if (driver) { - ret = driver->ops->attach_group(container->iommu_data, - group->iommu_group, - group->type); - if (ret) { - if (group->type == VFIO_IOMMU) - iommu_group_release_dma_owner( - group->iommu_group); - goto out_unlock_container; - } - } - - group->container = container; - group->container_users = 1; - container->noiommu = (group->type == VFIO_NO_IOMMU); - list_add(&group->container_next, &container->group_list); - - /* Get a reference on the container and mark a user within the group */ - vfio_container_get(container); - -out_unlock_container: - up_write(&container->group_lock); - return ret; -} - static int vfio_group_ioctl_set_container(struct vfio_group *group, int __user *arg) { @@ -1194,58 +700,11 @@ out_unlock: static const struct file_operations vfio_device_fops; /* true if the vfio_device has open_device() called but not close_device() */ -static bool vfio_assert_device_open(struct vfio_device *device) +bool vfio_assert_device_open(struct vfio_device *device) { return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_device_assign_container(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - - lockdep_assert_held_write(&group->group_rwsem); - - if (!group->container || !group->container->iommu_driver || - WARN_ON(!group->container_users)) - return -EINVAL; - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - get_file(group->opened_file); - group->container_users++; - return 0; -} - -static void vfio_device_unassign_container(struct vfio_device *device) -{ - down_write(&device->group->group_rwsem); - WARN_ON(device->group->container_users <= 1); - device->group->container_users--; - fput(device->group->opened_file); - up_write(&device->group->group_rwsem); -} - -static void vfio_device_container_register(struct vfio_device *device) -{ - struct vfio_iommu_driver *iommu_driver = - device->group->container->iommu_driver; - - if (iommu_driver && iommu_driver->ops->register_device) - iommu_driver->ops->register_device( - device->group->container->iommu_data, device); -} - -static void vfio_device_container_unregister(struct vfio_device *device) -{ - struct vfio_iommu_driver *iommu_driver = - device->group->container->iommu_driver; - - if (iommu_driver && iommu_driver->ops->unregister_device) - iommu_driver->ops->unregister_device( - device->group->container->iommu_data, device); -} - static struct file *vfio_device_open(struct vfio_device *device) { struct file *filep; @@ -2281,114 +1740,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, } EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); -/* - * Pin contiguous user pages and return their associated host pages for local - * domain only. - * @device [in] : device - * @iova [in] : starting IOVA of user pages to be pinned. - * @npage [in] : count of pages to be pinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @pages[out] : array of host pages - * Return error or number of pages pinned. - * - * A driver may only call this function if the vfio_device was created - * by vfio_register_emulated_iommu_dev(). - */ -int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, - int npage, int prot, struct page **pages) -{ - struct vfio_container *container; - struct vfio_group *group = device->group; - struct vfio_iommu_driver *driver; - int ret; - - if (!pages || !npage || !vfio_assert_device_open(device)) - return -EINVAL; - - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; - - /* group->container cannot change while a vfio device is open */ - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, iova, - npage, prot, pages); - else - ret = -ENOTTY; - - return ret; -} -EXPORT_SYMBOL(vfio_pin_pages); - -/* - * Unpin contiguous host pages for local domain only. - * @device [in] : device - * @iova [in] : starting address of user pages to be unpinned. - * @npage [in] : count of pages to be unpinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - */ -void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - - if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) - return; - - if (WARN_ON(!vfio_assert_device_open(device))) - return; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - driver->ops->unpin_pages(container->iommu_data, iova, npage); -} -EXPORT_SYMBOL(vfio_unpin_pages); - -/* - * This interface allows the CPUs to perform some sort of virtual DMA on - * behalf of the device. - * - * CPUs read/write from/into a range of IOVAs pointing to user space memory - * into/from a kernel buffer. - * - * As the read/write of user space memory is conducted via the CPUs and is - * not a real device DMA, it is not necessary to pin the user space memory. - * - * @device [in] : VFIO device - * @iova [in] : base IOVA of a user space buffer - * @data [in] : pointer to kernel buffer - * @len [in] : kernel buffer length - * @write : indicate read or write - * Return error code on failure or 0 on success. - */ -int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, - size_t len, bool write) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret = 0; - - if (!data || len <= 0 || !vfio_assert_device_open(device)) - return -EINVAL; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - if (likely(driver && driver->ops->dma_rw)) - ret = driver->ops->dma_rw(container->iommu_data, - iova, data, len, write); - else - ret = -ENOTTY; - return ret; -} -EXPORT_SYMBOL(vfio_dma_rw); - /* * Module/class support */ @@ -2397,47 +1748,6 @@ static char *vfio_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } -static struct miscdevice vfio_dev = { - .minor = VFIO_MINOR, - .name = "vfio", - .fops = &vfio_fops, - .nodename = "vfio/vfio", - .mode = S_IRUGO | S_IWUGO, -}; - -static int __init vfio_container_init(void) -{ - int ret; - - mutex_init(&vfio.iommu_drivers_lock); - INIT_LIST_HEAD(&vfio.iommu_drivers_list); - - ret = misc_register(&vfio_dev); - if (ret) { - pr_err("vfio: misc device register failed\n"); - return ret; - } - - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { - ret = vfio_register_iommu_driver(&vfio_noiommu_ops); - if (ret) - goto err_misc; - } - return 0; - -err_misc: - misc_deregister(&vfio_dev); - return ret; -} - -static void vfio_container_cleanup(void) -{ - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) - vfio_unregister_iommu_driver(&vfio_noiommu_ops); - misc_deregister(&vfio_dev); - mutex_destroy(&vfio.iommu_drivers_lock); -} - static int __init vfio_init(void) { int ret; -- GitLab From 6f65540b22afaa9c3d621bfb8b2a2958fedf6179 Mon Sep 17 00:00:00 2001 From: Chia-Wei Wang Date: Tue, 20 Sep 2022 10:03:33 +0800 Subject: [PATCH 0441/2223] ipmi: kcs: aspeed: Update port address comments Remove AST_usrGuide_KCS.pdf as it is no longer maintained. Add more descriptions as the driver now supports the I/O address configurations for both the KCS Data and Cmd/Status interface registers. Signed-off-by: Chia-Wei Wang Message-Id: <20220920020333.601-1-chiawei_wang@aspeedtech.com> [I don't like removing documentation, but the document in question was a personal note by an employee and nothing official and not necessarily guaranteed to be accurate in the future. So go ahead and remove it.] Signed-off-by: Corey Minyard --- drivers/char/ipmi/kcs_bmc_aspeed.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/char/ipmi/kcs_bmc_aspeed.c b/drivers/char/ipmi/kcs_bmc_aspeed.c index cdc88cde1e9aa..19c32bf50e0e9 100644 --- a/drivers/char/ipmi/kcs_bmc_aspeed.c +++ b/drivers/char/ipmi/kcs_bmc_aspeed.c @@ -207,17 +207,24 @@ static void aspeed_kcs_updateb(struct kcs_bmc_device *kcs_bmc, u32 reg, u8 mask, } /* - * AST_usrGuide_KCS.pdf - * 2. Background: - * we note D for Data, and C for Cmd/Status, default rules are - * A. KCS1 / KCS2 ( D / C:X / X+4 ) - * D / C : CA0h / CA4h - * D / C : CA8h / CACh - * B. KCS3 ( D / C:XX2h / XX3h ) - * D / C : CA2h / CA3h - * D / C : CB2h / CB3h - * C. KCS4 - * D / C : CA4h / CA5h + * We note D for Data, and C for Cmd/Status, default rules are + * + * 1. Only the D address is given: + * A. KCS1/KCS2 (D/C: X/X+4) + * D/C: CA0h/CA4h + * D/C: CA8h/CACh + * B. KCS3 (D/C: XX2/XX3h) + * D/C: CA2h/CA3h + * C. KCS4 (D/C: X/X+1) + * D/C: CA4h/CA5h + * + * 2. Both the D/C addresses are given: + * A. KCS1/KCS2/KCS4 (D/C: X/Y) + * D/C: CA0h/CA1h + * D/C: CA8h/CA9h + * D/C: CA4h/CA5h + * B. KCS3 (D/C: XX2/XX3h) + * D/C: CA2h/CA3h */ static int aspeed_kcs_set_address(struct kcs_bmc_device *kcs_bmc, u32 addrs[2], int nr_addrs) { -- GitLab From 13a0ac816d22aa47d6c393f14a99f39e49b960df Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 23 Sep 2022 14:53:14 +0200 Subject: [PATCH 0442/2223] firmware: dmi: Fortify entry point length checks Ensure that the SMBIOS entry point is long enough to include all the fields we need. Otherwise it is pointless to even attempt to verify its checksum. Also fix the maximum length check, which is technically 32, not 31. It does not matter in practice as the only valid values are 31 (for SMBIOS 2.x) and 24 (for SMBIOS 3.x), but let's still have the check right in case new fields are added to either structure in the future. Signed-off-by: Jean Delvare Reported-by: Linus Torvalds Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/lkml/20220823094857.27f3d924@endymion.delvare/T/ --- drivers/firmware/dmi_scan.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 0eb6b617f709a..015c95a825d31 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -567,8 +567,13 @@ static int __init dmi_present(const u8 *buf) { u32 smbios_ver; + /* + * The size of this structure is 31 bytes, but we also accept value + * 30 due to a mistake in SMBIOS specification version 2.1. + */ if (memcmp(buf, "_SM_", 4) == 0 && - buf[5] < 32 && dmi_checksum(buf, buf[5])) { + buf[5] >= 30 && buf[5] <= 32 && + dmi_checksum(buf, buf[5])) { smbios_ver = get_unaligned_be16(buf + 6); smbios_entry_point_size = buf[5]; memcpy(smbios_entry_point, buf, smbios_entry_point_size); @@ -629,7 +634,8 @@ static int __init dmi_present(const u8 *buf) static int __init dmi_smbios3_present(const u8 *buf) { if (memcmp(buf, "_SM3_", 5) == 0 && - buf[6] < 32 && dmi_checksum(buf, buf[6])) { + buf[6] >= 24 && buf[6] <= 32 && + dmi_checksum(buf, buf[6])) { dmi_ver = get_unaligned_be24(buf + 7); dmi_num = 0; /* No longer specified */ dmi_len = get_unaligned_le32(buf + 12); -- GitLab From 1a8339c6bdcf7d66a83152ee5ff13c50da761295 Mon Sep 17 00:00:00 2001 From: Yunlong Jia Date: Fri, 23 Sep 2022 09:35:48 -0700 Subject: [PATCH 0443/2223] dt-bindings: input: touchscreen: elants_i2c: Add compatible for eth3915n chip This adds a new compatible string for Elan eth3915n touchscreen controller, which is compatible with ekth3500. Signed-off-by: Yunlong Jia Suggested-by: Douglas Anderson Reviewed-by: Krzysztof Kozlowski Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20220923083657.v5.2.Ic4e8f03868f88b8027a81bc3d414bae68978e6b7@changeid Signed-off-by: Dmitry Torokhov --- .../bindings/input/touchscreen/elan,elants_i2c.yaml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/elan,elants_i2c.yaml b/Documentation/devicetree/bindings/input/touchscreen/elan,elants_i2c.yaml index a9b53c2e6f0ab..f9053e5e9b240 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/elan,elants_i2c.yaml +++ b/Documentation/devicetree/bindings/input/touchscreen/elan,elants_i2c.yaml @@ -14,9 +14,13 @@ allOf: properties: compatible: - enum: - - elan,ektf3624 - - elan,ekth3500 + oneOf: + - enum: + - elan,ektf3624 + - elan,ekth3500 + - items: + - const: elan,ekth3915 + - const: elan,ekth3500 reg: maxItems: 1 -- GitLab From 7aacc42f8d7d3bc9efde159b534d9199d9e2cc87 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Sep 2022 21:46:31 +0300 Subject: [PATCH 0444/2223] Input: matrix_keypad - add missed header inclusion The gpiod_count() API is defined in gpio/consumer.h. Include it. Fixes: f8f7f47d576f ("Input: matrix_keypad - replace of_gpio_named_count() by gpiod_count()") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220923184632.2157-1-andriy.shevchenko@linux.intel.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/matrix_keypad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c index 63f078f2bc4ae..7dd3f3eda834e 100644 --- a/drivers/input/keyboard/matrix_keypad.c +++ b/drivers/input/keyboard/matrix_keypad.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include -- GitLab From 28f677e9d15181556c1f2103d93b9cc093e7b91f Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:00 +0200 Subject: [PATCH 0445/2223] Input: synaptics-rmi4 - fix firmware update operations with bootloader v8 Commit a6977d758fed ("Input: synaptics-rmi4 - support bootloader v8 in f34v7") allowed the F34v7 driver to probe with bootloader v8, but it did not update various other bootloader version checks in the F34 code. Fixes: a6977d758fed ("Input: synaptics-rmi4 - support bootloader v8 in f34v7") Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-2-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34.c b/drivers/input/rmi4/rmi_f34.c index e5dca9868f87f..3afc94f679edc 100644 --- a/drivers/input/rmi4/rmi_f34.c +++ b/drivers/input/rmi4/rmi_f34.c @@ -370,7 +370,7 @@ static int rmi_firmware_update(struct rmi_driver_data *data, f34 = dev_get_drvdata(&data->f34_container->dev); - if (f34->bl_version == 7) { + if (f34->bl_version >= 7) { if (data->pdt_props & HAS_BSR) { dev_err(dev, "%s: LTS not supported\n", __func__); return -ENODEV; @@ -382,7 +382,7 @@ static int rmi_firmware_update(struct rmi_driver_data *data, } /* Enter flash mode */ - if (f34->bl_version == 7) + if (f34->bl_version >= 7) ret = rmi_f34v7_start_reflash(f34, fw); else ret = rmi_f34_enable_flash(f34); @@ -413,7 +413,7 @@ static int rmi_firmware_update(struct rmi_driver_data *data, f34 = dev_get_drvdata(&data->f34_container->dev); /* Perform firmware update */ - if (f34->bl_version == 7) + if (f34->bl_version >= 7) ret = rmi_f34v7_do_reflash(f34, fw); else ret = rmi_f34_update_firmware(f34, fw); -- GitLab From 33fe4d976ff2c1fb6caf961b2e7bbfa66b8a9bf6 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:01 +0200 Subject: [PATCH 0446/2223] Input: synaptics-rmi4 - introduce rmi_f34v7_check_command_status() helper Add a function that waits for the last command to complete and checks the status, and use it where appropriate. This prepares for the subsequent fix of the completion condition in rmi_f34_attention(), which would previously lead to a timeout instead of a more detailed error message whenever a command was unsuccessful with v7/v8 bootloaders. Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-3-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34v7.c | 36 +++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 8d7ec9d89b185..9049acb3a994a 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -72,6 +72,24 @@ static int rmi_f34v7_wait_for_idle(struct f34_data *f34, int timeout_ms) return 0; } +static int rmi_f34v7_check_command_status(struct f34_data *f34, int timeout_ms) +{ + int ret; + + ret = rmi_f34v7_wait_for_idle(f34, timeout_ms); + if (ret < 0) + return ret; + + ret = rmi_f34v7_read_flash_status(f34); + if (ret < 0) + return ret; + + if (f34->v7.flash_status != 0x00) + return -EIO; + + return 0; +} + static int rmi_f34v7_write_command_single_transaction(struct f34_data *f34, u8 cmd) { @@ -318,6 +336,10 @@ static int rmi_f34v7_read_partition_table(struct f34_data *f34) return ret; } + /* + * rmi_f34v7_check_command_status() can't be used here, as this + * function is called before IRQs are available + */ timeout = msecs_to_jiffies(F34_WRITE_WAIT_MS); while (time_before(jiffies, timeout)) { usleep_range(5000, 6000); @@ -674,7 +696,7 @@ static int rmi_f34v7_erase_config(struct f34_data *f34) break; } - ret = rmi_f34v7_wait_for_idle(f34, F34_ERASE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_ERASE_WAIT_MS); if (ret < 0) return ret; @@ -693,7 +715,7 @@ static int rmi_f34v7_erase_guest_code(struct f34_data *f34) if (ret < 0) return ret; - ret = rmi_f34v7_wait_for_idle(f34, F34_ERASE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_ERASE_WAIT_MS); if (ret < 0) return ret; @@ -712,7 +734,7 @@ static int rmi_f34v7_erase_all(struct f34_data *f34) if (ret < 0) return ret; - ret = rmi_f34v7_wait_for_idle(f34, F34_ERASE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_ERASE_WAIT_MS); if (ret < 0) return ret; @@ -787,7 +809,7 @@ static int rmi_f34v7_read_blocks(struct f34_data *f34, if (ret < 0) return ret; - ret = rmi_f34v7_wait_for_idle(f34, F34_ENABLE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_ENABLE_WAIT_MS); if (ret < 0) return ret; @@ -871,7 +893,7 @@ static int rmi_f34v7_write_f34v7_blocks(struct f34_data *f34, return ret; } - ret = rmi_f34v7_wait_for_idle(f34, F34_ENABLE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_ENABLE_WAIT_MS); if (ret < 0) return ret; @@ -944,7 +966,7 @@ static int rmi_f34v7_write_flash_config(struct f34_data *f34) rmi_dbg(RMI_DEBUG_FN, &f34->fn->dev, "%s: Erase flash config command written\n", __func__); - ret = rmi_f34v7_wait_for_idle(f34, F34_WRITE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_WRITE_WAIT_MS); if (ret < 0) return ret; @@ -1297,7 +1319,7 @@ static int rmi_f34v7_enter_flash_prog(struct f34_data *f34) if (ret < 0) return ret; - ret = rmi_f34v7_wait_for_idle(f34, F34_ENABLE_WAIT_MS); + ret = rmi_f34v7_check_command_status(f34, F34_ENABLE_WAIT_MS); if (ret < 0) return ret; -- GitLab From b4d6c6a07faa5c860421182d7599f12acfc7dfd0 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:02 +0200 Subject: [PATCH 0447/2223] Input: synaptics-rmi4 - fix command completion check for bootloader v7/v8 The command register is reset to 0 when a command has completed. Check for this condition instead of the error status, which will not accurately reflect completion. In particular, the incorrect condition caused every command error to be reported as a timeout. Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-4-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34.c b/drivers/input/rmi4/rmi_f34.c index 3afc94f679edc..b811706fb77b5 100644 --- a/drivers/input/rmi4/rmi_f34.c +++ b/drivers/input/rmi4/rmi_f34.c @@ -114,13 +114,13 @@ static irqreturn_t rmi_f34_attention(int irq, void *ctx) complete(&f34->v5.cmd_done); } else { ret = rmi_read_block(f34->fn->rmi_dev, - f34->fn->fd.data_base_addr + - f34->v7.off.flash_status, - &status, sizeof(status)); - rmi_dbg(RMI_DEBUG_FN, &fn->dev, "%s: status: %#02x, ret: %d\n", + f34->fn->fd.data_base_addr + + f34->v7.off.flash_cmd, + &status, sizeof(status)); + rmi_dbg(RMI_DEBUG_FN, &f34->fn->dev, "%s: cmd: %#02x, ret: %d\n", __func__, status, ret); - if (!ret && !(status & 0x1f)) + if (!ret && status == CMD_V7_IDLE) complete(&f34->v7.cmd_done); } -- GitLab From 0113b49bd9634ea96cb1237a35308d41f72175af Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:03 +0200 Subject: [PATCH 0448/2223] Input: synaptics-rmi4 - rewrite partition table unconditionally Preparation for use of the "erase application" command, which is required to recover from a bad partition table error condition. Rather than adding complex fallback error paths for such errors, it seems more robust to do the full erase unconditionally. Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-5-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34.h | 2 - drivers/input/rmi4/rmi_f34v7.c | 153 +++------------------------------ 2 files changed, 13 insertions(+), 142 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34.h b/drivers/input/rmi4/rmi_f34.h index 99faa8c2269df..9495c85428240 100644 --- a/drivers/input/rmi4/rmi_f34.h +++ b/drivers/input/rmi4/rmi_f34.h @@ -262,7 +262,6 @@ struct f34v5_data { struct f34v7_data { bool has_display_cfg; bool has_guest_code; - bool force_update; bool in_bl_mode; u8 *read_config_buf; size_t read_config_buf_size; @@ -276,7 +275,6 @@ struct f34v7_data { u16 payload_length; u8 partitions; u16 partition_table_bytes; - bool new_partition_table; struct register_offset off; struct block_count blkcount; diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 9049acb3a994a..19b94b1c1a339 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -593,68 +593,6 @@ static int rmi_f34v7_read_queries(struct f34_data *f34) return 0; } -static int rmi_f34v7_check_ui_firmware_size(struct f34_data *f34) -{ - u16 block_count; - - block_count = f34->v7.img.ui_firmware.size / f34->v7.block_size; - f34->update_size += block_count; - - if (block_count != f34->v7.blkcount.ui_firmware) { - dev_err(&f34->fn->dev, - "UI firmware size mismatch: %d != %d\n", - block_count, f34->v7.blkcount.ui_firmware); - return -EINVAL; - } - - return 0; -} - -static int rmi_f34v7_check_ui_config_size(struct f34_data *f34) -{ - u16 block_count; - - block_count = f34->v7.img.ui_config.size / f34->v7.block_size; - f34->update_size += block_count; - - if (block_count != f34->v7.blkcount.ui_config) { - dev_err(&f34->fn->dev, "UI config size mismatch\n"); - return -EINVAL; - } - - return 0; -} - -static int rmi_f34v7_check_dp_config_size(struct f34_data *f34) -{ - u16 block_count; - - block_count = f34->v7.img.dp_config.size / f34->v7.block_size; - f34->update_size += block_count; - - if (block_count != f34->v7.blkcount.dp_config) { - dev_err(&f34->fn->dev, "Display config size mismatch\n"); - return -EINVAL; - } - - return 0; -} - -static int rmi_f34v7_check_guest_code_size(struct f34_data *f34) -{ - u16 block_count; - - block_count = f34->v7.img.guest_code.size / f34->v7.block_size; - f34->update_size += block_count; - - if (block_count != f34->v7.blkcount.guest_code) { - dev_err(&f34->fn->dev, "Guest code size mismatch\n"); - return -EINVAL; - } - - return 0; -} - static int rmi_f34v7_check_bl_config_size(struct f34_data *f34) { u16 block_count; @@ -750,7 +688,7 @@ static int rmi_f34v7_erase_all(struct f34_data *f34) return ret; } - if (f34->v7.new_partition_table && f34->v7.has_guest_code) { + if (f34->v7.has_guest_code) { ret = rmi_f34v7_erase_guest_code(f34); if (ret < 0) return ret; @@ -1029,33 +967,6 @@ static int rmi_f34v7_write_firmware(struct f34_data *f34) blk_count, v7_CMD_WRITE_FW); } -static void rmi_f34v7_compare_partition_tables(struct f34_data *f34) -{ - if (f34->v7.phyaddr.ui_firmware != f34->v7.img.phyaddr.ui_firmware) { - f34->v7.new_partition_table = true; - return; - } - - if (f34->v7.phyaddr.ui_config != f34->v7.img.phyaddr.ui_config) { - f34->v7.new_partition_table = true; - return; - } - - if (f34->v7.has_display_cfg && - f34->v7.phyaddr.dp_config != f34->v7.img.phyaddr.dp_config) { - f34->v7.new_partition_table = true; - return; - } - - if (f34->v7.has_guest_code && - f34->v7.phyaddr.guest_code != f34->v7.img.phyaddr.guest_code) { - f34->v7.new_partition_table = true; - return; - } - - f34->v7.new_partition_table = false; -} - static void rmi_f34v7_parse_img_header_10_bl_container(struct f34_data *f34, const void *image) { @@ -1202,8 +1113,6 @@ static int rmi_f34v7_parse_image_info(struct f34_data *f34) rmi_f34v7_parse_partition_table(f34, f34->v7.img.fl_config.data, &f34->v7.img.blkcount, &f34->v7.img.phyaddr); - rmi_f34v7_compare_partition_tables(f34); - return 0; } @@ -1224,44 +1133,18 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) if (ret < 0) goto fail; - if (!f34->v7.new_partition_table) { - ret = rmi_f34v7_check_ui_firmware_size(f34); - if (ret < 0) - goto fail; - - ret = rmi_f34v7_check_ui_config_size(f34); - if (ret < 0) - goto fail; - - if (f34->v7.has_display_cfg && - f34->v7.img.contains_display_cfg) { - ret = rmi_f34v7_check_dp_config_size(f34); - if (ret < 0) - goto fail; - } - - if (f34->v7.has_guest_code && f34->v7.img.contains_guest_code) { - ret = rmi_f34v7_check_guest_code_size(f34); - if (ret < 0) - goto fail; - } - } else { - ret = rmi_f34v7_check_bl_config_size(f34); - if (ret < 0) - goto fail; - } + ret = rmi_f34v7_check_bl_config_size(f34); + if (ret < 0) + goto fail; ret = rmi_f34v7_erase_all(f34); if (ret < 0) goto fail; - if (f34->v7.new_partition_table) { - ret = rmi_f34v7_write_partition_table(f34); - if (ret < 0) - goto fail; - dev_info(&f34->fn->dev, "%s: Partition table programmed\n", - __func__); - } + ret = rmi_f34v7_write_partition_table(f34); + if (ret < 0) + goto fail; + dev_info(&f34->fn->dev, "%s: Partition table programmed\n", __func__); dev_info(&f34->fn->dev, "Writing firmware (%d bytes)...\n", f34->v7.img.ui_firmware.size); @@ -1286,14 +1169,12 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) goto fail; } - if (f34->v7.new_partition_table) { - if (f34->v7.has_guest_code && f34->v7.img.contains_guest_code) { - dev_info(&f34->fn->dev, "Writing guest code...\n"); + if (f34->v7.has_guest_code && f34->v7.img.contains_guest_code) { + dev_info(&f34->fn->dev, "Writing guest code...\n"); - ret = rmi_f34v7_write_guest_code(f34); - if (ret < 0) - goto fail; - } + ret = rmi_f34v7_write_guest_code(f34); + if (ret < 0) + goto fail; } fail: @@ -1339,13 +1220,6 @@ int rmi_f34v7_start_reflash(struct f34_data *f34, const struct firmware *fw) if (ret < 0) goto exit; - if (!f34->v7.force_update && f34->v7.new_partition_table) { - dev_err(&f34->fn->dev, "%s: Partition table mismatch\n", - __func__); - ret = -EINVAL; - goto exit; - } - dev_info(&f34->fn->dev, "Firmware image OK\n"); ret = rmi_f34v7_read_flash_status(f34); @@ -1406,6 +1280,5 @@ int rmi_f34v7_probe(struct f34_data *f34) if (ret < 0) return ret; - f34->v7.force_update = true; return 0; } -- GitLab From d316e709cd7e3a84fa6a9b93d0c25754c0cb707e Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:04 +0200 Subject: [PATCH 0449/2223] Input: synaptics-rmi4 - reset after writing partition table When recovering from a bad partition table (for example after an interrupted update), a reset is necessary for the new partition table to become effective. Without this reset, writing the core code partition will fail with status 0x03 (Invalid Command). Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-6-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34v7.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 19b94b1c1a339..9b78f98bb21cc 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -1146,6 +1146,14 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) goto fail; dev_info(&f34->fn->dev, "%s: Partition table programmed\n", __func__); + /* + * Reset to reload partition table - as the previous firmware has been + * erased, we remain in bootloader mode. + */ + ret = rmi_scan_pdt(f34->fn->rmi_dev, NULL, rmi_initial_reset); + if (ret < 0) + dev_warn(&f34->fn->dev, "RMI reset failed!\n"); + dev_info(&f34->fn->dev, "Writing firmware (%d bytes)...\n", f34->v7.img.ui_firmware.size); -- GitLab From d8d007f25cb6a25b25c0b6447b620638f1febcd3 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:05 +0200 Subject: [PATCH 0450/2223] Input: synaptics-rmi4 - make rmi_f34v7_erase_all() use the "erase all" command A full erase is required to recover from error conditions like "Bad Partition Table". Various individual partition erase commands can be (and need to be) omitted, as they will fail until a new partition table has been written. Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-7-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34v7.c | 87 +--------------------------------- 1 file changed, 1 insertion(+), 86 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 9b78f98bb21cc..9c1a736117616 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -608,58 +608,6 @@ static int rmi_f34v7_check_bl_config_size(struct f34_data *f34) return 0; } -static int rmi_f34v7_erase_config(struct f34_data *f34) -{ - int ret; - - dev_info(&f34->fn->dev, "Erasing config...\n"); - - init_completion(&f34->v7.cmd_done); - - switch (f34->v7.config_area) { - case v7_UI_CONFIG_AREA: - ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_UI_CONFIG); - if (ret < 0) - return ret; - break; - case v7_DP_CONFIG_AREA: - ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_DISP_CONFIG); - if (ret < 0) - return ret; - break; - case v7_BL_CONFIG_AREA: - ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_BL_CONFIG); - if (ret < 0) - return ret; - break; - } - - ret = rmi_f34v7_check_command_status(f34, F34_ERASE_WAIT_MS); - if (ret < 0) - return ret; - - return 0; -} - -static int rmi_f34v7_erase_guest_code(struct f34_data *f34) -{ - int ret; - - dev_info(&f34->fn->dev, "Erasing guest code...\n"); - - init_completion(&f34->v7.cmd_done); - - ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_GUEST_CODE); - if (ret < 0) - return ret; - - ret = rmi_f34v7_check_command_status(f34, F34_ERASE_WAIT_MS); - if (ret < 0) - return ret; - - return 0; -} - static int rmi_f34v7_erase_all(struct f34_data *f34) { int ret; @@ -668,7 +616,7 @@ static int rmi_f34v7_erase_all(struct f34_data *f34) init_completion(&f34->v7.cmd_done); - ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_UI_FIRMWARE); + ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_ALL); if (ret < 0) return ret; @@ -676,24 +624,6 @@ static int rmi_f34v7_erase_all(struct f34_data *f34) if (ret < 0) return ret; - f34->v7.config_area = v7_UI_CONFIG_AREA; - ret = rmi_f34v7_erase_config(f34); - if (ret < 0) - return ret; - - if (f34->v7.has_display_cfg) { - f34->v7.config_area = v7_DP_CONFIG_AREA; - ret = rmi_f34v7_erase_config(f34); - if (ret < 0) - return ret; - } - - if (f34->v7.has_guest_code) { - ret = rmi_f34v7_erase_guest_code(f34); - if (ret < 0) - return ret; - } - return 0; } @@ -897,17 +827,6 @@ static int rmi_f34v7_write_flash_config(struct f34_data *f34) init_completion(&f34->v7.cmd_done); - ret = rmi_f34v7_write_command(f34, v7_CMD_ERASE_FLASH_CONFIG); - if (ret < 0) - return ret; - - rmi_dbg(RMI_DEBUG_FN, &f34->fn->dev, - "%s: Erase flash config command written\n", __func__); - - ret = rmi_f34v7_check_command_status(f34, F34_WRITE_WAIT_MS); - if (ret < 0) - return ret; - ret = rmi_f34v7_write_config(f34); if (ret < 0) return ret; @@ -937,10 +856,6 @@ static int rmi_f34v7_write_partition_table(struct f34_data *f34) if (ret < 0) return ret; - ret = rmi_f34v7_erase_config(f34); - if (ret < 0) - return ret; - ret = rmi_f34v7_write_flash_config(f34); if (ret < 0) return ret; -- GitLab From b077d523d4d91e7239864997ffe2a30ac30055c1 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:06 +0200 Subject: [PATCH 0451/2223] Input: synaptics-rmi4 - remove unneeded struct register_offset All register offsets are fixed, and a number of places even read or write multiple registers as a block, so there is no way to support reordering them without move involved changes. Remove the unneeded level of indirection in the register access. Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-8-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34.c | 2 +- drivers/input/rmi4/rmi_f34.h | 15 --------------- drivers/input/rmi4/rmi_f34v7.c | 35 ++++++++++++++-------------------- 3 files changed, 15 insertions(+), 37 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34.c b/drivers/input/rmi4/rmi_f34.c index b811706fb77b5..30169b584573c 100644 --- a/drivers/input/rmi4/rmi_f34.c +++ b/drivers/input/rmi4/rmi_f34.c @@ -115,7 +115,7 @@ static irqreturn_t rmi_f34_attention(int irq, void *ctx) } else { ret = rmi_read_block(f34->fn->rmi_dev, f34->fn->fd.data_base_addr + - f34->v7.off.flash_cmd, + V7_COMMAND_OFFSET, &status, sizeof(status)); rmi_dbg(RMI_DEBUG_FN, &f34->fn->dev, "%s: cmd: %#02x, ret: %d\n", __func__, status, ret); diff --git a/drivers/input/rmi4/rmi_f34.h b/drivers/input/rmi4/rmi_f34.h index 9495c85428240..cfa3039804fd2 100644 --- a/drivers/input/rmi4/rmi_f34.h +++ b/drivers/input/rmi4/rmi_f34.h @@ -222,20 +222,6 @@ struct image_metadata { struct physical_address phyaddr; }; -struct register_offset { - u8 properties; - u8 properties_2; - u8 block_size; - u8 block_count; - u8 gc_block_count; - u8 flash_status; - u8 partition_id; - u8 block_number; - u8 transfer_length; - u8 flash_cmd; - u8 payload; -}; - struct rmi_f34_firmware { __le32 checksum; u8 pad1[3]; @@ -276,7 +262,6 @@ struct f34v7_data { u8 partitions; u16 partition_table_bytes; - struct register_offset off; struct block_count blkcount; struct physical_address phyaddr; struct image_metadata img; diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 9c1a736117616..5c22ad4bcc746 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -25,7 +25,7 @@ static int rmi_f34v7_read_flash_status(struct f34_data *f34) int ret; ret = rmi_read_block(f34->fn->rmi_dev, - f34->fn->fd.data_base_addr + f34->v7.off.flash_status, + f34->fn->fd.data_base_addr + V7_FLASH_STATUS_OFFSET, &status, sizeof(status)); if (ret < 0) { @@ -43,7 +43,7 @@ static int rmi_f34v7_read_flash_status(struct f34_data *f34) } ret = rmi_read_block(f34->fn->rmi_dev, - f34->fn->fd.data_base_addr + f34->v7.off.flash_cmd, + f34->fn->fd.data_base_addr + V7_COMMAND_OFFSET, &command, sizeof(command)); if (ret < 0) { @@ -140,7 +140,7 @@ static int rmi_f34v7_write_command_single_transaction(struct f34_data *f34, data_1_5.payload[1] = f34->bootloader_id[1]; ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.partition_id, + base + V7_PARTITION_ID_OFFSET, &data_1_5, sizeof(data_1_5)); if (ret < 0) { dev_err(&f34->fn->dev, @@ -213,7 +213,7 @@ static int rmi_f34v7_write_command(struct f34_data *f34, u8 cmd) __func__, command); ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.flash_cmd, + base + V7_COMMAND_OFFSET, &command, sizeof(command)); if (ret < 0) { dev_err(&f34->fn->dev, "%s: Failed to write flash command\n", @@ -280,7 +280,7 @@ static int rmi_f34v7_write_partition_id(struct f34_data *f34, u8 cmd) } ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.partition_id, + base + V7_PARTITION_ID_OFFSET, &partition, sizeof(partition)); if (ret < 0) { dev_err(&f34->fn->dev, "%s: Failed to write partition ID\n", @@ -308,7 +308,7 @@ static int rmi_f34v7_read_partition_table(struct f34_data *f34) return ret; ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.block_number, + base + V7_BLOCK_NUMBER_OFFSET, &block_number, sizeof(block_number)); if (ret < 0) { dev_err(&f34->fn->dev, "%s: Failed to write block number\n", @@ -319,7 +319,7 @@ static int rmi_f34v7_read_partition_table(struct f34_data *f34) put_unaligned_le16(f34->v7.flash_config_length, &length); ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.transfer_length, + base + V7_TRANSFER_LENGTH_OFFSET, &length, sizeof(length)); if (ret < 0) { dev_err(&f34->fn->dev, "%s: Failed to write transfer length\n", @@ -352,7 +352,7 @@ static int rmi_f34v7_read_partition_table(struct f34_data *f34) } ret = rmi_read_block(f34->fn->rmi_dev, - base + f34->v7.off.payload, + base + V7_PAYLOAD_OFFSET, f34->v7.read_config_buf, f34->v7.partition_table_bytes); if (ret < 0) { @@ -526,13 +526,6 @@ static int rmi_f34v7_read_queries(struct f34_data *f34) rmi_dbg(RMI_DEBUG_FN, &f34->fn->dev, "%s: f34->v7.block_size = %d\n", __func__, f34->v7.block_size); - f34->v7.off.flash_status = V7_FLASH_STATUS_OFFSET; - f34->v7.off.partition_id = V7_PARTITION_ID_OFFSET; - f34->v7.off.block_number = V7_BLOCK_NUMBER_OFFSET; - f34->v7.off.transfer_length = V7_TRANSFER_LENGTH_OFFSET; - f34->v7.off.flash_cmd = V7_COMMAND_OFFSET; - f34->v7.off.payload = V7_PAYLOAD_OFFSET; - f34->v7.has_display_cfg = query_1_7.partition_support[1] & HAS_DISP_CFG; f34->v7.has_guest_code = query_1_7.partition_support[1] & HAS_GUEST_CODE; @@ -646,7 +639,7 @@ static int rmi_f34v7_read_blocks(struct f34_data *f34, return ret; ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.block_number, + base + V7_BLOCK_NUMBER_OFFSET, &block_number, sizeof(block_number)); if (ret < 0) { dev_err(&f34->fn->dev, "%s: Failed to write block number\n", @@ -662,7 +655,7 @@ static int rmi_f34v7_read_blocks(struct f34_data *f34, put_unaligned_le16(transfer, &length); ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.transfer_length, + base + V7_TRANSFER_LENGTH_OFFSET, &length, sizeof(length)); if (ret < 0) { dev_err(&f34->fn->dev, @@ -682,7 +675,7 @@ static int rmi_f34v7_read_blocks(struct f34_data *f34, return ret; ret = rmi_read_block(f34->fn->rmi_dev, - base + f34->v7.off.payload, + base + V7_PAYLOAD_OFFSET, &f34->v7.read_config_buf[index], transfer * f34->v7.block_size); if (ret < 0) { @@ -718,7 +711,7 @@ static int rmi_f34v7_write_f34v7_blocks(struct f34_data *f34, return ret; ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.block_number, + base + V7_BLOCK_NUMBER_OFFSET, &block_number, sizeof(block_number)); if (ret < 0) { dev_err(&f34->fn->dev, "%s: Failed to write block number\n", @@ -738,7 +731,7 @@ static int rmi_f34v7_write_f34v7_blocks(struct f34_data *f34, init_completion(&f34->v7.cmd_done); ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.transfer_length, + base + V7_TRANSFER_LENGTH_OFFSET, &length, sizeof(length)); if (ret < 0) { dev_err(&f34->fn->dev, @@ -752,7 +745,7 @@ static int rmi_f34v7_write_f34v7_blocks(struct f34_data *f34, return ret; ret = rmi_write_block(f34->fn->rmi_dev, - base + f34->v7.off.payload, + base + V7_PAYLOAD_OFFSET, block_ptr, transfer * f34->v7.block_size); if (ret < 0) { dev_err(&f34->fn->dev, -- GitLab From 7d128a8d4107084a1df548d9304b9e49153808b8 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:07 +0200 Subject: [PATCH 0452/2223] Input: synaptics-rmi4 - simplify rmi_f34v7_start_reflash() rmi_f34v7_enter_flash_prog() already enables IRQs and checks the flash status - there's no need for rmi_f34v7_start_reflash() to do the same just before calling rmi_f34v7_enter_flash_prog(). Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-9-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34v7.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 5c22ad4bcc746..f16c67eb6cc62 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -1107,8 +1107,11 @@ static int rmi_f34v7_enter_flash_prog(struct f34_data *f34) if (ret < 0) return ret; - if (f34->v7.in_bl_mode) + if (f34->v7.in_bl_mode) { + dev_info(&f34->fn->dev, "%s: Device in bootloader mode\n", + __func__); return 0; + } init_completion(&f34->v7.cmd_done); @@ -1127,32 +1130,16 @@ int rmi_f34v7_start_reflash(struct f34_data *f34, const struct firmware *fw) { int ret = 0; - f34->fn->rmi_dev->driver->set_irq_bits(f34->fn->rmi_dev, f34->fn->irq_mask); - f34->v7.config_area = v7_UI_CONFIG_AREA; f34->v7.image = fw->data; ret = rmi_f34v7_parse_image_info(f34); if (ret < 0) - goto exit; + return ret; dev_info(&f34->fn->dev, "Firmware image OK\n"); - ret = rmi_f34v7_read_flash_status(f34); - if (ret < 0) - goto exit; - - if (f34->v7.in_bl_mode) { - dev_info(&f34->fn->dev, "%s: Device in bootloader mode\n", - __func__); - } - - rmi_f34v7_enter_flash_prog(f34); - - return 0; - -exit: - return ret; + return rmi_f34v7_enter_flash_prog(f34); } int rmi_f34v7_probe(struct f34_data *f34) -- GitLab From 87d3d1b1403ba079cf9b1541a247156863af07f0 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 8 Jun 2022 14:48:08 +0200 Subject: [PATCH 0453/2223] Input: synaptics-rmi4 - drop useless gotos in rmi_f34v7_do_reflash() Returning directly makes the code clearer. Signed-off-by: Matthias Schiffer Reviewed-by: Lyude Paul Link: https://lore.kernel.org/r/20220608124808.51402-10-matthias.schiffer@ew.tq-group.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34v7.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index f16c67eb6cc62..886557b01ebab 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -1039,19 +1039,19 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) ret = rmi_f34v7_parse_image_info(f34); if (ret < 0) - goto fail; + return ret; ret = rmi_f34v7_check_bl_config_size(f34); if (ret < 0) - goto fail; + return ret; ret = rmi_f34v7_erase_all(f34); if (ret < 0) - goto fail; + return ret; ret = rmi_f34v7_write_partition_table(f34); if (ret < 0) - goto fail; + return ret; dev_info(&f34->fn->dev, "%s: Partition table programmed\n", __func__); /* @@ -1067,7 +1067,7 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) ret = rmi_f34v7_write_firmware(f34); if (ret < 0) - goto fail; + return ret; dev_info(&f34->fn->dev, "Writing config (%d bytes)...\n", f34->v7.img.ui_config.size); @@ -1075,14 +1075,14 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) f34->v7.config_area = v7_UI_CONFIG_AREA; ret = rmi_f34v7_write_ui_config(f34); if (ret < 0) - goto fail; + return ret; if (f34->v7.has_display_cfg && f34->v7.img.contains_display_cfg) { dev_info(&f34->fn->dev, "Writing display config...\n"); ret = rmi_f34v7_write_dp_config(f34); if (ret < 0) - goto fail; + return ret; } if (f34->v7.has_guest_code && f34->v7.img.contains_guest_code) { @@ -1090,11 +1090,10 @@ int rmi_f34v7_do_reflash(struct f34_data *f34, const struct firmware *fw) ret = rmi_f34v7_write_guest_code(f34); if (ret < 0) - goto fail; + return ret; } -fail: - return ret; + return 0; } static int rmi_f34v7_enter_flash_prog(struct f34_data *f34) -- GitLab From 22873deac9e7b273bbf17eee515c8170510d861a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 06:59:59 +0200 Subject: [PATCH 0454/2223] vfs: add vfs_tmpfile_open() helper This helper unifies tmpfile creation with opening. Existing vfs_tmpfile() callers outside of fs/namei.c will be converted to using this helper. There are two such callers: cachefile and overlayfs. The cachefiles code currently uses the open_with_fake_path() helper to open the tmpfile, presumably to disable accounting of the open file. Overlayfs uses tmpfile for copy_up, which means these struct file instances will be short lived, hence it doesn't really matter if they are accounted or not. Disable accounting in this helper too, which should be okay for both callers. Add MAY_OPEN permission checking for consistency. Like for create(2) read/write permissions are not checked. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/namei.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 4 ++++ 2 files changed, 45 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 53b4bc094db23..81c388a813d35 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3624,6 +3624,47 @@ out_err: } EXPORT_SYMBOL(vfs_tmpfile); +/** + * vfs_tmpfile_open - open a tmpfile for kernel internal use + * @mnt_userns: user namespace of the mount the inode was found from + * @parentpath: path of the base directory + * @mode: mode of the new tmpfile + * @open_flag: flags + * @cred: credentials for open + * + * Create and open a temporary file. The file is not accounted in nr_files, + * hence this is only for kernel internal use, and must not be installed into + * file tables or such. + */ +struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, + const struct path *parentpath, + umode_t mode, int open_flag, const struct cred *cred) +{ + struct file *file; + int error; + struct path path = { .mnt = parentpath->mnt }; + + path.dentry = vfs_tmpfile(mnt_userns, parentpath->dentry, mode, open_flag); + if (IS_ERR(path.dentry)) + return ERR_CAST(path.dentry); + + error = may_open(mnt_userns, &path, 0, open_flag); + file = ERR_PTR(error); + if (error) + goto out_dput; + + /* + * This relies on the "noaccount" property of fake open, otherwise + * equivalent to dentry_open(). + */ + file = open_with_fake_path(&path, open_flag, d_inode(path.dentry), cred); +out_dput: + dput(path.dentry); + + return file; +} +EXPORT_SYMBOL(vfs_tmpfile_open); + static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286ee..15fafda95dd3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2007,6 +2007,10 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns, struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode, int open_flag); +struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, + const struct path *parentpath, + umode_t mode, int open_flag, const struct cred *cred); + int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); -- GitLab From 19ee5345f23423d5bbd84a59112433592d584b4c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 24 Sep 2022 06:59:59 +0200 Subject: [PATCH 0455/2223] hugetlbfs: cleanup mknod and tmpfile Duplicate the few lines that are shared between hugetlbfs_mknod() and hugetlbfs_tmpfile(). This is a prerequisite for sanely changing the signature of ->tmpfile(). Signed-off-by: Al Viro Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/hugetlbfs/inode.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a9..0b458beb318cc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -885,33 +885,18 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int do_hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t dev, - bool tmpfile) +static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); - if (inode) { - dir->i_ctime = dir->i_mtime = current_time(dir); - if (tmpfile) { - d_tmpfile(dentry, inode); - } else { - d_instantiate(dentry, inode); - dget(dentry);/* Extra count - pin the dentry in core */ - } - error = 0; - } - return error; -} - -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) -{ - return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + return 0; } static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, @@ -935,7 +920,14 @@ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { - return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); + struct inode *inode; + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_tmpfile(dentry, inode); + return 0; } static int hugetlbfs_symlink(struct user_namespace *mnt_userns, -- GitLab From 38017d44441efa695997d4f09d09d9d312f40088 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 06:59:59 +0200 Subject: [PATCH 0456/2223] cachefiles: tmpfile error handling cleanup Separate the error labels from the success path and use 'ret' to store the error value before jumping to the error label. Signed-off-by: Miklos Szeredi --- fs/cachefiles/namei.c | 55 ++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index facf2ebe464b3..d3a5884fe5c9f 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -460,31 +460,27 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) path.mnt = cache->mnt; ret = cachefiles_inject_write_error(); - if (ret == 0) + if (ret == 0) { path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); - else - path.dentry = ERR_PTR(ret); - if (IS_ERR(path.dentry)) { - trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry), + ret = PTR_ERR_OR_ZERO(path.dentry); + } + if (ret) { + trace_cachefiles_vfs_error(object, d_inode(fan), ret, cachefiles_trace_tmpfile_error); - if (PTR_ERR(path.dentry) == -EIO) + if (ret == -EIO) cachefiles_io_error_obj(object, "Failed to create tmpfile"); - file = ERR_CAST(path.dentry); - goto out; + goto err; } trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); - if (!cachefiles_mark_inode_in_use(object, path.dentry)) { - file = ERR_PTR(-EBUSY); - goto out_dput; - } + ret = -EBUSY; + if (!cachefiles_mark_inode_in_use(object, path.dentry)) + goto err_dput; ret = cachefiles_ondemand_init_object(object); - if (ret < 0) { - file = ERR_PTR(ret); - goto out_unuse; - } + if (ret < 0) + goto err_unuse; ni_size = object->cookie->object_size; ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); @@ -499,36 +495,37 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) trace_cachefiles_vfs_error( object, d_backing_inode(path.dentry), ret, cachefiles_trace_trunc_error); - file = ERR_PTR(ret); - goto out_unuse; + goto err_unuse; } } file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, d_backing_inode(path.dentry), cache->cache_cred); + ret = PTR_ERR(file); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), - PTR_ERR(file), - cachefiles_trace_open_error); - goto out_unuse; + ret, cachefiles_trace_open_error); + goto err_unuse; } + ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); - file = ERR_PTR(-EINVAL); - goto out_unuse; + goto err_unuse; } - - goto out_dput; - -out_unuse: - cachefiles_do_unmark_inode_in_use(object, path.dentry); -out_dput: dput(path.dentry); out: cachefiles_end_secure(cache, saved_cred); return file; + +err_unuse: + cachefiles_do_unmark_inode_in_use(object, path.dentry); +err_dput: + dput(path.dentry); +err: + file = ERR_PTR(ret); + goto out; } /* -- GitLab From 08d7a6fb7e44ae0f54f7903888dc41e31dfbc9da Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 06:59:59 +0200 Subject: [PATCH 0457/2223] cachefiles: only pass inode to *mark_inode_inuse() helpers The only reason to pass dentry was because of a pr_notice() text. Move that to the two callers where it makes sense and add a WARN_ON() to the third. file_inode(file) is never NULL on an opened file. Remove check in cachefiles_unmark_inode_in_use(). Do not open code cachefiles_do_unmark_inode_in_use() in cachefiles_put_directory(). Signed-off-by: Miklos Szeredi --- fs/cachefiles/namei.c | 59 +++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d3a5884fe5c9f..1bf816181fbb8 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -15,9 +15,8 @@ * file or directory. The caller must hold the inode lock. */ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use = false; if (!(inode->i_flags & S_KERNEL_FILE)) { @@ -26,21 +25,18 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, can_use = true; } else { trace_cachefiles_mark_failed(object, inode); - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", - dentry, inode->i_ino); } return can_use; } static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use; inode_lock(inode); - can_use = __cachefiles_mark_inode_in_use(object, dentry); + can_use = __cachefiles_mark_inode_in_use(object, inode); inode_unlock(inode); return can_use; } @@ -49,21 +45,17 @@ static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, * Unmark a backing inode. The caller must hold the inode lock. */ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode->i_flags &= ~S_KERNEL_FILE; trace_cachefiles_mark_inactive(object, inode); } static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, dentry); + __cachefiles_unmark_inode_in_use(object, inode); inode_unlock(inode); } @@ -77,14 +69,12 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct cachefiles_cache *cache = object->volume->cache; struct inode *inode = file_inode(file); - if (inode) { - cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); + cachefiles_do_unmark_inode_in_use(object, inode); - if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { - atomic_long_add(inode->i_blocks, &cache->b_released); - if (atomic_inc_return(&cache->f_released)) - cachefiles_state_changed(cache); - } + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + atomic_long_add(inode->i_blocks, &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); } } @@ -164,8 +154,11 @@ retry: inode_lock(d_inode(subdir)); inode_unlock(d_inode(dir)); - if (!__cachefiles_mark_inode_in_use(NULL, subdir)) + if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + subdir, d_inode(subdir)->i_ino); goto mark_error; + } inode_unlock(d_inode(subdir)); @@ -224,9 +217,7 @@ nomem_d_alloc: void cachefiles_put_directory(struct dentry *dir) { if (dir) { - inode_lock(dir->d_inode); - __cachefiles_unmark_inode_in_use(NULL, dir); - inode_unlock(dir->d_inode); + cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir)); dput(dir); } } @@ -410,7 +401,7 @@ try_again: "Rename failed with error %d", ret); } - __cachefiles_unmark_inode_in_use(object, rep); + __cachefiles_unmark_inode_in_use(object, d_inode(rep)); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -474,9 +465,9 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); - ret = -EBUSY; - if (!cachefiles_mark_inode_in_use(object, path.dentry)) - goto err_dput; + /* This is a newly created file with no other possible user */ + if (!cachefiles_mark_inode_in_use(object, d_inode(path.dentry))) + WARN_ON(1); ret = cachefiles_ondemand_init_object(object); if (ret < 0) @@ -520,8 +511,7 @@ out: return file; err_unuse: - cachefiles_do_unmark_inode_in_use(object, path.dentry); -err_dput: + cachefiles_do_unmark_inode_in_use(object, d_inode(path.dentry)); dput(path.dentry); err: file = ERR_PTR(ret); @@ -566,8 +556,11 @@ static bool cachefiles_open_file(struct cachefiles_object *object, _enter("%pd", dentry); - if (!cachefiles_mark_inode_in_use(object, dentry)) + if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, d_inode(dentry)->i_ino); return false; + } /* We need to open a file interface onto a data file now as we can't do * it on demand because writeback called from do_exit() sees @@ -621,7 +614,7 @@ check_failed: error_fput: fput(file); error: - cachefiles_do_unmark_inode_in_use(object, dentry); + cachefiles_do_unmark_inode_in_use(object, d_inode(dentry)); dput(dentry); return false; } -- GitLab From 24a81759b65fa85767739999d91523691c5e2ea5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 0458/2223] cachefiles: use vfs_tmpfile_open() helper Use the vfs_tmpfile_open() helper instead of doing tmpfile creation and opening separately. The only minor difference is that previously no permission checking was done, while vfs_tmpfile_open() will call may_open() with zero access mask (i.e. no access is checked). Even if this would make a difference with callers caps (don't see how it could, even in the LSM codepaths) cachfiles raises caps before performing the tmpfile creation, so this extra permission check will not result in any regression. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/cachefiles/namei.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1bf816181fbb8..03ca8f2f657ab 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -442,18 +442,19 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) const struct cred *saved_cred; struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; struct file *file; - struct path path; + const struct path parentpath = { .mnt = cache->mnt, .dentry = fan }; uint64_t ni_size; long ret; cachefiles_begin_secure(cache, &saved_cred); - path.mnt = cache->mnt; ret = cachefiles_inject_write_error(); if (ret == 0) { - path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); - ret = PTR_ERR_OR_ZERO(path.dentry); + file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); + ret = PTR_ERR_OR_ZERO(file); } if (ret) { trace_cachefiles_vfs_error(object, d_inode(fan), ret, @@ -463,10 +464,10 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) goto err; } - trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); + trace_cachefiles_tmpfile(object, file_inode(file)); /* This is a newly created file with no other possible user */ - if (!cachefiles_mark_inode_in_use(object, d_inode(path.dentry))) + if (!cachefiles_mark_inode_in_use(object, file_inode(file))) WARN_ON(1); ret = cachefiles_ondemand_init_object(object); @@ -477,27 +478,19 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); if (ni_size > 0) { - trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, + trace_cachefiles_trunc(object, file_inode(file), 0, ni_size, cachefiles_trunc_expand_tmpfile); ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_truncate(&path, ni_size); + ret = vfs_truncate(&file->f_path, ni_size); if (ret < 0) { trace_cachefiles_vfs_error( - object, d_backing_inode(path.dentry), ret, + object, file_inode(file), ret, cachefiles_trace_trunc_error); goto err_unuse; } } - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(path.dentry), cache->cache_cred); - ret = PTR_ERR(file); - if (IS_ERR(file)) { - trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), - ret, cachefiles_trace_open_error); - goto err_unuse; - } ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { @@ -505,14 +498,13 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) pr_notice("Cache does not support read_iter and write_iter\n"); goto err_unuse; } - dput(path.dentry); out: cachefiles_end_secure(cache, saved_cred); return file; err_unuse: - cachefiles_do_unmark_inode_in_use(object, d_inode(path.dentry)); - dput(path.dentry); + cachefiles_do_unmark_inode_in_use(object, file_inode(file)); + fput(file); err: file = ERR_PTR(ret); goto out; -- GitLab From 2b1a77461f1602f870d6fe61fc65610bb8c8dd05 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 0459/2223] ovl: use vfs_tmpfile_open() helper If tmpfile is used for copy up, then use this helper to create the tmpfile and open it at the same time. This will later allow filesystems such as fuse to do this operation atomically. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 108 +++++++++++++++++++++------------------ fs/overlayfs/overlayfs.h | 14 ++--- fs/overlayfs/super.c | 10 ++-- fs/overlayfs/util.c | 2 +- 4 files changed, 72 insertions(+), 62 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index fdde6c56cc3dc..62a63e9ca57d3 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -193,11 +193,11 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, return ovl_real_fileattr_set(new, &newfa); } -static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, - struct path *new, loff_t len) +static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, + struct file *new_file, loff_t len) { + struct path datapath; struct file *old_file; - struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; @@ -206,23 +206,18 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, bool skip_hole = false; int error = 0; - if (len == 0) - return 0; + ovl_path_lowerdata(dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; - old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY); + old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); - new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY); - if (IS_ERR(new_file)) { - error = PTR_ERR(new_file); - goto out_fput; - } - /* Try to use clone_file_range to clone up within the same fs */ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) - goto out; + goto out_fput; /* Couldn't clone, so now we try to copy the data */ /* Check if lower fs supports seek operation */ @@ -282,10 +277,8 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, len -= bytes; } -out: if (!error && ovl_should_sync(ofs)) error = vfs_fsync(new_file, 0); - fput(new_file); out_fput: fput(old_file); return error; @@ -556,30 +549,31 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; } -static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) +static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct inode *inode = d_inode(c->dentry); - struct path upperpath, datapath; + struct file *new_file; int err; - ovl_path_upper(c->dentry, &upperpath); - if (WARN_ON(upperpath.dentry != NULL)) - return -EIO; + if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size) + return 0; - upperpath.dentry = temp; + new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY); + if (IS_ERR(new_file)) + return PTR_ERR(new_file); - /* - * Copy up data first and then xattrs. Writing data after - * xattrs will remove security.capability xattr automatically. - */ - if (S_ISREG(c->stat.mode) && !c->metacopy) { - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(ofs, &datapath, &upperpath, - c->stat.size); - if (err) - return err; - } + err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size); + fput(new_file); + + return err; +} + +static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp }; + int err; err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); if (err) @@ -662,6 +656,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper; struct ovl_cu_creds cc; int err; @@ -688,7 +683,16 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (IS_ERR(temp)) goto unlock; - err = ovl_copy_up_inode(c, temp); + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + path.dentry = temp; + err = ovl_copy_up_data(c, &path); + if (err) + goto cleanup; + + err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; @@ -732,6 +736,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; + struct file *tmpfile; struct ovl_cu_creds cc; int err; @@ -739,15 +744,22 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; - temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); ovl_revert_cu_creds(&cc); - if (IS_ERR(temp)) - return PTR_ERR(temp); + if (IS_ERR(tmpfile)) + return PTR_ERR(tmpfile); - err = ovl_copy_up_inode(c, temp); + temp = tmpfile->f_path.dentry; + if (!c->metacopy && c->stat.size) { + err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); + if (err) + return err; + } + + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_dput; + goto out_fput; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -761,16 +773,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_dput; + goto out_fput; if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); - ovl_inode_update(d_inode(c->dentry), temp); + ovl_inode_update(d_inode(c->dentry), dget(temp)); - return 0; - -out_dput: - dput(temp); +out_fput: + fput(tmpfile); return err; } @@ -899,7 +909,7 @@ static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct path upperpath, datapath; + struct path upperpath; int err; char *capability = NULL; ssize_t cap_size; @@ -908,10 +918,6 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(upperpath.dentry == NULL)) return -EIO; - ovl_path_lowerdata(c->dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) - return -EIO; - if (c->stat.size) { err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, &capability); @@ -919,7 +925,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) goto out; } - err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(c, &upperpath); if (err) goto out_free; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 87759165d32b6..0f9dbd0e2ff59 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -310,14 +310,16 @@ static inline int ovl_do_whiteout(struct ovl_fs *ofs, return err; } -static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs, - struct dentry *dentry, umode_t mode) +static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0); - int err = PTR_ERR_OR_ZERO(ret); + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + O_LARGEFILE | O_WRONLY, current_cred()); + int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); - return ret; + return file; } static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, @@ -401,7 +403,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); -struct file *ovl_path_open(struct path *path, int flags); +struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ec746d447f1bb..7837223689c15 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); @@ -1356,7 +1357,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); - struct dentry *temp, *workdir; + struct dentry *workdir; + struct file *tmpfile; bool rename_whiteout; bool d_type; int fh_type; @@ -1392,10 +1394,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); - ofs->tmpfile = !IS_ERR(temp); + tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(tmpfile); if (ofs->tmpfile) - dput(temp); + fput(tmpfile); else pr_warn("upper fs does not support tmpfile.\n"); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 87f811c089e4f..968926c0c7abf 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -490,7 +490,7 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } -struct file *ovl_path_open(struct path *path, int flags) +struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); -- GitLab From 3e9d4c593558ea86f49e10e62373a54c7f5a63e4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 0460/2223] vfs: make vfs_tmpfile() static No callers outside of fs/namei.c anymore. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/namei.c | 3 +-- include/linux/fs.h | 3 --- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 81c388a813d35..03ad4e55fb265 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3583,7 +3583,7 @@ static int do_open(struct nameidata *nd, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs init_user_ns. */ -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, +static struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode, int open_flag) { struct dentry *child = NULL; @@ -3622,7 +3622,6 @@ out_err: dput(child); return ERR_PTR(error); } -EXPORT_SYMBOL(vfs_tmpfile); /** * vfs_tmpfile_open - open a tmpfile for kernel internal use diff --git a/include/linux/fs.h b/include/linux/fs.h index 15fafda95dd3c..02646542f6bb7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2004,9 +2004,6 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns, WHITEOUT_DEV); } -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag); - struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); -- GitLab From 9751b338656f05a0ce918befd5118fcd970c71c6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 0461/2223] vfs: move open right after ->tmpfile() Create a helper finish_open_simple() that opens the file with the original dentry. Handle the error case here as well to simplify callers. Call this helper right after ->tmpfile() is called. Next patch will change the tmpfile API and move this call into tmpfile instances. Signed-off-by: Miklos Szeredi --- fs/namei.c | 83 ++++++++++++++++++---------------------------- include/linux/fs.h | 9 +++++ 2 files changed, 42 insertions(+), 50 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 03ad4e55fb265..fea56fe9f3069 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3583,44 +3583,44 @@ static int do_open(struct nameidata *nd, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs init_user_ns. */ -static struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag) +static int vfs_tmpfile(struct user_namespace *mnt_userns, + const struct path *parentpath, + struct file *file, umode_t mode) { - struct dentry *child = NULL; - struct inode *dir = dentry->d_inode; + struct dentry *child; + struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; /* we want directory to be writable */ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) - goto out_err; - error = -EOPNOTSUPP; + return error; if (!dir->i_op->tmpfile) - goto out_err; - error = -ENOMEM; - child = d_alloc(dentry, &slash_name); + return -EOPNOTSUPP; + child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) - goto out_err; + return -ENOMEM; + file->f_path.mnt = parentpath->mnt; + file->f_path.dentry = child; mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); + error = finish_open_simple(file, error); + dput(child); if (error) - goto out_err; - error = -ENOENT; - inode = child->d_inode; - if (unlikely(!inode)) - goto out_err; - if (!(open_flag & O_EXCL)) { + return error; + /* Don't check for other permissions, the inode was just created */ + error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + if (error) + return error; + inode = file_inode(file); + if (!(file->f_flags & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } ima_post_create_tmpfile(mnt_userns, inode); - return child; - -out_err: - dput(child); - return ERR_PTR(error); + return 0; } /** @@ -3641,25 +3641,15 @@ struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, { struct file *file; int error; - struct path path = { .mnt = parentpath->mnt }; - - path.dentry = vfs_tmpfile(mnt_userns, parentpath->dentry, mode, open_flag); - if (IS_ERR(path.dentry)) - return ERR_CAST(path.dentry); - - error = may_open(mnt_userns, &path, 0, open_flag); - file = ERR_PTR(error); - if (error) - goto out_dput; - - /* - * This relies on the "noaccount" property of fake open, otherwise - * equivalent to dentry_open(). - */ - file = open_with_fake_path(&path, open_flag, d_inode(path.dentry), cred); -out_dput: - dput(path.dentry); + file = alloc_empty_file_noaccount(open_flag, cred); + if (!IS_ERR(file)) { + error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); + } + } return file; } EXPORT_SYMBOL(vfs_tmpfile_open); @@ -3669,26 +3659,19 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, struct file *file) { struct user_namespace *mnt_userns; - struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; mnt_userns = mnt_user_ns(path.mnt); - child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); - error = PTR_ERR(child); - if (IS_ERR(child)) + error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + if (error) goto out2; - dput(path.dentry); - path.dentry = child; - audit_inode(nd->name, child, 0); - /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &path, 0, op->open_flag); - if (!error) - error = vfs_open(&path, file); + audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: diff --git a/include/linux/fs.h b/include/linux/fs.h index 02646542f6bb7..a3c50869e79bf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2780,6 +2780,15 @@ extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); +/* Helper for the simple case when original dentry is used */ +static inline int finish_open_simple(struct file *file, int error) +{ + if (error) + return error; + + return finish_open(file, file->f_path.dentry, NULL); +} + /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); -- GitLab From 863f144f12add1f4eab80b70561a90857c524a8b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 0462/2223] vfs: open inside ->tmpfile() This is in preparation for adding tmpfile support to fuse, which requires that the tmpfile creation and opening are done as a single operation. Replace the 'struct dentry *' argument of i_op->tmpfile with 'struct file *'. Call finish_open_simple() as the last thing in ->tmpfile() instances (may be omitted in the error case). Change d_tmpfile() argument to 'struct file *' as well to make callers more readable. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- Documentation/filesystems/locking.rst | 3 ++- Documentation/filesystems/porting.rst | 10 ++++++++++ Documentation/filesystems/vfs.rst | 6 ++++-- fs/bad_inode.c | 2 +- fs/btrfs/inode.c | 8 ++++---- fs/dcache.c | 4 +++- fs/ext2/namei.c | 6 +++--- fs/ext4/namei.c | 6 +++--- fs/f2fs/namei.c | 13 ++++++++----- fs/hugetlbfs/inode.c | 6 +++--- fs/minix/namei.c | 6 +++--- fs/namei.c | 3 +-- fs/ramfs/inode.c | 6 +++--- fs/ubifs/dir.c | 7 ++++--- fs/udf/namei.c | 6 +++--- fs/xfs/xfs_iops.c | 16 +++++++++------- include/linux/dcache.h | 3 ++- include/linux/fs.h | 2 +- mm/shmem.c | 6 +++--- 19 files changed, 70 insertions(+), 49 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 4bb2627026ec8..8f737e76935ce 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -79,7 +79,8 @@ prototypes:: int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*tmpfile) (struct user_namespace *, struct inode *, + struct file *, umode_t); int (*fileattr_set)(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index aee9aaf9f3df7..af138241bb4b1 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -922,3 +922,13 @@ is provided - file_open_root_mnt(). In-tree users adjusted. no_llseek is gone; don't set .llseek to that - just leave it NULL instead. Checks for "does that file have llseek(2), or should it fail with ESPIPE" should be done by looking at FMODE_LSEEK in file->f_mode. + +--- + +**mandatory** + +Calling conventions for ->tmpfile() have changed. It now takes a struct +file pointer instead of struct dentry pointer. d_tmpfile() is similarly +changed to simplify callers. The passed file is in a non-open state and on +success must be opened before returning (e.g. by calling +finish_open_simple()). diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 6cd6953e175b3..71b0b8114b186 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -439,7 +439,7 @@ As of kernel 2.6.22, the following members are defined: void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct user_namespace *, struct inode *, struct dentry *, umode_t); + int (*tmpfile) (struct user_namespace *, struct inode *, struct file *, umode_t); int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int); int (*fileattr_set)(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); @@ -589,7 +589,9 @@ otherwise noted. ``tmpfile`` called in the end of O_TMPFILE open(). Optional, equivalent to atomically creating, opening and unlinking a file in given - directory. + directory. On success needs to return with the file already + open; this can be done by calling finish_open_simple() right at + the end. ``fileattr_get`` called on ioctl(FS_IOC_GETFLAGS) and ioctl(FS_IOC_FSGETXATTR) to diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 12b8fdcc445bb..9d1cde8066cf8 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -147,7 +147,7 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, } static int bad_inode_tmpfile(struct user_namespace *mnt_userns, - struct inode *inode, struct dentry *dentry, + struct inode *inode, struct file *file, umode_t mode) { return -EIO; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1372210869b14..4163737210850 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10168,7 +10168,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10176,7 +10176,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, - .dentry = dentry, + .dentry = file->f_path.dentry, .orphan = true, }; unsigned int trans_num_items; @@ -10213,7 +10213,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, set_nlink(inode, 1); if (!ret) { - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); mark_inode_dirty(inode); } @@ -10225,7 +10225,7 @@ out_new_inode_args: out_inode: if (ret) iput(inode); - return ret; + return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) diff --git a/fs/dcache.c b/fs/dcache.c index bb0c4d0038dbd..89dc613891028 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3258,8 +3258,10 @@ void d_genocide(struct dentry *parent) EXPORT_SYMBOL(d_genocide); -void d_tmpfile(struct dentry *dentry, struct inode *inode) +void d_tmpfile(struct file *file, struct inode *inode) { + struct dentry *dentry = file->f_path.dentry; + inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5fd9a22d2b70c..9125eab85146a 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -120,7 +120,7 @@ static int ext2_create (struct user_namespace * mnt_userns, } static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); if (IS_ERR(inode)) @@ -128,9 +128,9 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ext2_set_file_ops(inode); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3a31b662f6619..9c3fde633a6ef 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2849,7 +2849,7 @@ retry: } static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2871,7 +2871,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; @@ -2882,7 +2882,7 @@ retry: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bf00d5057abb8..d5065a5af1f8a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -845,7 +845,7 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool is_whiteout, + struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -892,8 +892,8 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } else { - if (dentry) - d_tmpfile(dentry, inode); + if (file) + d_tmpfile(file, inode); else f2fs_i_links_write(inode, false); } @@ -915,16 +915,19 @@ out: } static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); + err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + + return finish_open_simple(file, err); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0b458beb318cc..026daa8fc221d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -917,7 +917,7 @@ static int hugetlbfs_create(struct user_namespace *mnt_userns, } static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, + struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; @@ -926,8 +926,8 @@ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, if (!inode) return -ENOSPC; dir->i_ctime = dir->i_mtime = current_time(dir); - d_tmpfile(dentry, inode); - return 0; + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct user_namespace *mnt_userns, diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 937fa5fae2b8e..8afdc408ca4fd 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -53,16 +53,16 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, } static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); if (inode) { minix_set_inode(inode, 0); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); } static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/namei.c b/fs/namei.c index fea56fe9f3069..c4ca2c3e4c4b6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3604,8 +3604,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); - error = finish_open_simple(file, error); + error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); dput(child); if (error) return error; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index bc66d0173e330..b3257e8528200 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -146,15 +146,15 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, } static int ramfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, umode_t mode) + struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; - d_tmpfile(dentry, inode); - return 0; + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static const struct inode_operations ramfs_dir_inode_operations = { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 86151889548e3..f59acd6a3615e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -424,8 +424,9 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) } static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, @@ -475,7 +476,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -489,7 +490,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ubifs_release_budget(c, &req); - return 0; + return finish_open_simple(file, 0); out_cancel: unlock_2_inodes(dir, inode); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b3d5f97f16cdb..fb4c30e052453 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -626,7 +626,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, } static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -640,9 +640,9 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 45518b8c613c9..764409c466fde 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -167,7 +167,7 @@ xfs_generic_create( struct dentry *dentry, umode_t mode, dev_t rdev, - bool tmpfile) /* unnamed file */ + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -234,7 +234,7 @@ xfs_generic_create( * d_tmpfile can immediately set it back to zero. */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); + d_tmpfile(tmpfile, inode); } else d_instantiate(dentry, inode); @@ -261,7 +261,7 @@ xfs_vn_mknod( umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); } STATIC int @@ -272,7 +272,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); } STATIC int @@ -283,7 +283,7 @@ xfs_vn_mkdir( umode_t mode) { return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - false); + NULL); } STATIC struct dentry * @@ -1080,10 +1080,12 @@ STATIC int xfs_vn_tmpfile( struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, + struct file *file, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); + int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + + return finish_open_simple(file, err); } static const struct inode_operations xfs_inode_operations = { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 92c78ed02b54d..bde9f8ff88695 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -16,6 +16,7 @@ #include struct path; +struct file; struct vfsmount; /* @@ -250,7 +251,7 @@ extern struct dentry * d_make_root(struct inode *); /* - the ramfs-type tree */ extern void d_genocide(struct dentry *); -extern void d_tmpfile(struct dentry *, struct inode *); +extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); diff --git a/include/linux/fs.h b/include/linux/fs.h index a3c50869e79bf..8218d9964ff82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2168,7 +2168,7 @@ struct inode_operations { struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct user_namespace *, struct inode *, - struct dentry *, umode_t); + struct file *, umode_t); int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int); int (*fileattr_set)(struct user_namespace *mnt_userns, diff --git a/mm/shmem.c b/mm/shmem.c index 42e5888bf84d8..f63c51bc373ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2912,7 +2912,7 @@ out_iput: static int shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode; int error = -ENOSPC; @@ -2927,9 +2927,9 @@ shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, error = simple_acl_create(dir, inode); if (error) goto out_iput; - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); out_iput: iput(inode); return error; -- GitLab From 7d37539037c2fca70346fbedc219f655253d5cff Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 0463/2223] fuse: implement ->tmpfile() This is basically equivalent to the FUSE_CREATE operation which creates and opens a regular file. Add a new FUSE_TMPFILE operation, otherwise just reuse the protocol and the code for FUSE_CREATE. Acked-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 24 +++++++++++++++++++++--- fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 6 +++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b585b04e815e0..bb97a384dc5dd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -529,7 +529,7 @@ out_err: */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, - umode_t mode) + umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = FUSE_CREATE; + args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); @@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode); + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -1913,6 +1930,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 488b460e046f4..98a9cf5318731 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -784,6 +784,9 @@ struct fuse_conn { /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d6ccee9618917..76ee8f9e024af 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -194,6 +194,9 @@ * - add FUSE_SECURITY_CTX init flag * - add security context to create, mkdir, symlink, and mknod requests * - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX + * + * 7.37 + * - add FUSE_TMPFILE */ #ifndef _LINUX_FUSE_H @@ -229,7 +232,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 36 +#define FUSE_KERNEL_MINOR_VERSION 37 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -537,6 +540,7 @@ enum fuse_opcode { FUSE_SETUPMAPPING = 48, FUSE_REMOVEMAPPING = 49, FUSE_SYNCFS = 50, + FUSE_TMPFILE = 51, /* CUSE specific operations */ CUSE_INIT = 4096, -- GitLab From 41fd1cb6151439b205ac7611883d85ae14250172 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 26 Aug 2022 21:31:40 +0200 Subject: [PATCH 0464/2223] media: mceusb: Use new usb_control_msg_*() routines Automatic kernel fuzzing led to a WARN about invalid pipe direction in the mceusb driver: ------------[ cut here ]------------ usb 6-1: BOGUS control dir, pipe 80000380 doesn't match bRequestType 40 WARNING: CPU: 0 PID: 2465 at drivers/usb/core/urb.c:410 usb_submit_urb+0x1326/0x1820 drivers/usb/core/urb.c:410 Modules linked in: CPU: 0 PID: 2465 Comm: kworker/0:2 Not tainted 5.19.0-rc4-00208-g69cb6c6556ad #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: usb_hub_wq hub_event RIP: 0010:usb_submit_urb+0x1326/0x1820 drivers/usb/core/urb.c:410 Code: 7c 24 40 e8 ac 23 91 fd 48 8b 7c 24 40 e8 b2 70 1b ff 45 89 e8 44 89 f1 4c 89 e2 48 89 c6 48 c7 c7 a0 30 a9 86 e8 48 07 11 02 <0f> 0b e9 1c f0 ff ff e8 7e 23 91 fd 0f b6 1d 63 22 83 05 31 ff 41 RSP: 0018:ffffc900032becf0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8881100f3058 RCX: 0000000000000000 RDX: ffffc90004961000 RSI: ffff888114c6d580 RDI: fffff52000657d90 RBP: ffff888105ad90f0 R08: ffffffff812c3638 R09: 0000000000000000 R10: 0000000000000005 R11: ffffed1023504ef1 R12: ffff888105ad9000 R13: 0000000000000040 R14: 0000000080000380 R15: ffff88810ba96500 FS: 0000000000000000(0000) GS:ffff88811a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffe810bda58 CR3: 000000010b720000 CR4: 0000000000350ef0 Call Trace: usb_start_wait_urb+0x101/0x4c0 drivers/usb/core/message.c:58 usb_internal_control_msg drivers/usb/core/message.c:102 [inline] usb_control_msg+0x31c/0x4a0 drivers/usb/core/message.c:153 mceusb_gen1_init drivers/media/rc/mceusb.c:1431 [inline] mceusb_dev_probe+0x258e/0x33f0 drivers/media/rc/mceusb.c:1807 The reason for the warning is clear enough; the driver sends an unusual read request on endpoint 0 but does not set the USB_DIR_IN bit in the bRequestType field. More importantly, the whole situation can be avoided and the driver simplified by converting it over to the relatively new usb_control_msg_recv() and usb_control_msg_send() routines. That's what this fix does. Reported-and-tested-by: Rondreis Link: https://lore.kernel.org/all/CAB7eexLLApHJwZfMQ=X-PtRhw0BgO+5KcSMS05FNUYejJXqtSA@mail.gmail.com/ Signed-off-by: Alan Stern Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 0834d5f866fd8..39d2b03e26317 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1416,42 +1416,37 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) { int ret; struct device *dev = ir->dev; - char *data; - - data = kzalloc(USB_CTRL_MSG_SZ, GFP_KERNEL); - if (!data) { - dev_err(dev, "%s: memory allocation failed!", __func__); - return; - } + char data[USB_CTRL_MSG_SZ]; /* * This is a strange one. Windows issues a set address to the device * on the receive control pipe and expect a certain value pair back */ - ret = usb_control_msg(ir->usbdev, usb_rcvctrlpipe(ir->usbdev, 0), - USB_REQ_SET_ADDRESS, USB_TYPE_VENDOR, 0, 0, - data, USB_CTRL_MSG_SZ, 3000); + ret = usb_control_msg_recv(ir->usbdev, 0, USB_REQ_SET_ADDRESS, + USB_DIR_IN | USB_TYPE_VENDOR, + 0, 0, data, USB_CTRL_MSG_SZ, 3000, + GFP_KERNEL); dev_dbg(dev, "set address - ret = %d", ret); dev_dbg(dev, "set address - data[0] = %d, data[1] = %d", data[0], data[1]); /* set feature: bit rate 38400 bps */ - ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), - USB_REQ_SET_FEATURE, USB_TYPE_VENDOR, - 0xc04e, 0x0000, NULL, 0, 3000); + ret = usb_control_msg_send(ir->usbdev, 0, + USB_REQ_SET_FEATURE, USB_TYPE_VENDOR, + 0xc04e, 0x0000, NULL, 0, 3000, GFP_KERNEL); dev_dbg(dev, "set feature - ret = %d", ret); /* bRequest 4: set char length to 8 bits */ - ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), - 4, USB_TYPE_VENDOR, - 0x0808, 0x0000, NULL, 0, 3000); + ret = usb_control_msg_send(ir->usbdev, 0, + 4, USB_TYPE_VENDOR, + 0x0808, 0x0000, NULL, 0, 3000, GFP_KERNEL); dev_dbg(dev, "set char length - retB = %d", ret); /* bRequest 2: set handshaking to use DTR/DSR */ - ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), - 2, USB_TYPE_VENDOR, - 0x0000, 0x0100, NULL, 0, 3000); + ret = usb_control_msg_send(ir->usbdev, 0, + 2, USB_TYPE_VENDOR, + 0x0000, 0x0100, NULL, 0, 3000, GFP_KERNEL); dev_dbg(dev, "set handshake - retC = %d", ret); /* device resume */ @@ -1459,8 +1454,6 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) /* get hw/sw revision? */ mce_command_out(ir, GET_REVISION, sizeof(GET_REVISION)); - - kfree(data); } static void mceusb_gen2_init(struct mceusb_dev *ir) -- GitLab From 2dfe2c4f1720b6b0860d36d25107ffa57f0bbc63 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 30 Aug 2022 10:30:27 +0200 Subject: [PATCH 0465/2223] media: imon: Remove the unneeded result variable Return the value send_packet() directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Signed-off-by: ye xingchen Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 735b925da9984..5edfd8a9e8494 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -684,7 +684,6 @@ static int send_packet(struct imon_context *ictx) */ static int send_associate_24g(struct imon_context *ictx) { - int retval; const unsigned char packet[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20 }; @@ -699,9 +698,8 @@ static int send_associate_24g(struct imon_context *ictx) } memcpy(ictx->usb_tx_buf, packet, sizeof(packet)); - retval = send_packet(ictx); - return retval; + return send_packet(ictx); } /* -- GitLab From 20b794ddce475ed012deb365000527c17b3e93e6 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 2 Sep 2022 12:32:21 +0200 Subject: [PATCH 0466/2223] media: mceusb: set timeout to at least timeout provided By rounding down, the actual timeout can be lower than requested. As a result, long spaces just below the requested timeout can be incorrectly reported as timeout and truncated. Fixes: 877f1a7cee3f ("media: rc: mceusb: allow the timeout to be configurable") Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 39d2b03e26317..c76ba24c1f559 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1077,7 +1077,7 @@ static int mceusb_set_timeout(struct rc_dev *dev, unsigned int timeout) struct mceusb_dev *ir = dev->priv; unsigned int units; - units = DIV_ROUND_CLOSEST(timeout, MCE_TIME_UNIT); + units = DIV_ROUND_UP(timeout, MCE_TIME_UNIT); cmdbuf[2] = units >> 8; cmdbuf[3] = units; -- GitLab From 596fa6e71896e632832804a9648aa123af4afef9 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Thu, 9 Jun 2022 12:31:13 +0200 Subject: [PATCH 0467/2223] media: rockchip: rkisp1: Set DPCC methods enable bits inside loop The rkisp1_dpcc_config() function looks over methods sets to configure them, but sets the RKISP1_CIF_ISP_DPCC_METHODS_SET_* registers outside of the loop with hand-unrolled code. Move this to the loop to simplify the code. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-params.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 9da7dc1bc6909..ee91cb36c44b3 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -18,6 +18,8 @@ #define RKISP1_ISP_PARAMS_REQ_BUFS_MIN 2 #define RKISP1_ISP_PARAMS_REQ_BUFS_MAX 8 +#define RKISP1_ISP_DPCC_METHODS_SET(n) \ + (RKISP1_CIF_ISP_DPCC_METHODS_SET_1 + 0x4 * (n)) #define RKISP1_ISP_DPCC_LINE_THRESH(n) \ (RKISP1_CIF_ISP_DPCC_LINE_THRESH_1 + 0x14 * (n)) #define RKISP1_ISP_DPCC_LINE_MAD_FAC(n) \ @@ -66,13 +68,9 @@ static void rkisp1_dpcc_config(struct rkisp1_params *params, rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_SET_USE, arg->set_use); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_METHODS_SET_1, - arg->methods[0].method); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_METHODS_SET_2, - arg->methods[1].method); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_METHODS_SET_3, - arg->methods[2].method); for (i = 0; i < RKISP1_CIF_ISP_DPCC_METHODS_MAX; i++) { + rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_METHODS_SET(i), + arg->methods[i].method); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_LINE_THRESH(i), arg->methods[i].line_thresh); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_LINE_MAD_FAC(i), -- GitLab From 9daa2b843f046de6d7113890838155f02c2e60a5 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Thu, 9 Jun 2022 12:31:13 +0200 Subject: [PATCH 0468/2223] media: rockchip: rkisp1: Mask invalid bits in DPCC parameters Restrict the DPCC configuration that can be set by userspace to valid register bits. To do so, reorganize the related register macros to define valid bitmasks, as well as bits of the DPCC mode register. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-params.c | 44 ++++++++++++------- .../platform/rockchip/rkisp1/rkisp1-regs.h | 26 +++++------ 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index ee91cb36c44b3..8b4eea77af0d7 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -58,35 +58,47 @@ static void rkisp1_dpcc_config(struct rkisp1_params *params, unsigned int i; u32 mode; - /* avoid to override the old enable value */ + /* + * The enable bit is controlled in rkisp1_isp_isr_other_config() and + * must be preserved. The grayscale mode should be configured + * automatically based on the media bus code on the ISP sink pad, so + * only the STAGE1_ENABLE bit can be set by userspace. + */ mode = rkisp1_read(params->rkisp1, RKISP1_CIF_ISP_DPCC_MODE); - mode &= RKISP1_CIF_ISP_DPCC_ENA; - mode |= arg->mode & ~RKISP1_CIF_ISP_DPCC_ENA; + mode &= RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE; + mode |= arg->mode & RKISP1_CIF_ISP_DPCC_MODE_STAGE1_ENABLE; rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_MODE, mode); + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_OUTPUT_MODE, - arg->output_mode); + arg->output_mode & RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_MASK); rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_SET_USE, - arg->set_use); + arg->set_use & RKISP1_CIF_ISP_DPCC_SET_USE_MASK); for (i = 0; i < RKISP1_CIF_ISP_DPCC_METHODS_MAX; i++) { rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_METHODS_SET(i), - arg->methods[i].method); + arg->methods[i].method & + RKISP1_CIF_ISP_DPCC_METHODS_SET_MASK); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_LINE_THRESH(i), - arg->methods[i].line_thresh); + arg->methods[i].line_thresh & + RKISP1_CIF_ISP_DPCC_LINE_THRESH_MASK); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_LINE_MAD_FAC(i), - arg->methods[i].line_mad_fac); + arg->methods[i].line_mad_fac & + RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_MASK); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_PG_FAC(i), - arg->methods[i].pg_fac); + arg->methods[i].pg_fac & + RKISP1_CIF_ISP_DPCC_PG_FAC_MASK); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_RND_THRESH(i), - arg->methods[i].rnd_thresh); + arg->methods[i].rnd_thresh & + RKISP1_CIF_ISP_DPCC_RND_THRESH_MASK); rkisp1_write(params->rkisp1, RKISP1_ISP_DPCC_RG_FAC(i), - arg->methods[i].rg_fac); + arg->methods[i].rg_fac & + RKISP1_CIF_ISP_DPCC_RG_FAC_MASK); } rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_RND_OFFS, - arg->rnd_offs); + arg->rnd_offs & RKISP1_CIF_ISP_DPCC_RND_OFFS_MASK); rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_DPCC_RO_LIMITS, - arg->ro_limits); + arg->ro_limits & RKISP1_CIF_ISP_DPCC_RO_LIMIT_MASK); } /* ISP black level subtraction interface function */ @@ -1214,11 +1226,11 @@ rkisp1_isp_isr_other_config(struct rkisp1_params *params, if (module_ens & RKISP1_CIF_ISP_MODULE_DPCC) rkisp1_param_set_bits(params, RKISP1_CIF_ISP_DPCC_MODE, - RKISP1_CIF_ISP_DPCC_ENA); + RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE); else rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_DPCC_MODE, - RKISP1_CIF_ISP_DPCC_ENA); + RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE); } /* update bls config */ @@ -1580,7 +1592,7 @@ void rkisp1_params_configure(struct rkisp1_params *params, void rkisp1_params_disable(struct rkisp1_params *params) { rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_DPCC_MODE, - RKISP1_CIF_ISP_DPCC_ENA); + RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE); rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_LSC_CTRL, RKISP1_CIF_ISP_LSC_CTRL_ENA); rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_BLS_CTRL, diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h index dd3e6c38be677..dc01f968c19d3 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h @@ -618,19 +618,19 @@ #define RKISP1_CIF_ISP_CTRL_ISP_GAMMA_OUT_ENA_READ(x) (((x) >> 11) & 1) /* DPCC */ -/* ISP_DPCC_MODE */ -#define RKISP1_CIF_ISP_DPCC_ENA BIT(0) -#define RKISP1_CIF_ISP_DPCC_MODE_MAX 0x07 -#define RKISP1_CIF_ISP_DPCC_OUTPUTMODE_MAX 0x0F -#define RKISP1_CIF_ISP_DPCC_SETUSE_MAX 0x0F -#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RESERVED 0xFFFFE000 -#define RKISP1_CIF_ISP_DPCC_LINE_THRESH_RESERVED 0xFFFF0000 -#define RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_RESERVED 0xFFFFC0C0 -#define RKISP1_CIF_ISP_DPCC_PG_FAC_RESERVED 0xFFFFC0C0 -#define RKISP1_CIF_ISP_DPCC_RND_THRESH_RESERVED 0xFFFF0000 -#define RKISP1_CIF_ISP_DPCC_RG_FAC_RESERVED 0xFFFFC0C0 -#define RKISP1_CIF_ISP_DPCC_RO_LIMIT_RESERVED 0xFFFFF000 -#define RKISP1_CIF_ISP_DPCC_RND_OFFS_RESERVED 0xFFFFF000 +#define RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE BIT(0) +#define RKISP1_CIF_ISP_DPCC_MODE_GRAYSCALE_MODE BIT(1) +#define RKISP1_CIF_ISP_DPCC_MODE_STAGE1_ENABLE BIT(2) +#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_MASK GENMASK(3, 0) +#define RKISP1_CIF_ISP_DPCC_SET_USE_MASK GENMASK(3, 0) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_MASK 0x00001f1f +#define RKISP1_CIF_ISP_DPCC_LINE_THRESH_MASK 0x0000ffff +#define RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_MASK 0x00003f3f +#define RKISP1_CIF_ISP_DPCC_PG_FAC_MASK 0x00003f3f +#define RKISP1_CIF_ISP_DPCC_RND_THRESH_MASK 0x0000ffff +#define RKISP1_CIF_ISP_DPCC_RG_FAC_MASK 0x00003f3f +#define RKISP1_CIF_ISP_DPCC_RO_LIMIT_MASK 0x00000fff +#define RKISP1_CIF_ISP_DPCC_RND_OFFS_MASK 0x00000fff /* BLS */ /* ISP_BLS_CTRL */ -- GitLab From 8e2b7442d27ca2a56a116d44d597e77ca21dfed3 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Thu, 9 Jun 2022 12:31:13 +0200 Subject: [PATCH 0469/2223] media: rockchip: rkisp1: Define macros for DPCC configurations in UAPI Extend the UAPI rkisp1-config.h header with macros for all DPCC configuration fields. While at it, clarify of fix issues in the DPCC documentation. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-regs.h | 1 - include/uapi/linux/rkisp1-config.h | 77 +++++++++++++++---- 2 files changed, 61 insertions(+), 17 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h index dc01f968c19d3..a931f7216e9bd 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h @@ -620,7 +620,6 @@ /* DPCC */ #define RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE BIT(0) #define RKISP1_CIF_ISP_DPCC_MODE_GRAYSCALE_MODE BIT(1) -#define RKISP1_CIF_ISP_DPCC_MODE_STAGE1_ENABLE BIT(2) #define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_MASK GENMASK(3, 0) #define RKISP1_CIF_ISP_DPCC_SET_USE_MASK GENMASK(3, 0) #define RKISP1_CIF_ISP_DPCC_METHODS_SET_MASK 0x00001f1f diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h index 583ca0d9a79d2..730673ecc63d0 100644 --- a/include/uapi/linux/rkisp1-config.h +++ b/include/uapi/linux/rkisp1-config.h @@ -117,7 +117,46 @@ /* * Defect Pixel Cluster Correction */ -#define RKISP1_CIF_ISP_DPCC_METHODS_MAX 3 +#define RKISP1_CIF_ISP_DPCC_METHODS_MAX 3 + +#define RKISP1_CIF_ISP_DPCC_MODE_STAGE1_ENABLE (1U << 2) + +#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_INCL_G_CENTER (1U << 0) +#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_INCL_RB_CENTER (1U << 1) +#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_G_3X3 (1U << 2) +#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_RB_3X3 (1U << 3) + +/* 0-2 for sets 1-3 */ +#define RKISP1_CIF_ISP_DPCC_SET_USE_STAGE1_USE_SET(n) ((n) << 0) +#define RKISP1_CIF_ISP_DPCC_SET_USE_STAGE1_USE_FIX_SET (1U << 3) + +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_PG_GREEN_ENABLE (1U << 0) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_LC_GREEN_ENABLE (1U << 1) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RO_GREEN_ENABLE (1U << 2) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RND_GREEN_ENABLE (1U << 3) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RG_GREEN_ENABLE (1U << 4) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_PG_RED_BLUE_ENABLE (1U << 8) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_LC_RED_BLUE_ENABLE (1U << 9) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RO_RED_BLUE_ENABLE (1U << 10) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RND_RED_BLUE_ENABLE (1U << 11) +#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RG_RED_BLUE_ENABLE (1U << 12) + +#define RKISP1_CIF_ISP_DPCC_LINE_THRESH_G(v) ((v) << 0) +#define RKISP1_CIF_ISP_DPCC_LINE_THRESH_RB(v) ((v) << 8) +#define RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_G(v) ((v) << 0) +#define RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_RB(v) ((v) << 8) +#define RKISP1_CIF_ISP_DPCC_PG_FAC_G(v) ((v) << 0) +#define RKISP1_CIF_ISP_DPCC_PG_FAC_RB(v) ((v) << 8) +#define RKISP1_CIF_ISP_DPCC_RND_THRESH_G(v) ((v) << 0) +#define RKISP1_CIF_ISP_DPCC_RND_THRESH_RB(v) ((v) << 8) +#define RKISP1_CIF_ISP_DPCC_RG_FAC_G(v) ((v) << 0) +#define RKISP1_CIF_ISP_DPCC_RG_FAC_RB(v) ((v) << 8) + +#define RKISP1_CIF_ISP_DPCC_RO_LIMITS_n_G(n, v) ((v) << ((n) * 4)) +#define RKISP1_CIF_ISP_DPCC_RO_LIMITS_n_RB(n, v) ((v) << ((n) * 4 + 2)) + +#define RKISP1_CIF_ISP_DPCC_RND_OFFS_n_G(n, v) ((v) << ((n) * 4)) +#define RKISP1_CIF_ISP_DPCC_RND_OFFS_n_RB(n, v) ((v) << ((n) * 4 + 2)) /* * Denoising pre filter @@ -249,16 +288,20 @@ struct rkisp1_cif_isp_bls_config { }; /** - * struct rkisp1_cif_isp_dpcc_methods_config - Methods Configuration used by DPCC + * struct rkisp1_cif_isp_dpcc_methods_config - DPCC methods set configuration * - * Methods Configuration used by Defect Pixel Cluster Correction + * This structure stores the configuration of one set of methods for the DPCC + * algorithm. Multiple methods can be selected in each set (independently for + * the Green and Red/Blue components) through the @method field, the result is + * the logical AND of all enabled methods. The remaining fields set thresholds + * and factors for each method. * - * @method: Method enable bits - * @line_thresh: Line threshold - * @line_mad_fac: Line MAD factor - * @pg_fac: Peak gradient factor - * @rnd_thresh: Rank Neighbor Difference threshold - * @rg_fac: Rank gradient factor + * @method: Method enable bits (RKISP1_CIF_ISP_DPCC_METHODS_SET_*) + * @line_thresh: Line threshold (RKISP1_CIF_ISP_DPCC_LINE_THRESH_*) + * @line_mad_fac: Line Mean Absolute Difference factor (RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_*) + * @pg_fac: Peak gradient factor (RKISP1_CIF_ISP_DPCC_PG_FAC_*) + * @rnd_thresh: Rank Neighbor Difference threshold (RKISP1_CIF_ISP_DPCC_RND_THRESH_*) + * @rg_fac: Rank gradient factor (RKISP1_CIF_ISP_DPCC_RG_FAC_*) */ struct rkisp1_cif_isp_dpcc_methods_config { __u32 method; @@ -272,14 +315,16 @@ struct rkisp1_cif_isp_dpcc_methods_config { /** * struct rkisp1_cif_isp_dpcc_config - Configuration used by DPCC * - * Configuration used by Defect Pixel Cluster Correction + * Configuration used by Defect Pixel Cluster Correction. Three sets of methods + * can be configured and selected through the @set_use field. The result is the + * logical OR of all enabled sets. * - * @mode: dpcc output mode - * @output_mode: whether use hard coded methods - * @set_use: stage1 methods set - * @methods: methods config - * @ro_limits: rank order limits - * @rnd_offs: differential rank offsets for rank neighbor difference + * @mode: DPCC mode (RKISP1_CIF_ISP_DPCC_MODE_*) + * @output_mode: Interpolation output mode (RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_*) + * @set_use: Methods sets selection (RKISP1_CIF_ISP_DPCC_SET_USE_*) + * @methods: Methods sets configuration + * @ro_limits: Rank order limits (RKISP1_CIF_ISP_DPCC_RO_LIMITS_*) + * @rnd_offs: Differential rank offsets for rank neighbor difference (RKISP1_CIF_ISP_DPCC_RND_OFFS_*) */ struct rkisp1_cif_isp_dpcc_config { __u32 mode; -- GitLab From 87bfaa1a167be995d6dfacde08adb7e32f06d558 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0470/2223] media: rkisp1: Initialize color space on ISP sink and source pads Initialize the four color space fields on the sink and source video pads of the ISP in the .init_cfg() operation. As the main use case for the ISP is to convert Bayer data to YUV, select a raw color space on the sink pad and a limited range quantization of SYCC on the source pad by default. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Reviewed-by: Paul Elder Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index 383a3ec83ca9f..9fcf272734ee6 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -431,12 +431,17 @@ static int rkisp1_isp_init_config(struct v4l2_subdev *sd, struct v4l2_mbus_framefmt *sink_fmt, *src_fmt; struct v4l2_rect *sink_crop, *src_crop; + /* Video. */ sink_fmt = v4l2_subdev_get_try_format(sd, sd_state, RKISP1_ISP_PAD_SINK_VIDEO); sink_fmt->width = RKISP1_DEFAULT_WIDTH; sink_fmt->height = RKISP1_DEFAULT_HEIGHT; sink_fmt->field = V4L2_FIELD_NONE; sink_fmt->code = RKISP1_DEF_SINK_PAD_FMT; + sink_fmt->colorspace = V4L2_COLORSPACE_RAW; + sink_fmt->xfer_func = V4L2_XFER_FUNC_NONE; + sink_fmt->ycbcr_enc = V4L2_YCBCR_ENC_601; + sink_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; sink_crop = v4l2_subdev_get_try_crop(sd, sd_state, RKISP1_ISP_PAD_SINK_VIDEO); @@ -449,11 +454,16 @@ static int rkisp1_isp_init_config(struct v4l2_subdev *sd, RKISP1_ISP_PAD_SOURCE_VIDEO); *src_fmt = *sink_fmt; src_fmt->code = RKISP1_DEF_SRC_PAD_FMT; + src_fmt->colorspace = V4L2_COLORSPACE_SRGB; + src_fmt->xfer_func = V4L2_XFER_FUNC_SRGB; + src_fmt->ycbcr_enc = V4L2_YCBCR_ENC_601; + src_fmt->quantization = V4L2_QUANTIZATION_LIM_RANGE; src_crop = v4l2_subdev_get_try_crop(sd, sd_state, RKISP1_ISP_PAD_SOURCE_VIDEO); *src_crop = *sink_crop; + /* Parameters and statistics. */ sink_fmt = v4l2_subdev_get_try_format(sd, sd_state, RKISP1_ISP_PAD_SINK_PARAMS); src_fmt = v4l2_subdev_get_try_format(sd, sd_state, -- GitLab From 6844cebbf60ac61296a96f1bf57966f98d8d2d6a Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0471/2223] media: rkisp1: Allow setting color space on ISP sink pad The ISP accepts different color spaces on its input: for YUV input, it doesn't set any restrictions, and for Bayer inputs, any color primaries or transfer function can be accepted (YCbCr encoding isn't applicable there, and quantization range can only be full). Allow setting a color space on the ISP sink pad, with the aforementioned restrictions. The settings don't influence hardware yet (only the YUV quantization range will, anything else has no direct effect on the ISP configuration), but can already be set to allow color space information to be coherent across the ISP sink link. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-isp.c | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index 9fcf272734ee6..d3e13abcddb17 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -583,6 +583,7 @@ static void rkisp1_isp_set_sink_fmt(struct rkisp1_isp *isp, const struct rkisp1_mbus_info *mbus_info; struct v4l2_mbus_framefmt *sink_fmt; struct v4l2_rect *sink_crop; + bool is_yuv; sink_fmt = rkisp1_isp_get_pad_fmt(isp, sd_state, RKISP1_ISP_PAD_SINK_VIDEO, @@ -603,6 +604,36 @@ static void rkisp1_isp_set_sink_fmt(struct rkisp1_isp *isp, RKISP1_ISP_MIN_HEIGHT, RKISP1_ISP_MAX_HEIGHT); + /* + * Adjust the color space fields. Accept any color primaries and + * transfer function for both YUV and Bayer. For YUV any YCbCr encoding + * and quantization range is also accepted. For Bayer formats, the YCbCr + * encoding isn't applicable, and the quantization range can only be + * full. + */ + is_yuv = mbus_info->pixel_enc == V4L2_PIXEL_ENC_YUV; + + sink_fmt->colorspace = format->colorspace ? : + (is_yuv ? V4L2_COLORSPACE_SRGB : + V4L2_COLORSPACE_RAW); + sink_fmt->xfer_func = format->xfer_func ? : + V4L2_MAP_XFER_FUNC_DEFAULT(sink_fmt->colorspace); + if (is_yuv) { + sink_fmt->ycbcr_enc = format->ycbcr_enc ? : + V4L2_MAP_YCBCR_ENC_DEFAULT(sink_fmt->colorspace); + sink_fmt->quantization = format->quantization ? : + V4L2_MAP_QUANTIZATION_DEFAULT(false, sink_fmt->colorspace, + sink_fmt->ycbcr_enc); + } else { + /* + * The YCbCr encoding isn't applicable for non-YUV formats, but + * V4L2 has no "no encoding" value. Hardcode it to Rec. 601, it + * should be ignored by userspace. + */ + sink_fmt->ycbcr_enc = V4L2_YCBCR_ENC_601; + sink_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; + } + *format = *sink_fmt; /* Propagate to in crop */ -- GitLab From cb00f3a4421d5c7d7155bd4bded7fb2ff8eec211 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0472/2223] media: rkisp1: Fix source pad format configuration The ISP converts Bayer data to YUV when operating normally, and can also operate in pass-through mode where the input and output formats must match. Converting from YUV to Bayer isn't possible. If such an invalid configuration is attempted, adjust it by copying the sink pad media bus code to the source pad. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Reviewed-by: Paul Elder Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-isp.c | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index d3e13abcddb17..e6359f9b5b26e 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -482,23 +482,43 @@ static void rkisp1_isp_set_src_fmt(struct rkisp1_isp *isp, struct v4l2_mbus_framefmt *format, unsigned int which) { - const struct rkisp1_mbus_info *mbus_info; + const struct rkisp1_mbus_info *sink_info; + const struct rkisp1_mbus_info *src_info; + struct v4l2_mbus_framefmt *sink_fmt; struct v4l2_mbus_framefmt *src_fmt; const struct v4l2_rect *src_crop; + sink_fmt = rkisp1_isp_get_pad_fmt(isp, sd_state, + RKISP1_ISP_PAD_SINK_VIDEO, which); src_fmt = rkisp1_isp_get_pad_fmt(isp, sd_state, RKISP1_ISP_PAD_SOURCE_VIDEO, which); src_crop = rkisp1_isp_get_pad_crop(isp, sd_state, RKISP1_ISP_PAD_SOURCE_VIDEO, which); + /* + * Media bus code. The ISP can operate in pass-through mode (Bayer in, + * Bayer out or YUV in, YUV out) or process Bayer data to YUV, but + * can't convert from YUV to Bayer. + */ + sink_info = rkisp1_mbus_info_get_by_code(sink_fmt->code); + src_fmt->code = format->code; - mbus_info = rkisp1_mbus_info_get_by_code(src_fmt->code); - if (!mbus_info || !(mbus_info->direction & RKISP1_ISP_SD_SRC)) { + src_info = rkisp1_mbus_info_get_by_code(src_fmt->code); + if (!src_info || !(src_info->direction & RKISP1_ISP_SD_SRC)) { src_fmt->code = RKISP1_DEF_SRC_PAD_FMT; - mbus_info = rkisp1_mbus_info_get_by_code(src_fmt->code); + src_info = rkisp1_mbus_info_get_by_code(src_fmt->code); } - if (which == V4L2_SUBDEV_FORMAT_ACTIVE) - isp->src_fmt = mbus_info; + + if (sink_info->pixel_enc == V4L2_PIXEL_ENC_YUV && + src_info->pixel_enc == V4L2_PIXEL_ENC_BAYER) { + src_fmt->code = sink_fmt->code; + src_info = sink_info; + } + + /* + * The source width and height must be identical to the source crop + * size. + */ src_fmt->width = src_crop->width; src_fmt->height = src_crop->height; @@ -508,14 +528,18 @@ static void rkisp1_isp_set_src_fmt(struct rkisp1_isp *isp, */ if (format->flags & V4L2_MBUS_FRAMEFMT_SET_CSC && format->quantization == V4L2_QUANTIZATION_FULL_RANGE && - mbus_info->pixel_enc == V4L2_PIXEL_ENC_YUV) + src_info->pixel_enc == V4L2_PIXEL_ENC_YUV) src_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; - else if (mbus_info->pixel_enc == V4L2_PIXEL_ENC_YUV) + else if (src_info->pixel_enc == V4L2_PIXEL_ENC_YUV) src_fmt->quantization = V4L2_QUANTIZATION_LIM_RANGE; else src_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; *format = *src_fmt; + + /* Store the source format info when setting the active format. */ + if (which == V4L2_SUBDEV_FORMAT_ACTIVE) + isp->src_fmt = src_info; } static void rkisp1_isp_set_src_crop(struct rkisp1_isp *isp, -- GitLab From c1ec5efba08079820b0bfc5149891f364934440d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:56:22 +0200 Subject: [PATCH 0473/2223] media: rkisp1: Allow setting all color space fields on ISP source pad The ISP output color space is configured through the ISP source pad. At the moment, only the quantization can be set. Extend it to the three other color space fields: - The ycbcr_enc field will be used to configure the RGB to YUV matrix (currently hardcoded to Rec. 601). - The colorspace (which controls the color primaries) and xfer_func fields will not be used to configure the ISP, as the corresponding hardware blocks (the cross-talk RGB to RGB matrix and the tone mapping curve) are programmed directly by applications through ISP parameters. Nonetheless, those two fields should be set by applications to match the ISP configuration, in order to propagate the correct color space down the pipeline up to the capture video nodes. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-isp.c | 55 ++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index e6359f9b5b26e..1798ef8529503 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -487,6 +487,7 @@ static void rkisp1_isp_set_src_fmt(struct rkisp1_isp *isp, struct v4l2_mbus_framefmt *sink_fmt; struct v4l2_mbus_framefmt *src_fmt; const struct v4l2_rect *src_crop; + bool set_csc; sink_fmt = rkisp1_isp_get_pad_fmt(isp, sd_state, RKISP1_ISP_PAD_SINK_VIDEO, which); @@ -523,20 +524,60 @@ static void rkisp1_isp_set_src_fmt(struct rkisp1_isp *isp, src_fmt->height = src_crop->height; /* - * The CSC API is used to allow userspace to force full - * quantization on YUV formats. + * Copy the color space for the sink pad. When converting from Bayer to + * YUV, default to a limited quantization range. */ - if (format->flags & V4L2_MBUS_FRAMEFMT_SET_CSC && - format->quantization == V4L2_QUANTIZATION_FULL_RANGE && + src_fmt->colorspace = sink_fmt->colorspace; + src_fmt->xfer_func = sink_fmt->xfer_func; + src_fmt->ycbcr_enc = sink_fmt->ycbcr_enc; + + if (sink_info->pixel_enc == V4L2_PIXEL_ENC_BAYER && src_info->pixel_enc == V4L2_PIXEL_ENC_YUV) - src_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; - else if (src_info->pixel_enc == V4L2_PIXEL_ENC_YUV) src_fmt->quantization = V4L2_QUANTIZATION_LIM_RANGE; else - src_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; + src_fmt->quantization = sink_fmt->quantization; + + /* + * Allow setting the source color space fields when the SET_CSC flag is + * set and the source format is YUV. If the sink format is YUV, don't + * set the color primaries, transfer function or YCbCr encoding as the + * ISP is bypassed in that case and passes YUV data through without + * modifications. + * + * The color primaries and transfer function are configured through the + * cross-talk matrix and tone curve respectively. Settings for those + * hardware blocks are conveyed through the ISP parameters buffer, as + * they need to combine color space information with other image tuning + * characteristics and can't thus be computed by the kernel based on the + * color space. The source pad colorspace and xfer_func fields are thus + * ignored by the driver, but can be set by userspace to propagate + * accurate color space information down the pipeline. + */ + set_csc = format->flags & V4L2_MBUS_FRAMEFMT_SET_CSC; + + if (set_csc && src_info->pixel_enc == V4L2_PIXEL_ENC_YUV) { + if (sink_info->pixel_enc == V4L2_PIXEL_ENC_BAYER) { + if (format->colorspace != V4L2_COLORSPACE_DEFAULT) + src_fmt->colorspace = format->colorspace; + if (format->xfer_func != V4L2_XFER_FUNC_DEFAULT) + src_fmt->xfer_func = format->xfer_func; + if (format->ycbcr_enc != V4L2_YCBCR_ENC_DEFAULT) + src_fmt->ycbcr_enc = format->ycbcr_enc; + } + + if (format->quantization != V4L2_QUANTIZATION_DEFAULT) + src_fmt->quantization = format->quantization; + } *format = *src_fmt; + /* + * Restore the SET_CSC flag if it was set to indicate support for the + * CSC setting API. + */ + if (set_csc) + format->flags |= V4L2_MBUS_FRAMEFMT_SET_CSC; + /* Store the source format info when setting the active format. */ if (which == V4L2_SUBDEV_FORMAT_ACTIVE) isp->src_fmt = src_info; -- GitLab From 81303962da12a1dbb7f81779c2847d055779c46b Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0474/2223] media: rkisp1: Configure quantization using ISP source pad The rkisp1_config_isp() function uses the format on the sink pad of the ISP to configure quantization at the output of the ISP. This is incorrect, as hinted by the src_frm variable name that stores the format. Fix it by using the source pad. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Reviewed-by: Paul Elder Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index 1798ef8529503..51134f642ef97 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -231,7 +231,7 @@ static int rkisp1_config_isp(struct rkisp1_isp *isp, struct v4l2_mbus_framefmt *src_frm; src_frm = rkisp1_isp_get_pad_fmt(isp, NULL, - RKISP1_ISP_PAD_SINK_VIDEO, + RKISP1_ISP_PAD_SOURCE_VIDEO, V4L2_SUBDEV_FORMAT_ACTIVE); rkisp1_params_configure(&rkisp1->params, sink_fmt->bayer_pat, src_frm->quantization); -- GitLab From 711d91497e203b058cf0a08c0f7d41c04efbde76 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0475/2223] media: rkisp1: Don't pass the quantization to rkisp1_csm_config() The rkisp1_csm_config() function takes a pointer to the rkisp1_params structure which contains the quantization value. There's no need to pass it separately to the function. Drop it from the function parameters. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Reviewed-by: Paul Elder Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-params.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 8b4eea77af0d7..163419624370f 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -1076,7 +1076,7 @@ static void rkisp1_ie_enable(struct rkisp1_params *params, bool en) } } -static void rkisp1_csm_config(struct rkisp1_params *params, bool full_range) +static void rkisp1_csm_config(struct rkisp1_params *params) { static const u16 full_range_coeff[] = { 0x0026, 0x004b, 0x000f, @@ -1090,7 +1090,7 @@ static void rkisp1_csm_config(struct rkisp1_params *params, bool full_range) }; unsigned int i; - if (full_range) { + if (params->quantization == V4L2_QUANTIZATION_FULL_RANGE) { for (i = 0; i < ARRAY_SIZE(full_range_coeff); i++) rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_CC_COEFF_0 + i * 4, @@ -1562,11 +1562,7 @@ static void rkisp1_params_config_parameter(struct rkisp1_params *params) rkisp1_param_set_bits(params, RKISP1_CIF_ISP_HIST_PROP_V10, rkisp1_hst_params_default_config.mode); - /* set the range */ - if (params->quantization == V4L2_QUANTIZATION_FULL_RANGE) - rkisp1_csm_config(params, true); - else - rkisp1_csm_config(params, false); + rkisp1_csm_config(params); spin_lock_irq(¶ms->config_lock); -- GitLab From f7aa2d234377c8e69bfcc52d7384b784ab513460 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0476/2223] media: rkisp1: Configure CSM based on YCbCr encoding The driver currently only implements the Rec. 601 YCbCr encoding, extend it with support for the other encodings defined by V4L2 (Rec. 709, Rec. 2020 and SMPTE240m). The coefficients have been calculated by rounding the floating point values to the nearest Q1.7 fixed-point value, adjusting the rounding to ensure that the sum of each line in the matrix is preserved to avoid overflows. At the hardware level, the RGB to YUV conversion matrix is fully configurable, custom encoding could be supported by extending the ISP parameters if desired. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Reviewed-by: Paul Elder Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-common.h | 5 +- .../platform/rockchip/rkisp1/rkisp1-isp.c | 3 +- .../platform/rockchip/rkisp1/rkisp1-params.c | 97 +++++++++++++++---- 3 files changed, 84 insertions(+), 21 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h index 8056997d5c29a..b704e955cb287 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h @@ -378,6 +378,7 @@ struct rkisp1_params { struct v4l2_format vdev_fmt; enum v4l2_quantization quantization; + enum v4l2_ycbcr_encoding ycbcr_encoding; enum rkisp1_fmt_raw_pat_type raw_type; }; @@ -563,10 +564,12 @@ const struct rkisp1_mbus_info *rkisp1_mbus_info_get_by_code(u32 mbus_code); * @params: pointer to rkisp1_params. * @bayer_pat: the bayer pattern on the isp video sink pad * @quantization: the quantization configured on the isp's src pad + * @ycbcr_encoding: the ycbcr_encoding configured on the isp's src pad */ void rkisp1_params_configure(struct rkisp1_params *params, enum rkisp1_fmt_raw_pat_type bayer_pat, - enum v4l2_quantization quantization); + enum v4l2_quantization quantization, + enum v4l2_ycbcr_encoding ycbcr_encoding); /* rkisp1_params_disable - disable all parameters. * This function is called by the isp entity upon stream start diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index 51134f642ef97..f19c0718963fe 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -234,7 +234,8 @@ static int rkisp1_config_isp(struct rkisp1_isp *isp, RKISP1_ISP_PAD_SOURCE_VIDEO, V4L2_SUBDEV_FORMAT_ACTIVE); rkisp1_params_configure(&rkisp1->params, sink_fmt->bayer_pat, - src_frm->quantization); + src_frm->quantization, + src_frm->ycbcr_enc); } return 0; diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 163419624370f..246a6faa1fc1b 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -1078,37 +1078,94 @@ static void rkisp1_ie_enable(struct rkisp1_params *params, bool en) static void rkisp1_csm_config(struct rkisp1_params *params) { - static const u16 full_range_coeff[] = { - 0x0026, 0x004b, 0x000f, - 0x01ea, 0x01d6, 0x0040, - 0x0040, 0x01ca, 0x01f6 + struct csm_coeffs { + u16 limited[9]; + u16 full[9]; }; - static const u16 limited_range_coeff[] = { - 0x0021, 0x0040, 0x000d, - 0x01ed, 0x01db, 0x0038, - 0x0038, 0x01d1, 0x01f7, + static const struct csm_coeffs rec601_coeffs = { + .limited = { + 0x0021, 0x0042, 0x000d, + 0x01ed, 0x01db, 0x0038, + 0x0038, 0x01d1, 0x01f7, + }, + .full = { + 0x0026, 0x004b, 0x000f, + 0x01ea, 0x01d6, 0x0040, + 0x0040, 0x01ca, 0x01f6, + }, }; + static const struct csm_coeffs rec709_coeffs = { + .limited = { + 0x0018, 0x0050, 0x0008, + 0x01f3, 0x01d5, 0x0038, + 0x0038, 0x01cd, 0x01fb, + }, + .full = { + 0x001b, 0x005c, 0x0009, + 0x01f1, 0x01cf, 0x0040, + 0x0040, 0x01c6, 0x01fa, + }, + }; + static const struct csm_coeffs rec2020_coeffs = { + .limited = { + 0x001d, 0x004c, 0x0007, + 0x01f0, 0x01d8, 0x0038, + 0x0038, 0x01cd, 0x01fb, + }, + .full = { + 0x0022, 0x0057, 0x0008, + 0x01ee, 0x01d2, 0x0040, + 0x0040, 0x01c5, 0x01fb, + }, + }; + static const struct csm_coeffs smpte240m_coeffs = { + .limited = { + 0x0018, 0x004f, 0x000a, + 0x01f3, 0x01d5, 0x0038, + 0x0038, 0x01ce, 0x01fa, + }, + .full = { + 0x001b, 0x005a, 0x000b, + 0x01f1, 0x01cf, 0x0040, + 0x0040, 0x01c7, 0x01f9, + }, + }; + + const struct csm_coeffs *coeffs; + const u16 *csm; unsigned int i; - if (params->quantization == V4L2_QUANTIZATION_FULL_RANGE) { - for (i = 0; i < ARRAY_SIZE(full_range_coeff); i++) - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_CC_COEFF_0 + i * 4, - full_range_coeff[i]); + switch (params->ycbcr_encoding) { + case V4L2_YCBCR_ENC_601: + default: + coeffs = &rec601_coeffs; + break; + case V4L2_YCBCR_ENC_709: + coeffs = &rec709_coeffs; + break; + case V4L2_YCBCR_ENC_BT2020: + coeffs = &rec2020_coeffs; + break; + case V4L2_YCBCR_ENC_SMPTE240M: + coeffs = &smpte240m_coeffs; + break; + } + if (params->quantization == V4L2_QUANTIZATION_FULL_RANGE) { + csm = coeffs->full; rkisp1_param_set_bits(params, RKISP1_CIF_ISP_CTRL, RKISP1_CIF_ISP_CTRL_ISP_CSM_Y_FULL_ENA | RKISP1_CIF_ISP_CTRL_ISP_CSM_C_FULL_ENA); } else { - for (i = 0; i < ARRAY_SIZE(limited_range_coeff); i++) - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_CC_COEFF_0 + i * 4, - limited_range_coeff[i]); - + csm = coeffs->limited; rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_CTRL, RKISP1_CIF_ISP_CTRL_ISP_CSM_Y_FULL_ENA | RKISP1_CIF_ISP_CTRL_ISP_CSM_C_FULL_ENA); } + + for (i = 0; i < 9; i++) + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_CC_COEFF_0 + i * 4, + csm[i]); } /* ISP De-noise Pre-Filter(DPF) function */ @@ -1574,9 +1631,11 @@ static void rkisp1_params_config_parameter(struct rkisp1_params *params) void rkisp1_params_configure(struct rkisp1_params *params, enum rkisp1_fmt_raw_pat_type bayer_pat, - enum v4l2_quantization quantization) + enum v4l2_quantization quantization, + enum v4l2_ycbcr_encoding ycbcr_encoding) { params->quantization = quantization; + params->ycbcr_encoding = ycbcr_encoding; params->raw_type = bayer_pat; rkisp1_params_config_parameter(params); } -- GitLab From 83b9296e399367862845d3b19984444fc756bd61 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0477/2223] media: rkisp1: Initialize color space on resizer sink and source pads Initialize the four color space fields on the sink and source video pads of the resizer in the .init_cfg() operation. The resizer can't perform any color space conversion, so set the sink and source color spaces to the same defaults, which match the ISP source video pad default. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c index f4caa8f684aad..a2dc6f60d9cf6 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c @@ -411,6 +411,10 @@ static int rkisp1_rsz_init_config(struct v4l2_subdev *sd, sink_fmt->height = RKISP1_DEFAULT_HEIGHT; sink_fmt->field = V4L2_FIELD_NONE; sink_fmt->code = RKISP1_DEF_FMT; + sink_fmt->colorspace = V4L2_COLORSPACE_SRGB; + sink_fmt->xfer_func = V4L2_XFER_FUNC_SRGB; + sink_fmt->ycbcr_enc = V4L2_YCBCR_ENC_601; + sink_fmt->quantization = V4L2_QUANTIZATION_LIM_RANGE; sink_crop = v4l2_subdev_get_try_crop(sd, sd_state, RKISP1_RSZ_PAD_SINK); -- GitLab From faab2929515224f32a3cc080e8a6d44ae6e0d4ec Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 13 Aug 2022 00:44:14 +0200 Subject: [PATCH 0478/2223] media: rkisp1: Allow setting color space on resizer sink pad The resizer doesn't deal with color spaces, so it can accept any color space on its input, and propagates it unchanged to its output. When operating with a Bayer input format (in pass-through mode) further restrict the YCbCr encoding and quantization to Rec 601 and full range respectively, as for raw data the former ought to be ignored and the latter is always full range. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-resizer.c | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c index a2dc6f60d9cf6..f76afd8112b21 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-resizer.c @@ -507,6 +507,7 @@ static void rkisp1_rsz_set_sink_fmt(struct rkisp1_resizer *rsz, const struct rkisp1_mbus_info *mbus_info; struct v4l2_mbus_framefmt *sink_fmt, *src_fmt; struct v4l2_rect *sink_crop; + bool is_yuv; sink_fmt = rkisp1_rsz_get_pad_fmt(rsz, sd_state, RKISP1_RSZ_PAD_SINK, which); @@ -528,9 +529,6 @@ static void rkisp1_rsz_set_sink_fmt(struct rkisp1_resizer *rsz, if (which == V4L2_SUBDEV_FORMAT_ACTIVE) rsz->pixel_enc = mbus_info->pixel_enc; - /* Propagete to source pad */ - src_fmt->code = sink_fmt->code; - sink_fmt->width = clamp_t(u32, format->width, RKISP1_ISP_MIN_WIDTH, RKISP1_ISP_MAX_WIDTH); @@ -538,8 +536,45 @@ static void rkisp1_rsz_set_sink_fmt(struct rkisp1_resizer *rsz, RKISP1_ISP_MIN_HEIGHT, RKISP1_ISP_MAX_HEIGHT); + /* + * Adjust the color space fields. Accept any color primaries and + * transfer function for both YUV and Bayer. For YUV any YCbCr encoding + * and quantization range is also accepted. For Bayer formats, the YCbCr + * encoding isn't applicable, and the quantization range can only be + * full. + */ + is_yuv = mbus_info->pixel_enc == V4L2_PIXEL_ENC_YUV; + + sink_fmt->colorspace = format->colorspace ? : + (is_yuv ? V4L2_COLORSPACE_SRGB : + V4L2_COLORSPACE_RAW); + sink_fmt->xfer_func = format->xfer_func ? : + V4L2_MAP_XFER_FUNC_DEFAULT(sink_fmt->colorspace); + if (is_yuv) { + sink_fmt->ycbcr_enc = format->ycbcr_enc ? : + V4L2_MAP_YCBCR_ENC_DEFAULT(sink_fmt->colorspace); + sink_fmt->quantization = format->quantization ? : + V4L2_MAP_QUANTIZATION_DEFAULT(false, sink_fmt->colorspace, + sink_fmt->ycbcr_enc); + } else { + /* + * The YCbCr encoding isn't applicable for non-YUV formats, but + * V4L2 has no "no encoding" value. Hardcode it to Rec. 601, it + * should be ignored by userspace. + */ + sink_fmt->ycbcr_enc = V4L2_YCBCR_ENC_601; + sink_fmt->quantization = V4L2_QUANTIZATION_FULL_RANGE; + } + *format = *sink_fmt; + /* Propagate the media bus code and color space to the source pad. */ + src_fmt->code = sink_fmt->code; + src_fmt->colorspace = sink_fmt->colorspace; + src_fmt->xfer_func = sink_fmt->xfer_func; + src_fmt->ycbcr_enc = sink_fmt->ycbcr_enc; + src_fmt->quantization = sink_fmt->quantization; + /* Update sink crop */ rkisp1_rsz_set_sink_crop(rsz, sd_state, sink_crop, which); } -- GitLab From cf655faff5815986bb4393c303590f6f432644bb Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 17 Aug 2022 01:53:57 +0200 Subject: [PATCH 0479/2223] media: rkisp1: Clean up LSC configuration code Clean up the LSC configuration code to improve its readability by shortening lines, using extra local variables and renaming long variables. No functional change intended. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-params.c | 199 ++++++++---------- 1 file changed, 86 insertions(+), 113 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 246a6faa1fc1b..fbbaf55052911 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -198,149 +198,129 @@ static void rkisp1_lsc_matrix_config_v10(struct rkisp1_params *params, const struct rkisp1_cif_isp_lsc_config *pconfig) { - unsigned int isp_lsc_status, sram_addr, isp_lsc_table_sel, i, j, data; + struct rkisp1_device *rkisp1 = params->rkisp1; + unsigned int lsc_status, sram_addr, lsc_table_sel, i, j; - isp_lsc_status = rkisp1_read(params->rkisp1, RKISP1_CIF_ISP_LSC_STATUS); + lsc_status = rkisp1_read(rkisp1, RKISP1_CIF_ISP_LSC_STATUS); /* RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_153 = ( 17 * 18 ) >> 1 */ - sram_addr = (isp_lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE) ? + sram_addr = lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE ? RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_0 : RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_153; - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_ADDR, sram_addr); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_ADDR, sram_addr); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_ADDR, sram_addr); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_ADDR, sram_addr); /* program data tables (table size is 9 * 17 = 153) */ for (i = 0; i < RKISP1_CIF_ISP_LSC_SAMPLES_MAX; i++) { + const __u16 *r_tbl = pconfig->r_data_tbl[i]; + const __u16 *gr_tbl = pconfig->gr_data_tbl[i]; + const __u16 *gb_tbl = pconfig->gb_data_tbl[i]; + const __u16 *b_tbl = pconfig->b_data_tbl[i]; + /* * 17 sectors with 2 values in one DWORD = 9 * DWORDs (2nd value of last DWORD unused) */ for (j = 0; j < RKISP1_CIF_ISP_LSC_SAMPLES_MAX - 1; j += 2) { - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->r_data_tbl[i][j], - pconfig->r_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_R_TABLE_DATA, data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->gr_data_tbl[i][j], - pconfig->gr_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->gb_data_tbl[i][j], - pconfig->gb_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->b_data_tbl[i][j], - pconfig->b_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_B_TABLE_DATA, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10( + r_tbl[j], r_tbl[j + 1])); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10( + gr_tbl[j], gr_tbl[j + 1])); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10( + gb_tbl[j], gb_tbl[j + 1])); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10( + b_tbl[j], b_tbl[j + 1])); } - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->r_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_DATA, - data); - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->gr_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, - data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->gb_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, - data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(pconfig->b_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_DATA, - data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(r_tbl[j], 0)); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(gr_tbl[j], 0)); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(gb_tbl[j], 0)); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V10(b_tbl[j], 0)); } - isp_lsc_table_sel = (isp_lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE) ? - RKISP1_CIF_ISP_LSC_TABLE_0 : - RKISP1_CIF_ISP_LSC_TABLE_1; - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_TABLE_SEL, - isp_lsc_table_sel); + + lsc_table_sel = lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE ? + RKISP1_CIF_ISP_LSC_TABLE_0 : RKISP1_CIF_ISP_LSC_TABLE_1; + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_TABLE_SEL, lsc_table_sel); } static void rkisp1_lsc_matrix_config_v12(struct rkisp1_params *params, const struct rkisp1_cif_isp_lsc_config *pconfig) { - unsigned int isp_lsc_status, sram_addr, isp_lsc_table_sel, i, j, data; + struct rkisp1_device *rkisp1 = params->rkisp1; + unsigned int lsc_status, sram_addr, lsc_table_sel, i, j; - isp_lsc_status = rkisp1_read(params->rkisp1, RKISP1_CIF_ISP_LSC_STATUS); + lsc_status = rkisp1_read(rkisp1, RKISP1_CIF_ISP_LSC_STATUS); /* RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_153 = ( 17 * 18 ) >> 1 */ - sram_addr = (isp_lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE) ? - RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_0 : - RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_153; - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_ADDR, sram_addr); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_ADDR, sram_addr); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_ADDR, sram_addr); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_ADDR, sram_addr); + sram_addr = lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE ? + RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_0 : + RKISP1_CIF_ISP_LSC_TABLE_ADDRESS_153; + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_ADDR, sram_addr); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_ADDR, sram_addr); /* program data tables (table size is 9 * 17 = 153) */ for (i = 0; i < RKISP1_CIF_ISP_LSC_SAMPLES_MAX; i++) { + const __u16 *r_tbl = pconfig->r_data_tbl[i]; + const __u16 *gr_tbl = pconfig->gr_data_tbl[i]; + const __u16 *gb_tbl = pconfig->gb_data_tbl[i]; + const __u16 *b_tbl = pconfig->b_data_tbl[i]; + /* * 17 sectors with 2 values in one DWORD = 9 * DWORDs (2nd value of last DWORD unused) */ for (j = 0; j < RKISP1_CIF_ISP_LSC_SAMPLES_MAX - 1; j += 2) { - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( - pconfig->r_data_tbl[i][j], - pconfig->r_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_R_TABLE_DATA, data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( - pconfig->gr_data_tbl[i][j], - pconfig->gr_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( - pconfig->gb_data_tbl[i][j], - pconfig->gb_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( - pconfig->b_data_tbl[i][j], - pconfig->b_data_tbl[i][j + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_B_TABLE_DATA, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( + r_tbl[j], r_tbl[j + 1])); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( + gr_tbl[j], gr_tbl[j + 1])); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( + gb_tbl[j], gb_tbl[j + 1])); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12( + b_tbl[j], b_tbl[j + 1])); } - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(pconfig->r_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_DATA, - data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(pconfig->gr_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, - data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(pconfig->gb_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, - data); - - data = RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(pconfig->b_data_tbl[i][j], 0); - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_DATA, - data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_R_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(r_tbl[j], 0)); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GR_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(gr_tbl[j], 0)); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_GB_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(gb_tbl[j], 0)); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_B_TABLE_DATA, + RKISP1_CIF_ISP_LSC_TABLE_DATA_V12(b_tbl[j], 0)); } - isp_lsc_table_sel = (isp_lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE) ? - RKISP1_CIF_ISP_LSC_TABLE_0 : - RKISP1_CIF_ISP_LSC_TABLE_1; - rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_LSC_TABLE_SEL, - isp_lsc_table_sel); + + lsc_table_sel = lsc_status & RKISP1_CIF_ISP_LSC_ACTIVE_TABLE ? + RKISP1_CIF_ISP_LSC_TABLE_0 : RKISP1_CIF_ISP_LSC_TABLE_1; + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_TABLE_SEL, lsc_table_sel); } static void rkisp1_lsc_config(struct rkisp1_params *params, const struct rkisp1_cif_isp_lsc_config *arg) { + struct rkisp1_device *rkisp1 = params->rkisp1; unsigned int i, data; u32 lsc_ctrl; /* To config must be off , store the current status firstly */ - lsc_ctrl = rkisp1_read(params->rkisp1, RKISP1_CIF_ISP_LSC_CTRL); + lsc_ctrl = rkisp1_read(rkisp1, RKISP1_CIF_ISP_LSC_CTRL); rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_LSC_CTRL, RKISP1_CIF_ISP_LSC_CTRL_ENA); params->ops->lsc_matrix_config(params, arg); @@ -349,38 +329,31 @@ static void rkisp1_lsc_config(struct rkisp1_params *params, /* program x size tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->x_size_tbl[i * 2], arg->x_size_tbl[i * 2 + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_XSIZE_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XSIZE_01 + i * 4, data); /* program x grad tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->x_grad_tbl[i * 2], arg->x_grad_tbl[i * 2 + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_XGRAD_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XGRAD_01 + i * 4, data); /* program y size tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->y_size_tbl[i * 2], arg->y_size_tbl[i * 2 + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_YSIZE_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YSIZE_01 + i * 4, data); /* program y grad tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->y_grad_tbl[i * 2], arg->y_grad_tbl[i * 2 + 1]); - rkisp1_write(params->rkisp1, - RKISP1_CIF_ISP_LSC_YGRAD_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YGRAD_01 + i * 4, data); } /* restore the lsc ctrl status */ - if (lsc_ctrl & RKISP1_CIF_ISP_LSC_CTRL_ENA) { - rkisp1_param_set_bits(params, - RKISP1_CIF_ISP_LSC_CTRL, + if (lsc_ctrl & RKISP1_CIF_ISP_LSC_CTRL_ENA) + rkisp1_param_set_bits(params, RKISP1_CIF_ISP_LSC_CTRL, RKISP1_CIF_ISP_LSC_CTRL_ENA); - } else { - rkisp1_param_clear_bits(params, - RKISP1_CIF_ISP_LSC_CTRL, + else + rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_LSC_CTRL, RKISP1_CIF_ISP_LSC_CTRL_ENA); - } } /* ISP Filtering function */ -- GitLab From da57dffa098c8d8e46cda4c891e106bdc849712d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 17 Aug 2022 01:53:57 +0200 Subject: [PATCH 0480/2223] media: rkisp1: Store LSC register values in u32 variables Use the u32 type instead of unsigned int to store register values in the LSC configuration code, to make the variables' size more explicit. No functional change intended. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-params.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index fbbaf55052911..dbe826fd02d29 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -199,7 +199,8 @@ rkisp1_lsc_matrix_config_v10(struct rkisp1_params *params, const struct rkisp1_cif_isp_lsc_config *pconfig) { struct rkisp1_device *rkisp1 = params->rkisp1; - unsigned int lsc_status, sram_addr, lsc_table_sel, i, j; + u32 lsc_status, sram_addr, lsc_table_sel; + unsigned int i, j; lsc_status = rkisp1_read(rkisp1, RKISP1_CIF_ISP_LSC_STATUS); @@ -258,7 +259,8 @@ rkisp1_lsc_matrix_config_v12(struct rkisp1_params *params, const struct rkisp1_cif_isp_lsc_config *pconfig) { struct rkisp1_device *rkisp1 = params->rkisp1; - unsigned int lsc_status, sram_addr, lsc_table_sel, i, j; + u32 lsc_status, sram_addr, lsc_table_sel; + unsigned int i, j; lsc_status = rkisp1_read(rkisp1, RKISP1_CIF_ISP_LSC_STATUS); @@ -316,8 +318,8 @@ static void rkisp1_lsc_config(struct rkisp1_params *params, const struct rkisp1_cif_isp_lsc_config *arg) { struct rkisp1_device *rkisp1 = params->rkisp1; - unsigned int i, data; - u32 lsc_ctrl; + u32 lsc_ctrl, data; + unsigned int i; /* To config must be off , store the current status firstly */ lsc_ctrl = rkisp1_read(rkisp1, RKISP1_CIF_ISP_LSC_CTRL); -- GitLab From 10e36b2191401ffe98373e144580e0a9288e71e9 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 17 Aug 2022 01:53:57 +0200 Subject: [PATCH 0481/2223] media: rkisp1: Simplify LSC x/y size and grad register macros The LSC module x/y size and grad configuration is stored in a set of 4 indexed registers each. The rkisp1-regs.h header defines all those registers, but only the first one in each set is used, with manual calculation of addresses of subsequent registers. Simplifies this by merging all 4 register macros into one that takes the index as a parameter. No functional change intended. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-params.c | 8 ++++---- .../platform/rockchip/rkisp1/rkisp1-regs.h | 20 ++++--------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index dbe826fd02d29..aa6efa4c6e9ef 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -331,22 +331,22 @@ static void rkisp1_lsc_config(struct rkisp1_params *params, /* program x size tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->x_size_tbl[i * 2], arg->x_size_tbl[i * 2 + 1]); - rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XSIZE_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XSIZE(i), data); /* program x grad tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->x_grad_tbl[i * 2], arg->x_grad_tbl[i * 2 + 1]); - rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XGRAD_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XGRAD(i), data); /* program y size tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->y_size_tbl[i * 2], arg->y_size_tbl[i * 2 + 1]); - rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YSIZE_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YSIZE(i), data); /* program y grad tables */ data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->y_grad_tbl[i * 2], arg->y_grad_tbl[i * 2 + 1]); - rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YGRAD_01 + i * 4, data); + rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YGRAD(i), data); } /* restore the lsc ctrl status */ diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h index a931f7216e9bd..b3d8c10163d45 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h @@ -1072,22 +1072,10 @@ #define RKISP1_CIF_ISP_LSC_GR_TABLE_DATA (RKISP1_CIF_ISP_LSC_BASE + 0x00000018) #define RKISP1_CIF_ISP_LSC_B_TABLE_DATA (RKISP1_CIF_ISP_LSC_BASE + 0x0000001C) #define RKISP1_CIF_ISP_LSC_GB_TABLE_DATA (RKISP1_CIF_ISP_LSC_BASE + 0x00000020) -#define RKISP1_CIF_ISP_LSC_XGRAD_01 (RKISP1_CIF_ISP_LSC_BASE + 0x00000024) -#define RKISP1_CIF_ISP_LSC_XGRAD_23 (RKISP1_CIF_ISP_LSC_BASE + 0x00000028) -#define RKISP1_CIF_ISP_LSC_XGRAD_45 (RKISP1_CIF_ISP_LSC_BASE + 0x0000002C) -#define RKISP1_CIF_ISP_LSC_XGRAD_67 (RKISP1_CIF_ISP_LSC_BASE + 0x00000030) -#define RKISP1_CIF_ISP_LSC_YGRAD_01 (RKISP1_CIF_ISP_LSC_BASE + 0x00000034) -#define RKISP1_CIF_ISP_LSC_YGRAD_23 (RKISP1_CIF_ISP_LSC_BASE + 0x00000038) -#define RKISP1_CIF_ISP_LSC_YGRAD_45 (RKISP1_CIF_ISP_LSC_BASE + 0x0000003C) -#define RKISP1_CIF_ISP_LSC_YGRAD_67 (RKISP1_CIF_ISP_LSC_BASE + 0x00000040) -#define RKISP1_CIF_ISP_LSC_XSIZE_01 (RKISP1_CIF_ISP_LSC_BASE + 0x00000044) -#define RKISP1_CIF_ISP_LSC_XSIZE_23 (RKISP1_CIF_ISP_LSC_BASE + 0x00000048) -#define RKISP1_CIF_ISP_LSC_XSIZE_45 (RKISP1_CIF_ISP_LSC_BASE + 0x0000004C) -#define RKISP1_CIF_ISP_LSC_XSIZE_67 (RKISP1_CIF_ISP_LSC_BASE + 0x00000050) -#define RKISP1_CIF_ISP_LSC_YSIZE_01 (RKISP1_CIF_ISP_LSC_BASE + 0x00000054) -#define RKISP1_CIF_ISP_LSC_YSIZE_23 (RKISP1_CIF_ISP_LSC_BASE + 0x00000058) -#define RKISP1_CIF_ISP_LSC_YSIZE_45 (RKISP1_CIF_ISP_LSC_BASE + 0x0000005C) -#define RKISP1_CIF_ISP_LSC_YSIZE_67 (RKISP1_CIF_ISP_LSC_BASE + 0x00000060) +#define RKISP1_CIF_ISP_LSC_XGRAD(n) (RKISP1_CIF_ISP_LSC_BASE + 0x00000024 + (n) * 4) +#define RKISP1_CIF_ISP_LSC_YGRAD(n) (RKISP1_CIF_ISP_LSC_BASE + 0x00000034 + (n) * 4) +#define RKISP1_CIF_ISP_LSC_XSIZE(n) (RKISP1_CIF_ISP_LSC_BASE + 0x00000044 + (n) * 4) +#define RKISP1_CIF_ISP_LSC_YSIZE(n) (RKISP1_CIF_ISP_LSC_BASE + 0x00000054 + (n) * 4) #define RKISP1_CIF_ISP_LSC_TABLE_SEL (RKISP1_CIF_ISP_LSC_BASE + 0x00000064) #define RKISP1_CIF_ISP_LSC_STATUS (RKISP1_CIF_ISP_LSC_BASE + 0x00000068) -- GitLab From 4c3501f13e8e60f6e7e7308c77ac4404e1007c18 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 17 Aug 2022 01:53:57 +0200 Subject: [PATCH 0482/2223] media: rkisp1: Use correct macro for gradient registers The rkisp1_lsc_config() function incorrectly uses the RKISP1_CIF_ISP_LSC_SECT_SIZE() macro for the gradient registers. Replace it with the correct macro, and rename it from RKISP1_CIF_ISP_LSC_GRAD_SIZE() to RKISP1_CIF_ISP_LSC_SECT_GRAD() as the corresponding registers store the gradients for each sector, not a size. This doesn't cause any functional change as the two macros are defined identically (the size and gradient registers store fields in the same number of bits at the same positions). Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-params.c | 4 ++-- drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index aa6efa4c6e9ef..123c26fc1679e 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -334,7 +334,7 @@ static void rkisp1_lsc_config(struct rkisp1_params *params, rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XSIZE(i), data); /* program x grad tables */ - data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->x_grad_tbl[i * 2], + data = RKISP1_CIF_ISP_LSC_SECT_GRAD(arg->x_grad_tbl[i * 2], arg->x_grad_tbl[i * 2 + 1]); rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_XGRAD(i), data); @@ -344,7 +344,7 @@ static void rkisp1_lsc_config(struct rkisp1_params *params, rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YSIZE(i), data); /* program y grad tables */ - data = RKISP1_CIF_ISP_LSC_SECT_SIZE(arg->y_grad_tbl[i * 2], + data = RKISP1_CIF_ISP_LSC_SECT_GRAD(arg->y_grad_tbl[i * 2], arg->y_grad_tbl[i * 2 + 1]); rkisp1_write(rkisp1, RKISP1_CIF_ISP_LSC_YGRAD(i), data); } diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h index b3d8c10163d45..421cc73355dbf 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h @@ -576,7 +576,7 @@ (((v0) & 0x1FFF) | (((v1) & 0x1FFF) << 13)) #define RKISP1_CIF_ISP_LSC_SECT_SIZE(v0, v1) \ (((v0) & 0xFFF) | (((v1) & 0xFFF) << 16)) -#define RKISP1_CIF_ISP_LSC_GRAD_SIZE(v0, v1) \ +#define RKISP1_CIF_ISP_LSC_SECT_GRAD(v0, v1) \ (((v0) & 0xFFF) | (((v1) & 0xFFF) << 16)) /* LSC: ISP_LSC_TABLE_SEL */ -- GitLab From 4b07e2b8f7b53e929a483320cd6c9c1cbd76e329 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 17 Aug 2022 01:53:57 +0200 Subject: [PATCH 0483/2223] media: rkisp1: Configure LSC after enabling the ISP The ISP8000Nano v18.02 (found in the i.MX8MP) requires the ISP to be enabled (as indicated by the ISP_CTRL.ISP_ENABLE bit) to configure the lens shading table in internal RAM. The driver currently configures all ISP initial parameters before enabling the ISP, which causes the LSC RAM to not be initialized properly. To fix this, split the rkisp1_params_configure() function into a rkisp1_params_pre_configure() and a rkisp1_params_post_configure(). The former configures all ISP parameters but LSC, while the latter configures LSC. To implement this, the rkisp1_params_apply_params_cfg() function is deconstructed, with two small helpers created to deal with the parameters buffers, which are then used in rkisp1_params_isr(), rkisp1_params_pre_configure() and rkisp1_params_post_configure(). While this initialization ordering is only needed for the ISP8000Nano v18.02, it doesn't affect other ISP versions negatively, and can thus be followed unconditionally. Signed-off-by: Laurent Pinchart Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../platform/rockchip/rkisp1/rkisp1-common.h | 29 ++- .../platform/rockchip/rkisp1/rkisp1-isp.c | 9 +- .../platform/rockchip/rkisp1/rkisp1-params.c | 169 ++++++++++++------ 3 files changed, 143 insertions(+), 64 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h index b704e955cb287..a1293c45aae11 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h @@ -557,19 +557,32 @@ void rkisp1_sd_adjust_crop(struct v4l2_rect *crop, */ const struct rkisp1_mbus_info *rkisp1_mbus_info_get_by_code(u32 mbus_code); -/* rkisp1_params_configure - configure the params when stream starts. - * This function is called by the isp entity upon stream starts. - * The function applies the initial configuration of the parameters. +/* + * rkisp1_params_pre_configure - Configure the params before stream start * - * @params: pointer to rkisp1_params. + * @params: pointer to rkisp1_params * @bayer_pat: the bayer pattern on the isp video sink pad * @quantization: the quantization configured on the isp's src pad * @ycbcr_encoding: the ycbcr_encoding configured on the isp's src pad + * + * This function is called by the ISP entity just before the ISP gets started. + * It applies the initial ISP parameters from the first params buffer, but + * skips LSC as it needs to be configured after the ISP is started. + */ +void rkisp1_params_pre_configure(struct rkisp1_params *params, + enum rkisp1_fmt_raw_pat_type bayer_pat, + enum v4l2_quantization quantization, + enum v4l2_ycbcr_encoding ycbcr_encoding); + +/* + * rkisp1_params_post_configure - Configure the params after stream start + * + * @params: pointer to rkisp1_params + * + * This function is called by the ISP entity just after the ISP gets started. + * It applies the initial ISP LSC parameters from the first params buffer. */ -void rkisp1_params_configure(struct rkisp1_params *params, - enum rkisp1_fmt_raw_pat_type bayer_pat, - enum v4l2_quantization quantization, - enum v4l2_ycbcr_encoding ycbcr_encoding); +void rkisp1_params_post_configure(struct rkisp1_params *params); /* rkisp1_params_disable - disable all parameters. * This function is called by the isp entity upon stream start diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c index f19c0718963fe..585cf3f534692 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-isp.c @@ -233,9 +233,9 @@ static int rkisp1_config_isp(struct rkisp1_isp *isp, src_frm = rkisp1_isp_get_pad_fmt(isp, NULL, RKISP1_ISP_PAD_SOURCE_VIDEO, V4L2_SUBDEV_FORMAT_ACTIVE); - rkisp1_params_configure(&rkisp1->params, sink_fmt->bayer_pat, - src_frm->quantization, - src_frm->ycbcr_enc); + rkisp1_params_pre_configure(&rkisp1->params, sink_fmt->bayer_pat, + src_frm->quantization, + src_frm->ycbcr_enc); } return 0; @@ -341,6 +341,9 @@ static void rkisp1_isp_start(struct rkisp1_isp *isp) RKISP1_CIF_ISP_CTRL_ISP_ENABLE | RKISP1_CIF_ISP_CTRL_ISP_INFORM_ENABLE; rkisp1_write(rkisp1, RKISP1_CIF_ISP_CTRL, val); + + if (isp->src_fmt->pixel_enc != V4L2_PIXEL_ENC_BAYER) + rkisp1_params_post_configure(&rkisp1->params); } /* ---------------------------------------------------------------------------- diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 123c26fc1679e..d8731ebbf479e 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -1297,22 +1297,6 @@ rkisp1_isp_isr_other_config(struct rkisp1_params *params, RKISP1_CIF_ISP_CTRL_ISP_GAMMA_IN_ENA); } - /* update lsc config */ - if (module_cfg_update & RKISP1_CIF_ISP_MODULE_LSC) - rkisp1_lsc_config(params, - &new_params->others.lsc_config); - - if (module_en_update & RKISP1_CIF_ISP_MODULE_LSC) { - if (module_ens & RKISP1_CIF_ISP_MODULE_LSC) - rkisp1_param_set_bits(params, - RKISP1_CIF_ISP_LSC_CTRL, - RKISP1_CIF_ISP_LSC_CTRL_ENA); - else - rkisp1_param_clear_bits(params, - RKISP1_CIF_ISP_LSC_CTRL, - RKISP1_CIF_ISP_LSC_CTRL_ENA); - } - /* update awb gains */ if (module_cfg_update & RKISP1_CIF_ISP_MODULE_AWB_GAIN) params->ops->awb_gain_config(params, &new_params->others.awb_gain_config); @@ -1429,6 +1413,33 @@ rkisp1_isp_isr_other_config(struct rkisp1_params *params, } } +static void +rkisp1_isp_isr_lsc_config(struct rkisp1_params *params, + const struct rkisp1_params_cfg *new_params) +{ + unsigned int module_en_update, module_cfg_update, module_ens; + + module_en_update = new_params->module_en_update; + module_cfg_update = new_params->module_cfg_update; + module_ens = new_params->module_ens; + + /* update lsc config */ + if (module_cfg_update & RKISP1_CIF_ISP_MODULE_LSC) + rkisp1_lsc_config(params, + &new_params->others.lsc_config); + + if (module_en_update & RKISP1_CIF_ISP_MODULE_LSC) { + if (module_ens & RKISP1_CIF_ISP_MODULE_LSC) + rkisp1_param_set_bits(params, + RKISP1_CIF_ISP_LSC_CTRL, + RKISP1_CIF_ISP_LSC_CTRL_ENA); + else + rkisp1_param_clear_bits(params, + RKISP1_CIF_ISP_LSC_CTRL, + RKISP1_CIF_ISP_LSC_CTRL_ENA); + } +} + static void rkisp1_isp_isr_meas_config(struct rkisp1_params *params, struct rkisp1_params_cfg *new_params) { @@ -1490,47 +1501,60 @@ static void rkisp1_isp_isr_meas_config(struct rkisp1_params *params, } } -static void rkisp1_params_apply_params_cfg(struct rkisp1_params *params, - unsigned int frame_sequence) +static bool rkisp1_params_get_buffer(struct rkisp1_params *params, + struct rkisp1_buffer **buf, + struct rkisp1_params_cfg **cfg) { - struct rkisp1_params_cfg *new_params; - struct rkisp1_buffer *cur_buf = NULL; - if (list_empty(¶ms->params)) - return; - - cur_buf = list_first_entry(¶ms->params, - struct rkisp1_buffer, queue); + return false; - new_params = (struct rkisp1_params_cfg *)vb2_plane_vaddr(&cur_buf->vb.vb2_buf, 0); + *buf = list_first_entry(¶ms->params, struct rkisp1_buffer, queue); + *cfg = vb2_plane_vaddr(&(*buf)->vb.vb2_buf, 0); - rkisp1_isp_isr_other_config(params, new_params); - rkisp1_isp_isr_meas_config(params, new_params); - - /* update shadow register immediately */ - rkisp1_param_set_bits(params, RKISP1_CIF_ISP_CTRL, RKISP1_CIF_ISP_CTRL_ISP_CFG_UPD); + return true; +} - list_del(&cur_buf->queue); +static void rkisp1_params_complete_buffer(struct rkisp1_params *params, + struct rkisp1_buffer *buf, + unsigned int frame_sequence) +{ + list_del(&buf->queue); - cur_buf->vb.sequence = frame_sequence; - vb2_buffer_done(&cur_buf->vb.vb2_buf, VB2_BUF_STATE_DONE); + buf->vb.sequence = frame_sequence; + vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_DONE); } void rkisp1_params_isr(struct rkisp1_device *rkisp1) { - /* - * This isr is called when the ISR finishes processing a frame (RKISP1_CIF_ISP_FRAME). - * Configurations performed here will be applied on the next frame. - * Since frame_sequence is updated on the vertical sync signal, we should use - * frame_sequence + 1 here to indicate to userspace on which frame these parameters - * are being applied. - */ - unsigned int frame_sequence = rkisp1->isp.frame_sequence + 1; struct rkisp1_params *params = &rkisp1->params; + struct rkisp1_params_cfg *new_params; + struct rkisp1_buffer *cur_buf; spin_lock(¶ms->config_lock); - rkisp1_params_apply_params_cfg(params, frame_sequence); + if (!rkisp1_params_get_buffer(params, &cur_buf, &new_params)) + goto unlock; + + rkisp1_isp_isr_other_config(params, new_params); + rkisp1_isp_isr_lsc_config(params, new_params); + rkisp1_isp_isr_meas_config(params, new_params); + + /* update shadow register immediately */ + rkisp1_param_set_bits(params, RKISP1_CIF_ISP_CTRL, + RKISP1_CIF_ISP_CTRL_ISP_CFG_UPD); + + /* + * This isr is called when the ISR finishes processing a frame + * (RKISP1_CIF_ISP_FRAME). Configurations performed here will be + * applied on the next frame. Since frame_sequence is updated on the + * vertical sync signal, we should use frame_sequence + 1 here to + * indicate to userspace on which frame these parameters are being + * applied. + */ + rkisp1_params_complete_buffer(params, cur_buf, + rkisp1->isp.frame_sequence + 1); + +unlock: spin_unlock(¶ms->config_lock); } @@ -1573,9 +1597,18 @@ static const struct rkisp1_cif_isp_afc_config rkisp1_afc_params_default_config = 14 }; -static void rkisp1_params_config_parameter(struct rkisp1_params *params) +void rkisp1_params_pre_configure(struct rkisp1_params *params, + enum rkisp1_fmt_raw_pat_type bayer_pat, + enum v4l2_quantization quantization, + enum v4l2_ycbcr_encoding ycbcr_encoding) { struct rkisp1_cif_isp_hst_config hst = rkisp1_hst_params_default_config; + struct rkisp1_params_cfg *new_params; + struct rkisp1_buffer *cur_buf; + + params->quantization = quantization; + params->ycbcr_encoding = ycbcr_encoding; + params->raw_type = bayer_pat; params->ops->awb_meas_config(params, &rkisp1_awb_params_default_config); params->ops->awb_meas_enable(params, &rkisp1_awb_params_default_config, @@ -1599,20 +1632,50 @@ static void rkisp1_params_config_parameter(struct rkisp1_params *params) spin_lock_irq(¶ms->config_lock); /* apply the first buffer if there is one already */ - rkisp1_params_apply_params_cfg(params, 0); + if (!rkisp1_params_get_buffer(params, &cur_buf, &new_params)) + goto unlock; + + rkisp1_isp_isr_other_config(params, new_params); + rkisp1_isp_isr_meas_config(params, new_params); + + /* update shadow register immediately */ + rkisp1_param_set_bits(params, RKISP1_CIF_ISP_CTRL, + RKISP1_CIF_ISP_CTRL_ISP_CFG_UPD); + +unlock: spin_unlock_irq(¶ms->config_lock); } -void rkisp1_params_configure(struct rkisp1_params *params, - enum rkisp1_fmt_raw_pat_type bayer_pat, - enum v4l2_quantization quantization, - enum v4l2_ycbcr_encoding ycbcr_encoding) +void rkisp1_params_post_configure(struct rkisp1_params *params) { - params->quantization = quantization; - params->ycbcr_encoding = ycbcr_encoding; - params->raw_type = bayer_pat; - rkisp1_params_config_parameter(params); + struct rkisp1_params_cfg *new_params; + struct rkisp1_buffer *cur_buf; + + spin_lock_irq(¶ms->config_lock); + + /* + * Apply LSC parameters from the first buffer (if any is already + * available. This must be done after the ISP gets started in the + * ISP8000Nano v18.02 (found in the i.MX8MP) as access to the LSC RAM + * is gated by the ISP_CTRL.ISP_ENABLE bit. As this initialization + * ordering doesn't affect other ISP versions negatively, do so + * unconditionally. + */ + + if (!rkisp1_params_get_buffer(params, &cur_buf, &new_params)) + goto unlock; + + rkisp1_isp_isr_lsc_config(params, new_params); + + /* update shadow register immediately */ + rkisp1_param_set_bits(params, RKISP1_CIF_ISP_CTRL, + RKISP1_CIF_ISP_CTRL_ISP_CFG_UPD); + + rkisp1_params_complete_buffer(params, cur_buf, 0); + +unlock: + spin_unlock_irq(¶ms->config_lock); } /* -- GitLab From c53e3a049f35978a150526671587fd46b1ae7ca1 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 23 Aug 2022 17:11:36 +0200 Subject: [PATCH 0484/2223] media: rkisp1: Zero v4l2_subdev_format fields in when validating links The local sd_fmt variable in rkisp1_capture_link_validate() has uninitialized fields, which causes random failures when calling the subdev .get_fmt() operation. Fix it by initializing the variable when declaring it, which zeros all other fields. Signed-off-by: Laurent Pinchart Reviewed-by: Paul Elder Reviewed-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c index d5904c96ff3fc..c66963a2ccd99 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c @@ -1273,11 +1273,12 @@ static int rkisp1_capture_link_validate(struct media_link *link) struct rkisp1_capture *cap = video_get_drvdata(vdev); const struct rkisp1_capture_fmt_cfg *fmt = rkisp1_find_fmt_cfg(cap, cap->pix.fmt.pixelformat); - struct v4l2_subdev_format sd_fmt; + struct v4l2_subdev_format sd_fmt = { + .which = V4L2_SUBDEV_FORMAT_ACTIVE, + .pad = link->source->index, + }; int ret; - sd_fmt.which = V4L2_SUBDEV_FORMAT_ACTIVE; - sd_fmt.pad = link->source->index; ret = v4l2_subdev_call(sd, pad, get_fmt, NULL, &sd_fmt); if (ret) return ret; -- GitLab From 93f65ce036863893c164ca410938e0968964b26c Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Aug 2022 09:02:42 +0200 Subject: [PATCH 0485/2223] media: s5p_cec: limit msg.len to CEC_MAX_MSG_SIZE I expect that the hardware will have limited this to 16, but just in case it hasn't, check for this corner case. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/platform/s5p/s5p_cec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/cec/platform/s5p/s5p_cec.c b/drivers/media/cec/platform/s5p/s5p_cec.c index ce9a9d922f116..0a30e7acdc10e 100644 --- a/drivers/media/cec/platform/s5p/s5p_cec.c +++ b/drivers/media/cec/platform/s5p/s5p_cec.c @@ -115,6 +115,8 @@ static irqreturn_t s5p_cec_irq_handler(int irq, void *priv) dev_dbg(cec->dev, "Buffer overrun (worker did not process previous message)\n"); cec->rx = STATE_BUSY; cec->msg.len = status >> 24; + if (cec->msg.len > CEC_MAX_MSG_SIZE) + cec->msg.len = CEC_MAX_MSG_SIZE; cec->msg.rx_status = CEC_RX_STATUS_OK; s5p_cec_get_rx_buf(cec, cec->msg.len, cec->msg.msg); -- GitLab From 2dc73b48665411a08c4e5f0f823dea8510761603 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Aug 2022 09:06:19 +0200 Subject: [PATCH 0486/2223] media: cros-ec-cec: limit msg.len to CEC_MAX_MSG_SIZE I expect that the hardware will have limited this to 16, but just in case it hasn't, check for this corner case. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c index 3b583ed4da9df..e5ebaa58be457 100644 --- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c +++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c @@ -44,6 +44,8 @@ static void handle_cec_message(struct cros_ec_cec *cros_ec_cec) uint8_t *cec_message = cros_ec->event_data.data.cec_message; unsigned int len = cros_ec->event_size; + if (len > CEC_MAX_MSG_SIZE) + len = CEC_MAX_MSG_SIZE; cros_ec_cec->rx_msg.len = len; memcpy(cros_ec_cec->rx_msg.msg, cec_message, len); -- GitLab From 91d0092a6958640c362d0913ed6f933e514eee68 Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Wed, 24 Aug 2022 15:10:16 +0200 Subject: [PATCH 0487/2223] media: usb/msi2500: fix repeated words in comments Delete the redundant word 'for'. Signed-off-by: Jilin Yuan Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/msi2500/msi2500.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/usb/msi2500/msi2500.c b/drivers/media/usb/msi2500/msi2500.c index 5a1f2698efb7b..9759996ee6a4c 100644 --- a/drivers/media/usb/msi2500/msi2500.c +++ b/drivers/media/usb/msi2500/msi2500.c @@ -209,7 +209,7 @@ leave: * * Control bits for previous samples is 32-bit field, containing 16 x 2-bit * numbers. This results one 2-bit number for 8 samples. It is likely used for - * for bit shifting sample by given bits, increasing actual sampling resolution. + * bit shifting sample by given bits, increasing actual sampling resolution. * Number 2 (0b10) was never seen. * * 6 * 16 * 2 * 4 = 768 samples. 768 * 4 = 3072 bytes -- GitLab From 9dcd063656a9225c8ac6341c0931b34ceebf9214 Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Wed, 24 Aug 2022 15:13:50 +0200 Subject: [PATCH 0488/2223] media: usb/dvb-usb-v2: fix repeated words in comments Delete the redundant word 'my'. Signed-off-by: Jilin Yuan Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/dvb-usb-v2/af9035.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/usb/dvb-usb-v2/af9035.c b/drivers/media/usb/dvb-usb-v2/af9035.c index 5eef37b00a520..1e9c8d01523be 100644 --- a/drivers/media/usb/dvb-usb-v2/af9035.c +++ b/drivers/media/usb/dvb-usb-v2/af9035.c @@ -1497,7 +1497,7 @@ static int af9035_tuner_attach(struct dvb_usb_adapter *adap) /* * AF9035 gpiot2 = FC0012 enable * XXX: there seems to be something on gpioh8 too, but on my - * my test I didn't find any difference. + * test I didn't find any difference. */ if (adap->id == 0) { -- GitLab From 2d0c052c09d5a074f4806d60b20b0addd8561994 Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Wed, 24 Aug 2022 15:27:42 +0200 Subject: [PATCH 0489/2223] media: pci/cx18: fix repeated words in comments Delete the redundant word 'is'. Signed-off-by: Jilin Yuan Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/pci/cx18/cx18-av-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/pci/cx18/cx18-av-core.c b/drivers/media/pci/cx18/cx18-av-core.c index d3358643fb7d6..ee6e71157786e 100644 --- a/drivers/media/pci/cx18/cx18-av-core.c +++ b/drivers/media/pci/cx18/cx18-av-core.c @@ -339,7 +339,7 @@ void cx18_av_std_setup(struct cx18 *cx) /* * For a 13.5 Mpps clock and 15,625 Hz line rate, a line is - * is 864 pixels = 720 active + 144 blanking. ITU-R BT.601 + * 864 pixels = 720 active + 144 blanking. ITU-R BT.601 * specifies 12 luma clock periods or ~ 0.9 * 13.5 Mpps after * the end of active video to start a horizontal line, so that * leaves 132 pixels of hblank to ignore. @@ -399,7 +399,7 @@ void cx18_av_std_setup(struct cx18 *cx) /* * For a 13.5 Mpps clock and 15,734.26 Hz line rate, a line is - * is 858 pixels = 720 active + 138 blanking. The Hsync leading + * 858 pixels = 720 active + 138 blanking. The Hsync leading * edge should happen 1.2 us * 13.5 Mpps ~= 16 pixels after the * end of active video, leaving 122 pixels of hblank to ignore * before active video starts. -- GitLab From cfeacb5d46ed9c747cad7fdffc254afced50c1ef Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 25 Aug 2022 12:41:49 +0200 Subject: [PATCH 0490/2223] media: coda: jpeg: drop coda9_jpeg_dec_huff_setup() return value coda9_jpeg_dec_huff_setup() never returns anything but 0. Drop return value and superfluous error handling at the call site. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/chips-media/coda-jpeg.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/media/platform/chips-media/coda-jpeg.c b/drivers/media/platform/chips-media/coda-jpeg.c index a0b22b07f69ac..435e7030fc2a8 100644 --- a/drivers/media/platform/chips-media/coda-jpeg.c +++ b/drivers/media/platform/chips-media/coda-jpeg.c @@ -421,7 +421,7 @@ static inline void coda9_jpeg_write_huff_values(struct coda_dev *dev, u8 *bits, coda_write(dev, (s32)values[i], CODA9_REG_JPEG_HUFF_DATA); } -static int coda9_jpeg_dec_huff_setup(struct coda_ctx *ctx) +static void coda9_jpeg_dec_huff_setup(struct coda_ctx *ctx) { struct coda_huff_tab *huff_tab = ctx->params.jpeg_huff_tab; struct coda_dev *dev = ctx->dev; @@ -455,7 +455,6 @@ static int coda9_jpeg_dec_huff_setup(struct coda_ctx *ctx) coda9_jpeg_write_huff_values(dev, huff_tab->luma_ac, 162); coda9_jpeg_write_huff_values(dev, huff_tab->chroma_ac, 162); coda_write(dev, 0x000, CODA9_REG_JPEG_HUFF_CTRL); - return 0; } static inline void coda9_jpeg_write_qmat_tab(struct coda_dev *dev, @@ -1394,14 +1393,8 @@ static int coda9_jpeg_prepare_decode(struct coda_ctx *ctx) coda_write(dev, ctx->params.jpeg_restart_interval, CODA9_REG_JPEG_RST_INTVAL); - if (ctx->params.jpeg_huff_tab) { - ret = coda9_jpeg_dec_huff_setup(ctx); - if (ret < 0) { - v4l2_err(&dev->v4l2_dev, - "failed to set up Huffman tables: %d\n", ret); - return ret; - } - } + if (ctx->params.jpeg_huff_tab) + coda9_jpeg_dec_huff_setup(ctx); coda9_jpeg_qmat_setup(ctx); -- GitLab From d91d7bc85062309aae6d8064563ddf17947cb6bc Mon Sep 17 00:00:00 2001 From: Ming Qian Date: Mon, 29 Aug 2022 07:33:16 +0200 Subject: [PATCH 0491/2223] media: amphion: release m2m ctx when releasing vpu instance release m2m ctx in the callback function that release the vpu instance, then there is no need to add lock around releasing m2m ctx. Fixes: 3cd084519c6f ("media: amphion: add vpu v4l2 m2m support") Signed-off-by: Ming Qian Reviewed-by: Tommaso Merciai Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/amphion/vpu_v4l2.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/media/platform/amphion/vpu_v4l2.c b/drivers/media/platform/amphion/vpu_v4l2.c index 8a3eed957ae6e..b779e0ba916ca 100644 --- a/drivers/media/platform/amphion/vpu_v4l2.c +++ b/drivers/media/platform/amphion/vpu_v4l2.c @@ -603,6 +603,10 @@ static int vpu_v4l2_release(struct vpu_inst *inst) inst->workqueue = NULL; } + if (inst->fh.m2m_ctx) { + v4l2_m2m_ctx_release(inst->fh.m2m_ctx); + inst->fh.m2m_ctx = NULL; + } v4l2_ctrl_handler_free(&inst->ctrl_handler); mutex_destroy(&inst->lock); v4l2_fh_del(&inst->fh); @@ -685,13 +689,6 @@ int vpu_v4l2_close(struct file *file) vpu_trace(vpu->dev, "tgid = %d, pid = %d, inst = %p\n", inst->tgid, inst->pid, inst); - vpu_inst_lock(inst); - if (inst->fh.m2m_ctx) { - v4l2_m2m_ctx_release(inst->fh.m2m_ctx); - inst->fh.m2m_ctx = NULL; - } - vpu_inst_unlock(inst); - call_void_vop(inst, release); vpu_inst_unregister(inst); vpu_inst_put(inst); -- GitLab From cd75981ec93a3abf717d0182ff5d56b650873215 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 29 Aug 2022 16:05:14 +0200 Subject: [PATCH 0492/2223] media: v4l2-ctrls: drop 'elems' argument from control type ops. The type ops for equal, init and validate have an elems argument, but this can be taken from struct v4l2_ctrl: ctrl->elems for equal and init, and ctrl->new_elems for validate (since you are validating a new control value). So drop this argument and update all callers. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/nxp/dw100/dw100.c | 4 ++-- drivers/media/v4l2-core/v4l2-ctrls-api.c | 8 +++---- drivers/media/v4l2-core/v4l2-ctrls-core.c | 19 +++++++-------- include/media/v4l2-ctrls.h | 28 +++++++++-------------- 4 files changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/media/platform/nxp/dw100/dw100.c b/drivers/media/platform/nxp/dw100/dw100.c index b3b057798ab67..f6d48c36f3860 100644 --- a/drivers/media/platform/nxp/dw100/dw100.c +++ b/drivers/media/platform/nxp/dw100/dw100.c @@ -373,7 +373,7 @@ static const struct v4l2_ctrl_ops dw100_ctrl_ops = { * The coordinates are saved in UQ12.4 fixed point format. */ static void dw100_ctrl_dewarping_map_init(const struct v4l2_ctrl *ctrl, - u32 from_idx, u32 elems, + u32 from_idx, union v4l2_ctrl_ptr ptr) { struct dw100_ctx *ctx = @@ -398,7 +398,7 @@ static void dw100_ctrl_dewarping_map_init(const struct v4l2_ctrl *ctrl, ctx->map_height = mh; ctx->map_size = mh * mw * sizeof(u32); - for (idx = from_idx; idx < elems; idx++) { + for (idx = from_idx; idx < ctrl->elems; idx++) { qy = min_t(u32, (idx / mw) * qdy, qsh); qx = min_t(u32, (idx % mw) * qdx, qsw); map[idx] = dw100_map_format_coordinates(qx, qy); diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c index a8c354ad3d234..d0a3aa3806fbd 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-api.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c @@ -89,7 +89,7 @@ static int req_to_user(struct v4l2_ext_control *c, /* Helper function: copy the initial control value back to the caller */ static int def_to_user(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl) { - ctrl->type_ops->init(ctrl, 0, ctrl->elems, ctrl->p_new); + ctrl->type_ops->init(ctrl, 0, ctrl->p_new); return ptr_to_user(c, ctrl, ctrl->p_new); } @@ -126,7 +126,7 @@ static int user_to_new(struct v4l2_ext_control *c, struct v4l2_ctrl *ctrl) if (ctrl->is_dyn_array) ctrl->new_elems = elems; else if (ctrl->is_array) - ctrl->type_ops->init(ctrl, elems, ctrl->elems, ctrl->p_new); + ctrl->type_ops->init(ctrl, elems, ctrl->p_new); return 0; } @@ -494,7 +494,7 @@ EXPORT_SYMBOL(v4l2_g_ext_ctrls); /* Validate a new control */ static int validate_new(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr p_new) { - return ctrl->type_ops->validate(ctrl, ctrl->new_elems, p_new); + return ctrl->type_ops->validate(ctrl, p_new); } /* Validate controls. */ @@ -1007,7 +1007,7 @@ int __v4l2_ctrl_modify_dimensions(struct v4l2_ctrl *ctrl, ctrl->p_cur.p = p_array + elems * ctrl->elem_size; for (i = 0; i < ctrl->nr_of_dims; i++) ctrl->dims[i] = dims[i]; - ctrl->type_ops->init(ctrl, 0, elems, ctrl->p_cur); + ctrl->type_ops->init(ctrl, 0, ctrl->p_cur); cur_to_new(ctrl); send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_VALUE | V4L2_EVENT_CTRL_CH_DIMENSIONS); diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c index 01f00093f2591..0dab1d7b90f0e 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-core.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c @@ -65,7 +65,7 @@ void send_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 changes) v4l2_event_queue_fh(sev->fh, &ev); } -bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, u32 elems, +bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2) { unsigned int i; @@ -74,7 +74,7 @@ bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, u32 elems, case V4L2_CTRL_TYPE_BUTTON: return false; case V4L2_CTRL_TYPE_STRING: - for (i = 0; i < elems; i++) { + for (i = 0; i < ctrl->elems; i++) { unsigned int idx = i * ctrl->elem_size; /* strings are always 0-terminated */ @@ -84,7 +84,7 @@ bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, u32 elems, return true; default: return !memcmp(ptr1.p_const, ptr2.p_const, - elems * ctrl->elem_size); + ctrl->elems * ctrl->elem_size); } } EXPORT_SYMBOL(v4l2_ctrl_type_op_equal); @@ -178,9 +178,10 @@ static void std_init_compound(const struct v4l2_ctrl *ctrl, u32 idx, } void v4l2_ctrl_type_op_init(const struct v4l2_ctrl *ctrl, u32 from_idx, - u32 tot_elems, union v4l2_ctrl_ptr ptr) + union v4l2_ctrl_ptr ptr) { unsigned int i; + u32 tot_elems = ctrl->elems; u32 elems = tot_elems - from_idx; if (from_idx >= tot_elems) @@ -995,7 +996,7 @@ static int std_validate_elem(const struct v4l2_ctrl *ctrl, u32 idx, } } -int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, u32 elems, +int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr) { unsigned int i; @@ -1017,11 +1018,11 @@ int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, u32 elems, case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: - memset(ptr.p_s32, 0, elems * sizeof(s32)); + memset(ptr.p_s32, 0, ctrl->new_elems * sizeof(s32)); return 0; } - for (i = 0; !ret && i < elems; i++) + for (i = 0; !ret && i < ctrl->new_elems; i++) ret = std_validate_elem(ctrl, i, ptr); return ret; } @@ -1724,7 +1725,7 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl, memcpy(ctrl->p_def.p, p_def.p_const, elem_size); } - ctrl->type_ops->init(ctrl, 0, elems, ctrl->p_cur); + ctrl->type_ops->init(ctrl, 0, ctrl->p_cur); cur_to_new(ctrl); if (handler_new_ref(hdl, ctrl, NULL, false, false)) { @@ -2069,7 +2070,7 @@ static int cluster_changed(struct v4l2_ctrl *master) ctrl_changed = true; if (!ctrl_changed) ctrl_changed = !ctrl->type_ops->equal(ctrl, - ctrl->elems, ctrl->p_cur, ctrl->p_new); + ctrl->p_cur, ctrl->p_new); ctrl->has_changed = ctrl_changed; changed |= ctrl->has_changed; } diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index b76a0714d4254..e59d9a234631d 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -121,21 +121,19 @@ struct v4l2_ctrl_ops { * struct v4l2_ctrl_type_ops - The control type operations that the driver * has to provide. * - * @equal: return true if both values are equal. - * @init: initialize the value. + * @equal: return true if all ctrl->elems array elements are equal. + * @init: initialize the value for array elements from from_idx to ctrl->elems. * @log: log the value. - * @validate: validate the value. Return 0 on success and a negative value - * otherwise. + * @validate: validate the value for ctrl->new_elems array elements. + * Return 0 on success and a negative value otherwise. */ struct v4l2_ctrl_type_ops { - bool (*equal)(const struct v4l2_ctrl *ctrl, u32 elems, - union v4l2_ctrl_ptr ptr1, - union v4l2_ctrl_ptr ptr2); - void (*init)(const struct v4l2_ctrl *ctrl, u32 from_idx, u32 tot_elems, + bool (*equal)(const struct v4l2_ctrl *ctrl, + union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2); + void (*init)(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr); void (*log)(const struct v4l2_ctrl *ctrl); - int (*validate)(const struct v4l2_ctrl *ctrl, u32 elems, - union v4l2_ctrl_ptr ptr); + int (*validate)(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr); }; /** @@ -1543,13 +1541,12 @@ int v4l2_ctrl_new_fwnode_properties(struct v4l2_ctrl_handler *hdl, * v4l2_ctrl_type_op_equal - Default v4l2_ctrl_type_ops equal callback. * * @ctrl: The v4l2_ctrl pointer. - * @elems: The number of elements to compare. * @ptr1: A v4l2 control value. * @ptr2: A v4l2 control value. * * Return: true if values are equal, otherwise false. */ -bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, u32 elems, +bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2); /** @@ -1557,13 +1554,12 @@ bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, u32 elems, * * @ctrl: The v4l2_ctrl pointer. * @from_idx: Starting element index. - * @elems: The number of elements to initialize. * @ptr: The v4l2 control value. * * Return: void */ void v4l2_ctrl_type_op_init(const struct v4l2_ctrl *ctrl, u32 from_idx, - u32 elems, union v4l2_ctrl_ptr ptr); + union v4l2_ctrl_ptr ptr); /** * v4l2_ctrl_type_op_log - Default v4l2_ctrl_type_ops log callback. @@ -1578,12 +1574,10 @@ void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl); * v4l2_ctrl_type_op_validate - Default v4l2_ctrl_type_ops validate callback. * * @ctrl: The v4l2_ctrl pointer. - * @elems: The number of elements in the control. * @ptr: The v4l2 control value. * * Return: 0 on success, a negative error code on failure. */ -int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, u32 elems, - union v4l2_ctrl_ptr ptr); +int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr); #endif -- GitLab From 20694e96ca089ce6693c2348f8f628ee621e4e74 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 30 Aug 2022 07:59:24 +0200 Subject: [PATCH 0493/2223] media: dvb-frontends/drxk: initialize err to 0 Fix a compiler warning: drivers/media/dvb-frontends/drxk_hard.c: In function 'drxk_read_ucblocks': drivers/media/dvb-frontends/drxk_hard.c:6673:21: warning: 'err' may be used uninitialized [-Wmaybe-uninitialized] 6673 | *ucblocks = (u32) err; | ^~~~~~~~~ drivers/media/dvb-frontends/drxk_hard.c:6663:13: note: 'err' was declared here 6663 | u16 err; | ^~~ Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-frontends/drxk_hard.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/dvb-frontends/drxk_hard.c b/drivers/media/dvb-frontends/drxk_hard.c index 47d83e0a470c7..9807f54119965 100644 --- a/drivers/media/dvb-frontends/drxk_hard.c +++ b/drivers/media/dvb-frontends/drxk_hard.c @@ -6660,7 +6660,7 @@ static int drxk_read_snr(struct dvb_frontend *fe, u16 *snr) static int drxk_read_ucblocks(struct dvb_frontend *fe, u32 *ucblocks) { struct drxk_state *state = fe->demodulator_priv; - u16 err; + u16 err = 0; dprintk(1, "\n"); -- GitLab From 479747caa5bfa94b856bf47249006e6c8aa8be37 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 30 Aug 2022 12:37:24 +0200 Subject: [PATCH 0494/2223] media: cec: add support for Absolute Volume Control Add support for this new CEC message. This was added in HDMI 2.1a. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/cec.h.rst.exceptions | 2 ++ drivers/media/cec/core/cec-adap.c | 1 + include/uapi/linux/cec-funcs.h | 14 ++++++++++++++ include/uapi/linux/cec.h | 2 ++ 4 files changed, 19 insertions(+) diff --git a/Documentation/userspace-api/media/cec.h.rst.exceptions b/Documentation/userspace-api/media/cec.h.rst.exceptions index 13de01d9555eb..15fa1752d4ef8 100644 --- a/Documentation/userspace-api/media/cec.h.rst.exceptions +++ b/Documentation/userspace-api/media/cec.h.rst.exceptions @@ -239,6 +239,7 @@ ignore define CEC_OP_FEAT_DEV_HAS_DECK_CONTROL ignore define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_RATE ignore define CEC_OP_FEAT_DEV_SINK_HAS_ARC_TX ignore define CEC_OP_FEAT_DEV_SOURCE_HAS_ARC_RX +ignore define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_VOLUME_LEVEL ignore define CEC_MSG_GIVE_FEATURES @@ -487,6 +488,7 @@ ignore define CEC_OP_SYS_AUD_STATUS_ON ignore define CEC_MSG_SYSTEM_AUDIO_MODE_REQUEST ignore define CEC_MSG_SYSTEM_AUDIO_MODE_STATUS +ignore define CEC_MSG_SET_AUDIO_VOLUME_LEVEL ignore define CEC_OP_AUD_FMT_ID_CEA861 ignore define CEC_OP_AUD_FMT_ID_CEA861_CXT diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c index 41a79293ee02d..4f5ab3cae8a71 100644 --- a/drivers/media/cec/core/cec-adap.c +++ b/drivers/media/cec/core/cec-adap.c @@ -1027,6 +1027,7 @@ static const u8 cec_msg_size[256] = { [CEC_MSG_REPORT_SHORT_AUDIO_DESCRIPTOR] = 2 | DIRECTED, [CEC_MSG_REQUEST_SHORT_AUDIO_DESCRIPTOR] = 2 | DIRECTED, [CEC_MSG_SET_SYSTEM_AUDIO_MODE] = 3 | BOTH, + [CEC_MSG_SET_AUDIO_VOLUME_LEVEL] = 3 | DIRECTED, [CEC_MSG_SYSTEM_AUDIO_MODE_REQUEST] = 2 | DIRECTED, [CEC_MSG_SYSTEM_AUDIO_MODE_STATUS] = 3 | DIRECTED, [CEC_MSG_SET_AUDIO_RATE] = 3 | DIRECTED, diff --git a/include/uapi/linux/cec-funcs.h b/include/uapi/linux/cec-funcs.h index c3baaea0b8ef6..d58fa1cdcb084 100644 --- a/include/uapi/linux/cec-funcs.h +++ b/include/uapi/linux/cec-funcs.h @@ -1568,6 +1568,20 @@ static inline void cec_ops_request_short_audio_descriptor(const struct cec_msg * } } +static inline void cec_msg_set_audio_volume_level(struct cec_msg *msg, + __u8 audio_volume_level) +{ + msg->len = 3; + msg->msg[1] = CEC_MSG_SET_AUDIO_VOLUME_LEVEL; + msg->msg[2] = audio_volume_level; +} + +static inline void cec_ops_set_audio_volume_level(const struct cec_msg *msg, + __u8 *audio_volume_level) +{ + *audio_volume_level = msg->msg[2]; +} + /* Audio Rate Control Feature */ static inline void cec_msg_set_audio_rate(struct cec_msg *msg, diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h index 1d48da9262163..b8e071abaea5a 100644 --- a/include/uapi/linux/cec.h +++ b/include/uapi/linux/cec.h @@ -768,6 +768,7 @@ struct cec_event { #define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_RATE 0x08 #define CEC_OP_FEAT_DEV_SINK_HAS_ARC_TX 0x04 #define CEC_OP_FEAT_DEV_SOURCE_HAS_ARC_RX 0x02 +#define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_VOLUME_LEVEL 0x01 #define CEC_MSG_GIVE_FEATURES 0xa5 /* HDMI 2.0 */ @@ -1059,6 +1060,7 @@ struct cec_event { #define CEC_OP_AUD_FMT_ID_CEA861 0 #define CEC_OP_AUD_FMT_ID_CEA861_CXT 1 +#define CEC_MSG_SET_AUDIO_VOLUME_LEVEL 0x73 /* Audio Rate Control Feature */ #define CEC_MSG_SET_AUDIO_RATE 0x9a -- GitLab From f001cc8dc16e966e9f0b1e91a5e9264b7da550b4 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 30 Aug 2022 23:11:10 +0200 Subject: [PATCH 0495/2223] media: i2c: isl7998x: Use right include This driver is using GPIO descriptors but uses the legacy include header . Fix it by including the intended . Cc: Marek Vasut Signed-off-by: Linus Walleij Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/isl7998x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/i2c/isl7998x.c b/drivers/media/i2c/isl7998x.c index dc3068549dfa8..27feefe1dfcde 100644 --- a/drivers/media/i2c/isl7998x.c +++ b/drivers/media/i2c/isl7998x.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include -- GitLab From 74997c55aafc105dfc8ef62f3c9979cccbcd8dff Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 30 Aug 2022 23:28:06 +0200 Subject: [PATCH 0496/2223] media: si4713: Use the right include The driver includes the legacy header but uses . Cc: Dinesh Ram Cc: Eduardo Valentin Signed-off-by: Linus Walleij Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/radio/si4713/si4713.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/radio/si4713/si4713.c b/drivers/media/radio/si4713/si4713.c index adbf43ff6a21e..60e72c8c643bb 100644 --- a/drivers/media/radio/si4713/si4713.c +++ b/drivers/media/radio/si4713/si4713.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include -- GitLab From 74869a88f9d551add870412b53c2be8a192b8d12 Mon Sep 17 00:00:00 2001 From: Moudy Ho Date: Wed, 31 Aug 2022 10:56:04 +0200 Subject: [PATCH 0497/2223] media: platform: mtk-mdp3: add pointer checks and use devm_kfree Fix two errors reported by smatch: drivers/media/platform/mediatek/mdp3/mtk-mdp3-core.c:292 mdp_probe() error: we previously assumed 'mdp' could be null drivers/media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c:460 mdp_cmdq_send() error: we previously assumed 'cmd' could be null Also, avoid warnings reported by smatch: drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c:872 mdp_comp_create() warn: passing devm_ allocated variable to kfree. 'comp' [hverkuil: fix devm_kfree call] Signed-off-by: Moudy Ho Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c | 2 +- drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c | 2 +- drivers/media/platform/mediatek/mdp3/mtk-mdp3-core.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c index 29f6c1cd3de79..86c054600a08c 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-cmdq.c @@ -457,7 +457,7 @@ err_cmdq_data: kfree(path); atomic_dec(&mdp->job_count); wake_up(&mdp->callback_wq); - if (cmd->pkt.buf_size > 0) + if (cmd && cmd->pkt.buf_size > 0) mdp_cmdq_pkt_destroy(&cmd->pkt); kfree(comps); kfree(cmd); diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c index e62abf3587bff..43455755a5ac3 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c @@ -869,7 +869,7 @@ static struct mdp_comp *mdp_comp_create(struct mdp_dev *mdp, ret = mdp_comp_init(mdp, node, comp, id); if (ret) { - kfree(comp); + devm_kfree(dev, comp); return ERR_PTR(ret); } mdp->comp[id] = comp; diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-core.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-core.c index cde59579b7aeb..c413e59d42860 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-core.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-core.c @@ -289,7 +289,8 @@ err_deinit_comp: mdp_comp_destroy(mdp); err_return: for (i = 0; i < MDP_PIPE_MAX; i++) - mtk_mutex_put(mdp->mdp_mutex[i]); + if (mdp) + mtk_mutex_put(mdp->mdp_mutex[i]); kfree(mdp); dev_dbg(dev, "Errno %d\n", ret); return ret; -- GitLab From 5cd5f1344434e49a20a6e165e1cee3ead095b32e Mon Sep 17 00:00:00 2001 From: Daniel Lundberg Pedersen Date: Wed, 31 Aug 2022 16:54:59 +0200 Subject: [PATCH 0498/2223] media: docs: libv4l-introduction.rst: Fix function signature and link v4l2_mmap returns a void*, also link to mmap instead of munmap Signed-off-by: Daniel Lundberg Pedersen Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/libv4l-introduction.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/userspace-api/media/v4l/libv4l-introduction.rst b/Documentation/userspace-api/media/v4l/libv4l-introduction.rst index 90215313b9657..7c8bf160e1c6e 100644 --- a/Documentation/userspace-api/media/v4l/libv4l-introduction.rst +++ b/Documentation/userspace-api/media/v4l/libv4l-introduction.rst @@ -136,9 +136,9 @@ V4L2 functions operates like the :c:func:`read()` function. -.. c:function:: void v4l2_mmap(void *start, size_t length, int prot, int flags, int fd, int64_t offset); +.. c:function:: void *v4l2_mmap(void *start, size_t length, int prot, int flags, int fd, int64_t offset); - operates like the :c:func:`munmap()` function. + operates like the :c:func:`mmap()` function. .. c:function:: int v4l2_munmap(void *_start, size_t length); -- GitLab From bf4ed9e3283b3db26089f7c265e65435af4d3e11 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Thu, 1 Sep 2022 09:45:07 +0200 Subject: [PATCH 0499/2223] media: radio-si476x: Remove the unneeded result variable Return the value v4l2_fh_release() directly instead of storing it in another redundant variable. Signed-off-by: ye xingchen Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/radio/radio-si476x.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/media/radio/radio-si476x.c b/drivers/media/radio/radio-si476x.c index 0bf99e1cd1d8d..171f9cc9ee5ea 100644 --- a/drivers/media/radio/radio-si476x.c +++ b/drivers/media/radio/radio-si476x.c @@ -1072,7 +1072,6 @@ done: static int si476x_radio_fops_release(struct file *file) { - int err; struct si476x_radio *radio = video_drvdata(file); if (v4l2_fh_is_singular_file(file) && @@ -1080,9 +1079,7 @@ static int si476x_radio_fops_release(struct file *file) si476x_core_set_power_state(radio->core, SI476X_POWER_DOWN); - err = v4l2_fh_release(file); - - return err; + return v4l2_fh_release(file); } static ssize_t si476x_radio_fops_read(struct file *file, char __user *buf, -- GitLab From dec7920e55db35ac429b002025aa1fafc0ec7d57 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 1 Sep 2022 16:47:48 +0200 Subject: [PATCH 0500/2223] media: platform: mtk-mdp3: fix error code in mdp_vpu_dev_init() Return a negative error code if mdp_vpu_shared_mem_alloc() fails. Fixes: 61890ccaefaf ("media: platform: mtk-mdp3: add MediaTek MDP3 driver") Signed-off-by: Dan Carpenter Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Matthias Brugger Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mediatek/mdp3/mtk-mdp3-vpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-vpu.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-vpu.c index 9f5844385c8fc..a72bed927bb64 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-vpu.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-vpu.c @@ -173,7 +173,8 @@ int mdp_vpu_dev_init(struct mdp_vpu_dev *vpu, struct mtk_scp *scp, /* vpu work_size was set in mdp_vpu_ipi_handle_init_ack */ mem_size = vpu_alloc_size; - if (mdp_vpu_shared_mem_alloc(vpu)) { + err = mdp_vpu_shared_mem_alloc(vpu); + if (err) { dev_err(&mdp->pdev->dev, "VPU memory alloc fail!"); goto err_mem_alloc; } -- GitLab From ff464745e4576ed8670bc2fc8da27e022f0ea56c Mon Sep 17 00:00:00 2001 From: Sun Ke Date: Fri, 2 Sep 2022 10:58:19 +0200 Subject: [PATCH 0501/2223] media: platform: mtk-mdp3: fix PM reference leak in mdp_comp_clock_on() mdp_comp_clock_on will increase runtime PM usage counter, and mdp_comp_clock_off will decrease the runtime PM usage counter. so, if mdp_comp_clock_on failed after increment runtime PM usage counter, it should decrease it before return a error code. pm_runtime_get_sync will increment pm usage counter even it failed. Forgetting to putting operation will result in reference leak here. Fix it by replacing it with pm_runtime_resume_and_get to keep usage counter balanced. And if failed to enable clk, add pm_runtime_put() to decrease the runtime PM usage counter. Fixes: 61890ccaefaf ("media: platform: mtk-mdp3: add MediaTek MDP3 driver") Signed-off-by: Sun Ke Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c index 43455755a5ac3..d3eaf8884412d 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-comp.c @@ -682,7 +682,7 @@ int mdp_comp_clock_on(struct device *dev, struct mdp_comp *comp) int i, ret; if (comp->comp_dev) { - ret = pm_runtime_get_sync(comp->comp_dev); + ret = pm_runtime_resume_and_get(comp->comp_dev); if (ret < 0) { dev_err(dev, "Failed to get power, err %d. type:%d id:%d\n", @@ -699,6 +699,7 @@ int mdp_comp_clock_on(struct device *dev, struct mdp_comp *comp) dev_err(dev, "Failed to enable clk %d. type:%d id:%d\n", i, comp->type, comp->id); + pm_runtime_put(comp->comp_dev); return ret; } } @@ -930,7 +931,7 @@ void mdp_comp_destroy(struct mdp_dev *mdp) if (mdp->comp[i]) { pm_runtime_disable(mdp->comp[i]->comp_dev); mdp_comp_deinit(mdp->comp[i]); - kfree(mdp->comp[i]); + devm_kfree(mdp->comp[i]->comp_dev, mdp->comp[i]); mdp->comp[i] = NULL; } } -- GitLab From 08b91227471f85ba241707994b9cf3c006d2620e Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Mon, 5 Sep 2022 11:02:03 +0200 Subject: [PATCH 0502/2223] media: tuners: Remove the unneeded result variable Return the value xc_send_i2c_data() directly instead of storing it in another redundant variable. Signed-off-by: ye xingchen Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/tuners/xc4000.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/media/tuners/xc4000.c b/drivers/media/tuners/xc4000.c index a04dfd5799f77..d59b4ab774302 100644 --- a/drivers/media/tuners/xc4000.c +++ b/drivers/media/tuners/xc4000.c @@ -282,15 +282,13 @@ static int xc4000_tuner_reset(struct dvb_frontend *fe) static int xc_write_reg(struct xc4000_priv *priv, u16 regAddr, u16 i2cData) { u8 buf[4]; - int result; buf[0] = (regAddr >> 8) & 0xFF; buf[1] = regAddr & 0xFF; buf[2] = (i2cData >> 8) & 0xFF; buf[3] = i2cData & 0xFF; - result = xc_send_i2c_data(priv, buf, 4); - return result; + return xc_send_i2c_data(priv, buf, 4); } static int xc_load_i2c_sequence(struct dvb_frontend *fe, const u8 *i2c_sequence) -- GitLab From d630f17a4efb69d9125ce66671cf87a71b8d0b69 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 5 Sep 2022 12:14:16 +0200 Subject: [PATCH 0503/2223] media: MAINTAINERS: adjust entry to zoran driver movement Commit 2a0c28063de2 ("media: zoran: move to mainline") moves the zoran driver from the staging to the media subsystem, but does not adjust the entry in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Repair this file reference in ZR36067 VIDEO FOR LINUX DRIVER. Signed-off-by: Lukas Bulwahn Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a58f1fc6dd47f..e4bcd9665dbd3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22508,7 +22508,7 @@ S: Maintained W: http://mjpeg.sourceforge.net/driver-zoran/ Q: https://patchwork.linuxtv.org/project/linux-media/list/ F: Documentation/driver-api/media/drivers/zoran.rst -F: drivers/staging/media/zoran/ +F: drivers/media/pci/zoran/ ZRAM COMPRESSED RAM BLOCK DEVICE DRVIER M: Minchan Kim -- GitLab From a1f32d288224ad9cf02968819e6800f6366d0d8f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 5 Sep 2022 12:29:39 +0200 Subject: [PATCH 0504/2223] media: MAINTAINERS: rectify entry in SAA7146 VIDEO4LINUX-2 DRIVER Commit e33fdb5a0249 ("media: saa7146: deprecate hexium_gemini/orion, mxb and ttpci") moves some media drivers to the staging subsystem, and unfortunately only partially adjusts the entry in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. As the files matching include/media/drv-intf/saa7146* are moved to drivers/staging/media/deprecated/saa7146/common, this directory is already covered by the existing file entry drivers/staging/media/deprecated/saa7146/. Repair this file reference in SAA7146 VIDEO4LINUX-2 DRIVER. Signed-off-by: Lukas Bulwahn Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e4bcd9665dbd3..5135aa7d713cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17911,7 +17911,6 @@ L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git F: drivers/staging/media/deprecated/saa7146/ -F: include/media/drv-intf/saa7146* SAFESETID SECURITY MODULE M: Micah Morton -- GitLab From 594b6bdde2e7833a56413de5092b6e4188d33ff7 Mon Sep 17 00:00:00 2001 From: Rory Liu Date: Tue, 6 Sep 2022 05:30:16 +0200 Subject: [PATCH 0505/2223] media: platform: cros-ec: Add Kuldax to the match table The Google Kuldax device uses the same approach as the Google Brask which enables the HDMI CEC via the cros-ec-cec driver. Signed-off-by: Rory Liu Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c index e5ebaa58be457..6ebedc71d67d4 100644 --- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c +++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c @@ -223,6 +223,8 @@ static const struct cec_dmi_match cec_dmi_match_table[] = { { "Google", "Moli", "0000:00:02.0", "Port B" }, /* Google Kinox */ { "Google", "Kinox", "0000:00:02.0", "Port B" }, + /* Google Kuldax */ + { "Google", "Kuldax", "0000:00:02.0", "Port B" }, }; static struct device *cros_ec_cec_find_hdmi_dev(struct device *dev, -- GitLab From 7718999356234d9cc6a11b4641bb773928f1390f Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Tue, 6 Sep 2022 09:46:30 +0200 Subject: [PATCH 0506/2223] media: meson: vdec: fix possible refcount leak in vdec_probe() v4l2_device_unregister need to be called to put the refcount got by v4l2_device_register when vdec_probe fails or vdec_remove is called. Signed-off-by: Hangyu Hua Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/meson/vdec/vdec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/media/meson/vdec/vdec.c b/drivers/staging/media/meson/vdec/vdec.c index 8549d95be0f25..52f224d8def10 100644 --- a/drivers/staging/media/meson/vdec/vdec.c +++ b/drivers/staging/media/meson/vdec/vdec.c @@ -1102,6 +1102,7 @@ static int vdec_probe(struct platform_device *pdev) err_vdev_release: video_device_release(vdev); + v4l2_device_unregister(&core->v4l2_dev); return ret; } @@ -1110,6 +1111,7 @@ static int vdec_remove(struct platform_device *pdev) struct amvdec_core *core = platform_get_drvdata(pdev); video_unregister_device(core->vdev_dec); + v4l2_device_unregister(&core->v4l2_dev); return 0; } -- GitLab From 4bec03301ecd81760c159402467dbb2cfd527684 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:53 +0200 Subject: [PATCH 0507/2223] media: hantro: Store HEVC bit depth in context Store HEVC bit depth in context. Bit depth is equal to hevc sps bit_depth_luma_minus8 + 8. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/verisilicon/hantro_drv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c index 2036f72eeb4af..1dd8312d824ce 100644 --- a/drivers/media/platform/verisilicon/hantro_drv.c +++ b/drivers/media/platform/verisilicon/hantro_drv.c @@ -251,6 +251,11 @@ queue_init(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq) static int hantro_try_ctrl(struct v4l2_ctrl *ctrl) { + struct hantro_ctx *ctx; + + ctx = container_of(ctrl->handler, + struct hantro_ctx, ctrl_handler); + if (ctrl->id == V4L2_CID_STATELESS_H264_SPS) { const struct v4l2_ctrl_h264_sps *sps = ctrl->p_new.p_h264_sps; @@ -272,6 +277,8 @@ static int hantro_try_ctrl(struct v4l2_ctrl *ctrl) if (sps->bit_depth_luma_minus8 != 0) /* Only 8-bit is supported */ return -EINVAL; + + ctx->bit_depth = sps->bit_depth_luma_minus8 + 8; } else if (ctrl->id == V4L2_CID_STATELESS_VP9_FRAME) { const struct v4l2_ctrl_vp9_frame *dec_params = ctrl->p_new.p_vp9_frame; -- GitLab From 8a438580a09ecef78cd6c5825d628b4d5ae1c127 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:54 +0200 Subject: [PATCH 0508/2223] media: hantro: HEVC: Fix auxilary buffer size calculation SAO and FILTER buffers size depend of the bit depth. Make sure we have enough space for 10bit bitstreams. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/verisilicon/hantro_hevc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/verisilicon/hantro_hevc.c b/drivers/media/platform/verisilicon/hantro_hevc.c index b990bc98164c3..9383fb7081f6c 100644 --- a/drivers/media/platform/verisilicon/hantro_hevc.c +++ b/drivers/media/platform/verisilicon/hantro_hevc.c @@ -104,7 +104,7 @@ static int tile_buffer_reallocate(struct hantro_ctx *ctx) hevc_dec->tile_bsd.cpu = NULL; } - size = VERT_FILTER_RAM_SIZE * height64 * (num_tile_cols - 1); + size = (VERT_FILTER_RAM_SIZE * height64 * (num_tile_cols - 1) * ctx->bit_depth) / 8; hevc_dec->tile_filter.cpu = dma_alloc_coherent(vpu->dev, size, &hevc_dec->tile_filter.dma, GFP_KERNEL); @@ -112,7 +112,7 @@ static int tile_buffer_reallocate(struct hantro_ctx *ctx) goto err_free_tile_buffers; hevc_dec->tile_filter.size = size; - size = VERT_SAO_RAM_SIZE * height64 * (num_tile_cols - 1); + size = (VERT_SAO_RAM_SIZE * height64 * (num_tile_cols - 1) * ctx->bit_depth) / 8; hevc_dec->tile_sao.cpu = dma_alloc_coherent(vpu->dev, size, &hevc_dec->tile_sao.dma, GFP_KERNEL); -- GitLab From f64853ad7f964b3bf7c1d63b27ca7ef972797a1c Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:55 +0200 Subject: [PATCH 0509/2223] media: hantro: HEVC: Fix chroma offset computation The chroma offset depends of the bitstream depth. Make sure that ctx->bit_depth is used to compute it. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c b/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c index 233ecd863d5f1..a917079a6ed30 100644 --- a/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c +++ b/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c @@ -12,7 +12,7 @@ static size_t hantro_hevc_chroma_offset(struct hantro_ctx *ctx) { - return ctx->dst_fmt.width * ctx->dst_fmt.height; + return ctx->dst_fmt.width * ctx->dst_fmt.height * ctx->bit_depth / 8; } static size_t hantro_hevc_motion_vectors_offset(struct hantro_ctx *ctx) -- GitLab From 5aa24d729999c3d80034b29fc48f9957ad61fce8 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:56 +0200 Subject: [PATCH 0510/2223] media: hantro: postproc: Configure output regs to support 10bit Move output format setting in postproc and make sure that 8/10bit configuration is correctly set. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c | 2 -- drivers/media/platform/verisilicon/hantro_postproc.c | 7 ++++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c b/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c index a917079a6ed30..a9d4ac84a8d8d 100644 --- a/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c +++ b/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c @@ -167,8 +167,6 @@ static void set_params(struct hantro_ctx *ctx) hantro_reg_write(vpu, &g2_bit_depth_y_minus8, sps->bit_depth_luma_minus8); hantro_reg_write(vpu, &g2_bit_depth_c_minus8, sps->bit_depth_chroma_minus8); - hantro_reg_write(vpu, &g2_output_8_bits, 0); - hantro_reg_write(vpu, &g2_hdr_skip_length, compute_header_skip_length(ctx)); min_log2_cb_size = sps->log2_min_luma_coding_block_size_minus3 + 3; diff --git a/drivers/media/platform/verisilicon/hantro_postproc.c b/drivers/media/platform/verisilicon/hantro_postproc.c index a0928c5084342..09d8cf9426895 100644 --- a/drivers/media/platform/verisilicon/hantro_postproc.c +++ b/drivers/media/platform/verisilicon/hantro_postproc.c @@ -114,6 +114,7 @@ static void hantro_postproc_g2_enable(struct hantro_ctx *ctx) struct hantro_dev *vpu = ctx->dev; struct vb2_v4l2_buffer *dst_buf; int down_scale = down_scale_factor(ctx); + int out_depth; size_t chroma_offset; dma_addr_t dst_dma; @@ -132,8 +133,9 @@ static void hantro_postproc_g2_enable(struct hantro_ctx *ctx) hantro_write_addr(vpu, G2_RS_OUT_LUMA_ADDR, dst_dma); hantro_write_addr(vpu, G2_RS_OUT_CHROMA_ADDR, dst_dma + chroma_offset); } + + out_depth = hantro_get_format_depth(ctx->dst_fmt.pixelformat); if (ctx->dev->variant->legacy_regs) { - int out_depth = hantro_get_format_depth(ctx->dst_fmt.pixelformat); u8 pp_shift = 0; if (out_depth > 8) @@ -141,6 +143,9 @@ static void hantro_postproc_g2_enable(struct hantro_ctx *ctx) hantro_reg_write(ctx->dev, &g2_rs_out_bit_depth, out_depth); hantro_reg_write(ctx->dev, &g2_pp_pix_shift, pp_shift); + } else { + hantro_reg_write(vpu, &g2_output_8_bits, out_depth > 8 ? 0 : 1); + hantro_reg_write(vpu, &g2_output_format, out_depth > 8 ? 1 : 0); } hantro_reg_write(vpu, &g2_out_rs_e, 1); } -- GitLab From d040a24b5aaede6049fe27f2ea29773ada16a9e3 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:57 +0200 Subject: [PATCH 0511/2223] media: Hantro: HEVC: Allows 10-bit bitstream Stop limiting HEVC support to 8-bits bitstreams also accept 10-bits bitstreams. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/verisilicon/hantro_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c index 1dd8312d824ce..7c75922e2e983 100644 --- a/drivers/media/platform/verisilicon/hantro_drv.c +++ b/drivers/media/platform/verisilicon/hantro_drv.c @@ -274,8 +274,8 @@ static int hantro_try_ctrl(struct v4l2_ctrl *ctrl) if (sps->bit_depth_luma_minus8 != sps->bit_depth_chroma_minus8) /* Luma and chroma bit depth mismatch */ return -EINVAL; - if (sps->bit_depth_luma_minus8 != 0) - /* Only 8-bit is supported */ + if (sps->bit_depth_luma_minus8 != 0 && sps->bit_depth_luma_minus8 != 2) + /* Only 8-bit and 10-bit are supported */ return -EINVAL; ctx->bit_depth = sps->bit_depth_luma_minus8 + 8; -- GitLab From dc39473d0340071bc04c07ba95c40f2bcf9f8ded Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:58 +0200 Subject: [PATCH 0512/2223] media: hantro: imx8m: Enable 10bit decoding Expose 10bit pixel formats to enable 10bit decoding in IMX8M SoCs. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/verisilicon/imx8m_vpu_hw.c | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/media/platform/verisilicon/imx8m_vpu_hw.c b/drivers/media/platform/verisilicon/imx8m_vpu_hw.c index 77f574fdfa77b..b390228fd3b4a 100644 --- a/drivers/media/platform/verisilicon/imx8m_vpu_hw.c +++ b/drivers/media/platform/verisilicon/imx8m_vpu_hw.c @@ -162,12 +162,39 @@ static const struct hantro_fmt imx8m_vpu_g2_postproc_fmts[] = { .step_height = MB_DIM, }, }, + { + .fourcc = V4L2_PIX_FMT_P010, + .codec_mode = HANTRO_MODE_NONE, + .postprocessed = true, + .frmsize = { + .min_width = FMT_MIN_WIDTH, + .max_width = FMT_UHD_WIDTH, + .step_width = MB_DIM, + .min_height = FMT_MIN_HEIGHT, + .max_height = FMT_UHD_HEIGHT, + .step_height = MB_DIM, + }, + }, }; static const struct hantro_fmt imx8m_vpu_g2_dec_fmts[] = { { .fourcc = V4L2_PIX_FMT_NV12_4L4, .codec_mode = HANTRO_MODE_NONE, + .match_depth = true, + .frmsize = { + .min_width = FMT_MIN_WIDTH, + .max_width = FMT_UHD_WIDTH, + .step_width = TILE_MB_DIM, + .min_height = FMT_MIN_HEIGHT, + .max_height = FMT_UHD_HEIGHT, + .step_height = TILE_MB_DIM, + }, + }, + { + .fourcc = V4L2_PIX_FMT_P010_4L4, + .codec_mode = HANTRO_MODE_NONE, + .match_depth = true, .frmsize = { .min_width = FMT_MIN_WIDTH, .max_width = FMT_UHD_WIDTH, -- GitLab From 39434d42e87fe23bff9f39d7b7485ad7764297d1 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 29 Aug 2022 18:21:59 +0200 Subject: [PATCH 0513/2223] media: hantro: Allows luma and chroma depth to be different Luma and chroma depth are set on different hardware registers. Even if they aren't identical the bitstream can be compliant to HEVC specifications and decoded by the hardware. With this patch TSUNEQBD_A_MAIN10_Technicolor_2 conformance test is successfully decoded. Signed-off-by: Benjamin Gaignard Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/verisilicon/hantro_drv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c index 7c75922e2e983..8cb4a68c9119e 100644 --- a/drivers/media/platform/verisilicon/hantro_drv.c +++ b/drivers/media/platform/verisilicon/hantro_drv.c @@ -271,9 +271,6 @@ static int hantro_try_ctrl(struct v4l2_ctrl *ctrl) } else if (ctrl->id == V4L2_CID_STATELESS_HEVC_SPS) { const struct v4l2_ctrl_hevc_sps *sps = ctrl->p_new.p_hevc_sps; - if (sps->bit_depth_luma_minus8 != sps->bit_depth_chroma_minus8) - /* Luma and chroma bit depth mismatch */ - return -EINVAL; if (sps->bit_depth_luma_minus8 != 0 && sps->bit_depth_luma_minus8 != 2) /* Only 8-bit and 10-bit are supported */ return -EINVAL; -- GitLab From 4b7444ff13250d2b10e940978bd72aef7a5561f2 Mon Sep 17 00:00:00 2001 From: Jean-Michel Hautbois Date: Fri, 17 Jun 2022 10:45:19 +0200 Subject: [PATCH 0514/2223] media: staging: ipu3-imgu: Fix BNR wb gain documentation The documentation states that the BNR factor is a multiplier coded as u3.13 and with a range of (0, 8). This is not correct, as the isp is adding 1.0 to the gain applied, ie Pout = { Pin * (1 + Gx) }. It means that a gain of 1.0 should be coded as 0. Signed-off-by: Jean-Michel Hautbois Reviewed-by: Kieran Bingham Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/ipu3/include/uapi/intel-ipu3.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h b/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h index dbdd015ce2201..caa358e0bae40 100644 --- a/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h +++ b/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h @@ -626,8 +626,11 @@ struct ipu3_uapi_stats_3a { * @b: white balance gain for B channel. * @gb: white balance gain for Gb channel. * - * Precision u3.13, range [0, 8). White balance correction is done by applying - * a multiplicative gain to each color channels prior to BNR. + * For BNR parameters WB gain factor for the three channels [Ggr, Ggb, Gb, Gr]. + * Their precision is U3.13 and the range is (0, 8) and the actual gain is + * Gx + 1, it is typically Gx = 1. + * + * Pout = {Pin * (1 + Gx)}. */ struct ipu3_uapi_bnr_static_config_wb_gains_config { __u16 gr; -- GitLab From 85644a9b37ec00912e2ca7bfd58ce22079dd7681 Mon Sep 17 00:00:00 2001 From: Paul Elder Date: Thu, 21 Jul 2022 09:41:38 +0200 Subject: [PATCH 0515/2223] media: ov5640: Use runtime PM Switch to using runtime PM for power management. Make it optional, however, to support ACPI. Signed-off-by: Paul Elder Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/ov5640.c | 123 +++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 47 deletions(-) diff --git a/drivers/media/i2c/ov5640.c b/drivers/media/i2c/ov5640.c index 502f0b62e9505..94db427e21de2 100644 --- a/drivers/media/i2c/ov5640.c +++ b/drivers/media/i2c/ov5640.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -447,8 +448,6 @@ struct ov5640_dev { /* lock to protect all members below */ struct mutex lock; - int power_count; - struct v4l2_mbus_framefmt fmt; bool pending_fmt_change; @@ -2696,39 +2695,24 @@ power_off: return ret; } -/* --------------- Subdev Operations --------------- */ - -static int ov5640_s_power(struct v4l2_subdev *sd, int on) +static int ov5640_sensor_suspend(struct device *dev) { - struct ov5640_dev *sensor = to_ov5640_dev(sd); - int ret = 0; - - mutex_lock(&sensor->lock); - - /* - * If the power count is modified from 0 to != 0 or from != 0 to 0, - * update the power state. - */ - if (sensor->power_count == !on) { - ret = ov5640_set_power(sensor, !!on); - if (ret) - goto out; - } + struct v4l2_subdev *sd = dev_get_drvdata(dev); + struct ov5640_dev *ov5640 = to_ov5640_dev(sd); - /* Update the power count. */ - sensor->power_count += on ? 1 : -1; - WARN_ON(sensor->power_count < 0); -out: - mutex_unlock(&sensor->lock); + return ov5640_set_power(ov5640, false); +} - if (on && !ret && sensor->power_count == 1) { - /* restore controls */ - ret = v4l2_ctrl_handler_setup(&sensor->ctrls.handler); - } +static int ov5640_sensor_resume(struct device *dev) +{ + struct v4l2_subdev *sd = dev_get_drvdata(dev); + struct ov5640_dev *ov5640 = to_ov5640_dev(sd); - return ret; + return ov5640_set_power(ov5640, true); } +/* --------------- Subdev Operations --------------- */ + static int ov5640_try_frame_interval(struct ov5640_dev *sensor, struct v4l2_fract *fi, u32 width, u32 height) @@ -3314,6 +3298,9 @@ static int ov5640_g_volatile_ctrl(struct v4l2_ctrl *ctrl) /* v4l2_ctrl_lock() locks our own mutex */ + if (!pm_runtime_get_if_in_use(&sensor->i2c_client->dev)) + return 0; + switch (ctrl->id) { case V4L2_CID_AUTOGAIN: val = ov5640_get_gain(sensor); @@ -3329,6 +3316,8 @@ static int ov5640_g_volatile_ctrl(struct v4l2_ctrl *ctrl) break; } + pm_runtime_put_autosuspend(&sensor->i2c_client->dev); + return 0; } @@ -3358,9 +3347,9 @@ static int ov5640_s_ctrl(struct v4l2_ctrl *ctrl) /* * If the device is not powered up by the host driver do * not apply any controls to H/W at this time. Instead - * the controls will be restored right after power-up. + * the controls will be restored at start streaming time. */ - if (sensor->power_count == 0) + if (!pm_runtime_get_if_in_use(&sensor->i2c_client->dev)) return 0; switch (ctrl->id) { @@ -3402,6 +3391,8 @@ static int ov5640_s_ctrl(struct v4l2_ctrl *ctrl) break; } + pm_runtime_put_autosuspend(&sensor->i2c_client->dev); + return ret; } @@ -3677,6 +3668,18 @@ static int ov5640_s_stream(struct v4l2_subdev *sd, int enable) struct ov5640_dev *sensor = to_ov5640_dev(sd); int ret = 0; + if (enable) { + ret = pm_runtime_resume_and_get(&sensor->i2c_client->dev); + if (ret < 0) + return ret; + + ret = v4l2_ctrl_handler_setup(&sensor->ctrls.handler); + if (ret) { + pm_runtime_put(&sensor->i2c_client->dev); + return ret; + } + } + mutex_lock(&sensor->lock); if (sensor->streaming == !enable) { @@ -3701,8 +3704,13 @@ static int ov5640_s_stream(struct v4l2_subdev *sd, int enable) if (!ret) sensor->streaming = enable; } + out: mutex_unlock(&sensor->lock); + + if (!enable || ret) + pm_runtime_put_autosuspend(&sensor->i2c_client->dev); + return ret; } @@ -3724,7 +3732,6 @@ static int ov5640_init_cfg(struct v4l2_subdev *sd, } static const struct v4l2_subdev_core_ops ov5640_core_ops = { - .s_power = ov5640_s_power, .log_status = v4l2_ctrl_subdev_log_status, .subscribe_event = v4l2_ctrl_subdev_subscribe_event, .unsubscribe_event = v4l2_event_subdev_unsubscribe, @@ -3770,26 +3777,20 @@ static int ov5640_check_chip_id(struct ov5640_dev *sensor) int ret = 0; u16 chip_id; - ret = ov5640_set_power_on(sensor); - if (ret) - return ret; - ret = ov5640_read_reg16(sensor, OV5640_REG_CHIP_ID, &chip_id); if (ret) { dev_err(&client->dev, "%s: failed to read chip identifier\n", __func__); - goto power_off; + return ret; } if (chip_id != 0x5640) { dev_err(&client->dev, "%s: wrong chip identifier, expected 0x5640, got 0x%x\n", __func__, chip_id); - ret = -ENXIO; + return -ENXIO; } -power_off: - ov5640_set_power_off(sensor); - return ret; + return 0; } static int ov5640_probe(struct i2c_client *client) @@ -3880,26 +3881,43 @@ static int ov5640_probe(struct i2c_client *client) ret = ov5640_get_regulators(sensor); if (ret) - return ret; + goto entity_cleanup; mutex_init(&sensor->lock); - ret = ov5640_check_chip_id(sensor); + ret = ov5640_init_controls(sensor); if (ret) goto entity_cleanup; - ret = ov5640_init_controls(sensor); - if (ret) + ret = ov5640_sensor_resume(dev); + if (ret) { + dev_err(dev, "failed to power on\n"); goto entity_cleanup; + } + + pm_runtime_set_active(dev); + pm_runtime_get_noresume(dev); + pm_runtime_enable(dev); + + ret = ov5640_check_chip_id(sensor); + if (ret) + goto err_pm_runtime; ret = v4l2_async_register_subdev_sensor(&sensor->sd); if (ret) - goto free_ctrls; + goto err_pm_runtime; + + pm_runtime_set_autosuspend_delay(dev, 1000); + pm_runtime_use_autosuspend(dev); + pm_runtime_put_autosuspend(dev); return 0; -free_ctrls: +err_pm_runtime: + pm_runtime_put_noidle(dev); + pm_runtime_disable(dev); v4l2_ctrl_handler_free(&sensor->ctrls.handler); + ov5640_sensor_suspend(dev); entity_cleanup: media_entity_cleanup(&sensor->sd.entity); mutex_destroy(&sensor->lock); @@ -3910,6 +3928,12 @@ static int ov5640_remove(struct i2c_client *client) { struct v4l2_subdev *sd = i2c_get_clientdata(client); struct ov5640_dev *sensor = to_ov5640_dev(sd); + struct device *dev = &client->dev; + + pm_runtime_disable(dev); + if (!pm_runtime_status_suspended(dev)) + ov5640_sensor_suspend(dev); + pm_runtime_set_suspended(dev); v4l2_async_unregister_subdev(&sensor->sd); media_entity_cleanup(&sensor->sd.entity); @@ -3919,6 +3943,10 @@ static int ov5640_remove(struct i2c_client *client) return 0; } +static const struct dev_pm_ops ov5640_pm_ops = { + SET_RUNTIME_PM_OPS(ov5640_sensor_suspend, ov5640_sensor_resume, NULL) +}; + static const struct i2c_device_id ov5640_id[] = { {"ov5640", 0}, {}, @@ -3935,6 +3963,7 @@ static struct i2c_driver ov5640_i2c_driver = { .driver = { .name = "ov5640", .of_match_table = ov5640_dt_ids, + .pm = &ov5640_pm_ops, }, .id_table = ov5640_id, .probe_new = ov5640_probe, -- GitLab From bb9ea2c31fa11b789ade4c3abcdda3c5370a76ab Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 22 Jul 2022 09:11:31 +0200 Subject: [PATCH 0516/2223] media: v4l2: Fix v4l2_i2c_subdev_set_name function documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doc says the I²C device's name is used if devname is NULL, but actually the I²C device driver's name is used. Fixes: 0658293012af ("media: v4l: subdev: Add a function to set an I²C sub-device's name") Signed-off-by: Alexander Stein Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-common.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index b708d63995f45..2ae2be4c87e5a 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -175,7 +175,8 @@ struct v4l2_subdev *v4l2_i2c_new_subdev_board(struct v4l2_device *v4l2_dev, * * @sd: pointer to &struct v4l2_subdev * @client: pointer to struct i2c_client - * @devname: the name of the device; if NULL, the I²C device's name will be used + * @devname: the name of the device; if NULL, the I²C device drivers's name + * will be used * @postfix: sub-device specific string to put right after the I²C device name; * may be NULL */ -- GitLab From 9c6dee9ac62931987bc45add5dfe6d535a1d9f80 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Fri, 22 Jul 2022 03:52:11 +0200 Subject: [PATCH 0517/2223] media: i2c: mt9v111: Fix typo 'the the' in comment Replace 'the the' with 'the' in the comment. Signed-off-by: Slark Xiao Reviewed-by: Jacopo Mondi Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/mt9v111.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/i2c/mt9v111.c b/drivers/media/i2c/mt9v111.c index 2dc4a0f24ce86..7beca0b70b720 100644 --- a/drivers/media/i2c/mt9v111.c +++ b/drivers/media/i2c/mt9v111.c @@ -633,7 +633,7 @@ static int mt9v111_hw_config(struct mt9v111_dev *mt9v111) /* * Set pixel integration time to the whole frame time. - * This value controls the the shutter delay when running with AE + * This value controls the shutter delay when running with AE * disabled. If longer than frame time, it affects the output * frame rate. */ -- GitLab From b5f8fa876931c1adfd2c5eca5b189fd2be893238 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 15 Jul 2022 10:59:24 +0200 Subject: [PATCH 0518/2223] media: ar0521: fix error return code in ar0521_power_on() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return error code if ar0521_write_regs() fails in ar0521_power_on(). Fixes: 852b50aeed15 ("media: On Semi AR0521 sensor driver") Signed-off-by: Yang Yingliang Acked-by: Krzysztof Hałasa Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/ar0521.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/i2c/ar0521.c b/drivers/media/i2c/ar0521.c index c7bdfc69b9be8..e850c92d847e4 100644 --- a/drivers/media/i2c/ar0521.c +++ b/drivers/media/i2c/ar0521.c @@ -757,8 +757,9 @@ static int ar0521_power_on(struct device *dev) usleep_range(4500, 5000); /* min 45000 clocks */ for (cnt = 0; cnt < ARRAY_SIZE(initial_regs); cnt++) - if (ar0521_write_regs(sensor, initial_regs[cnt].data, - initial_regs[cnt].count)) + ret = ar0521_write_regs(sensor, initial_regs[cnt].data, + initial_regs[cnt].count); + if (ret) goto off; ret = ar0521_write_reg(sensor, AR0521_REG_SERIAL_FORMAT, -- GitLab From 8fcccd2f37b27503694c6cd82a6c5ab911e69d73 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Sun, 14 Aug 2022 19:42:13 +0200 Subject: [PATCH 0519/2223] media: ar0521: Remove redundant variable ret ret in ar0521_set_fmt is never set to values other than 0. Replace it with plain 0. Reported-by: kernel test robot Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/ar0521.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/i2c/ar0521.c b/drivers/media/i2c/ar0521.c index e850c92d847e4..9e90b02b15e36 100644 --- a/drivers/media/i2c/ar0521.c +++ b/drivers/media/i2c/ar0521.c @@ -406,7 +406,6 @@ static int ar0521_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_format *format) { struct ar0521_dev *sensor = to_ar0521_dev(sd); - int ret = 0; ar0521_adj_fmt(&format->format); @@ -423,7 +422,7 @@ static int ar0521_set_fmt(struct v4l2_subdev *sd, } mutex_unlock(&sensor->lock); - return ret; + return 0; } static int ar0521_s_ctrl(struct v4l2_ctrl *ctrl) -- GitLab From b9eb3ab6f30bf32f7326909f17949ccb11bab514 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 25 Aug 2022 20:36:37 +0200 Subject: [PATCH 0520/2223] media: ipu3-imgu: Fix NULL pointer dereference in active selection access What the IMGU driver did was that it first acquired the pointers to active and try V4L2 subdev state, and only then figured out which one to use. The problem with that approach and a later patch (see Fixes: tag) is that as sd_state argument to v4l2_subdev_get_try_crop() et al is NULL, there is now an attempt to dereference that. Fix this. Also rewrap lines a little. Fixes: 0d346d2a6f54 ("media: v4l2-subdev: add subdev-wide state struct") Cc: stable@vger.kernel.org # for v5.14 and later Signed-off-by: Sakari Ailus Reviewed-by: Bingbu Cao Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/ipu3/ipu3-v4l2.c | 31 ++++++++++++-------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/staging/media/ipu3/ipu3-v4l2.c b/drivers/staging/media/ipu3/ipu3-v4l2.c index d1c539cefba87..2234bb8d48b34 100644 --- a/drivers/staging/media/ipu3/ipu3-v4l2.c +++ b/drivers/staging/media/ipu3/ipu3-v4l2.c @@ -192,33 +192,30 @@ static int imgu_subdev_get_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_selection *sel) { - struct v4l2_rect *try_sel, *r; - struct imgu_v4l2_subdev *imgu_sd = container_of(sd, - struct imgu_v4l2_subdev, - subdev); + struct imgu_v4l2_subdev *imgu_sd = + container_of(sd, struct imgu_v4l2_subdev, subdev); if (sel->pad != IMGU_NODE_IN) return -EINVAL; switch (sel->target) { case V4L2_SEL_TGT_CROP: - try_sel = v4l2_subdev_get_try_crop(sd, sd_state, sel->pad); - r = &imgu_sd->rect.eff; - break; + if (sel->which == V4L2_SUBDEV_FORMAT_TRY) + sel->r = *v4l2_subdev_get_try_crop(sd, sd_state, + sel->pad); + else + sel->r = imgu_sd->rect.eff; + return 0; case V4L2_SEL_TGT_COMPOSE: - try_sel = v4l2_subdev_get_try_compose(sd, sd_state, sel->pad); - r = &imgu_sd->rect.bds; - break; + if (sel->which == V4L2_SUBDEV_FORMAT_TRY) + sel->r = *v4l2_subdev_get_try_compose(sd, sd_state, + sel->pad); + else + sel->r = imgu_sd->rect.bds; + return 0; default: return -EINVAL; } - - if (sel->which == V4L2_SUBDEV_FORMAT_TRY) - sel->r = *try_sel; - else - sel->r = *r; - - return 0; } static int imgu_subdev_set_selection(struct v4l2_subdev *sd, -- GitLab From 2ba3e38517f5a4ebf9c997168079dca01b7f9fc6 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 26 Aug 2022 13:53:58 +0200 Subject: [PATCH 0521/2223] media: v4l: subdev: Fail graciously when getting try data for NULL state The state argument for the functions for obtaining various parts of the state is NULL if it is called by drivers for active state. Fail graciously in that case instead of dereferencing a NULL pointer. Suggested-by: Bingbu Cao Signed-off-by: Sakari Ailus Reviewed-by: Tomi Valkeinen Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-subdev.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 9689f38a0af1f..ec1896886dbd6 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -1046,6 +1046,8 @@ v4l2_subdev_get_pad_format(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, unsigned int pad) { + if (WARN_ON(!state)) + return NULL; if (WARN_ON(pad >= sd->entity.num_pads)) pad = 0; return &state->pads[pad].try_fmt; @@ -1064,6 +1066,8 @@ v4l2_subdev_get_pad_crop(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, unsigned int pad) { + if (WARN_ON(!state)) + return NULL; if (WARN_ON(pad >= sd->entity.num_pads)) pad = 0; return &state->pads[pad].try_crop; @@ -1082,6 +1086,8 @@ v4l2_subdev_get_pad_compose(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, unsigned int pad) { + if (WARN_ON(!state)) + return NULL; if (WARN_ON(pad >= sd->entity.num_pads)) pad = 0; return &state->pads[pad].try_compose; -- GitLab From 54bb7671ca6de58929b3994468c330bedb9a3b7e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Aug 2022 12:32:36 +0200 Subject: [PATCH 0522/2223] media: ar0521: Fix return value check in writing initial registers The return value from register writes is ignored apart from the last value. Fix this. Reported-by: kernel test robot Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/ar0521.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/i2c/ar0521.c b/drivers/media/i2c/ar0521.c index 9e90b02b15e36..a586e0fc0ea3f 100644 --- a/drivers/media/i2c/ar0521.c +++ b/drivers/media/i2c/ar0521.c @@ -755,11 +755,12 @@ static int ar0521_power_on(struct device *dev) gpiod_set_value(sensor->reset_gpio, 0); usleep_range(4500, 5000); /* min 45000 clocks */ - for (cnt = 0; cnt < ARRAY_SIZE(initial_regs); cnt++) + for (cnt = 0; cnt < ARRAY_SIZE(initial_regs); cnt++) { ret = ar0521_write_regs(sensor, initial_regs[cnt].data, initial_regs[cnt].count); if (ret) goto off; + } ret = ar0521_write_reg(sensor, AR0521_REG_SERIAL_FORMAT, AR0521_REG_SERIAL_FORMAT_MIPI | -- GitLab From 080e0b7404850406628674b07286f16cc389a892 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Aug 2022 08:43:29 +0200 Subject: [PATCH 0523/2223] media: ov8865: Fix an error handling path in ov8865_probe() The commit in Fixes also introduced some new error handling which should goto the existing error handling path. Otherwise some resources leak. Fixes: 73dcffeb2ff9 ("media: i2c: Support 19.2MHz input clock in ov8865") Signed-off-by: Christophe JAILLET Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/ov8865.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/media/i2c/ov8865.c b/drivers/media/i2c/ov8865.c index b8f4f0d3e33d7..15d0f79231dd6 100644 --- a/drivers/media/i2c/ov8865.c +++ b/drivers/media/i2c/ov8865.c @@ -3034,11 +3034,13 @@ static int ov8865_probe(struct i2c_client *client) &rate); if (!ret && sensor->extclk) { ret = clk_set_rate(sensor->extclk, rate); - if (ret) - return dev_err_probe(dev, ret, - "failed to set clock rate\n"); + if (ret) { + dev_err_probe(dev, ret, "failed to set clock rate\n"); + goto error_endpoint; + } } else if (ret && !sensor->extclk) { - return dev_err_probe(dev, ret, "invalid clock config\n"); + dev_err_probe(dev, ret, "invalid clock config\n"); + goto error_endpoint; } sensor->extclk_rate = rate ? rate : clk_get_rate(sensor->extclk); -- GitLab From ff37bc8c7099b673e9838bfbd0de78eff740316b Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 5 Sep 2022 11:52:35 +0200 Subject: [PATCH 0524/2223] media: sun6i-mipi-csi2: Depend on PHY_SUN6I_MIPI_DPHY PHY_SUN6I_MIPI_DPHY is not a freely selectable option and so may not always be available. Depend on it instead. Fixes: 94d7fd9692b5 ("media: sunxi: Depend on GENERIC_PHY_MIPI_DPHY") Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig b/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig index eb982466abd30..4d072abdfb705 100644 --- a/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig +++ b/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig @@ -4,10 +4,10 @@ config VIDEO_SUN6I_MIPI_CSI2 depends on V4L_PLATFORM_DRIVERS && VIDEO_DEV depends on ARCH_SUNXI || COMPILE_TEST depends on PM && COMMON_CLK + depends on PHY_SUN6I_MIPI_DPHY select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API select V4L2_FWNODE - select PHY_SUN6I_MIPI_DPHY select GENERIC_PHY_MIPI_DPHY select REGMAP_MMIO help -- GitLab From ac5d4d87e1eb8534c4b8148f77a0771825e780f4 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 5 Sep 2022 14:35:34 +0200 Subject: [PATCH 0525/2223] media: Remove incorrect comment from struct v4l2_fwnode_endpoint struct v4l2_fwnode_endpoint was zeroed previously apart from the endpoint information itself when the endpoint properties were parsed. Now this hasn't been the case for a few years so remove the comment. Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-fwnode.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h index 15e4ab6722232..394d798f3dfa4 100644 --- a/include/media/v4l2-fwnode.h +++ b/include/media/v4l2-fwnode.h @@ -45,10 +45,6 @@ struct v4l2_async_subdev; */ struct v4l2_fwnode_endpoint { struct fwnode_endpoint base; - /* - * Fields below this line will be zeroed by - * v4l2_fwnode_endpoint_parse() - */ enum v4l2_mbus_type bus_type; struct { struct v4l2_mbus_config_parallel parallel; -- GitLab From b558ce56b434d8ca633ccc31a4acfee79a29a7a5 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:24 +0200 Subject: [PATCH 0526/2223] media: Documentation: mc: add definitions for stream and pipeline The doc talks about streams and pipelines, but doesn't really define them. This is an attempt to define them according to my understanding. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- Documentation/driver-api/media/mc-core.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst index 84aa7cdb53419..4bb062d5c2e77 100644 --- a/Documentation/driver-api/media/mc-core.rst +++ b/Documentation/driver-api/media/mc-core.rst @@ -214,6 +214,18 @@ Link properties can be modified at runtime by calling Pipelines and media streams ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A media stream is a stream of pixels or metadata originating from one or more +source devices (such as a sensors) and flowing through media entity pads +towards the final sinks. The stream can be modified on the route by the +devices (e.g. scaling or pixel format conversions), or it can be split into +multiple branches, or multiple branches can be merged. + +A media pipeline is a set of media streams which are interdependent. This +interdependency can be caused by the hardware (e.g. configuration of a second +stream cannot be changed if the first stream has been enabled) or by the driver +due to the software design. Most commonly a media pipeline consists of a single +stream which does not branch. + When starting streaming, drivers must notify all entities in the pipeline to prevent link states from being modified during streaming by calling :c:func:`media_pipeline_start()`. -- GitLab From c7097c80ca684f6476a55b2e202b975cf3e36b46 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:25 +0200 Subject: [PATCH 0527/2223] media: media-entity.h: add include for min() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ./include/media/media-entity.h:595:34: error: implicit declaration of function ‘min’ Include minmax.h to get the definition for min(). Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/media-entity.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/media/media-entity.h b/include/media/media-entity.h index f16ffe70f7a64..4a67b1dfdc696 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -17,6 +17,7 @@ #include #include #include +#include #include /* Enums used internally at the media controller to represent graphs */ -- GitLab From 87d36eb84d4f45657bb422af36c9eed0161cd032 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:26 +0200 Subject: [PATCH 0528/2223] media: subdev: increase V4L2_FRAME_DESC_ENTRY_MAX to 8 V4L2_FRAME_DESC_ENTRY_MAX is currently set to 4. In theory it's possible to have an arbitrary amount of streams in a single pad, so preferably there should be no hardcoded maximum number. However, I believe a reasonable max is 8, which would cover a CSI-2 pad with 4 streams of pixel data and 4 streams of metadata. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Reviewed-by: Hans Verkuil Reviewed-by: Jacopo Mondi Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-subdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index ec1896886dbd6..2f80c9c818ed0 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -358,7 +358,11 @@ struct v4l2_mbus_frame_desc_entry { } bus; }; -#define V4L2_FRAME_DESC_ENTRY_MAX 4 + /* + * If this number is too small, it should be dropped altogether and the + * API switched to a dynamic number of frame descriptor entries. + */ +#define V4L2_FRAME_DESC_ENTRY_MAX 8 /** * enum v4l2_mbus_frame_desc_type - media bus frame description type -- GitLab From b7319e2bd7bd7740a405719727e6fc01be1363ef Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:27 +0200 Subject: [PATCH 0529/2223] media: mc: entity: Rename streaming_count -> start_count 'streaming_count' is a bit misleading name, as the count is increased with media_pipeline_start(). Let's rename it to 'start_count' instead. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 8 ++++---- drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c | 4 ++-- include/media/media-entity.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index afd1bd7ff7b6e..67d009b617ce1 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -415,8 +415,8 @@ __must_check int __media_pipeline_start(struct media_entity *entity, struct media_link *link; int ret; - if (pipe->streaming_count) { - pipe->streaming_count++; + if (pipe->start_count) { + pipe->start_count++; return 0; } @@ -499,7 +499,7 @@ __must_check int __media_pipeline_start(struct media_entity *entity, } } - pipe->streaming_count++; + pipe->start_count++; return 0; @@ -552,7 +552,7 @@ void __media_pipeline_stop(struct media_entity *entity) if (WARN_ON(!pipe)) return; - if (--pipe->streaming_count) + if (--pipe->start_count) return; media_graph_walk_start(graph, entity); diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c index c66963a2ccd99..6ef09579dc21b 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c @@ -926,7 +926,7 @@ static void rkisp1_pipeline_stream_disable(struct rkisp1_capture *cap) * If the other capture is streaming, isp and sensor nodes shouldn't * be disabled, skip them. */ - if (rkisp1->pipe.streaming_count < 2) + if (rkisp1->pipe.start_count < 2) v4l2_subdev_call(&rkisp1->isp.sd, video, s_stream, false); v4l2_subdev_call(&rkisp1->resizer_devs[cap->id].sd, video, s_stream, @@ -956,7 +956,7 @@ static int rkisp1_pipeline_stream_enable(struct rkisp1_capture *cap) * If the other capture is streaming, isp and sensor nodes are already * enabled, skip them. */ - if (rkisp1->pipe.streaming_count > 1) + if (rkisp1->pipe.start_count > 1) return 0; ret = v4l2_subdev_call(&rkisp1->isp.sd, video, s_stream, true); diff --git a/include/media/media-entity.h b/include/media/media-entity.h index 4a67b1dfdc696..198ea1416ddd5 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -100,11 +100,11 @@ struct media_graph { /** * struct media_pipeline - Media pipeline related information * - * @streaming_count: Streaming start count - streaming stop count + * @start_count: Media pipeline start - stop count * @graph: Media graph walk during pipeline start / stop */ struct media_pipeline { - int streaming_count; + int start_count; struct media_graph graph; }; -- GitLab From 8db465f7d6a0fb573d8f7c953d336b8470c0e831 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Wed, 31 Aug 2022 16:13:28 +0200 Subject: [PATCH 0530/2223] media: mc: entity: Add iterator helper for entity pads Add an iterator helper to easily cycle through all pads in an entity and use it in media-entity and media-device code where appropriate. Signed-off-by: Jacopo Mondi Reviewed-by: Laurent Pinchart Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-device.c | 13 ++++++------- drivers/media/mc/mc-entity.c | 11 ++++++----- include/media/media-entity.h | 12 ++++++++++++ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/media/mc/mc-device.c b/drivers/media/mc/mc-device.c index b8176a3b76d3b..25020d58eb06e 100644 --- a/drivers/media/mc/mc-device.c +++ b/drivers/media/mc/mc-device.c @@ -581,7 +581,7 @@ static void __media_device_unregister_entity(struct media_entity *entity) struct media_device *mdev = entity->graph_obj.mdev; struct media_link *link, *tmp; struct media_interface *intf; - unsigned int i; + struct media_pad *iter; ida_free(&mdev->entity_internal_idx, entity->internal_idx); @@ -597,8 +597,8 @@ static void __media_device_unregister_entity(struct media_entity *entity) __media_entity_remove_links(entity); /* Remove all pads that belong to this entity */ - for (i = 0; i < entity->num_pads; i++) - media_gobj_destroy(&entity->pads[i].graph_obj); + media_entity_for_each_pad(entity, iter) + media_gobj_destroy(&iter->graph_obj); /* Remove the entity */ media_gobj_destroy(&entity->graph_obj); @@ -610,7 +610,7 @@ int __must_check media_device_register_entity(struct media_device *mdev, struct media_entity *entity) { struct media_entity_notify *notify, *next; - unsigned int i; + struct media_pad *iter; int ret; if (entity->function == MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN || @@ -639,9 +639,8 @@ int __must_check media_device_register_entity(struct media_device *mdev, media_gobj_create(mdev, MEDIA_GRAPH_ENTITY, &entity->graph_obj); /* Initialize objects at the pads */ - for (i = 0; i < entity->num_pads; i++) - media_gobj_create(mdev, MEDIA_GRAPH_PAD, - &entity->pads[i].graph_obj); + media_entity_for_each_pad(entity, iter) + media_gobj_create(mdev, MEDIA_GRAPH_PAD, &iter->graph_obj); /* invoke entity_notify callbacks */ list_for_each_entry_safe(notify, next, &mdev->entity_notify, list) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index 67d009b617ce1..682f424a15caf 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -193,7 +193,8 @@ int media_entity_pads_init(struct media_entity *entity, u16 num_pads, struct media_pad *pads) { struct media_device *mdev = entity->graph_obj.mdev; - unsigned int i; + struct media_pad *iter; + unsigned int i = 0; if (num_pads >= MEDIA_ENTITY_MAX_PADS) return -E2BIG; @@ -204,12 +205,12 @@ int media_entity_pads_init(struct media_entity *entity, u16 num_pads, if (mdev) mutex_lock(&mdev->graph_mutex); - for (i = 0; i < num_pads; i++) { - pads[i].entity = entity; - pads[i].index = i; + media_entity_for_each_pad(entity, iter) { + iter->entity = entity; + iter->index = i++; if (mdev) media_gobj_create(mdev, MEDIA_GRAPH_PAD, - &entity->pads[i].graph_obj); + &iter->graph_obj); } if (mdev) diff --git a/include/media/media-entity.h b/include/media/media-entity.h index 198ea1416ddd5..a5a50350e954d 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -316,6 +316,18 @@ struct media_entity { } info; }; +/** + * media_entity_for_each_pad - Iterate on all pads in an entity + * @entity: The entity the pads belong to + * @iter: The iterator pad + * + * Iterate on all pads in a media entity. + */ +#define media_entity_for_each_pad(entity, iter) \ + for (iter = (entity)->pads; \ + iter < &(entity)->pads[(entity)->num_pads]; \ + ++iter) + /** * struct media_interface - A media interface graph object. * -- GitLab From 49b38947d7841abb6e60c15968f03b2daa2d54d7 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 31 Aug 2022 16:13:29 +0200 Subject: [PATCH 0531/2223] media: mc: entity: Merge media_entity_enum_init and __media_entity_enum_init The media_entity_enum_init() function is a wrapper around __media_entity_enum_init() that turns a media_device pointer argument into the maximum entity ID in the corresponding media graph. __media_entity_enum_init() is never used outside of media_entity_enum_init(), so the two functions can be merged together. Signed-off-by: Laurent Pinchart Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 10 ++++++---- include/media/media-device.h | 15 --------------- include/media/media-entity.h | 10 +++++----- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index 682f424a15caf..48d8cc98ae043 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -59,10 +59,12 @@ static inline const char *link_type_name(struct media_link *link) } } -__must_check int __media_entity_enum_init(struct media_entity_enum *ent_enum, - int idx_max) +__must_check int media_entity_enum_init(struct media_entity_enum *ent_enum, + struct media_device *mdev) { - idx_max = ALIGN(idx_max, BITS_PER_LONG); + int idx_max; + + idx_max = ALIGN(mdev->entity_internal_idx_max + 1, BITS_PER_LONG); ent_enum->bmap = bitmap_zalloc(idx_max, GFP_KERNEL); if (!ent_enum->bmap) return -ENOMEM; @@ -71,7 +73,7 @@ __must_check int __media_entity_enum_init(struct media_entity_enum *ent_enum, return 0; } -EXPORT_SYMBOL_GPL(__media_entity_enum_init); +EXPORT_SYMBOL_GPL(media_entity_enum_init); void media_entity_enum_cleanup(struct media_entity_enum *ent_enum) { diff --git a/include/media/media-device.h b/include/media/media-device.h index a10b305075242..86716ee7cc6ce 100644 --- a/include/media/media-device.h +++ b/include/media/media-device.h @@ -191,21 +191,6 @@ struct usb_device; #define MEDIA_DEV_NOTIFY_PRE_LINK_CH 0 #define MEDIA_DEV_NOTIFY_POST_LINK_CH 1 -/** - * media_entity_enum_init - Initialise an entity enumeration - * - * @ent_enum: Entity enumeration to be initialised - * @mdev: The related media device - * - * Return: zero on success or a negative error code. - */ -static inline __must_check int media_entity_enum_init( - struct media_entity_enum *ent_enum, struct media_device *mdev) -{ - return __media_entity_enum_init(ent_enum, - mdev->entity_internal_idx_max + 1); -} - /** * media_device_init() - Initializes a media device element * diff --git a/include/media/media-entity.h b/include/media/media-entity.h index a5a50350e954d..1030e45e8ee64 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -439,15 +439,15 @@ static inline bool is_media_entity_v4l2_subdev(struct media_entity *entity) } /** - * __media_entity_enum_init - Initialise an entity enumeration + * media_entity_enum_init - Initialise an entity enumeration * * @ent_enum: Entity enumeration to be initialised - * @idx_max: Maximum number of entities in the enumeration + * @mdev: The related media device * - * Return: Returns zero on success or a negative error code. + * Return: zero on success or a negative error code. */ -__must_check int __media_entity_enum_init(struct media_entity_enum *ent_enum, - int idx_max); +__must_check int media_entity_enum_init(struct media_entity_enum *ent_enum, + struct media_device *mdev); /** * media_entity_enum_cleanup - Release resources of an entity enumeration -- GitLab From 612589a35e99fcbb7c85d8ba21b01f0249cc188d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 31 Aug 2022 16:13:30 +0200 Subject: [PATCH 0532/2223] media: mc: entity: Move media_entity_get_fwnode_pad() out of graph walk section The media_entity_get_fwnode_pad() function is unrelated to the graph traversal code that it is currently bundled with. Move it with the media_entity_remote_pad() function. Signed-off-by: Laurent Pinchart Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 70 ++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index 48d8cc98ae043..c5c66befed0f2 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -370,41 +370,6 @@ struct media_entity *media_graph_walk_next(struct media_graph *graph) } EXPORT_SYMBOL_GPL(media_graph_walk_next); -int media_entity_get_fwnode_pad(struct media_entity *entity, - struct fwnode_handle *fwnode, - unsigned long direction_flags) -{ - struct fwnode_endpoint endpoint; - unsigned int i; - int ret; - - if (!entity->ops || !entity->ops->get_fwnode_pad) { - for (i = 0; i < entity->num_pads; i++) { - if (entity->pads[i].flags & direction_flags) - return i; - } - - return -ENXIO; - } - - ret = fwnode_graph_parse_endpoint(fwnode, &endpoint); - if (ret) - return ret; - - ret = entity->ops->get_fwnode_pad(entity, &endpoint); - if (ret < 0) - return ret; - - if (ret >= entity->num_pads) - return -ENXIO; - - if (!(entity->pads[ret].flags & direction_flags)) - return -ENXIO; - - return ret; -} -EXPORT_SYMBOL_GPL(media_entity_get_fwnode_pad); - /* ----------------------------------------------------------------------------- * Pipeline management */ @@ -994,6 +959,41 @@ struct media_pad *media_pad_remote_pad_unique(const struct media_pad *pad) } EXPORT_SYMBOL_GPL(media_pad_remote_pad_unique); +int media_entity_get_fwnode_pad(struct media_entity *entity, + struct fwnode_handle *fwnode, + unsigned long direction_flags) +{ + struct fwnode_endpoint endpoint; + unsigned int i; + int ret; + + if (!entity->ops || !entity->ops->get_fwnode_pad) { + for (i = 0; i < entity->num_pads; i++) { + if (entity->pads[i].flags & direction_flags) + return i; + } + + return -ENXIO; + } + + ret = fwnode_graph_parse_endpoint(fwnode, &endpoint); + if (ret) + return ret; + + ret = entity->ops->get_fwnode_pad(entity, &endpoint); + if (ret < 0) + return ret; + + if (ret >= entity->num_pads) + return -ENXIO; + + if (!(entity->pads[ret].flags & direction_flags)) + return -ENXIO; + + return ret; +} +EXPORT_SYMBOL_GPL(media_entity_get_fwnode_pad); + static void media_interface_init(struct media_device *mdev, struct media_interface *intf, u32 gobj_type, -- GitLab From 72b603357ae461c0f19ca05d6624b4afd5c74b47 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 31 Aug 2022 16:13:31 +0200 Subject: [PATCH 0533/2223] media: mc: entity: Add media_entity_pipeline() to access the media pipeline Replace direct access to the pipe field in drivers with a new helper function. This will allow easier refactoring of media pipeline handling in the MC core behind the scenes without affecting drivers. Signed-off-by: Laurent Pinchart Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 6 ++++++ .../platform/renesas/rcar-vin/rcar-core.c | 5 ++--- .../media/platform/renesas/rcar-vin/rcar-dma.c | 2 +- drivers/media/platform/ti/omap3isp/isp.c | 4 +--- drivers/media/platform/ti/omap3isp/ispvideo.c | 3 +-- drivers/media/platform/ti/omap3isp/ispvideo.h | 11 +++++++++-- drivers/media/platform/xilinx/xilinx-dma.c | 3 +-- drivers/media/platform/xilinx/xilinx-dma.h | 7 ++++++- drivers/staging/media/imx/imx-media-utils.c | 2 +- drivers/staging/media/omap4iss/iss.c | 4 +--- drivers/staging/media/omap4iss/iss_video.c | 3 +-- drivers/staging/media/omap4iss/iss_video.h | 11 +++++++++-- include/media/media-entity.h | 18 ++++++++++++++++++ 13 files changed, 57 insertions(+), 22 deletions(-) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index c5c66befed0f2..7fb97c6dc897a 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -994,6 +994,12 @@ int media_entity_get_fwnode_pad(struct media_entity *entity, } EXPORT_SYMBOL_GPL(media_entity_get_fwnode_pad); +struct media_pipeline *media_entity_pipeline(struct media_entity *entity) +{ + return entity->pipe; +} +EXPORT_SYMBOL_GPL(media_entity_pipeline); + static void media_interface_init(struct media_device *mdev, struct media_interface *intf, u32 gobj_type, diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-core.c b/drivers/media/platform/renesas/rcar-vin/rcar-core.c index 968a74234e929..2f7daa853ed8b 100644 --- a/drivers/media/platform/renesas/rcar-vin/rcar-core.c +++ b/drivers/media/platform/renesas/rcar-vin/rcar-core.c @@ -786,9 +786,8 @@ static int rvin_csi2_link_notify(struct media_link *link, u32 flags, return 0; /* - * Don't allow link changes if any entity in the graph is - * streaming, modifying the CHSEL register fields can disrupt - * running streams. + * Don't allow link changes if any stream in the graph is active as + * modifying the CHSEL register fields can disrupt running streams. */ media_device_for_each_entity(entity, &group->mdev) if (media_entity_is_streaming(entity)) diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c index 8d37fbdc266a0..e72bc6fa049f4 100644 --- a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c +++ b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c @@ -1281,7 +1281,7 @@ static int rvin_set_stream(struct rvin_dev *vin, int on) */ mdev = vin->vdev.entity.graph_obj.mdev; mutex_lock(&mdev->graph_mutex); - pipe = sd->entity.pipe ? sd->entity.pipe : &vin->vdev.pipe; + pipe = media_entity_pipeline(&sd->entity) ? : &vin->vdev.pipe; ret = __media_pipeline_start(&vin->vdev.entity, pipe); mutex_unlock(&mdev->graph_mutex); if (ret) diff --git a/drivers/media/platform/ti/omap3isp/isp.c b/drivers/media/platform/ti/omap3isp/isp.c index a6052df9bb19e..24d2383400b0a 100644 --- a/drivers/media/platform/ti/omap3isp/isp.c +++ b/drivers/media/platform/ti/omap3isp/isp.c @@ -937,10 +937,8 @@ static int isp_pipeline_is_last(struct media_entity *me) struct isp_pipeline *pipe; struct media_pad *pad; - if (!me->pipe) - return 0; pipe = to_isp_pipeline(me); - if (pipe->stream_state == ISP_PIPELINE_STREAM_STOPPED) + if (!pipe || pipe->stream_state == ISP_PIPELINE_STREAM_STOPPED) return 0; pad = media_pad_remote_pad_first(&pipe->output->pad); return pad->entity == me; diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.c b/drivers/media/platform/ti/omap3isp/ispvideo.c index cc9a97d5d5051..2e7f90603a5a8 100644 --- a/drivers/media/platform/ti/omap3isp/ispvideo.c +++ b/drivers/media/platform/ti/omap3isp/ispvideo.c @@ -1093,8 +1093,7 @@ isp_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) /* Start streaming on the pipeline. No link touching an entity in the * pipeline can be activated or deactivated once streaming is started. */ - pipe = video->video.entity.pipe - ? to_isp_pipeline(&video->video.entity) : &video->pipe; + pipe = to_isp_pipeline(&video->video.entity) ? : &video->pipe; ret = media_entity_enum_init(&pipe->ent_enum, &video->isp->media_dev); if (ret) diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.h b/drivers/media/platform/ti/omap3isp/ispvideo.h index a0908670c0cf3..1d23df576e6b3 100644 --- a/drivers/media/platform/ti/omap3isp/ispvideo.h +++ b/drivers/media/platform/ti/omap3isp/ispvideo.h @@ -99,8 +99,15 @@ struct isp_pipeline { unsigned int external_width; }; -#define to_isp_pipeline(__e) \ - container_of((__e)->pipe, struct isp_pipeline, pipe) +static inline struct isp_pipeline *to_isp_pipeline(struct media_entity *entity) +{ + struct media_pipeline *pipe = media_entity_pipeline(entity); + + if (!pipe) + return NULL; + + return container_of(pipe, struct isp_pipeline, pipe); +} static inline int isp_pipeline_ready(struct isp_pipeline *pipe) { diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c index 2d1ef7a25c338..3a4d62be0f277 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.c +++ b/drivers/media/platform/xilinx/xilinx-dma.c @@ -402,8 +402,7 @@ static int xvip_dma_start_streaming(struct vb2_queue *vq, unsigned int count) * Use the pipeline object embedded in the first DMA object that starts * streaming. */ - pipe = dma->video.entity.pipe - ? to_xvip_pipeline(&dma->video.entity) : &dma->pipe; + pipe = to_xvip_pipeline(&dma->video.entity) ? : &dma->pipe; ret = media_pipeline_start(&dma->video.entity, &pipe->pipe); if (ret < 0) diff --git a/drivers/media/platform/xilinx/xilinx-dma.h b/drivers/media/platform/xilinx/xilinx-dma.h index 2378bdae57aea..3ea10f6b0bb9b 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.h +++ b/drivers/media/platform/xilinx/xilinx-dma.h @@ -47,7 +47,12 @@ struct xvip_pipeline { static inline struct xvip_pipeline *to_xvip_pipeline(struct media_entity *e) { - return container_of(e->pipe, struct xvip_pipeline, pipe); + struct media_pipeline *pipe = media_entity_pipeline(e); + + if (!pipe) + return NULL; + + return container_of(pipe, struct xvip_pipeline, pipe); } /** diff --git a/drivers/staging/media/imx/imx-media-utils.c b/drivers/staging/media/imx/imx-media-utils.c index 294c808b2ebe1..e9a3c6d2c66fb 100644 --- a/drivers/staging/media/imx/imx-media-utils.c +++ b/drivers/staging/media/imx/imx-media-utils.c @@ -871,7 +871,7 @@ int imx_media_pipeline_set_stream(struct imx_media_dev *imxmd, __media_pipeline_stop(entity); } else { v4l2_subdev_call(sd, video, s_stream, 0); - if (entity->pipe) + if (media_entity_pipeline(entity)) __media_pipeline_stop(entity); } diff --git a/drivers/staging/media/omap4iss/iss.c b/drivers/staging/media/omap4iss/iss.c index 28aacda0f5a7d..fa2a36d829d3d 100644 --- a/drivers/staging/media/omap4iss/iss.c +++ b/drivers/staging/media/omap4iss/iss.c @@ -548,10 +548,8 @@ static int iss_pipeline_is_last(struct media_entity *me) struct iss_pipeline *pipe; struct media_pad *pad; - if (!me->pipe) - return 0; pipe = to_iss_pipeline(me); - if (pipe->stream_state == ISS_PIPELINE_STREAM_STOPPED) + if (!pipe || pipe->stream_state == ISS_PIPELINE_STREAM_STOPPED) return 0; pad = media_pad_remote_pad_first(&pipe->output->pad); return pad->entity == me; diff --git a/drivers/staging/media/omap4iss/iss_video.c b/drivers/staging/media/omap4iss/iss_video.c index 842509dcfedff..7967a42a3ffac 100644 --- a/drivers/staging/media/omap4iss/iss_video.c +++ b/drivers/staging/media/omap4iss/iss_video.c @@ -870,8 +870,7 @@ iss_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) * Start streaming on the pipeline. No link touching an entity in the * pipeline can be activated or deactivated once streaming is started. */ - pipe = entity->pipe - ? to_iss_pipeline(entity) : &video->pipe; + pipe = to_iss_pipeline(&video->video.entity) ? : &video->pipe; pipe->external = NULL; pipe->external_rate = 0; pipe->external_bpp = 0; diff --git a/drivers/staging/media/omap4iss/iss_video.h b/drivers/staging/media/omap4iss/iss_video.h index 526281bf00513..ca2d5edb6261a 100644 --- a/drivers/staging/media/omap4iss/iss_video.h +++ b/drivers/staging/media/omap4iss/iss_video.h @@ -90,8 +90,15 @@ struct iss_pipeline { int external_bpp; }; -#define to_iss_pipeline(__e) \ - container_of((__e)->pipe, struct iss_pipeline, pipe) +static inline struct iss_pipeline *to_iss_pipeline(struct media_entity *entity) +{ + struct media_pipeline *pipe = media_entity_pipeline(entity); + + if (!pipe) + return NULL; + + return container_of(pipe, struct iss_pipeline, pipe); +} static inline int iss_pipeline_ready(struct iss_pipeline *pipe) { diff --git a/include/media/media-entity.h b/include/media/media-entity.h index 1030e45e8ee64..aaf276f765cf0 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -948,6 +948,24 @@ static inline bool media_entity_is_streaming(const struct media_entity *entity) return entity->pipe; } +/** + * media_entity_pipeline - Get the media pipeline an entity is part of + * @entity: The entity + * + * This function returns the media pipeline that an entity has been associated + * with when constructing the pipeline with media_pipeline_start(). The pointer + * remains valid until media_pipeline_stop() is called. + * + * In general, entities can be part of multiple pipelines, when carrying + * multiple streams (either on different pads, or on the same pad using + * multiplexed streams). This function is to be used only for entities that + * do not support multiple pipelines. + * + * Return: The media_pipeline the entity is part of, or NULL if the entity is + * not part of any pipeline. + */ +struct media_pipeline *media_entity_pipeline(struct media_entity *entity); + /** * media_entity_get_fwnode_pad - Get pad number from fwnode * -- GitLab From 340eba477f0e51bed997e94bd3c2b728a0c6e1ac Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:32 +0200 Subject: [PATCH 0534/2223] media: v4l2-dev: Add videodev wrappers for media pipelines With the upcoming stream related improvements to the pipelines, the pipelines are moved from media entities to media pads. As the drivers currently use the pipelines with the entity based model, moving the pipelines to pads will cause changes to the drivers. However, most of the uses of media pipelines are related to a video device (a DMA engine) with a single pad, and thus there's never a need to support multiple pads in these use cases. We can avoid pushing the complexities of the pad based model to the drivers by adding video device wrappers for the pipeline related functions. This patch adds a number of wrappers to media_pipeline functions, all of which take a video_device as a parameter (instead of a media_entity), and verify that there's just one pad. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-dev.c | 61 +++++++++++++++++++++ include/media/v4l2-dev.h | 88 ++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index d00237ee4caee..7f933ff89fd49 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -1095,6 +1095,67 @@ void video_unregister_device(struct video_device *vdev) } EXPORT_SYMBOL(video_unregister_device); +#if defined(CONFIG_MEDIA_CONTROLLER) + +__must_check int video_device_pipeline_start(struct video_device *vdev, + struct media_pipeline *pipe) +{ + struct media_entity *entity = &vdev->entity; + + if (entity->num_pads != 1) + return -ENODEV; + + return media_pipeline_start(entity, pipe); +} +EXPORT_SYMBOL_GPL(video_device_pipeline_start); + +__must_check int __video_device_pipeline_start(struct video_device *vdev, + struct media_pipeline *pipe) +{ + struct media_entity *entity = &vdev->entity; + + if (entity->num_pads != 1) + return -ENODEV; + + return __media_pipeline_start(entity, pipe); +} +EXPORT_SYMBOL_GPL(__video_device_pipeline_start); + +void video_device_pipeline_stop(struct video_device *vdev) +{ + struct media_entity *entity = &vdev->entity; + + if (WARN_ON(entity->num_pads != 1)) + return; + + return media_pipeline_stop(entity); +} +EXPORT_SYMBOL_GPL(video_device_pipeline_stop); + +void __video_device_pipeline_stop(struct video_device *vdev) +{ + struct media_entity *entity = &vdev->entity; + + if (WARN_ON(entity->num_pads != 1)) + return; + + return __media_pipeline_stop(entity); +} +EXPORT_SYMBOL_GPL(__video_device_pipeline_stop); + +struct media_pipeline *video_device_pipeline(struct video_device *vdev) +{ + struct media_entity *entity = &vdev->entity; + + if (WARN_ON(entity->num_pads != 1)) + return NULL; + + return media_entity_pipeline(entity); +} +EXPORT_SYMBOL_GPL(video_device_pipeline); + +#endif /* CONFIG_MEDIA_CONTROLLER */ + /* * Initialise video for linux */ diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h index 5cf1edefb822d..4946858722540 100644 --- a/include/media/v4l2-dev.h +++ b/include/media/v4l2-dev.h @@ -539,4 +539,92 @@ static inline int video_is_registered(struct video_device *vdev) return test_bit(V4L2_FL_REGISTERED, &vdev->flags); } +#if defined(CONFIG_MEDIA_CONTROLLER) + +/** + * video_device_pipeline_start - Mark a pipeline as streaming + * @vdev: Starting video device + * @pipe: Media pipeline to be assigned to all entities in the pipeline. + * + * Mark all entities connected to a given video device through enabled links, + * either directly or indirectly, as streaming. The given pipeline object is + * assigned to every entity in the pipeline and stored in the media_entity pipe + * field. + * + * Calls to this function can be nested, in which case the same number of + * video_device_pipeline_stop() calls will be required to stop streaming. The + * pipeline pointer must be identical for all nested calls to + * video_device_pipeline_start(). + * + * The video device must contain a single pad. + * + * This is a convenience wrapper around media_pipeline_start(). + */ +__must_check int video_device_pipeline_start(struct video_device *vdev, + struct media_pipeline *pipe); + +/** + * __video_device_pipeline_start - Mark a pipeline as streaming + * @vdev: Starting video device + * @pipe: Media pipeline to be assigned to all entities in the pipeline. + * + * ..note:: This is the non-locking version of video_device_pipeline_start() + * + * The video device must contain a single pad. + * + * This is a convenience wrapper around __media_pipeline_start(). + */ +__must_check int __video_device_pipeline_start(struct video_device *vdev, + struct media_pipeline *pipe); + +/** + * video_device_pipeline_stop - Mark a pipeline as not streaming + * @vdev: Starting video device + * + * Mark all entities connected to a given video device through enabled links, + * either directly or indirectly, as not streaming. The media_entity pipe field + * is reset to %NULL. + * + * If multiple calls to media_pipeline_start() have been made, the same + * number of calls to this function are required to mark the pipeline as not + * streaming. + * + * The video device must contain a single pad. + * + * This is a convenience wrapper around media_pipeline_stop(). + */ +void video_device_pipeline_stop(struct video_device *vdev); + +/** + * __video_device_pipeline_stop - Mark a pipeline as not streaming + * @vdev: Starting video device + * + * .. note:: This is the non-locking version of media_pipeline_stop() + * + * The video device must contain a single pad. + * + * This is a convenience wrapper around __media_pipeline_stop(). + */ +void __video_device_pipeline_stop(struct video_device *vdev); + +/** + * video_device_pipeline - Get the media pipeline a video device is part of + * @vdev: The video device + * + * This function returns the media pipeline that a video device has been + * associated with when constructing the pipeline with + * video_device_pipeline_start(). The pointer remains valid until + * video_device_pipeline_stop() is called. + * + * Return: The media_pipeline the video device is part of, or NULL if the video + * device is not part of any pipeline. + * + * The video device must contain a single pad. + * + * This is a convenience wrapper around media_entity_pipeline(). + */ +struct media_pipeline *video_device_pipeline(struct video_device *vdev); + +#endif /* CONFIG_MEDIA_CONTROLLER */ + #endif /* _V4L2_DEV_H */ -- GitLab From 12cecbf9150f67b0ce7d88bc2e243e67637726c2 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:33 +0200 Subject: [PATCH 0535/2223] media: drivers: use video device pipeline start/stop Convert the media drivers to use video device based pipeline start/stop where possible. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/pci/intel/ipu3/ipu3-cio2-main.c | 6 +++--- drivers/media/platform/qcom/camss/camss-video.c | 6 +++--- drivers/media/platform/renesas/rcar-vin/rcar-dma.c | 6 +++--- drivers/media/platform/renesas/vsp1/vsp1_video.c | 6 +++--- .../media/platform/rockchip/rkisp1/rkisp1-capture.c | 10 +++++----- .../media/platform/samsung/exynos4-is/fimc-capture.c | 9 ++++----- .../media/platform/samsung/exynos4-is/fimc-isp-video.c | 9 ++++----- drivers/media/platform/samsung/exynos4-is/fimc-lite.c | 9 ++++----- drivers/media/platform/st/stm32/stm32-dcmi.c | 6 +++--- drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c | 6 +++--- drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c | 6 +++--- drivers/media/platform/ti/cal/cal-video.c | 6 +++--- drivers/media/platform/ti/omap3isp/ispvideo.c | 6 +++--- drivers/media/platform/xilinx/xilinx-dma.c | 6 +++--- drivers/media/test-drivers/vimc/vimc-capture.c | 7 +++---- drivers/staging/media/imx/imx7-media-csi.c | 6 +++--- drivers/staging/media/ipu3/ipu3-v4l2.c | 6 +++--- drivers/staging/media/omap4iss/iss_video.c | 6 +++--- drivers/staging/media/tegra-video/tegra210.c | 6 +++--- 19 files changed, 62 insertions(+), 66 deletions(-) diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c b/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c index a3fe547b7fcec..390bd5ea34724 100644 --- a/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c +++ b/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c @@ -989,7 +989,7 @@ static int cio2_vb2_start_streaming(struct vb2_queue *vq, unsigned int count) return r; } - r = media_pipeline_start(&q->vdev.entity, &q->pipe); + r = video_device_pipeline_start(&q->vdev, &q->pipe); if (r) goto fail_pipeline; @@ -1009,7 +1009,7 @@ static int cio2_vb2_start_streaming(struct vb2_queue *vq, unsigned int count) fail_csi2_subdev: cio2_hw_exit(cio2, q); fail_hw: - media_pipeline_stop(&q->vdev.entity); + video_device_pipeline_stop(&q->vdev); fail_pipeline: dev_dbg(dev, "failed to start streaming (%d)\n", r); cio2_vb2_return_all_buffers(q, VB2_BUF_STATE_QUEUED); @@ -1030,7 +1030,7 @@ static void cio2_vb2_stop_streaming(struct vb2_queue *vq) cio2_hw_exit(cio2, q); synchronize_irq(cio2->pci_dev->irq); cio2_vb2_return_all_buffers(q, VB2_BUF_STATE_ERROR); - media_pipeline_stop(&q->vdev.entity); + video_device_pipeline_stop(&q->vdev); pm_runtime_put(dev); cio2->streaming = false; } diff --git a/drivers/media/platform/qcom/camss/camss-video.c b/drivers/media/platform/qcom/camss/camss-video.c index 290df04c4d02c..81fb3a5bc1d51 100644 --- a/drivers/media/platform/qcom/camss/camss-video.c +++ b/drivers/media/platform/qcom/camss/camss-video.c @@ -493,7 +493,7 @@ static int video_start_streaming(struct vb2_queue *q, unsigned int count) struct v4l2_subdev *subdev; int ret; - ret = media_pipeline_start(&vdev->entity, &video->pipe); + ret = video_device_pipeline_start(vdev, &video->pipe); if (ret < 0) return ret; @@ -522,7 +522,7 @@ static int video_start_streaming(struct vb2_queue *q, unsigned int count) return 0; error: - media_pipeline_stop(&vdev->entity); + video_device_pipeline_stop(vdev); video->ops->flush_buffers(video, VB2_BUF_STATE_QUEUED); @@ -553,7 +553,7 @@ static void video_stop_streaming(struct vb2_queue *q) v4l2_subdev_call(subdev, video, s_stream, 0); } - media_pipeline_stop(&vdev->entity); + video_device_pipeline_stop(vdev); video->ops->flush_buffers(video, VB2_BUF_STATE_ERROR); } diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c index e72bc6fa049f4..879dd02bbb554 100644 --- a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c +++ b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c @@ -1265,7 +1265,7 @@ static int rvin_set_stream(struct rvin_dev *vin, int on) sd = media_entity_to_v4l2_subdev(pad->entity); if (!on) { - media_pipeline_stop(&vin->vdev.entity); + video_device_pipeline_stop(&vin->vdev); return v4l2_subdev_call(sd, video, s_stream, 0); } @@ -1282,7 +1282,7 @@ static int rvin_set_stream(struct rvin_dev *vin, int on) mdev = vin->vdev.entity.graph_obj.mdev; mutex_lock(&mdev->graph_mutex); pipe = media_entity_pipeline(&sd->entity) ? : &vin->vdev.pipe; - ret = __media_pipeline_start(&vin->vdev.entity, pipe); + ret = __video_device_pipeline_start(&vin->vdev, pipe); mutex_unlock(&mdev->graph_mutex); if (ret) return ret; @@ -1291,7 +1291,7 @@ static int rvin_set_stream(struct rvin_dev *vin, int on) if (ret == -ENOIOCTLCMD) ret = 0; if (ret) - media_pipeline_stop(&vin->vdev.entity); + video_device_pipeline_stop(&vin->vdev); return ret; } diff --git a/drivers/media/platform/renesas/vsp1/vsp1_video.c b/drivers/media/platform/renesas/vsp1/vsp1_video.c index df1606b49d77a..9d24647c8f324 100644 --- a/drivers/media/platform/renesas/vsp1/vsp1_video.c +++ b/drivers/media/platform/renesas/vsp1/vsp1_video.c @@ -927,7 +927,7 @@ static void vsp1_video_stop_streaming(struct vb2_queue *vq) } mutex_unlock(&pipe->lock); - media_pipeline_stop(&video->video.entity); + video_device_pipeline_stop(&video->video); vsp1_video_release_buffers(video); vsp1_video_pipeline_put(pipe); } @@ -1046,7 +1046,7 @@ vsp1_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) return PTR_ERR(pipe); } - ret = __media_pipeline_start(&video->video.entity, &pipe->pipe); + ret = __video_device_pipeline_start(&video->video, &pipe->pipe); if (ret < 0) { mutex_unlock(&mdev->graph_mutex); goto err_pipe; @@ -1070,7 +1070,7 @@ vsp1_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) return 0; err_stop: - media_pipeline_stop(&video->video.entity); + video_device_pipeline_stop(&video->video); err_pipe: vsp1_video_pipeline_put(pipe); return ret; diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c index 6ef09579dc21b..d4540684ea9af 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-capture.c @@ -913,7 +913,7 @@ static void rkisp1_cap_stream_disable(struct rkisp1_capture *cap) * * Call s_stream(false) in the reverse order from * rkisp1_pipeline_stream_enable() and disable the DMA engine. - * Should be called before media_pipeline_stop() + * Should be called before video_device_pipeline_stop() */ static void rkisp1_pipeline_stream_disable(struct rkisp1_capture *cap) __must_hold(&cap->rkisp1->stream_lock) @@ -937,7 +937,7 @@ static void rkisp1_pipeline_stream_disable(struct rkisp1_capture *cap) * rkisp1_pipeline_stream_enable - enable nodes in the pipeline * * Enable the DMA Engine and call s_stream(true) through the pipeline. - * Should be called after media_pipeline_start() + * Should be called after video_device_pipeline_start() */ static int rkisp1_pipeline_stream_enable(struct rkisp1_capture *cap) __must_hold(&cap->rkisp1->stream_lock) @@ -994,7 +994,7 @@ static void rkisp1_vb2_stop_streaming(struct vb2_queue *queue) rkisp1_dummy_buf_destroy(cap); - media_pipeline_stop(&node->vdev.entity); + video_device_pipeline_stop(&node->vdev); mutex_unlock(&cap->rkisp1->stream_lock); } @@ -1008,7 +1008,7 @@ rkisp1_vb2_start_streaming(struct vb2_queue *queue, unsigned int count) mutex_lock(&cap->rkisp1->stream_lock); - ret = media_pipeline_start(entity, &cap->rkisp1->pipe); + ret = video_device_pipeline_start(&cap->vnode.vdev, &cap->rkisp1->pipe); if (ret) { dev_err(cap->rkisp1->dev, "start pipeline failed %d\n", ret); goto err_ret_buffers; @@ -1044,7 +1044,7 @@ err_pipe_pm_put: err_destroy_dummy: rkisp1_dummy_buf_destroy(cap); err_pipeline_stop: - media_pipeline_stop(entity); + video_device_pipeline_stop(&cap->vnode.vdev); err_ret_buffers: rkisp1_return_all_buffers(cap, VB2_BUF_STATE_QUEUED); mutex_unlock(&cap->rkisp1->stream_lock); diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-capture.c b/drivers/media/platform/samsung/exynos4-is/fimc-capture.c index 03638c8f772d0..e3b95a2b7e040 100644 --- a/drivers/media/platform/samsung/exynos4-is/fimc-capture.c +++ b/drivers/media/platform/samsung/exynos4-is/fimc-capture.c @@ -524,7 +524,7 @@ static int fimc_capture_release(struct file *file) mutex_lock(&fimc->lock); if (close && vc->streaming) { - media_pipeline_stop(&vc->ve.vdev.entity); + video_device_pipeline_stop(&vc->ve.vdev); vc->streaming = false; } @@ -1176,7 +1176,6 @@ static int fimc_cap_streamon(struct file *file, void *priv, { struct fimc_dev *fimc = video_drvdata(file); struct fimc_vid_cap *vc = &fimc->vid_cap; - struct media_entity *entity = &vc->ve.vdev.entity; struct fimc_source_info *si = NULL; struct v4l2_subdev *sd; int ret; @@ -1184,7 +1183,7 @@ static int fimc_cap_streamon(struct file *file, void *priv, if (fimc_capture_active(fimc)) return -EBUSY; - ret = media_pipeline_start(entity, &vc->ve.pipe->mp); + ret = video_device_pipeline_start(&vc->ve.vdev, &vc->ve.pipe->mp); if (ret < 0) return ret; @@ -1218,7 +1217,7 @@ static int fimc_cap_streamon(struct file *file, void *priv, } err_p_stop: - media_pipeline_stop(entity); + video_device_pipeline_stop(&vc->ve.vdev); return ret; } @@ -1234,7 +1233,7 @@ static int fimc_cap_streamoff(struct file *file, void *priv, return ret; if (vc->streaming) { - media_pipeline_stop(&vc->ve.vdev.entity); + video_device_pipeline_stop(&vc->ve.vdev); vc->streaming = false; } diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c b/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c index 8f12240b0eb7a..f6a302fa8d377 100644 --- a/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c +++ b/drivers/media/platform/samsung/exynos4-is/fimc-isp-video.c @@ -312,7 +312,7 @@ static int isp_video_release(struct file *file) is_singular_file = v4l2_fh_is_singular_file(file); if (is_singular_file && ivc->streaming) { - media_pipeline_stop(entity); + video_device_pipeline_stop(&ivc->ve.vdev); ivc->streaming = 0; } @@ -490,10 +490,9 @@ static int isp_video_streamon(struct file *file, void *priv, { struct fimc_isp *isp = video_drvdata(file); struct exynos_video_entity *ve = &isp->video_capture.ve; - struct media_entity *me = &ve->vdev.entity; int ret; - ret = media_pipeline_start(me, &ve->pipe->mp); + ret = video_device_pipeline_start(&ve->vdev, &ve->pipe->mp); if (ret < 0) return ret; @@ -508,7 +507,7 @@ static int isp_video_streamon(struct file *file, void *priv, isp->video_capture.streaming = 1; return 0; p_stop: - media_pipeline_stop(me); + video_device_pipeline_stop(&ve->vdev); return ret; } @@ -523,7 +522,7 @@ static int isp_video_streamoff(struct file *file, void *priv, if (ret < 0) return ret; - media_pipeline_stop(&video->ve.vdev.entity); + video_device_pipeline_stop(&video->ve.vdev); video->streaming = 0; return 0; } diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-lite.c b/drivers/media/platform/samsung/exynos4-is/fimc-lite.c index 41b0a4a5929a7..e185a40305a8f 100644 --- a/drivers/media/platform/samsung/exynos4-is/fimc-lite.c +++ b/drivers/media/platform/samsung/exynos4-is/fimc-lite.c @@ -516,7 +516,7 @@ static int fimc_lite_release(struct file *file) if (v4l2_fh_is_singular_file(file) && atomic_read(&fimc->out_path) == FIMC_IO_DMA) { if (fimc->streaming) { - media_pipeline_stop(entity); + video_device_pipeline_stop(&fimc->ve.vdev); fimc->streaming = false; } fimc_lite_stop_capture(fimc, false); @@ -812,13 +812,12 @@ static int fimc_lite_streamon(struct file *file, void *priv, enum v4l2_buf_type type) { struct fimc_lite *fimc = video_drvdata(file); - struct media_entity *entity = &fimc->ve.vdev.entity; int ret; if (fimc_lite_active(fimc)) return -EBUSY; - ret = media_pipeline_start(entity, &fimc->ve.pipe->mp); + ret = video_device_pipeline_start(&fimc->ve.vdev, &fimc->ve.pipe->mp); if (ret < 0) return ret; @@ -835,7 +834,7 @@ static int fimc_lite_streamon(struct file *file, void *priv, } err_p_stop: - media_pipeline_stop(entity); + video_device_pipeline_stop(&fimc->ve.vdev); return 0; } @@ -849,7 +848,7 @@ static int fimc_lite_streamoff(struct file *file, void *priv, if (ret < 0) return ret; - media_pipeline_stop(&fimc->ve.vdev.entity); + video_device_pipeline_stop(&fimc->ve.vdev); fimc->streaming = false; return 0; } diff --git a/drivers/media/platform/st/stm32/stm32-dcmi.c b/drivers/media/platform/st/stm32/stm32-dcmi.c index 2ca95ab2b0fe4..37458d4d9564b 100644 --- a/drivers/media/platform/st/stm32/stm32-dcmi.c +++ b/drivers/media/platform/st/stm32/stm32-dcmi.c @@ -751,7 +751,7 @@ static int dcmi_start_streaming(struct vb2_queue *vq, unsigned int count) goto err_unlocked; } - ret = media_pipeline_start(&dcmi->vdev->entity, &dcmi->pipeline); + ret = video_device_pipeline_start(dcmi->vdev, &dcmi->pipeline); if (ret < 0) { dev_err(dcmi->dev, "%s: Failed to start streaming, media pipeline start error (%d)\n", __func__, ret); @@ -865,7 +865,7 @@ err_pipeline_stop: dcmi_pipeline_stop(dcmi); err_media_pipeline_stop: - media_pipeline_stop(&dcmi->vdev->entity); + video_device_pipeline_stop(dcmi->vdev); err_pm_put: pm_runtime_put(dcmi->dev); @@ -892,7 +892,7 @@ static void dcmi_stop_streaming(struct vb2_queue *vq) dcmi_pipeline_stop(dcmi); - media_pipeline_stop(&dcmi->vdev->entity); + video_device_pipeline_stop(dcmi->vdev); spin_lock_irq(&dcmi->irqlock); diff --git a/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c b/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c index 0912a1b6d5257..17ad9a3caaa5f 100644 --- a/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c +++ b/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c @@ -266,7 +266,7 @@ static int sun4i_csi_start_streaming(struct vb2_queue *vq, unsigned int count) goto err_clear_dma_queue; } - ret = media_pipeline_start(&csi->vdev.entity, &csi->vdev.pipe); + ret = video_device_pipeline_start(&csi->vdev, &csi->vdev.pipe); if (ret < 0) goto err_free_scratch_buffer; @@ -330,7 +330,7 @@ err_disable_device: sun4i_csi_capture_stop(csi); err_disable_pipeline: - media_pipeline_stop(&csi->vdev.entity); + video_device_pipeline_stop(&csi->vdev); err_free_scratch_buffer: dma_free_coherent(csi->dev, csi->scratch.size, csi->scratch.vaddr, @@ -359,7 +359,7 @@ static void sun4i_csi_stop_streaming(struct vb2_queue *vq) return_all_buffers(csi, VB2_BUF_STATE_ERROR); spin_unlock_irqrestore(&csi->qlock, flags); - media_pipeline_stop(&csi->vdev.entity); + video_device_pipeline_stop(&csi->vdev); dma_free_coherent(csi->dev, csi->scratch.size, csi->scratch.vaddr, csi->scratch.paddr); diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index 74d64a20ba5bf..a6873fdb84388 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -141,7 +141,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) video->sequence = 0; - ret = media_pipeline_start(&video->vdev.entity, &video->vdev.pipe); + ret = video_device_pipeline_start(&video->vdev, &video->vdev.pipe); if (ret < 0) goto clear_dma_queue; @@ -207,7 +207,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) stop_csi_stream: sun6i_csi_set_stream(video->csi, false); stop_media_pipeline: - media_pipeline_stop(&video->vdev.entity); + video_device_pipeline_stop(&video->vdev); clear_dma_queue: spin_lock_irqsave(&video->dma_queue_lock, flags); list_for_each_entry(buf, &video->dma_queue, list) @@ -231,7 +231,7 @@ static void sun6i_video_stop_streaming(struct vb2_queue *vq) sun6i_csi_set_stream(video->csi, false); - media_pipeline_stop(&video->vdev.entity); + video_device_pipeline_stop(&video->vdev); /* Release all active buffers */ spin_lock_irqsave(&video->dma_queue_lock, flags); diff --git a/drivers/media/platform/ti/cal/cal-video.c b/drivers/media/platform/ti/cal/cal-video.c index 21e3d0aabf706..0ac54d7618e30 100644 --- a/drivers/media/platform/ti/cal/cal-video.c +++ b/drivers/media/platform/ti/cal/cal-video.c @@ -708,7 +708,7 @@ static int cal_start_streaming(struct vb2_queue *vq, unsigned int count) dma_addr_t addr; int ret; - ret = media_pipeline_start(&ctx->vdev.entity, &ctx->phy->pipe); + ret = video_device_pipeline_start(&ctx->vdev, &ctx->phy->pipe); if (ret < 0) { ctx_err(ctx, "Failed to start media pipeline: %d\n", ret); goto error_release_buffers; @@ -761,7 +761,7 @@ error_stop: cal_ctx_unprepare(ctx); error_pipeline: - media_pipeline_stop(&ctx->vdev.entity); + video_device_pipeline_stop(&ctx->vdev); error_release_buffers: cal_release_buffers(ctx, VB2_BUF_STATE_QUEUED); @@ -782,7 +782,7 @@ static void cal_stop_streaming(struct vb2_queue *vq) cal_release_buffers(ctx, VB2_BUF_STATE_ERROR); - media_pipeline_stop(&ctx->vdev.entity); + video_device_pipeline_stop(&ctx->vdev); } static const struct vb2_ops cal_video_qops = { diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.c b/drivers/media/platform/ti/omap3isp/ispvideo.c index 2e7f90603a5a8..3e5348c63773a 100644 --- a/drivers/media/platform/ti/omap3isp/ispvideo.c +++ b/drivers/media/platform/ti/omap3isp/ispvideo.c @@ -1103,7 +1103,7 @@ isp_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) pipe->l3_ick = clk_get_rate(video->isp->clock[ISP_CLK_L3_ICK]); pipe->max_rate = pipe->l3_ick; - ret = media_pipeline_start(&video->video.entity, &pipe->pipe); + ret = video_device_pipeline_start(&video->video, &pipe->pipe); if (ret < 0) goto err_pipeline_start; @@ -1160,7 +1160,7 @@ isp_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) return 0; err_check_format: - media_pipeline_stop(&video->video.entity); + video_device_pipeline_stop(&video->video); err_pipeline_start: /* TODO: Implement PM QoS */ /* The DMA queue must be emptied here, otherwise CCDC interrupts that @@ -1227,7 +1227,7 @@ isp_video_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) video->error = false; /* TODO: Implement PM QoS */ - media_pipeline_stop(&video->video.entity); + video_device_pipeline_stop(&video->video); media_entity_enum_cleanup(&pipe->ent_enum); diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c index 3a4d62be0f277..9a177337e9342 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.c +++ b/drivers/media/platform/xilinx/xilinx-dma.c @@ -404,7 +404,7 @@ static int xvip_dma_start_streaming(struct vb2_queue *vq, unsigned int count) */ pipe = to_xvip_pipeline(&dma->video.entity) ? : &dma->pipe; - ret = media_pipeline_start(&dma->video.entity, &pipe->pipe); + ret = video_device_pipeline_start(&dma->video, &pipe->pipe); if (ret < 0) goto error; @@ -430,7 +430,7 @@ static int xvip_dma_start_streaming(struct vb2_queue *vq, unsigned int count) return 0; error_stop: - media_pipeline_stop(&dma->video.entity); + video_device_pipeline_stop(&dma->video); error: /* Give back all queued buffers to videobuf2. */ @@ -458,7 +458,7 @@ static void xvip_dma_stop_streaming(struct vb2_queue *vq) /* Cleanup the pipeline and mark it as being stopped. */ xvip_pipeline_cleanup(pipe); - media_pipeline_stop(&dma->video.entity); + video_device_pipeline_stop(&dma->video); /* Give back all queued buffers to videobuf2. */ spin_lock_irq(&dma->queued_lock); diff --git a/drivers/media/test-drivers/vimc/vimc-capture.c b/drivers/media/test-drivers/vimc/vimc-capture.c index 6c437802f91f6..aa944270e716c 100644 --- a/drivers/media/test-drivers/vimc/vimc-capture.c +++ b/drivers/media/test-drivers/vimc/vimc-capture.c @@ -241,13 +241,12 @@ static void vimc_capture_return_all_buffers(struct vimc_capture_device *vcapture static int vimc_capture_start_streaming(struct vb2_queue *vq, unsigned int count) { struct vimc_capture_device *vcapture = vb2_get_drv_priv(vq); - struct media_entity *entity = &vcapture->vdev.entity; int ret; vcapture->sequence = 0; /* Start the media pipeline */ - ret = media_pipeline_start(entity, &vcapture->stream.pipe); + ret = video_device_pipeline_start(&vcapture->vdev, &vcapture->stream.pipe); if (ret) { vimc_capture_return_all_buffers(vcapture, VB2_BUF_STATE_QUEUED); return ret; @@ -255,7 +254,7 @@ static int vimc_capture_start_streaming(struct vb2_queue *vq, unsigned int count ret = vimc_streamer_s_stream(&vcapture->stream, &vcapture->ved, 1); if (ret) { - media_pipeline_stop(entity); + video_device_pipeline_stop(&vcapture->vdev); vimc_capture_return_all_buffers(vcapture, VB2_BUF_STATE_QUEUED); return ret; } @@ -274,7 +273,7 @@ static void vimc_capture_stop_streaming(struct vb2_queue *vq) vimc_streamer_s_stream(&vcapture->stream, &vcapture->ved, 0); /* Stop the media pipeline */ - media_pipeline_stop(&vcapture->vdev.entity); + video_device_pipeline_stop(&vcapture->vdev); /* Release all active buffers */ vimc_capture_return_all_buffers(vcapture, VB2_BUF_STATE_ERROR); diff --git a/drivers/staging/media/imx/imx7-media-csi.c b/drivers/staging/media/imx/imx7-media-csi.c index cbc66ef0eda8e..e5b550ccfa22d 100644 --- a/drivers/staging/media/imx/imx7-media-csi.c +++ b/drivers/staging/media/imx/imx7-media-csi.c @@ -1360,7 +1360,7 @@ static int imx7_csi_video_start_streaming(struct vb2_queue *vq, mutex_lock(&csi->mdev.graph_mutex); - ret = __media_pipeline_start(&csi->sd.entity, &csi->pipe); + ret = __video_device_pipeline_start(csi->vdev, &csi->pipe); if (ret) goto err_unlock; @@ -1373,7 +1373,7 @@ static int imx7_csi_video_start_streaming(struct vb2_queue *vq, return 0; err_stop: - __media_pipeline_stop(&csi->sd.entity); + __video_device_pipeline_stop(csi->vdev); err_unlock: mutex_unlock(&csi->mdev.graph_mutex); dev_err(csi->dev, "pipeline start failed with %d\n", ret); @@ -1396,7 +1396,7 @@ static void imx7_csi_video_stop_streaming(struct vb2_queue *vq) mutex_lock(&csi->mdev.graph_mutex); v4l2_subdev_call(&csi->sd, video, s_stream, 0); - __media_pipeline_stop(&csi->sd.entity); + __video_device_pipeline_stop(csi->vdev); mutex_unlock(&csi->mdev.graph_mutex); /* release all active buffers */ diff --git a/drivers/staging/media/ipu3/ipu3-v4l2.c b/drivers/staging/media/ipu3/ipu3-v4l2.c index 2234bb8d48b34..ce13e746c15f3 100644 --- a/drivers/staging/media/ipu3/ipu3-v4l2.c +++ b/drivers/staging/media/ipu3/ipu3-v4l2.c @@ -483,7 +483,7 @@ static int imgu_vb2_start_streaming(struct vb2_queue *vq, unsigned int count) pipe = node->pipe; imgu_pipe = &imgu->imgu_pipe[pipe]; atomic_set(&node->sequence, 0); - r = media_pipeline_start(&node->vdev.entity, &imgu_pipe->pipeline); + r = video_device_pipeline_start(&node->vdev, &imgu_pipe->pipeline); if (r < 0) goto fail_return_bufs; @@ -508,7 +508,7 @@ static int imgu_vb2_start_streaming(struct vb2_queue *vq, unsigned int count) return 0; fail_stop_pipeline: - media_pipeline_stop(&node->vdev.entity); + video_device_pipeline_stop(&node->vdev); fail_return_bufs: imgu_return_all_buffers(imgu, node, VB2_BUF_STATE_QUEUED); @@ -548,7 +548,7 @@ static void imgu_vb2_stop_streaming(struct vb2_queue *vq) imgu_return_all_buffers(imgu, node, VB2_BUF_STATE_ERROR); mutex_unlock(&imgu->streaming_lock); - media_pipeline_stop(&node->vdev.entity); + video_device_pipeline_stop(&node->vdev); } /******************** v4l2_ioctl_ops ********************/ diff --git a/drivers/staging/media/omap4iss/iss_video.c b/drivers/staging/media/omap4iss/iss_video.c index 7967a42a3ffac..60f3d84be8285 100644 --- a/drivers/staging/media/omap4iss/iss_video.c +++ b/drivers/staging/media/omap4iss/iss_video.c @@ -886,7 +886,7 @@ iss_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) if (video->iss->pdata->set_constraints) video->iss->pdata->set_constraints(video->iss, true); - ret = media_pipeline_start(entity, &pipe->pipe); + ret = video_device_pipeline_start(&video->video, &pipe->pipe); if (ret < 0) goto err_media_pipeline_start; @@ -977,7 +977,7 @@ iss_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) err_omap4iss_set_stream: vb2_streamoff(&vfh->queue, type); err_iss_video_check_format: - media_pipeline_stop(&video->video.entity); + video_device_pipeline_stop(&video->video); err_media_pipeline_start: if (video->iss->pdata->set_constraints) video->iss->pdata->set_constraints(video->iss, false); @@ -1031,7 +1031,7 @@ iss_video_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) if (video->iss->pdata->set_constraints) video->iss->pdata->set_constraints(video->iss, false); - media_pipeline_stop(&video->video.entity); + video_device_pipeline_stop(&video->video); done: mutex_unlock(&video->stream_lock); diff --git a/drivers/staging/media/tegra-video/tegra210.c b/drivers/staging/media/tegra-video/tegra210.c index f10a041e3e6c0..d58370a84737a 100644 --- a/drivers/staging/media/tegra-video/tegra210.c +++ b/drivers/staging/media/tegra-video/tegra210.c @@ -547,7 +547,7 @@ static int tegra210_vi_start_streaming(struct vb2_queue *vq, u32 count) VI_INCR_SYNCPT_NO_STALL); /* start the pipeline */ - ret = media_pipeline_start(&chan->video.entity, pipe); + ret = video_device_pipeline_start(&chan->video, pipe); if (ret < 0) goto error_pipeline_start; @@ -595,7 +595,7 @@ error_kthread_done: error_kthread_start: tegra_channel_set_stream(chan, false); error_set_stream: - media_pipeline_stop(&chan->video.entity); + video_device_pipeline_stop(&chan->video); error_pipeline_start: tegra_channel_release_buffers(chan, VB2_BUF_STATE_QUEUED); return ret; @@ -617,7 +617,7 @@ static void tegra210_vi_stop_streaming(struct vb2_queue *vq) tegra_channel_release_buffers(chan, VB2_BUF_STATE_ERROR); tegra_channel_set_stream(chan, false); - media_pipeline_stop(&chan->video.entity); + video_device_pipeline_stop(&chan->video); } /* -- GitLab From 98d79dc34798cb5b3bdbc49cfc17ff63b3044b64 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:34 +0200 Subject: [PATCH 0536/2223] media: drivers: use video_device_pipeline() Use video_device_pipeline() in the drivers instead of media_entity_pipeline(). Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/xilinx/xilinx-dma.c | 4 ++-- drivers/media/platform/xilinx/xilinx-dma.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c index 9a177337e9342..0a7fd8642a659 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.c +++ b/drivers/media/platform/xilinx/xilinx-dma.c @@ -402,7 +402,7 @@ static int xvip_dma_start_streaming(struct vb2_queue *vq, unsigned int count) * Use the pipeline object embedded in the first DMA object that starts * streaming. */ - pipe = to_xvip_pipeline(&dma->video.entity) ? : &dma->pipe; + pipe = to_xvip_pipeline(&dma->video) ? : &dma->pipe; ret = video_device_pipeline_start(&dma->video, &pipe->pipe); if (ret < 0) @@ -447,7 +447,7 @@ error: static void xvip_dma_stop_streaming(struct vb2_queue *vq) { struct xvip_dma *dma = vb2_get_drv_priv(vq); - struct xvip_pipeline *pipe = to_xvip_pipeline(&dma->video.entity); + struct xvip_pipeline *pipe = to_xvip_pipeline(&dma->video); struct xvip_dma_buffer *buf, *nbuf; /* Stop the pipeline. */ diff --git a/drivers/media/platform/xilinx/xilinx-dma.h b/drivers/media/platform/xilinx/xilinx-dma.h index 3ea10f6b0bb9b..9c6d4c18d1a95 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.h +++ b/drivers/media/platform/xilinx/xilinx-dma.h @@ -45,9 +45,9 @@ struct xvip_pipeline { struct xvip_dma *output; }; -static inline struct xvip_pipeline *to_xvip_pipeline(struct media_entity *e) +static inline struct xvip_pipeline *to_xvip_pipeline(struct video_device *vdev) { - struct media_pipeline *pipe = media_entity_pipeline(e); + struct media_pipeline *pipe = video_device_pipeline(vdev); if (!pipe) return NULL; -- GitLab From d9f4434513b499ddb8ba8617fba787b1ce98274e Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:35 +0200 Subject: [PATCH 0537/2223] media: mc: entity: add alloc variant of pipeline_start Add new variant of media_pipeline_start(), media_pipeline_alloc_start(). media_pipeline_alloc_start() can be used by drivers that do not need to extend the media_pipeline. The function will either use the pipeline already associated with the entity, if such exists, or allocate a new pipeline. When media_pipeline_stop() is called and the pipeline's use count drops to zero, the pipeline is automatically freed. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 38 ++++++++++++++++++++++++++++++ drivers/media/v4l2-core/v4l2-dev.c | 11 +++++++++ include/media/media-entity.h | 15 ++++++++++++ include/media/v4l2-dev.h | 14 +++++++++++ 4 files changed, 78 insertions(+) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index 7fb97c6dc897a..ad153a426a36c 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -530,6 +530,8 @@ void __media_pipeline_stop(struct media_entity *entity) media_graph_walk_cleanup(graph); + if (pipe->allocated) + kfree(pipe); } EXPORT_SYMBOL_GPL(__media_pipeline_stop); @@ -543,6 +545,42 @@ void media_pipeline_stop(struct media_entity *entity) } EXPORT_SYMBOL_GPL(media_pipeline_stop); +__must_check int media_pipeline_alloc_start(struct media_entity *entity) +{ + struct media_device *mdev = entity->graph_obj.mdev; + struct media_pipeline *new_pipe = NULL; + struct media_pipeline *pipe; + int ret; + + mutex_lock(&mdev->graph_mutex); + + /* + * Is the entity already part of a pipeline? If not, we need to allocate + * a pipe. + */ + pipe = media_entity_pipeline(entity); + if (!pipe) { + new_pipe = kzalloc(sizeof(*new_pipe), GFP_KERNEL); + if (!new_pipe) { + ret = -ENOMEM; + goto out; + } + + pipe = new_pipe; + pipe->allocated = true; + } + + ret = __media_pipeline_start(entity, pipe); + if (ret) + kfree(new_pipe); + +out: + mutex_unlock(&mdev->graph_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(media_pipeline_alloc_start); + /* ----------------------------------------------------------------------------- * Links management */ diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index 7f933ff89fd49..945bb867a4c19 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -1143,6 +1143,17 @@ void __video_device_pipeline_stop(struct video_device *vdev) } EXPORT_SYMBOL_GPL(__video_device_pipeline_stop); +__must_check int video_device_pipeline_alloc_start(struct video_device *vdev) +{ + struct media_entity *entity = &vdev->entity; + + if (entity->num_pads != 1) + return -ENODEV; + + return media_pipeline_alloc_start(entity); +} +EXPORT_SYMBOL_GPL(video_device_pipeline_alloc_start); + struct media_pipeline *video_device_pipeline(struct video_device *vdev) { struct media_entity *entity = &vdev->entity; diff --git a/include/media/media-entity.h b/include/media/media-entity.h index aaf276f765cf0..a77933afaa48f 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -100,10 +100,12 @@ struct media_graph { /** * struct media_pipeline - Media pipeline related information * + * @allocated: Media pipeline allocated and freed by the framework * @start_count: Media pipeline start - stop count * @graph: Media graph walk during pipeline start / stop */ struct media_pipeline { + bool allocated; int start_count; struct media_graph graph; }; @@ -1092,6 +1094,19 @@ void media_pipeline_stop(struct media_entity *entity); */ void __media_pipeline_stop(struct media_entity *entity); +/** + * media_pipeline_alloc_start - Mark a pipeline as streaming + * @entity: Starting entity + * + * media_pipeline_alloc_start() is similar to media_pipeline_start() but instead + * of working on a given pipeline the function will use an existing pipeline if + * the entity is already part of a pipeline, or allocate a new pipeline. + * + * Calls to media_pipeline_alloc_start() must be matched with + * media_pipeline_stop(). + */ +__must_check int media_pipeline_alloc_start(struct media_entity *entity); + /** * media_devnode_create() - creates and initializes a device node interface * diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h index 4946858722540..643da0740ab06 100644 --- a/include/media/v4l2-dev.h +++ b/include/media/v4l2-dev.h @@ -607,6 +607,20 @@ void video_device_pipeline_stop(struct video_device *vdev); */ void __video_device_pipeline_stop(struct video_device *vdev); +/** + * video_device_pipeline_alloc_start - Mark a pipeline as streaming + * @vdev: Starting video device + * + * video_device_pipeline_alloc_start() is similar to video_device_pipeline_start() + * but instead of working on a given pipeline the function will use an + * existing pipeline if the video device is already part of a pipeline, or + * allocate a new pipeline. + * + * Calls to video_device_pipeline_alloc_start() must be matched with + * video_device_pipeline_stop(). + */ +__must_check int video_device_pipeline_alloc_start(struct video_device *vdev); + /** * video_device_pipeline - Get the media pipeline a video device is part of * @vdev: The video device -- GitLab From 6eaff06ad064f5182eda361c3615a0e10c032f74 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:36 +0200 Subject: [PATCH 0538/2223] media: drivers: use video_device_pipeline_alloc_start() Use video_device_pipeline_alloc_start() instead of manually allocating/managing the media pipeline storage. Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/renesas/rcar-vin/rcar-dma.c | 14 +------------- drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c | 2 +- .../media/platform/sunxi/sun6i-csi/sun6i_video.c | 2 +- drivers/media/platform/ti/cal/cal-video.c | 2 +- drivers/media/platform/ti/cal/cal.h | 1 - 5 files changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c index 879dd02bbb554..3aea96d85165a 100644 --- a/drivers/media/platform/renesas/rcar-vin/rcar-dma.c +++ b/drivers/media/platform/renesas/rcar-vin/rcar-dma.c @@ -1244,8 +1244,6 @@ static int rvin_mc_validate_format(struct rvin_dev *vin, struct v4l2_subdev *sd, static int rvin_set_stream(struct rvin_dev *vin, int on) { - struct media_pipeline *pipe; - struct media_device *mdev; struct v4l2_subdev *sd; struct media_pad *pad; int ret; @@ -1273,17 +1271,7 @@ static int rvin_set_stream(struct rvin_dev *vin, int on) if (ret) return ret; - /* - * The graph lock needs to be taken to protect concurrent - * starts of multiple VIN instances as they might share - * a common subdevice down the line and then should use - * the same pipe. - */ - mdev = vin->vdev.entity.graph_obj.mdev; - mutex_lock(&mdev->graph_mutex); - pipe = media_entity_pipeline(&sd->entity) ? : &vin->vdev.pipe; - ret = __video_device_pipeline_start(&vin->vdev, pipe); - mutex_unlock(&mdev->graph_mutex); + ret = video_device_pipeline_alloc_start(&vin->vdev); if (ret) return ret; diff --git a/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c b/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c index 17ad9a3caaa5f..a3e826a755fc3 100644 --- a/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c +++ b/drivers/media/platform/sunxi/sun4i-csi/sun4i_dma.c @@ -266,7 +266,7 @@ static int sun4i_csi_start_streaming(struct vb2_queue *vq, unsigned int count) goto err_clear_dma_queue; } - ret = video_device_pipeline_start(&csi->vdev, &csi->vdev.pipe); + ret = video_device_pipeline_alloc_start(&csi->vdev); if (ret < 0) goto err_free_scratch_buffer; diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index a6873fdb84388..da4b7f9557a10 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -141,7 +141,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) video->sequence = 0; - ret = video_device_pipeline_start(&video->vdev, &video->vdev.pipe); + ret = video_device_pipeline_alloc_start(&video->vdev); if (ret < 0) goto clear_dma_queue; diff --git a/drivers/media/platform/ti/cal/cal-video.c b/drivers/media/platform/ti/cal/cal-video.c index 0ac54d7618e30..4eade409d5d36 100644 --- a/drivers/media/platform/ti/cal/cal-video.c +++ b/drivers/media/platform/ti/cal/cal-video.c @@ -708,7 +708,7 @@ static int cal_start_streaming(struct vb2_queue *vq, unsigned int count) dma_addr_t addr; int ret; - ret = video_device_pipeline_start(&ctx->vdev, &ctx->phy->pipe); + ret = video_device_pipeline_alloc_start(&ctx->vdev); if (ret < 0) { ctx_err(ctx, "Failed to start media pipeline: %d\n", ret); goto error_release_buffers; diff --git a/drivers/media/platform/ti/cal/cal.h b/drivers/media/platform/ti/cal/cal.h index 80f2c9c73c719..de73d6d21b6f1 100644 --- a/drivers/media/platform/ti/cal/cal.h +++ b/drivers/media/platform/ti/cal/cal.h @@ -174,7 +174,6 @@ struct cal_camerarx { struct device_node *source_ep_node; struct device_node *source_node; struct v4l2_subdev *source; - struct media_pipeline pipe; struct v4l2_subdev subdev; struct media_pad pads[CAL_CAMERARX_NUM_PADS]; -- GitLab From ae219872834a32da88408a92a4b4745c11f5a7ce Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 31 Aug 2022 16:13:37 +0200 Subject: [PATCH 0539/2223] media: mc: entity: Rewrite media_pipeline_start() [Note: the code is mostly from Laurent but the patch description is from Tomi] The media_pipeline_start() and media_pipeline_stop() functions use the media graph walk API to traverse the graph and validate the pipeline. The graph walk traverses the media graph following links between the entities. Also, while the pipeline can't change between the start and stop calls, the graph is walked again from scratch at stop time, or any time a driver needs to inspect the pipeline. With the upcoming multiplexed streams support we will need a bit more intelligent pipeline construction, as e.g. two independent streams may be passing through a single entity via separate pads in which case those pads should not be part of the same pipeline. This patch essentially rewrites the media_pipeline_start/stop so that a pipeline is defined as a set of pads instead of entities and the media graph traversal considers the pad interdependencies when choosing which links to follow. Currently all the entity's pads are considered as interdependent. This means that the behavior with all the current drivers stays the same, but in the future we can define a more fine-grained pipeline construction. Additionally the media pipeline's pads are cached at media_pipeline_start() time, and re-used at media_pipeline_stop() which avoid the need to re-walk the whole graph as the previous implementation did. Also, caching pads in the pipeline can serve in the future as the foundation to provide a better API than the media graph walk to drivers to iterate over pads and entities in the pipeline. Note that the old media_pipeline_start/stop used the media graph walk API. The new version does not use the media graph walk API, but instead a new implementation. There are two reasons for not changing the graph walk: it proved to be rather difficult to change the graph walk to have the features implemented in this patch, and second, this keeps the backward compatibility of the graph walk as there are users of the graph walk API The long term plan is that all the existing code would be converted to use the new cached pipeline, thus allowing us to remove the graph walk. Signed-off-by: Laurent Pinchart Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- Documentation/driver-api/media/mc-core.rst | 7 +- drivers/media/mc/mc-entity.c | 517 ++++++++++++++++++--- include/media/media-entity.h | 71 ++- 3 files changed, 509 insertions(+), 86 deletions(-) diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst index 4bb062d5c2e77..400b8ca29367e 100644 --- a/Documentation/driver-api/media/mc-core.rst +++ b/Documentation/driver-api/media/mc-core.rst @@ -230,14 +230,13 @@ When starting streaming, drivers must notify all entities in the pipeline to prevent link states from being modified during streaming by calling :c:func:`media_pipeline_start()`. -The function will mark all entities connected to the given entity through -enabled links, either directly or indirectly, as streaming. +The function will mark all the pads which are part of the pipeline as streaming. The struct media_pipeline instance pointed to by -the pipe argument will be stored in every entity in the pipeline. +the pipe argument will be stored in every pad in the pipeline. Drivers should embed the struct media_pipeline in higher-level pipeline structures and can then access the -pipeline through the struct media_entity +pipeline through the struct media_pad pipe field. Calls to :c:func:`media_pipeline_start()` can be nested. diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index ad153a426a36c..0a5c92b8bbce2 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -226,6 +226,27 @@ EXPORT_SYMBOL_GPL(media_entity_pads_init); * Graph traversal */ +/* + * This function checks the interdependency inside the entity between @pad0 + * and @pad1. If two pads are interdependent they are part of the same pipeline + * and enabling one of the pads means that the other pad will become "locked" + * and doesn't allow configuration changes. + * + * For the time being all pads are considered interdependent. + */ +static bool media_entity_has_pad_interdep(struct media_entity *entity, + unsigned int pad0, unsigned int pad1) +{ + if (pad0 >= entity->num_pads || pad1 >= entity->num_pads) + return false; + + if (entity->pads[pad0].flags & entity->pads[pad1].flags & + (MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_SOURCE)) + return false; + + return true; +} + static struct media_entity * media_entity_other(struct media_entity *entity, struct media_link *link) { @@ -374,97 +395,436 @@ EXPORT_SYMBOL_GPL(media_graph_walk_next); * Pipeline management */ +/* + * The pipeline traversal stack stores pads that are reached during graph + * traversal, with a list of links to be visited to continue the traversal. + * When a new pad is reached, an entry is pushed on the top of the stack and + * points to the incoming pad and the first link of the entity. + * + * To find further pads in the pipeline, the traversal algorithm follows + * internal pad dependencies in the entity, and then links in the graph. It + * does so by iterating over all links of the entity, and following enabled + * links that originate from a pad that is internally connected to the incoming + * pad, as reported by the media_entity_has_pad_interdep() function. + */ + +/** + * struct media_pipeline_walk_entry - Entry in the pipeline traversal stack + * + * @pad: The media pad being visited + * @links: Links left to be visited + */ +struct media_pipeline_walk_entry { + struct media_pad *pad; + struct list_head *links; +}; + +/** + * struct media_pipeline_walk - State used by the media pipeline traversal + * algorithm + * + * @mdev: The media device + * @stack: Depth-first search stack + * @stack.size: Number of allocated entries in @stack.entries + * @stack.top: Index of the top stack entry (-1 if the stack is empty) + * @stack.entries: Stack entries + */ +struct media_pipeline_walk { + struct media_device *mdev; + + struct { + unsigned int size; + int top; + struct media_pipeline_walk_entry *entries; + } stack; +}; + +#define MEDIA_PIPELINE_STACK_GROW_STEP 16 + +static struct media_pipeline_walk_entry * +media_pipeline_walk_top(struct media_pipeline_walk *walk) +{ + return &walk->stack.entries[walk->stack.top]; +} + +static bool media_pipeline_walk_empty(struct media_pipeline_walk *walk) +{ + return walk->stack.top == -1; +} + +/* Increase the stack size by MEDIA_PIPELINE_STACK_GROW_STEP elements. */ +static int media_pipeline_walk_resize(struct media_pipeline_walk *walk) +{ + struct media_pipeline_walk_entry *entries; + unsigned int new_size; + + /* Safety check, to avoid stack overflows in case of bugs. */ + if (walk->stack.size >= 256) + return -E2BIG; + + new_size = walk->stack.size + MEDIA_PIPELINE_STACK_GROW_STEP; + + entries = krealloc(walk->stack.entries, + new_size * sizeof(*walk->stack.entries), + GFP_KERNEL); + if (!entries) + return -ENOMEM; + + walk->stack.entries = entries; + walk->stack.size = new_size; + + return 0; +} + +/* Push a new entry on the stack. */ +static int media_pipeline_walk_push(struct media_pipeline_walk *walk, + struct media_pad *pad) +{ + struct media_pipeline_walk_entry *entry; + int ret; + + if (walk->stack.top + 1 >= walk->stack.size) { + ret = media_pipeline_walk_resize(walk); + if (ret) + return ret; + } + + walk->stack.top++; + entry = media_pipeline_walk_top(walk); + entry->pad = pad; + entry->links = pad->entity->links.next; + + dev_dbg(walk->mdev->dev, + "media pipeline: pushed entry %u: '%s':%u\n", + walk->stack.top, pad->entity->name, pad->index); + + return 0; +} + +/* + * Move the top entry link cursor to the next link. If all links of the entry + * have been visited, pop the entry itself. + */ +static void media_pipeline_walk_pop(struct media_pipeline_walk *walk) +{ + struct media_pipeline_walk_entry *entry; + + if (WARN_ON(walk->stack.top < 0)) + return; + + entry = media_pipeline_walk_top(walk); + + if (entry->links->next == &entry->pad->entity->links) { + dev_dbg(walk->mdev->dev, + "media pipeline: entry %u has no more links, popping\n", + walk->stack.top); + + walk->stack.top--; + return; + } + + entry->links = entry->links->next; + + dev_dbg(walk->mdev->dev, + "media pipeline: moved entry %u to next link\n", + walk->stack.top); +} + +/* Free all memory allocated while walking the pipeline. */ +static void media_pipeline_walk_destroy(struct media_pipeline_walk *walk) +{ + kfree(walk->stack.entries); +} + +/* Add a pad to the pipeline and push it to the stack. */ +static int media_pipeline_add_pad(struct media_pipeline *pipe, + struct media_pipeline_walk *walk, + struct media_pad *pad) +{ + struct media_pipeline_pad *ppad; + + list_for_each_entry(ppad, &pipe->pads, list) { + if (ppad->pad == pad) { + dev_dbg(pad->graph_obj.mdev->dev, + "media pipeline: already contains pad '%s':%u\n", + pad->entity->name, pad->index); + return 0; + } + } + + ppad = kzalloc(sizeof(*ppad), GFP_KERNEL); + if (!ppad) + return -ENOMEM; + + ppad->pipe = pipe; + ppad->pad = pad; + + list_add_tail(&ppad->list, &pipe->pads); + + dev_dbg(pad->graph_obj.mdev->dev, + "media pipeline: added pad '%s':%u\n", + pad->entity->name, pad->index); + + return media_pipeline_walk_push(walk, pad); +} + +/* Explore the next link of the entity at the top of the stack. */ +static int media_pipeline_explore_next_link(struct media_pipeline *pipe, + struct media_pipeline_walk *walk) +{ + struct media_pipeline_walk_entry *entry = media_pipeline_walk_top(walk); + struct media_pad *pad; + struct media_link *link; + struct media_pad *local; + struct media_pad *remote; + int ret; + + pad = entry->pad; + link = list_entry(entry->links, typeof(*link), list); + media_pipeline_walk_pop(walk); + + dev_dbg(walk->mdev->dev, + "media pipeline: exploring link '%s':%u -> '%s':%u\n", + link->source->entity->name, link->source->index, + link->sink->entity->name, link->sink->index); + + /* Skip links that are not enabled. */ + if (!(link->flags & MEDIA_LNK_FL_ENABLED)) { + dev_dbg(walk->mdev->dev, + "media pipeline: skipping link (disabled)\n"); + return 0; + } + + /* Get the local pad and remote pad. */ + if (link->source->entity == pad->entity) { + local = link->source; + remote = link->sink; + } else { + local = link->sink; + remote = link->source; + } + + /* + * Skip links that originate from a different pad than the incoming pad + * that is not connected internally in the entity to the incoming pad. + */ + if (pad != local && + !media_entity_has_pad_interdep(pad->entity, pad->index, local->index)) { + dev_dbg(walk->mdev->dev, + "media pipeline: skipping link (no route)\n"); + return 0; + } + + /* + * Add the local and remote pads of the link to the pipeline and push + * them to the stack, if they're not already present. + */ + ret = media_pipeline_add_pad(pipe, walk, local); + if (ret) + return ret; + + ret = media_pipeline_add_pad(pipe, walk, remote); + if (ret) + return ret; + + return 0; +} + +static void media_pipeline_cleanup(struct media_pipeline *pipe) +{ + while (!list_empty(&pipe->pads)) { + struct media_pipeline_pad *ppad; + + ppad = list_first_entry(&pipe->pads, typeof(*ppad), list); + list_del(&ppad->list); + kfree(ppad); + } +} + +static int media_pipeline_populate(struct media_pipeline *pipe, + struct media_pad *pad) +{ + struct media_pipeline_walk walk = { }; + struct media_pipeline_pad *ppad; + int ret; + + /* + * Populate the media pipeline by walking the media graph, starting + * from @pad. + */ + INIT_LIST_HEAD(&pipe->pads); + pipe->mdev = pad->graph_obj.mdev; + + walk.mdev = pipe->mdev; + walk.stack.top = -1; + ret = media_pipeline_add_pad(pipe, &walk, pad); + if (ret) + goto done; + + /* + * Use a depth-first search algorithm: as long as the stack is not + * empty, explore the next link of the top entry. The + * media_pipeline_explore_next_link() function will either move to the + * next link, pop the entry if fully visited, or add new entries on + * top. + */ + while (!media_pipeline_walk_empty(&walk)) { + ret = media_pipeline_explore_next_link(pipe, &walk); + if (ret) + goto done; + } + + dev_dbg(pad->graph_obj.mdev->dev, + "media pipeline populated, found pads:\n"); + + list_for_each_entry(ppad, &pipe->pads, list) + dev_dbg(pad->graph_obj.mdev->dev, "- '%s':%u\n", + ppad->pad->entity->name, ppad->pad->index); + + WARN_ON(walk.stack.top != -1); + + ret = 0; + +done: + media_pipeline_walk_destroy(&walk); + + if (ret) + media_pipeline_cleanup(pipe); + + return ret; +} + __must_check int __media_pipeline_start(struct media_entity *entity, struct media_pipeline *pipe) { struct media_device *mdev = entity->graph_obj.mdev; - struct media_graph *graph = &pipe->graph; - struct media_entity *entity_err = entity; - struct media_link *link; + struct media_pipeline_pad *err_ppad; + struct media_pipeline_pad *ppad; int ret; + lockdep_assert_held(&mdev->graph_mutex); + + /* + * media_pipeline_start(entity) only makes sense with entities that have + * a single pad. + */ + + if (WARN_ON(entity->num_pads != 1)) + return -EINVAL; + + /* + * If the entity is already part of a pipeline, that pipeline must + * be the same as the pipe given to media_pipeline_start(). + */ + if (WARN_ON(entity->pads->pipe && entity->pads->pipe != pipe)) + return -EINVAL; + + /* + * If the pipeline has already been started, it is guaranteed to be + * valid, so just increase the start count. + */ if (pipe->start_count) { pipe->start_count++; return 0; } - ret = media_graph_walk_init(&pipe->graph, mdev); + /* + * Populate the pipeline. This populates the media_pipeline pads list + * with media_pipeline_pad instances for each pad found during graph + * walk. + */ + ret = media_pipeline_populate(pipe, entity->pads); if (ret) return ret; - media_graph_walk_start(&pipe->graph, entity); + /* + * Now that all the pads in the pipeline have been gathered, perform + * the validation steps. + */ + + list_for_each_entry(ppad, &pipe->pads, list) { + struct media_pad *pad = ppad->pad; + struct media_entity *entity = pad->entity; + bool has_enabled_link = false; + bool has_link = false; + struct media_link *link; - while ((entity = media_graph_walk_next(graph))) { - DECLARE_BITMAP(active, MEDIA_ENTITY_MAX_PADS); - DECLARE_BITMAP(has_no_links, MEDIA_ENTITY_MAX_PADS); + dev_dbg(mdev->dev, "Validating pad '%s':%u\n", pad->entity->name, + pad->index); - if (entity->pipe && entity->pipe != pipe) { - pr_err("Pipe active for %s. Can't start for %s\n", - entity->name, - entity_err->name); + /* + * 1. Ensure that the pad doesn't already belong to a different + * pipeline. + */ + if (pad->pipe) { + dev_dbg(mdev->dev, "Failed to start pipeline: pad '%s':%u busy\n", + pad->entity->name, pad->index); ret = -EBUSY; goto error; } - /* Already streaming --- no need to check. */ - if (entity->pipe) - continue; - - entity->pipe = pipe; - - if (!entity->ops || !entity->ops->link_validate) - continue; - - bitmap_zero(active, entity->num_pads); - bitmap_fill(has_no_links, entity->num_pads); - + /* + * 2. Validate all active links whose sink is the current pad. + * Validation of the source pads is performed in the context of + * the connected sink pad to avoid duplicating checks. + */ for_each_media_entity_data_link(entity, link) { - struct media_pad *pad = link->sink->entity == entity - ? link->sink : link->source; + /* Skip links unrelated to the current pad. */ + if (link->sink != pad && link->source != pad) + continue; - /* Mark that a pad is connected by a link. */ - bitmap_clear(has_no_links, pad->index, 1); + /* Record if the pad has links and enabled links. */ + if (link->flags & MEDIA_LNK_FL_ENABLED) + has_enabled_link = true; + has_link = true; /* - * Pads that either do not need to connect or - * are connected through an enabled link are - * fine. + * Validate the link if it's enabled and has the + * current pad as its sink. */ - if (!(pad->flags & MEDIA_PAD_FL_MUST_CONNECT) || - link->flags & MEDIA_LNK_FL_ENABLED) - bitmap_set(active, pad->index, 1); + if (!(link->flags & MEDIA_LNK_FL_ENABLED)) + continue; - /* - * Link validation will only take place for - * sink ends of the link that are enabled. - */ - if (link->sink != pad || - !(link->flags & MEDIA_LNK_FL_ENABLED)) + if (link->sink != pad) + continue; + + if (!entity->ops || !entity->ops->link_validate) continue; ret = entity->ops->link_validate(link); - if (ret < 0 && ret != -ENOIOCTLCMD) { - dev_dbg(entity->graph_obj.mdev->dev, - "link validation failed for '%s':%u -> '%s':%u, error %d\n", + if (ret) { + dev_dbg(mdev->dev, + "Link '%s':%u -> '%s':%u failed validation: %d\n", link->source->entity->name, link->source->index, - entity->name, link->sink->index, ret); + link->sink->entity->name, + link->sink->index, ret); goto error; } - } - /* Either no links or validated links are fine. */ - bitmap_or(active, active, has_no_links, entity->num_pads); + dev_dbg(mdev->dev, + "Link '%s':%u -> '%s':%u is valid\n", + link->source->entity->name, + link->source->index, + link->sink->entity->name, + link->sink->index); + } - if (!bitmap_full(active, entity->num_pads)) { + /* + * 3. If the pad has the MEDIA_PAD_FL_MUST_CONNECT flag set, + * ensure that it has either no link or an enabled link. + */ + if ((pad->flags & MEDIA_PAD_FL_MUST_CONNECT) && has_link && + !has_enabled_link) { + dev_dbg(mdev->dev, + "Pad '%s':%u must be connected by an enabled link\n", + pad->entity->name, pad->index); ret = -ENOLINK; - dev_dbg(entity->graph_obj.mdev->dev, - "'%s':%u must be connected by an enabled link\n", - entity->name, - (unsigned)find_first_zero_bit( - active, entity->num_pads)); goto error; } + + /* Validation passed, store the pipe pointer in the pad. */ + pad->pipe = pipe; } pipe->start_count++; @@ -476,20 +836,15 @@ error: * Link validation on graph failed. We revert what we did and * return the error. */ - media_graph_walk_start(graph, entity_err); - - while ((entity_err = media_graph_walk_next(graph))) { - entity_err->pipe = NULL; - /* - * We haven't started entities further than this so we quit - * here. - */ - if (entity_err == entity) + list_for_each_entry(err_ppad, &pipe->pads, list) { + if (err_ppad == ppad) break; + + err_ppad->pad->pipe = NULL; } - media_graph_walk_cleanup(graph); + media_pipeline_cleanup(pipe); return ret; } @@ -510,8 +865,8 @@ EXPORT_SYMBOL_GPL(media_pipeline_start); void __media_pipeline_stop(struct media_entity *entity) { - struct media_graph *graph = &entity->pipe->graph; - struct media_pipeline *pipe = entity->pipe; + struct media_pipeline *pipe = entity->pads->pipe; + struct media_pipeline_pad *ppad; /* * If the following check fails, the driver has performed an @@ -523,12 +878,10 @@ void __media_pipeline_stop(struct media_entity *entity) if (--pipe->start_count) return; - media_graph_walk_start(graph, entity); + list_for_each_entry(ppad, &pipe->pads, list) + ppad->pad->pipe = NULL; - while ((entity = media_graph_walk_next(graph))) - entity->pipe = NULL; - - media_graph_walk_cleanup(graph); + media_pipeline_cleanup(pipe); if (pipe->allocated) kfree(pipe); @@ -835,7 +1188,7 @@ int __media_entity_setup_link(struct media_link *link, u32 flags) { const u32 mask = MEDIA_LNK_FL_ENABLED; struct media_device *mdev; - struct media_entity *source, *sink; + struct media_pad *source, *sink; int ret = -EBUSY; if (link == NULL) @@ -851,12 +1204,11 @@ int __media_entity_setup_link(struct media_link *link, u32 flags) if (link->flags == flags) return 0; - source = link->source->entity; - sink = link->sink->entity; + source = link->source; + sink = link->sink; if (!(link->flags & MEDIA_LNK_FL_DYNAMIC) && - (media_entity_is_streaming(source) || - media_entity_is_streaming(sink))) + (media_pad_is_streaming(source) || media_pad_is_streaming(sink))) return -EBUSY; mdev = source->graph_obj.mdev; @@ -1034,10 +1386,23 @@ EXPORT_SYMBOL_GPL(media_entity_get_fwnode_pad); struct media_pipeline *media_entity_pipeline(struct media_entity *entity) { - return entity->pipe; + struct media_pad *pad; + + media_entity_for_each_pad(entity, pad) { + if (pad->pipe) + return pad->pipe; + } + + return NULL; } EXPORT_SYMBOL_GPL(media_entity_pipeline); +struct media_pipeline *media_pad_pipeline(struct media_pad *pad) +{ + return pad->pipe; +} +EXPORT_SYMBOL_GPL(media_pad_pipeline); + static void media_interface_init(struct media_device *mdev, struct media_interface *intf, u32 gobj_type, diff --git a/include/media/media-entity.h b/include/media/media-entity.h index a77933afaa48f..00990b20b3d5a 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -101,13 +101,33 @@ struct media_graph { * struct media_pipeline - Media pipeline related information * * @allocated: Media pipeline allocated and freed by the framework + * @mdev: The media device the pipeline is part of + * @pads: List of media_pipeline_pad * @start_count: Media pipeline start - stop count - * @graph: Media graph walk during pipeline start / stop */ struct media_pipeline { bool allocated; + struct media_device *mdev; + struct list_head pads; int start_count; - struct media_graph graph; +}; + +/** + * struct media_pipeline_pad - A pad part of a media pipeline + * + * @list: Entry in the media_pad pads list + * @pipe: The media_pipeline that the pad is part of + * @pad: The media pad + * + * This structure associate a pad with a media pipeline. Instances of + * media_pipeline_pad are created by media_pipeline_start() when it builds the + * pipeline, and stored in the &media_pad.pads list. media_pipeline_stop() + * removes the entries from the list and deletes them. + */ +struct media_pipeline_pad { + struct list_head list; + struct media_pipeline *pipe; + struct media_pad *pad; }; /** @@ -189,6 +209,8 @@ enum media_pad_signal_type { * @flags: Pad flags, as defined in * :ref:`include/uapi/linux/media.h ` * (seek for ``MEDIA_PAD_FL_*``) + * @pipe: Pipeline this pad belongs to. Use media_entity_pipeline() to + * access this field. */ struct media_pad { struct media_gobj graph_obj; /* must be first field in struct */ @@ -196,6 +218,12 @@ struct media_pad { u16 index; enum media_pad_signal_type sig_type; unsigned long flags; + + /* + * The fields below are private, and should only be accessed via + * appropriate functions. + */ + struct media_pipeline *pipe; }; /** @@ -272,7 +300,6 @@ enum media_entity_type { * @links: List of data links. * @ops: Entity operations. * @use_count: Use count for the entity. - * @pipe: Pipeline this entity belongs to. * @info: Union with devnode information. Kept just for backward * compatibility. * @info.dev: Contains device major and minor info. @@ -308,8 +335,6 @@ struct media_entity { int use_count; - struct media_pipeline *pipe; - union { struct { u32 major; @@ -938,6 +963,18 @@ media_entity_remote_source_pad_unique(const struct media_entity *entity) return media_entity_remote_pad_unique(entity, MEDIA_PAD_FL_SOURCE); } +/** + * media_pad_is_streaming - Test if a pad is part of a streaming pipeline + * @pad: The pad + * + * Return: True if the pad is part of a pipeline started with the + * media_pipeline_start() function, false otherwise. + */ +static inline bool media_pad_is_streaming(const struct media_pad *pad) +{ + return pad->pipe; +} + /** * media_entity_is_streaming - Test if an entity is part of a streaming pipeline * @entity: The entity @@ -947,13 +984,22 @@ media_entity_remote_source_pad_unique(const struct media_entity *entity) */ static inline bool media_entity_is_streaming(const struct media_entity *entity) { - return entity->pipe; + struct media_pad *pad; + + media_entity_for_each_pad(entity, pad) { + if (media_pad_is_streaming(pad)) + return true; + } + + return false; } /** * media_entity_pipeline - Get the media pipeline an entity is part of * @entity: The entity * + * DEPRECATED: use media_pad_pipeline() instead. + * * This function returns the media pipeline that an entity has been associated * with when constructing the pipeline with media_pipeline_start(). The pointer * remains valid until media_pipeline_stop() is called. @@ -968,6 +1014,19 @@ static inline bool media_entity_is_streaming(const struct media_entity *entity) */ struct media_pipeline *media_entity_pipeline(struct media_entity *entity); +/** + * media_pad_pipeline - Get the media pipeline a pad is part of + * @pad: The pad + * + * This function returns the media pipeline that a pad has been associated + * with when constructing the pipeline with media_pipeline_start(). The pointer + * remains valid until media_pipeline_stop() is called. + * + * Return: The media_pipeline the pad is part of, or NULL if the pad is + * not part of any pipeline. + */ +struct media_pipeline *media_pad_pipeline(struct media_pad *pad); + /** * media_entity_get_fwnode_pad - Get pad number from fwnode * -- GitLab From 5b4f9a727532ff9732ffc1bceb2017260b81a0ff Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:38 +0200 Subject: [PATCH 0540/2223] media: mc: entity: Add has_pad_interdep entity operation Add a new media entity operation, has_pad_interdep. The optional op is used to discover the pad interdependencies inside an entity during the pipeline construction. Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 10 ++++++++-- include/media/media-entity.h | 10 ++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index 0a5c92b8bbce2..831076b368476 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -232,7 +232,10 @@ EXPORT_SYMBOL_GPL(media_entity_pads_init); * and enabling one of the pads means that the other pad will become "locked" * and doesn't allow configuration changes. * - * For the time being all pads are considered interdependent. + * This function uses the &media_entity_operations.has_pad_interdep() operation + * to check the dependency inside the entity between @pad0 and @pad1. If the + * has_pad_interdep operation is not implemented, all pads of the entity are + * considered to be interdependent. */ static bool media_entity_has_pad_interdep(struct media_entity *entity, unsigned int pad0, unsigned int pad1) @@ -244,7 +247,10 @@ static bool media_entity_has_pad_interdep(struct media_entity *entity, (MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_SOURCE)) return false; - return true; + if (!entity->ops || !entity->ops->has_pad_interdep) + return true; + + return entity->ops->has_pad_interdep(entity, pad0, pad1); } static struct media_entity * diff --git a/include/media/media-entity.h b/include/media/media-entity.h index 00990b20b3d5a..8e9fd309aa653 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -237,6 +237,14 @@ struct media_pad { * @link_validate: Return whether a link is valid from the entity point of * view. The media_pipeline_start() function * validates all links by calling this operation. Optional. + * @has_pad_interdep: Return whether a two pads inside the entity are + * interdependent. If two pads are interdependent they are + * part of the same pipeline and enabling one of the pads + * means that the other pad will become "locked" and + * doesn't allow configuration changes. pad0 and pad1 are + * guaranteed to not both be sinks or sources. + * Optional: If the operation isn't implemented all pads + * will be considered as interdependent. * * .. note:: * @@ -250,6 +258,8 @@ struct media_entity_operations { const struct media_pad *local, const struct media_pad *remote, u32 flags); int (*link_validate)(struct media_link *link); + bool (*has_pad_interdep)(struct media_entity *entity, unsigned int pad0, + unsigned int pad1); }; /** -- GitLab From 9e3576a1ae2bb67c4d09d5e6c002fb793c300b58 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 31 Aug 2022 16:13:39 +0200 Subject: [PATCH 0541/2223] media: mc: convert pipeline funcs to take media_pad Now that the pipeline is stored into pads instead of entities, we can change the relevant functions to take pads instead of entities. Signed-off-by: Tomi Valkeinen Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/mc/mc-entity.c | 40 ++++++++----------- .../samsung/s3c-camif/camif-capture.c | 6 +-- drivers/media/usb/au0828/au0828-core.c | 8 ++-- drivers/media/v4l2-core/v4l2-dev.c | 12 +++--- drivers/staging/media/imx/imx-media-utils.c | 8 ++-- include/media/media-entity.h | 34 ++++++++-------- include/media/v4l2-dev.h | 4 +- 7 files changed, 52 insertions(+), 60 deletions(-) diff --git a/drivers/media/mc/mc-entity.c b/drivers/media/mc/mc-entity.c index 831076b368476..b8bcbc734eaf4 100644 --- a/drivers/media/mc/mc-entity.c +++ b/drivers/media/mc/mc-entity.c @@ -700,29 +700,21 @@ done: return ret; } -__must_check int __media_pipeline_start(struct media_entity *entity, +__must_check int __media_pipeline_start(struct media_pad *pad, struct media_pipeline *pipe) { - struct media_device *mdev = entity->graph_obj.mdev; + struct media_device *mdev = pad->entity->graph_obj.mdev; struct media_pipeline_pad *err_ppad; struct media_pipeline_pad *ppad; int ret; lockdep_assert_held(&mdev->graph_mutex); - /* - * media_pipeline_start(entity) only makes sense with entities that have - * a single pad. - */ - - if (WARN_ON(entity->num_pads != 1)) - return -EINVAL; - /* * If the entity is already part of a pipeline, that pipeline must * be the same as the pipe given to media_pipeline_start(). */ - if (WARN_ON(entity->pads->pipe && entity->pads->pipe != pipe)) + if (WARN_ON(pad->pipe && pad->pipe != pipe)) return -EINVAL; /* @@ -739,7 +731,7 @@ __must_check int __media_pipeline_start(struct media_entity *entity, * with media_pipeline_pad instances for each pad found during graph * walk. */ - ret = media_pipeline_populate(pipe, entity->pads); + ret = media_pipeline_populate(pipe, pad); if (ret) return ret; @@ -856,22 +848,22 @@ error: } EXPORT_SYMBOL_GPL(__media_pipeline_start); -__must_check int media_pipeline_start(struct media_entity *entity, +__must_check int media_pipeline_start(struct media_pad *pad, struct media_pipeline *pipe) { - struct media_device *mdev = entity->graph_obj.mdev; + struct media_device *mdev = pad->entity->graph_obj.mdev; int ret; mutex_lock(&mdev->graph_mutex); - ret = __media_pipeline_start(entity, pipe); + ret = __media_pipeline_start(pad, pipe); mutex_unlock(&mdev->graph_mutex); return ret; } EXPORT_SYMBOL_GPL(media_pipeline_start); -void __media_pipeline_stop(struct media_entity *entity) +void __media_pipeline_stop(struct media_pad *pad) { - struct media_pipeline *pipe = entity->pads->pipe; + struct media_pipeline *pipe = pad->pipe; struct media_pipeline_pad *ppad; /* @@ -894,19 +886,19 @@ void __media_pipeline_stop(struct media_entity *entity) } EXPORT_SYMBOL_GPL(__media_pipeline_stop); -void media_pipeline_stop(struct media_entity *entity) +void media_pipeline_stop(struct media_pad *pad) { - struct media_device *mdev = entity->graph_obj.mdev; + struct media_device *mdev = pad->entity->graph_obj.mdev; mutex_lock(&mdev->graph_mutex); - __media_pipeline_stop(entity); + __media_pipeline_stop(pad); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_pipeline_stop); -__must_check int media_pipeline_alloc_start(struct media_entity *entity) +__must_check int media_pipeline_alloc_start(struct media_pad *pad) { - struct media_device *mdev = entity->graph_obj.mdev; + struct media_device *mdev = pad->entity->graph_obj.mdev; struct media_pipeline *new_pipe = NULL; struct media_pipeline *pipe; int ret; @@ -917,7 +909,7 @@ __must_check int media_pipeline_alloc_start(struct media_entity *entity) * Is the entity already part of a pipeline? If not, we need to allocate * a pipe. */ - pipe = media_entity_pipeline(entity); + pipe = media_pad_pipeline(pad); if (!pipe) { new_pipe = kzalloc(sizeof(*new_pipe), GFP_KERNEL); if (!new_pipe) { @@ -929,7 +921,7 @@ __must_check int media_pipeline_alloc_start(struct media_entity *entity) pipe->allocated = true; } - ret = __media_pipeline_start(entity, pipe); + ret = __media_pipeline_start(pad, pipe); if (ret) kfree(new_pipe); diff --git a/drivers/media/platform/samsung/s3c-camif/camif-capture.c b/drivers/media/platform/samsung/s3c-camif/camif-capture.c index c2d8f1e425d87..db106ebdf870a 100644 --- a/drivers/media/platform/samsung/s3c-camif/camif-capture.c +++ b/drivers/media/platform/samsung/s3c-camif/camif-capture.c @@ -848,13 +848,13 @@ static int s3c_camif_streamon(struct file *file, void *priv, if (s3c_vp_active(vp)) return 0; - ret = media_pipeline_start(sensor, camif->m_pipeline); + ret = media_pipeline_start(sensor->pads, camif->m_pipeline); if (ret < 0) return ret; ret = camif_pipeline_validate(camif); if (ret < 0) { - media_pipeline_stop(sensor); + media_pipeline_stop(sensor->pads); return ret; } @@ -878,7 +878,7 @@ static int s3c_camif_streamoff(struct file *file, void *priv, ret = vb2_streamoff(&vp->vb_queue, type); if (ret == 0) - media_pipeline_stop(&camif->sensor.sd->entity); + media_pipeline_stop(camif->sensor.sd->entity.pads); return ret; } diff --git a/drivers/media/usb/au0828/au0828-core.c b/drivers/media/usb/au0828/au0828-core.c index caefac07af927..877e85a451cbe 100644 --- a/drivers/media/usb/au0828/au0828-core.c +++ b/drivers/media/usb/au0828/au0828-core.c @@ -410,7 +410,7 @@ static int au0828_enable_source(struct media_entity *entity, goto end; } - ret = __media_pipeline_start(entity, pipe); + ret = __media_pipeline_start(entity->pads, pipe); if (ret) { pr_err("Start Pipeline: %s->%s Error %d\n", source->name, entity->name, ret); @@ -501,12 +501,12 @@ static void au0828_disable_source(struct media_entity *entity) return; /* stop pipeline */ - __media_pipeline_stop(dev->active_link_owner); + __media_pipeline_stop(dev->active_link_owner->pads); pr_debug("Pipeline stop for %s\n", dev->active_link_owner->name); ret = __media_pipeline_start( - dev->active_link_user, + dev->active_link_user->pads, dev->active_link_user_pipe); if (ret) { pr_err("Start Pipeline: %s->%s %d\n", @@ -532,7 +532,7 @@ static void au0828_disable_source(struct media_entity *entity) return; /* stop pipeline */ - __media_pipeline_stop(dev->active_link_owner); + __media_pipeline_stop(dev->active_link_owner->pads); pr_debug("Pipeline stop for %s\n", dev->active_link_owner->name); diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index 945bb867a4c19..397d553177fa7 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -1105,7 +1105,7 @@ __must_check int video_device_pipeline_start(struct video_device *vdev, if (entity->num_pads != 1) return -ENODEV; - return media_pipeline_start(entity, pipe); + return media_pipeline_start(&entity->pads[0], pipe); } EXPORT_SYMBOL_GPL(video_device_pipeline_start); @@ -1117,7 +1117,7 @@ __must_check int __video_device_pipeline_start(struct video_device *vdev, if (entity->num_pads != 1) return -ENODEV; - return __media_pipeline_start(entity, pipe); + return __media_pipeline_start(&entity->pads[0], pipe); } EXPORT_SYMBOL_GPL(__video_device_pipeline_start); @@ -1128,7 +1128,7 @@ void video_device_pipeline_stop(struct video_device *vdev) if (WARN_ON(entity->num_pads != 1)) return; - return media_pipeline_stop(entity); + return media_pipeline_stop(&entity->pads[0]); } EXPORT_SYMBOL_GPL(video_device_pipeline_stop); @@ -1139,7 +1139,7 @@ void __video_device_pipeline_stop(struct video_device *vdev) if (WARN_ON(entity->num_pads != 1)) return; - return __media_pipeline_stop(entity); + return __media_pipeline_stop(&entity->pads[0]); } EXPORT_SYMBOL_GPL(__video_device_pipeline_stop); @@ -1150,7 +1150,7 @@ __must_check int video_device_pipeline_alloc_start(struct video_device *vdev) if (entity->num_pads != 1) return -ENODEV; - return media_pipeline_alloc_start(entity); + return media_pipeline_alloc_start(&entity->pads[0]); } EXPORT_SYMBOL_GPL(video_device_pipeline_alloc_start); @@ -1161,7 +1161,7 @@ struct media_pipeline *video_device_pipeline(struct video_device *vdev) if (WARN_ON(entity->num_pads != 1)) return NULL; - return media_entity_pipeline(entity); + return media_pad_pipeline(&entity->pads[0]); } EXPORT_SYMBOL_GPL(video_device_pipeline); diff --git a/drivers/staging/media/imx/imx-media-utils.c b/drivers/staging/media/imx/imx-media-utils.c index e9a3c6d2c66fb..3e7462112649d 100644 --- a/drivers/staging/media/imx/imx-media-utils.c +++ b/drivers/staging/media/imx/imx-media-utils.c @@ -863,16 +863,16 @@ int imx_media_pipeline_set_stream(struct imx_media_dev *imxmd, mutex_lock(&imxmd->md.graph_mutex); if (on) { - ret = __media_pipeline_start(entity, &imxmd->pipe); + ret = __media_pipeline_start(entity->pads, &imxmd->pipe); if (ret) goto out; ret = v4l2_subdev_call(sd, video, s_stream, 1); if (ret) - __media_pipeline_stop(entity); + __media_pipeline_stop(entity->pads); } else { v4l2_subdev_call(sd, video, s_stream, 0); - if (media_entity_pipeline(entity)) - __media_pipeline_stop(entity); + if (media_pad_pipeline(entity->pads)) + __media_pipeline_stop(entity->pads); } out: diff --git a/include/media/media-entity.h b/include/media/media-entity.h index 8e9fd309aa653..28c9de8a1f348 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -1115,66 +1115,66 @@ struct media_entity *media_graph_walk_next(struct media_graph *graph); /** * media_pipeline_start - Mark a pipeline as streaming - * @entity: Starting entity - * @pipe: Media pipeline to be assigned to all entities in the pipeline. + * @pad: Starting pad + * @pipe: Media pipeline to be assigned to all pads in the pipeline. * - * Mark all entities connected to a given entity through enabled links, either + * Mark all pads connected to a given pad through enabled links, either * directly or indirectly, as streaming. The given pipeline object is assigned - * to every entity in the pipeline and stored in the media_entity pipe field. + * to every pad in the pipeline and stored in the media_pad pipe field. * * Calls to this function can be nested, in which case the same number of * media_pipeline_stop() calls will be required to stop streaming. The * pipeline pointer must be identical for all nested calls to * media_pipeline_start(). */ -__must_check int media_pipeline_start(struct media_entity *entity, +__must_check int media_pipeline_start(struct media_pad *pad, struct media_pipeline *pipe); /** * __media_pipeline_start - Mark a pipeline as streaming * - * @entity: Starting entity - * @pipe: Media pipeline to be assigned to all entities in the pipeline. + * @pad: Starting pad + * @pipe: Media pipeline to be assigned to all pads in the pipeline. * * ..note:: This is the non-locking version of media_pipeline_start() */ -__must_check int __media_pipeline_start(struct media_entity *entity, +__must_check int __media_pipeline_start(struct media_pad *pad, struct media_pipeline *pipe); /** * media_pipeline_stop - Mark a pipeline as not streaming - * @entity: Starting entity + * @pad: Starting pad * - * Mark all entities connected to a given entity through enabled links, either - * directly or indirectly, as not streaming. The media_entity pipe field is + * Mark all pads connected to a given pads through enabled links, either + * directly or indirectly, as not streaming. The media_pad pipe field is * reset to %NULL. * * If multiple calls to media_pipeline_start() have been made, the same * number of calls to this function are required to mark the pipeline as not * streaming. */ -void media_pipeline_stop(struct media_entity *entity); +void media_pipeline_stop(struct media_pad *pad); /** * __media_pipeline_stop - Mark a pipeline as not streaming * - * @entity: Starting entity + * @pad: Starting pad * * .. note:: This is the non-locking version of media_pipeline_stop() */ -void __media_pipeline_stop(struct media_entity *entity); +void __media_pipeline_stop(struct media_pad *pad); /** * media_pipeline_alloc_start - Mark a pipeline as streaming - * @entity: Starting entity + * @pad: Starting pad * * media_pipeline_alloc_start() is similar to media_pipeline_start() but instead * of working on a given pipeline the function will use an existing pipeline if - * the entity is already part of a pipeline, or allocate a new pipeline. + * the pad is already part of a pipeline, or allocate a new pipeline. * * Calls to media_pipeline_alloc_start() must be matched with * media_pipeline_stop(). */ -__must_check int media_pipeline_alloc_start(struct media_entity *entity); +__must_check int media_pipeline_alloc_start(struct media_pad *pad); /** * media_devnode_create() - creates and initializes a device node interface diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h index 643da0740ab06..e0a13505f88da 100644 --- a/include/media/v4l2-dev.h +++ b/include/media/v4l2-dev.h @@ -548,7 +548,7 @@ static inline int video_is_registered(struct video_device *vdev) * * Mark all entities connected to a given video device through enabled links, * either directly or indirectly, as streaming. The given pipeline object is - * assigned to every entity in the pipeline and stored in the media_entity pipe + * assigned to every pad in the pipeline and stored in the media_pad pipe * field. * * Calls to this function can be nested, in which case the same number of @@ -582,7 +582,7 @@ __must_check int __video_device_pipeline_start(struct video_device *vdev, * @vdev: Starting video device * * Mark all entities connected to a given video device through enabled links, - * either directly or indirectly, as not streaming. The media_entity pipe field + * either directly or indirectly, as not streaming. The media_pad pipe field * is reset to %NULL. * * If multiple calls to media_pipeline_start() have been made, the same -- GitLab From 2a96b40f366abb9c07e4a5cf89434a5f45f0f1e7 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 1 Sep 2022 09:14:37 +0200 Subject: [PATCH 0542/2223] media: dt-bindings: dongwoon,dw9714: convert to dtschema Convert Dongwoon Anatech DW9714 camera voice coil lens driver to DT schema and extend the bindings with vcc-supply (already used by driver) and powerdown-gpios (based on datasheet, not used by the driver). Signed-off-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- .../bindings/media/i2c/dongwoon,dw9714.txt | 9 ---- .../bindings/media/i2c/dongwoon,dw9714.yaml | 47 +++++++++++++++++++ MAINTAINERS | 2 +- 3 files changed, 48 insertions(+), 10 deletions(-) delete mode 100644 Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.txt create mode 100644 Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml diff --git a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.txt b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.txt deleted file mode 100644 index b88dcdd41def0..0000000000000 --- a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.txt +++ /dev/null @@ -1,9 +0,0 @@ -Dongwoon Anatech DW9714 camera voice coil lens driver - -DW9174 is a 10-bit DAC with current sink capability. It is intended -for driving voice coil lenses in camera modules. - -Mandatory properties: - -- compatible: "dongwoon,dw9714" -- reg: I²C slave address diff --git a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml new file mode 100644 index 0000000000000..66229a3dc05d6 --- /dev/null +++ b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/media/i2c/dongwoon,dw9714.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Dongwoon Anatech DW9714 camera voice coil lens driver + +maintainers: + - Krzysztof Kozlowski + +description: + DW9174 is a 10-bit DAC with current sink capability. It is intended for + driving voice coil lenses in camera modules. + +properties: + compatible: + const: dongwoon,dw9714 + + reg: + maxItems: 1 + + powerdown-gpios: + description: + XSD pin for shutdown (active low) + + vcc-supply: + description: VDD power supply + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + camera-lens@c { + compatible = "dongwoon,dw9714"; + reg = <0x0c>; + vcc-supply = <®_csi_1v8>; + }; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 5135aa7d713cc..d0f844857447e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6244,7 +6244,7 @@ M: Sakari Ailus L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git -F: Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.txt +F: Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml F: drivers/media/i2c/dw9714.c DONGWOON DW9768 LENS VOICE COIL DRIVER -- GitLab From b53ad42566e0b31e295233cdc556045946929d16 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 26 Jun 2022 18:33:00 +0200 Subject: [PATCH 0543/2223] media: dt-bindings: media: samsung,exynos5250-gsc: convert to dtschema Convert the Samsung Exynos SoC G-Scaler bindings to DT schema. Changes done during conversion: 1. A typical (already used) properties like clocks, iommus and power-domains. 2. Require clocks, because they are essential for the block to operate. 3. Describe the differences in clocks between the Exynos5250/5420 and the Exynos5433 G-Scalers. This includes the fifth Exynos5433 clock "gsd" (GSCL Smart Deck) which was added to the DTS, but not to the bindings and Linux driver. Similarly to Exynos5433 DECON change [1], the clock should be used. [1] https://lore.kernel.org/all/6270db2d-667d-8d6f-9289-be92da486c25@samsung.com/ Signed-off-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- .../devicetree/bindings/media/exynos5-gsc.txt | 38 ------ .../media/samsung,exynos5250-gsc.yaml | 109 ++++++++++++++++++ 2 files changed, 109 insertions(+), 38 deletions(-) delete mode 100644 Documentation/devicetree/bindings/media/exynos5-gsc.txt create mode 100644 Documentation/devicetree/bindings/media/samsung,exynos5250-gsc.yaml diff --git a/Documentation/devicetree/bindings/media/exynos5-gsc.txt b/Documentation/devicetree/bindings/media/exynos5-gsc.txt deleted file mode 100644 index 1872688fa4089..0000000000000 --- a/Documentation/devicetree/bindings/media/exynos5-gsc.txt +++ /dev/null @@ -1,38 +0,0 @@ -* Samsung Exynos5 G-Scaler device - -G-Scaler is used for scaling and color space conversion on Exynos5 SoCs. - -Required properties: -- compatible: should be one of - "samsung,exynos5250-gsc" - "samsung,exynos5420-gsc" - "samsung,exynos5433-gsc" - "samsung,exynos5-gsc" (deprecated) -- reg: should contain G-Scaler physical address location and length. -- interrupts: should contain G-Scaler interrupt number - -Optional properties: -- samsung,sysreg: handle to syscon used to control the system registers to - set writeback input and destination - -Example: - -gsc_0: gsc@13e00000 { - compatible = "samsung,exynos5250-gsc"; - reg = <0x13e00000 0x1000>; - interrupts = <0 85 0>; -}; - -Aliases: -Each G-Scaler node should have a numbered alias in the aliases node, -in the form of gscN, N = 0...3. G-Scaler driver uses these aliases -to retrieve the device IDs using "of_alias_get_id()" call. - -Example: - -aliases { - gsc0 =&gsc_0; - gsc1 =&gsc_1; - gsc2 =&gsc_2; - gsc3 =&gsc_3; -}; diff --git a/Documentation/devicetree/bindings/media/samsung,exynos5250-gsc.yaml b/Documentation/devicetree/bindings/media/samsung,exynos5250-gsc.yaml new file mode 100644 index 0000000000000..878397830a4dd --- /dev/null +++ b/Documentation/devicetree/bindings/media/samsung,exynos5250-gsc.yaml @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/media/samsung,exynos5250-gsc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Samsung Exynos SoC G-Scaler + +maintainers: + - Inki Dae + - Krzysztof Kozlowski + - Seung-Woo Kim + #include + + video-scaler@13e00000 { + compatible = "samsung,exynos5250-gsc", "samsung,exynos5-gsc"; + reg = <0x13e00000 0x1000>; + interrupts = ; + power-domains = <&pd_gsc>; + clocks = <&clock CLK_GSCL0>; + clock-names = "gscl"; + iommus = <&sysmmu_gsc0>; + }; -- GitLab From a25a64b50e325787828007123988e09b1d2d4303 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 30 Jul 2022 17:59:04 +0200 Subject: [PATCH 0544/2223] media: atomisp_gmin_platform: Switch to use acpi_evaluate_dsm_typed() The acpi_evaluate_dsm_typed() provides a way to check the type of the object evaluated by _DSM call. Use it instead of open coded variant. Link: https://lore.kernel.org/r/20220730155905.90091-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Sakari Ailus Tested-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c b/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c index bf527b366ab34..f7fc5137199cf 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c +++ b/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c @@ -1207,16 +1207,14 @@ static int gmin_get_config_dsm_var(struct device *dev, if (!strcmp(var, "CamClk")) return -EINVAL; - obj = acpi_evaluate_dsm(handle, &atomisp_dsm_guid, 0, 0, NULL); + /* Return on unexpected object type */ + obj = acpi_evaluate_dsm_typed(handle, &atomisp_dsm_guid, 0, 0, NULL, + ACPI_TYPE_PACKAGE); if (!obj) { dev_info_once(dev, "Didn't find ACPI _DSM table.\n"); return -EINVAL; } - /* Return on unexpected object type */ - if (obj->type != ACPI_TYPE_PACKAGE) - return -EINVAL; - #if 0 /* Just for debugging purposes */ for (i = 0; i < obj->package.count; i++) { union acpi_object *cur = &obj->package.elements[i]; -- GitLab From adea153b4f6537f367fe77abada263fde8a1f7b6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 10:12:39 +0200 Subject: [PATCH 0545/2223] media: atomisp-ov2680: Fix ov2680_set_fmt() On sets actually store the set (closest) format inside ov2680_device.dev, so that it also properly gets returned by get_fmt. This fixes the following problem: 1. App does an VIDIOC_SET_FMT 640x480, calling ov2680_set_fmt() 2. Internal buffers (atomisp_create_pipes_stream()) get allocated at 640x480 size by atomisp_set_fmt() 3. ov2680_get_fmt() gets called later on and returns 1600x1200 since ov2680_device.dev was not updated. So things get configured to stream at 1600x1200, but the internal buffers created during atomisp_create_pipes_stream() do not get updated in size 4. streaming starts, internal buffers overflow and the entire machine freezes eventually due to memory being corrupted Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/i2c/atomisp-ov2680.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c b/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c index 4ba99c6606814..ab52e35266bb1 100644 --- a/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c +++ b/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c @@ -894,11 +894,7 @@ static int ov2680_set_fmt(struct v4l2_subdev *sd, if (v_flag) ov2680_v_flip(sd, v_flag); - /* - * ret = startup(sd); - * if (ret) - * dev_err(&client->dev, "ov2680 startup err\n"); - */ + dev->res = res; err: mutex_unlock(&dev->input_lock); return ret; -- GitLab From 44a11920ac39fe7fe5191d72c0822ff2a4c3b83d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 10:30:34 +0200 Subject: [PATCH 0546/2223] media: atomisp-ov2680: Don't take the input_lock for try_fmt calls. On ov2680_set_fmt() calls with format->which == V4L2_SUBDEV_FORMAT_TRY, ov2680_set_fmt() does not talk to the sensor, so there is no need to lock the dev->input_lock mutex in this case. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/i2c/atomisp-ov2680.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c b/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c index ab52e35266bb1..9ac469878eea9 100644 --- a/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c +++ b/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c @@ -841,8 +841,6 @@ static int ov2680_set_fmt(struct v4l2_subdev *sd, if (!ov2680_info) return -EINVAL; - mutex_lock(&dev->input_lock); - res = v4l2_find_nearest_size(ov2680_res_preview, ARRAY_SIZE(ov2680_res_preview), width, height, fmt->width, fmt->height); @@ -855,13 +853,14 @@ static int ov2680_set_fmt(struct v4l2_subdev *sd, fmt->code = MEDIA_BUS_FMT_SBGGR10_1X10; if (format->which == V4L2_SUBDEV_FORMAT_TRY) { sd_state->pads->try_fmt = *fmt; - mutex_unlock(&dev->input_lock); return 0; } dev_dbg(&client->dev, "%s: %dx%d\n", __func__, fmt->width, fmt->height); + mutex_lock(&dev->input_lock); + /* s_power has not been called yet for std v4l2 clients (camorama) */ power_up(sd); ret = ov2680_write_reg_array(client, dev->res->regs); -- GitLab From cbd5b438f8c2b5ed0af57869402320bc9891ccb5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 11:03:23 +0200 Subject: [PATCH 0547/2223] media: atomisp-ov2680: Improve ov2680_set_fmt() error handling Exit with an error on any i2c-write errors, rather then only exiting with an error when ov2680_get_intg_factor() fails. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/i2c/atomisp-ov2680.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c b/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c index 9ac469878eea9..5ba4c52a06a2d 100644 --- a/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c +++ b/drivers/staging/media/atomisp/i2c/atomisp-ov2680.c @@ -864,9 +864,11 @@ static int ov2680_set_fmt(struct v4l2_subdev *sd, /* s_power has not been called yet for std v4l2 clients (camorama) */ power_up(sd); ret = ov2680_write_reg_array(client, dev->res->regs); - if (ret) + if (ret) { dev_err(&client->dev, "ov2680 write resolution register err: %d\n", ret); + goto err; + } vts = dev->res->lines_per_frame; @@ -875,8 +877,10 @@ static int ov2680_set_fmt(struct v4l2_subdev *sd, vts = dev->exposure + OV2680_INTEGRATION_TIME_MARGIN; ret = ov2680_write_reg(client, 2, OV2680_TIMING_VTS_H, vts); - if (ret) + if (ret) { dev_err(&client->dev, "ov2680 write vts err: %d\n", ret); + goto err; + } ret = ov2680_get_intg_factor(client, ov2680_info, res); if (ret) { -- GitLab From e0565e23796e8260613b7aa81606cb42dcdcf68c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 11:01:13 +0200 Subject: [PATCH 0548/2223] media: atomisp-notes: Add info about sensors v4l2_get_subdev_hostdata() use Add info about sensors v4l2_get_subdev_hostdata() use, to notes.txt. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/notes.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/staging/media/atomisp/notes.txt b/drivers/staging/media/atomisp/notes.txt index d128b792e05fc..d3cf6ed547ae0 100644 --- a/drivers/staging/media/atomisp/notes.txt +++ b/drivers/staging/media/atomisp/notes.txt @@ -28,3 +28,22 @@ Since getting a picture requires multiple processing steps, this means that unlike in fixed pipelines the soft pipelines on the ISP can do multiple processing steps in a single pipeline element (in a single binary). + +### + +The sensor drivers use of v4l2_get_subdev_hostdata(), which returns +a camera_mipi_info struct. This struct is allocated/managed by +the core atomisp code. The most important parts of the struct +are filled by the atomisp core itself, like e.g. the port number. + +The sensor drivers on a set_fmt call do fill in camera_mipi_info.data +which is a atomisp_sensor_mode_data struct. This gets filled from +a function called _get_intg_factor(). This struct is not +used by the atomisp code at all. It is returned to userspace by +a ATOMISP_IOC_G_SENSOR_MODE_DATA and the Android userspace does use this. + +Other members of camera_mipi_info which are set by some drivers are: +-metadata_width, metadata_height, metadata_effective_width, set by + the ov5693 driver (and used by the atomisp core) +-raw_bayer_order, adjusted by the ov2680 driver when flipping since + flipping can change the bayer order -- GitLab From 4d3aafb9c9bba59c9b6f6df8ea6c89483bfed8d4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 16:53:00 +0200 Subject: [PATCH 0549/2223] media: atomisp: Fix VIDIOC_TRY_FMT atomisp_try_fmt() calls the sensor's try_fmt handler but it does not copy the result back to the passed in v4l2_pix_format under some circumstances. Potentially returning an unsupported resolution to userspace, which VIDIOC_TRY_FMT is not supposed to do. atomisp_set_fmt() also uses atomisp_try_fmt() and relies on this wrong behavior. The VIDIOC_TRY_FMT call passes NULL for the res_overflow argument where as the atomisp_set_fmt() call passes non NULL. Use the res_overflow argument to differentiate between the 2 callers and always propagate the sensors result in the VIDIOC_TRY_FMT case. This fixes the resolution list in camorama showing resolutions like e.g. 1584x1184 instead of 1600x1200. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index c932f340068f1..db6465756e497 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -4886,8 +4886,8 @@ int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, return 0; } - if (snr_mbus_fmt->width < f->width - && snr_mbus_fmt->height < f->height) { + if (!res_overflow || (snr_mbus_fmt->width < f->width && + snr_mbus_fmt->height < f->height)) { f->width = snr_mbus_fmt->width; f->height = snr_mbus_fmt->height; /* Set the flag when resolution requested is -- GitLab From e0ae3048b3db5a380a7196f58d5eeebb6770bad2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 17:08:44 +0200 Subject: [PATCH 0550/2223] media: atomisp: Make atomisp_try_fmt_cap() take padding into account atomisp_try_fmt() gives results with padding included. So when userspace asks for e.g. 1600x1200 then we should pass 1616x1216 to atomisp_try_fmt() this will then get adjusted back to 1600x1200 before returning it to userspace by the atomisp_adjust_fmt() call at the end of atomisp_try_fmt(). This fixes the resolution list in camorama showing resolutions like e.g. 1584x1184 instead of 1600x1200. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 459645c2e2a73..7ecee39ef5a47 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -960,6 +960,13 @@ static int atomisp_try_fmt_cap(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); int ret; + /* + * atomisp_try_fmt() gived results with padding included, note + * (this gets removed again by the atomisp_adjust_fmt() call below. + */ + f->fmt.pix.width += pad_w; + f->fmt.pix.height += pad_h; + rt_mutex_lock(&isp->mutex); ret = atomisp_try_fmt(vdev, &f->fmt.pix, NULL); rt_mutex_unlock(&isp->mutex); -- GitLab From 8519635cb292ee4e804e3f465a54b13447180366 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 17:38:45 +0200 Subject: [PATCH 0551/2223] media: atomisp: hmm_bo: Simplify alloc_private_pages() Since lack_mem starts initialized to true, alloc_private_pages() will always set order to HMM_MIN_ORDER aka 0 / will always alloc 1 page at a time. So all the magic to decrease order if allocs fail is not necessary and can be removed. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../media/atomisp/include/hmm/hmm_bo.h | 3 - .../staging/media/atomisp/pci/hmm/hmm_bo.c | 83 +++---------------- 2 files changed, 10 insertions(+), 76 deletions(-) diff --git a/drivers/staging/media/atomisp/include/hmm/hmm_bo.h b/drivers/staging/media/atomisp/include/hmm/hmm_bo.h index 385e22fc4a46a..901dc37c80bcf 100644 --- a/drivers/staging/media/atomisp/include/hmm/hmm_bo.h +++ b/drivers/staging/media/atomisp/include/hmm/hmm_bo.h @@ -65,9 +65,6 @@ #define check_bo_null_return_void(bo) \ check_null_return_void(bo, "NULL hmm buffer object.\n") -#define HMM_MAX_ORDER 3 -#define HMM_MIN_ORDER 0 - #define ISP_VM_START 0x0 #define ISP_VM_SIZE (0x7FFFFFFF) /* 2G address space */ #define ISP_PTR_NULL NULL diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index f50494123f039..2753142412637 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -44,16 +44,6 @@ #include "hmm/hmm_common.h" #include "hmm/hmm_bo.h" -static unsigned int order_to_nr(unsigned int order) -{ - return 1U << order; -} - -static unsigned int nr_to_order_bottom(unsigned int nr) -{ - return fls(nr) - 1; -} - static int __bo_init(struct hmm_bo_device *bdev, struct hmm_buffer_object *bo, unsigned int pgnr) { @@ -653,13 +643,10 @@ static void free_private_bo_pages(struct hmm_buffer_object *bo, static int alloc_private_pages(struct hmm_buffer_object *bo) { int ret; - unsigned int pgnr, order, blk_pgnr, alloc_pgnr; + unsigned int pgnr, blk_pgnr, alloc_pgnr; struct page *pages; gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; /* REVISIT: need __GFP_FS too? */ int i, j; - int failure_number = 0; - bool reduce_order = false; - bool lack_mem = true; pgnr = bo->pgnr; @@ -667,58 +654,17 @@ static int alloc_private_pages(struct hmm_buffer_object *bo) alloc_pgnr = 0; while (pgnr) { - order = nr_to_order_bottom(pgnr); - /* - * if be short of memory, we will set order to 0 - * everytime. - */ - if (lack_mem) - order = HMM_MIN_ORDER; - else if (order > HMM_MAX_ORDER) - order = HMM_MAX_ORDER; -retry: - /* - * When order > HMM_MIN_ORDER, for performance reasons we don't - * want alloc_pages() to sleep. In case it fails and fallbacks - * to HMM_MIN_ORDER or in case the requested order is originally - * the minimum value, we can allow alloc_pages() to sleep for - * robustness purpose. - * - * REVISIT: why __GFP_FS is necessary? - */ - if (order == HMM_MIN_ORDER) { - gfp &= ~GFP_NOWAIT; - gfp |= __GFP_RECLAIM | __GFP_FS; - } + gfp &= ~GFP_NOWAIT; + gfp |= __GFP_RECLAIM | __GFP_FS; - pages = alloc_pages(gfp, order); + pages = alloc_pages(gfp, 0); // alloc 1 page if (unlikely(!pages)) { - /* - * in low memory case, if allocation page fails, - * we turn to try if order=0 allocation could - * succeed. if order=0 fails too, that means there is - * no memory left. - */ - if (order == HMM_MIN_ORDER) { - dev_err(atomisp_dev, - "%s: cannot allocate pages\n", - __func__); - goto cleanup; - } - order = HMM_MIN_ORDER; - failure_number++; - reduce_order = true; - /* - * if fail two times continuously, we think be short - * of memory now. - */ - if (failure_number == 2) { - lack_mem = true; - failure_number = 0; - } - goto retry; + dev_err(atomisp_dev, + "%s: cannot allocate pages\n", + __func__); + goto cleanup; } else { - blk_pgnr = order_to_nr(order); + blk_pgnr = 1; /* * set memory to uncacheable -- UC_MINUS @@ -728,7 +674,7 @@ retry: dev_err(atomisp_dev, "set page uncacheablefailed.\n"); - __free_pages(pages, order); + __free_pages(pages, 0); goto cleanup; } @@ -738,15 +684,6 @@ retry: } pgnr -= blk_pgnr; - - /* - * if order is not reduced this time, clear - * failure_number. - */ - if (reduce_order) - reduce_order = false; - else - failure_number = 0; } } -- GitLab From fce48bf10141953e55bd9ffb34de22dde7fdca03 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 17:47:37 +0200 Subject: [PATCH 0552/2223] media: atomisp: hmm_bo: Further simplify alloc_private_pages() Further simplify alloc_private_pages(). Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/hmm/hmm_bo.c | 29 ++++--------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 2753142412637..bb52171a9d870 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -642,21 +642,11 @@ static void free_private_bo_pages(struct hmm_buffer_object *bo, /*Allocate pages which will be used only by ISP*/ static int alloc_private_pages(struct hmm_buffer_object *bo) { - int ret; - unsigned int pgnr, blk_pgnr, alloc_pgnr; + const gfp_t gfp = __GFP_NOWARN | __GFP_RECLAIM | __GFP_FS; struct page *pages; - gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; /* REVISIT: need __GFP_FS too? */ - int i, j; - - pgnr = bo->pgnr; - - i = 0; - alloc_pgnr = 0; - - while (pgnr) { - gfp &= ~GFP_NOWAIT; - gfp |= __GFP_RECLAIM | __GFP_FS; + int i, ret; + for (i = 0; i < bo->pgnr; i++) { pages = alloc_pages(gfp, 0); // alloc 1 page if (unlikely(!pages)) { dev_err(atomisp_dev, @@ -664,12 +654,10 @@ static int alloc_private_pages(struct hmm_buffer_object *bo) __func__); goto cleanup; } else { - blk_pgnr = 1; - /* * set memory to uncacheable -- UC_MINUS */ - ret = set_pages_uc(pages, blk_pgnr); + ret = set_pages_uc(pages, 1); if (ret) { dev_err(atomisp_dev, "set page uncacheablefailed.\n"); @@ -679,18 +667,13 @@ static int alloc_private_pages(struct hmm_buffer_object *bo) goto cleanup; } - for (j = 0; j < blk_pgnr; j++, i++) { - bo->pages[i] = pages + j; - } - - pgnr -= blk_pgnr; + bo->pages[i] = pages; } } return 0; cleanup: - alloc_pgnr = i; - free_private_bo_pages(bo, alloc_pgnr); + free_private_bo_pages(bo, i); return -ENOMEM; } -- GitLab From 2691ecc089ca934dd6d4c03a4b410a81a0d87351 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 18:00:38 +0200 Subject: [PATCH 0553/2223] media: atomisp: hmm_bo: Rewrite alloc_private_pages() using pages_array helper funcs Rewrite alloc_private_pages() using pages_array helper funcs. Note alloc_pages_bulk_array() skips non NULL pages, so switch the allocating of the pages pointer array to kcalloc to ensure the pages are initially all set to NULL. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/hmm/hmm_bo.c | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index bb52171a9d870..40b1137dcc313 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -615,6 +615,14 @@ found: return bo; } +static void free_pages_bulk_array(unsigned long nr_pages, struct page **page_array) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + __free_pages(page_array[i], 0); +} + static void free_private_bo_pages(struct hmm_buffer_object *bo, int free_pgnr) { @@ -643,38 +651,22 @@ static void free_private_bo_pages(struct hmm_buffer_object *bo, static int alloc_private_pages(struct hmm_buffer_object *bo) { const gfp_t gfp = __GFP_NOWARN | __GFP_RECLAIM | __GFP_FS; - struct page *pages; - int i, ret; - - for (i = 0; i < bo->pgnr; i++) { - pages = alloc_pages(gfp, 0); // alloc 1 page - if (unlikely(!pages)) { - dev_err(atomisp_dev, - "%s: cannot allocate pages\n", - __func__); - goto cleanup; - } else { - /* - * set memory to uncacheable -- UC_MINUS - */ - ret = set_pages_uc(pages, 1); - if (ret) { - dev_err(atomisp_dev, - "set page uncacheablefailed.\n"); - - __free_pages(pages, 0); + int ret; - goto cleanup; - } + ret = alloc_pages_bulk_array(gfp, bo->pgnr, bo->pages); + if (ret != bo->pgnr) { + free_pages_bulk_array(ret, bo->pages); + return -ENOMEM; + } - bo->pages[i] = pages; - } + ret = set_pages_array_uc(bo->pages, bo->pgnr); + if (ret) { + dev_err(atomisp_dev, "set pages uncacheable failed.\n"); + free_pages_bulk_array(bo->pgnr, bo->pages); + return ret; } return 0; -cleanup: - free_private_bo_pages(bo, i); - return -ENOMEM; } static void free_user_pages(struct hmm_buffer_object *bo, @@ -774,7 +766,7 @@ int hmm_bo_alloc_pages(struct hmm_buffer_object *bo, mutex_lock(&bo->mutex); check_bo_status_no_goto(bo, HMM_BO_PAGE_ALLOCED, status_err); - bo->pages = kmalloc_array(bo->pgnr, sizeof(struct page *), GFP_KERNEL); + bo->pages = kcalloc(bo->pgnr, sizeof(struct page *), GFP_KERNEL); if (unlikely(!bo->pages)) { ret = -ENOMEM; goto alloc_err; -- GitLab From 3df52e584ed13a4451930d2d712580e3f2e63b6a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Aug 2022 18:06:10 +0200 Subject: [PATCH 0554/2223] media: atomisp: hmm_bo: Rewrite free_private_pages() using pages_array helper funcs Rewrite free_private_pages() using pages_array helper funcs. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/hmm/hmm_bo.c | 26 +++---------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 40b1137dcc313..d7f42a4ce40a3 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -623,28 +623,10 @@ static void free_pages_bulk_array(unsigned long nr_pages, struct page **page_arr __free_pages(page_array[i], 0); } -static void free_private_bo_pages(struct hmm_buffer_object *bo, - int free_pgnr) +static void free_private_bo_pages(struct hmm_buffer_object *bo) { - int i, ret; - - for (i = 0; i < free_pgnr; i++) { - ret = set_pages_wb(bo->pages[i], 1); - if (ret) - dev_err(atomisp_dev, - "set page to WB err ...ret = %d\n", - ret); - /* - W/A: set_pages_wb seldom return value = -EFAULT - indicate that address of page is not in valid - range(0xffff880000000000~0xffffc7ffffffffff) - then, _free_pages would panic; Do not know why page - address be valid,it maybe memory corruption by lowmemory - */ - if (!ret) { - __free_pages(bo->pages[i], 0); - } - } + set_pages_array_wb(bo->pages, bo->pgnr); + free_pages_bulk_array(bo->pgnr, bo->pages); } /*Allocate pages which will be used only by ISP*/ @@ -822,7 +804,7 @@ void hmm_bo_free_pages(struct hmm_buffer_object *bo) bo->status &= (~HMM_BO_PAGE_ALLOCED); if (bo->type == HMM_BO_PRIVATE) - free_private_bo_pages(bo, bo->pgnr); + free_private_bo_pages(bo); else if (bo->type == HMM_BO_USER) free_user_pages(bo, bo->pgnr); else -- GitLab From 30cf7e90f0ea7adb1f125a286f2273f9faa162fe Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 21 Aug 2022 20:43:57 +0200 Subject: [PATCH 0555/2223] media: atomisp: hmm_bo: Drop PFN code path from alloc_user_pages() alloc_user_pages() is only ever called on qbuf for USERPTR buffers which always hits the get_user_pages_fast() path, so the pin_user_pages() path can be removed. Getting the vma then also is no longer necessary since that is only done to determine which path to use. And this also removes the only users of the mem_type struct hmm_bo member, so remove that as well. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../media/atomisp/include/hmm/hmm_bo.h | 3 -- .../staging/media/atomisp/pci/hmm/hmm_bo.c | 46 +++---------------- 2 files changed, 6 insertions(+), 43 deletions(-) diff --git a/drivers/staging/media/atomisp/include/hmm/hmm_bo.h b/drivers/staging/media/atomisp/include/hmm/hmm_bo.h index 901dc37c80bcf..c5cbae1d9cf9c 100644 --- a/drivers/staging/media/atomisp/include/hmm/hmm_bo.h +++ b/drivers/staging/media/atomisp/include/hmm/hmm_bo.h @@ -86,8 +86,6 @@ enum hmm_bo_type { #define HMM_BO_VMAPED 0x10 #define HMM_BO_VMAPED_CACHED 0x20 #define HMM_BO_ACTIVE 0x1000 -#define HMM_BO_MEM_TYPE_USER 0x1 -#define HMM_BO_MEM_TYPE_PFN 0x2 struct hmm_bo_device { struct isp_mmu mmu; @@ -123,7 +121,6 @@ struct hmm_buffer_object { enum hmm_bo_type type; int mmap_count; int status; - int mem_type; void *vmap_addr; /* kernel virtual address by vmap */ struct rb_node node; diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index d7f42a4ce40a3..a5fd6d38d3c41 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -656,12 +656,8 @@ static void free_user_pages(struct hmm_buffer_object *bo, { int i; - if (bo->mem_type == HMM_BO_MEM_TYPE_PFN) { - unpin_user_pages(bo->pages, page_nr); - } else { - for (i = 0; i < page_nr; i++) - put_page(bo->pages[i]); - } + for (i = 0; i < page_nr; i++) + put_page(bo->pages[i]); } /* @@ -671,43 +667,13 @@ static int alloc_user_pages(struct hmm_buffer_object *bo, const void __user *userptr) { int page_nr; - struct vm_area_struct *vma; - - mutex_unlock(&bo->mutex); - mmap_read_lock(current->mm); - vma = find_vma(current->mm, (unsigned long)userptr); - mmap_read_unlock(current->mm); - if (!vma) { - dev_err(atomisp_dev, "find_vma failed\n"); - mutex_lock(&bo->mutex); - return -EFAULT; - } - mutex_lock(&bo->mutex); - /* - * Handle frame buffer allocated in other kerenl space driver - * and map to user space - */ userptr = untagged_addr(userptr); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - page_nr = pin_user_pages((unsigned long)userptr, bo->pgnr, - FOLL_LONGTERM | FOLL_WRITE, - bo->pages, NULL); - bo->mem_type = HMM_BO_MEM_TYPE_PFN; - } else { - /*Handle frame buffer allocated in user space*/ - mutex_unlock(&bo->mutex); - page_nr = get_user_pages_fast((unsigned long)userptr, - (int)(bo->pgnr), 1, bo->pages); - mutex_lock(&bo->mutex); - bo->mem_type = HMM_BO_MEM_TYPE_USER; - } - - dev_dbg(atomisp_dev, "%s: %d %s pages were allocated as 0x%08x\n", - __func__, - bo->pgnr, - bo->mem_type == HMM_BO_MEM_TYPE_USER ? "user" : "pfn", page_nr); + /* Handle frame buffer allocated in user space */ + mutex_unlock(&bo->mutex); + page_nr = get_user_pages_fast((unsigned long)userptr, bo->pgnr, 1, bo->pages); + mutex_lock(&bo->mutex); /* can be written by caller, not forced */ if (page_nr != bo->pgnr) { -- GitLab From 6e6c4ae0f0ba295dbf6cbd48d93bec169d6ce431 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 21 Aug 2022 20:29:06 +0200 Subject: [PATCH 0556/2223] media: atomisp: Ensure that USERPTR pointers are page aligned The atomisp code needs USERPTR pointers to be page aligned, otherwise bad things (scribbling over other parts of the process' RAM) happen. Add a check to ensure this and exit VIDIOC_QBUF calls with unaligned pointers with -EINVAL. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 7ecee39ef5a47..d0b5dacbb20aa 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1345,6 +1345,12 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) * address and reprograme out page table properly */ if (buf->memory == V4L2_MEMORY_USERPTR) { + if (offset_in_page(buf->m.userptr)) { + dev_err(isp->dev, "Error userptr is not page aligned.\n"); + ret = -EINVAL; + goto error; + } + vb = pipe->capq.bufs[buf->index]; vm_mem = vb->priv; if (!vm_mem) { -- GitLab From a2ace25c3f0e8904abc2aadee554cc20c6c3bf6b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 27 Aug 2022 15:54:27 +0200 Subject: [PATCH 0557/2223] media: atomisp: Fix device_caps reporting of the registered video-devs atomisp_subdev_register_entities() had V4L2_CAP_VIDEO_CAPTURE / V4L2_CAP_VIDEO_OUT swapped. Or-ing in V4L2_CAP_VIDEO_OUT for the nodes which allow capturing from the camera and or-ing in V4L2_CAP_VIDEO_CAPTURE for the file-injection node (mem2mem use of the ISP). Things happen to still work for the capture device-nodes because the "shared" caps also included V4L2_CAP_VIDEO_CAPTURE, so those shared nodes advertised V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUT. Fix things so that only the correct caps are advertised. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../media/atomisp/pci/atomisp_subdev.c | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 394fe69590333..6d533919d466f 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1314,16 +1314,12 @@ int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, struct v4l2_device *vdev) { int ret; - u32 device_caps; /* * FIXME: check if all device caps are properly initialized. - * Should any of those use V4L2_CAP_META_OUTPUT? Probably yes. + * Should any of those use V4L2_CAP_META_CAPTURE? Probably yes. */ - device_caps = V4L2_CAP_VIDEO_CAPTURE | - V4L2_CAP_STREAMING; - /* Register the subdev and video node. */ ret = v4l2_device_register_subdev(vdev, &asd->subdev); @@ -1331,39 +1327,34 @@ int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, goto error; asd->video_out_preview.vdev.v4l2_dev = vdev; - asd->video_out_preview.vdev.device_caps = device_caps | - V4L2_CAP_VIDEO_OUTPUT; + asd->video_out_preview.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_out_preview.vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) goto error; asd->video_out_capture.vdev.v4l2_dev = vdev; - asd->video_out_capture.vdev.device_caps = device_caps | - V4L2_CAP_VIDEO_OUTPUT; + asd->video_out_capture.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_out_capture.vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) goto error; asd->video_out_vf.vdev.v4l2_dev = vdev; - asd->video_out_vf.vdev.device_caps = device_caps | - V4L2_CAP_VIDEO_OUTPUT; + asd->video_out_vf.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_out_vf.vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) goto error; asd->video_out_video_capture.vdev.v4l2_dev = vdev; - asd->video_out_video_capture.vdev.device_caps = device_caps | - V4L2_CAP_VIDEO_OUTPUT; + asd->video_out_video_capture.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_out_video_capture.vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) goto error; asd->video_acc.vdev.v4l2_dev = vdev; - asd->video_acc.vdev.device_caps = device_caps | - V4L2_CAP_VIDEO_OUTPUT; + asd->video_acc.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_acc.vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) @@ -1377,8 +1368,7 @@ int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, return 0; asd->video_in.vdev.v4l2_dev = vdev; - asd->video_in.vdev.device_caps = device_caps | - V4L2_CAP_VIDEO_CAPTURE; + asd->video_in.vdev.device_caps = V4L2_CAP_VIDEO_OUT | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_in.vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) -- GitLab From 29b12ac7609c0c8f0bc4a6448d984b80c8957e99 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 27 Aug 2022 16:17:07 +0200 Subject: [PATCH 0558/2223] media: atomisp: Remove file-injection support The file-injection support of the atomisp driver has not been tested and is not necessary for camera support, remove it. Note the main reason for removing this is because it depends on the videobuf (version 1) outq and we want to remove or replace all videobuf usage in the driver. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/Makefile | 1 - .../staging/media/atomisp/pci/atomisp_file.c | 229 ------------------ .../staging/media/atomisp/pci/atomisp_file.h | 44 ---- .../staging/media/atomisp/pci/atomisp_fops.c | 3 +- .../media/atomisp/pci/atomisp_internal.h | 4 +- .../staging/media/atomisp/pci/atomisp_ioctl.c | 3 +- .../media/atomisp/pci/atomisp_subdev.c | 40 +-- .../media/atomisp/pci/atomisp_subdev.h | 1 - .../staging/media/atomisp/pci/atomisp_v4l2.c | 30 --- 9 files changed, 8 insertions(+), 347 deletions(-) delete mode 100644 drivers/staging/media/atomisp/pci/atomisp_file.c delete mode 100644 drivers/staging/media/atomisp/pci/atomisp_file.h diff --git a/drivers/staging/media/atomisp/Makefile b/drivers/staging/media/atomisp/Makefile index fb7b406f50bfb..532e12ed72e6e 100644 --- a/drivers/staging/media/atomisp/Makefile +++ b/drivers/staging/media/atomisp/Makefile @@ -17,7 +17,6 @@ atomisp-objs += \ pci/atomisp_compat_css20.o \ pci/atomisp_csi2.o \ pci/atomisp_drvfs.o \ - pci/atomisp_file.o \ pci/atomisp_fops.o \ pci/atomisp_ioctl.o \ pci/atomisp_subdev.o \ diff --git a/drivers/staging/media/atomisp/pci/atomisp_file.c b/drivers/staging/media/atomisp/pci/atomisp_file.c deleted file mode 100644 index 4570a9ab100b7..0000000000000 --- a/drivers/staging/media/atomisp/pci/atomisp_file.c +++ /dev/null @@ -1,229 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Support for Medifield PNW Camera Imaging ISP subsystem. - * - * Copyright (c) 2010 Intel Corporation. All Rights Reserved. - * - * Copyright (c) 2010 Silicon Hive www.siliconhive.com. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - */ - -#include -#include - -#include -#include - -#include "ia_css.h" - -#include "atomisp_cmd.h" -#include "atomisp_common.h" -#include "atomisp_file.h" -#include "atomisp_internal.h" -#include "atomisp_ioctl.h" - -static void file_work(struct work_struct *work) -{ - struct atomisp_file_device *file_dev = - container_of(work, struct atomisp_file_device, work); - struct atomisp_device *isp = file_dev->isp; - /* only support file injection on subdev0 */ - struct atomisp_sub_device *asd = &isp->asd[0]; - struct atomisp_video_pipe *out_pipe = &asd->video_in; - unsigned short *buf = videobuf_to_vmalloc(out_pipe->outq.bufs[0]); - struct v4l2_mbus_framefmt isp_sink_fmt; - - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - return; - - dev_dbg(isp->dev, ">%s: ready to start streaming\n", __func__); - isp_sink_fmt = *atomisp_subdev_get_ffmt(&asd->subdev, NULL, - V4L2_SUBDEV_FORMAT_ACTIVE, - ATOMISP_SUBDEV_PAD_SINK); - - while (!ia_css_isp_has_started()) - usleep_range(1000, 1500); - - ia_css_stream_send_input_frame(asd->stream_env[ATOMISP_INPUT_STREAM_GENERAL].stream, - buf, isp_sink_fmt.width, - isp_sink_fmt.height); - dev_dbg(isp->dev, "<%s: streaming done\n", __func__); -} - -static int file_input_s_stream(struct v4l2_subdev *sd, int enable) -{ - struct atomisp_file_device *file_dev = v4l2_get_subdevdata(sd); - struct atomisp_device *isp = file_dev->isp; - /* only support file injection on subdev0 */ - struct atomisp_sub_device *asd = &isp->asd[0]; - - dev_dbg(isp->dev, "%s: enable %d\n", __func__, enable); - if (enable) { - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - return 0; - - queue_work(file_dev->work_queue, &file_dev->work); - return 0; - } - cancel_work_sync(&file_dev->work); - return 0; -} - -static int file_input_get_fmt(struct v4l2_subdev *sd, - struct v4l2_subdev_state *sd_state, - struct v4l2_subdev_format *format) -{ - struct v4l2_mbus_framefmt *fmt = &format->format; - struct atomisp_file_device *file_dev = v4l2_get_subdevdata(sd); - struct atomisp_device *isp = file_dev->isp; - /* only support file injection on subdev0 */ - struct atomisp_sub_device *asd = &isp->asd[0]; - struct v4l2_mbus_framefmt *isp_sink_fmt; - - if (format->pad) - return -EINVAL; - isp_sink_fmt = atomisp_subdev_get_ffmt(&asd->subdev, NULL, - V4L2_SUBDEV_FORMAT_ACTIVE, - ATOMISP_SUBDEV_PAD_SINK); - - fmt->width = isp_sink_fmt->width; - fmt->height = isp_sink_fmt->height; - fmt->code = isp_sink_fmt->code; - - return 0; -} - -static int file_input_set_fmt(struct v4l2_subdev *sd, - struct v4l2_subdev_state *sd_state, - struct v4l2_subdev_format *format) -{ - struct v4l2_mbus_framefmt *fmt = &format->format; - - if (format->pad) - return -EINVAL; - file_input_get_fmt(sd, sd_state, format); - if (format->which == V4L2_SUBDEV_FORMAT_TRY) - sd_state->pads->try_fmt = *fmt; - return 0; -} - -static int file_input_log_status(struct v4l2_subdev *sd) -{ - /*to fake*/ - return 0; -} - -static int file_input_s_power(struct v4l2_subdev *sd, int on) -{ - /* to fake */ - return 0; -} - -static int file_input_enum_mbus_code(struct v4l2_subdev *sd, - struct v4l2_subdev_state *sd_state, - struct v4l2_subdev_mbus_code_enum *code) -{ - /*to fake*/ - return 0; -} - -static int file_input_enum_frame_size(struct v4l2_subdev *sd, - struct v4l2_subdev_state *sd_state, - struct v4l2_subdev_frame_size_enum *fse) -{ - /*to fake*/ - return 0; -} - -static int file_input_enum_frame_ival(struct v4l2_subdev *sd, - struct v4l2_subdev_state *sd_state, - struct v4l2_subdev_frame_interval_enum - *fie) -{ - /*to fake*/ - return 0; -} - -static const struct v4l2_subdev_video_ops file_input_video_ops = { - .s_stream = file_input_s_stream, -}; - -static const struct v4l2_subdev_core_ops file_input_core_ops = { - .log_status = file_input_log_status, - .s_power = file_input_s_power, -}; - -static const struct v4l2_subdev_pad_ops file_input_pad_ops = { - .enum_mbus_code = file_input_enum_mbus_code, - .enum_frame_size = file_input_enum_frame_size, - .enum_frame_interval = file_input_enum_frame_ival, - .get_fmt = file_input_get_fmt, - .set_fmt = file_input_set_fmt, -}; - -static const struct v4l2_subdev_ops file_input_ops = { - .core = &file_input_core_ops, - .video = &file_input_video_ops, - .pad = &file_input_pad_ops, -}; - -void -atomisp_file_input_unregister_entities(struct atomisp_file_device *file_dev) -{ - media_entity_cleanup(&file_dev->sd.entity); - v4l2_device_unregister_subdev(&file_dev->sd); -} - -int atomisp_file_input_register_entities(struct atomisp_file_device *file_dev, - struct v4l2_device *vdev) -{ - /* Register the subdev and video nodes. */ - return v4l2_device_register_subdev(vdev, &file_dev->sd); -} - -void atomisp_file_input_cleanup(struct atomisp_device *isp) -{ - struct atomisp_file_device *file_dev = &isp->file_dev; - - if (file_dev->work_queue) { - destroy_workqueue(file_dev->work_queue); - file_dev->work_queue = NULL; - } -} - -int atomisp_file_input_init(struct atomisp_device *isp) -{ - struct atomisp_file_device *file_dev = &isp->file_dev; - struct v4l2_subdev *sd = &file_dev->sd; - struct media_pad *pads = file_dev->pads; - struct media_entity *me = &sd->entity; - - file_dev->isp = isp; - file_dev->work_queue = alloc_workqueue(isp->v4l2_dev.name, 0, 1); - if (!file_dev->work_queue) { - dev_err(isp->dev, "Failed to initialize file inject workq\n"); - return -ENOMEM; - } - - INIT_WORK(&file_dev->work, file_work); - - v4l2_subdev_init(sd, &file_input_ops); - sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE; - strscpy(sd->name, "file_input_subdev", sizeof(sd->name)); - v4l2_set_subdevdata(sd, file_dev); - - pads[0].flags = MEDIA_PAD_FL_SINK; - me->function = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN; - - return media_entity_pads_init(me, 1, pads); -} diff --git a/drivers/staging/media/atomisp/pci/atomisp_file.h b/drivers/staging/media/atomisp/pci/atomisp_file.h deleted file mode 100644 index f166a2aefff11..0000000000000 --- a/drivers/staging/media/atomisp/pci/atomisp_file.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Support for Medifield PNW Camera Imaging ISP subsystem. - * - * Copyright (c) 2010 Intel Corporation. All Rights Reserved. - * - * Copyright (c) 2010 Silicon Hive www.siliconhive.com. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - */ - -#ifndef __ATOMISP_FILE_H__ -#define __ATOMISP_FILE_H__ - -#include -#include - -struct atomisp_device; - -struct atomisp_file_device { - struct v4l2_subdev sd; - struct atomisp_device *isp; - struct media_pad pads[1]; - - struct workqueue_struct *work_queue; - struct work_struct work; -}; - -void atomisp_file_input_cleanup(struct atomisp_device *isp); -int atomisp_file_input_init(struct atomisp_device *isp); -void atomisp_file_input_unregister_entities( - struct atomisp_file_device *file_dev); -int atomisp_file_input_register_entities(struct atomisp_file_device *file_dev, - struct v4l2_device *vdev); -#endif /* __ATOMISP_FILE_H__ */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 77150e4ae1447..9ff0bcc043072 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -742,8 +742,7 @@ static unsigned int atomisp_subdev_users(struct atomisp_sub_device *asd) asd->video_out_vf.users + asd->video_out_capture.users + asd->video_out_video_capture.users + - asd->video_acc.users + - asd->video_in.users; + asd->video_acc.users; } unsigned int atomisp_dev_users(struct atomisp_device *isp) diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index f71ab1ee6e19c..ce1746e7ab9f4 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -34,7 +34,6 @@ #include "sh_css_legacy.h" #include "atomisp_csi2.h" -#include "atomisp_file.h" #include "atomisp_subdev.h" #include "atomisp_tpg.h" #include "atomisp_compat.h" @@ -86,7 +85,7 @@ #define ATOM_ISP_POWER_DOWN 0 #define ATOM_ISP_POWER_UP 1 -#define ATOM_ISP_MAX_INPUTS 4 +#define ATOM_ISP_MAX_INPUTS 3 #define ATOMISP_SC_TYPE_SIZE 2 @@ -241,7 +240,6 @@ struct atomisp_device { struct atomisp_mipi_csi2_device csi2_port[ATOMISP_CAMERA_NR_PORTS]; struct atomisp_tpg_device tpg; - struct atomisp_file_device file_dev; /* Purpose of mutex is to protect and serialize use of isp data * structures and css API calls. */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index d0b5dacbb20aa..571a2df75238e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -609,8 +609,7 @@ atomisp_subdev_streaming_count(struct atomisp_sub_device *asd) return asd->video_out_preview.capq.streaming + asd->video_out_capture.capq.streaming + asd->video_out_video_capture.capq.streaming - + asd->video_out_vf.capq.streaming - + asd->video_in.capq.streaming; + + asd->video_out_vf.capq.streaming; } unsigned int atomisp_streaming_count(struct atomisp_device *isp) diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 6d533919d466f..1509543924d29 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1126,9 +1126,6 @@ static int isp_subdev_init_entities(struct atomisp_sub_device *asd) if (ret < 0) return ret; - atomisp_init_subdev_pipe(asd, &asd->video_in, - V4L2_BUF_TYPE_VIDEO_OUTPUT); - atomisp_init_subdev_pipe(asd, &asd->video_out_preview, V4L2_BUF_TYPE_VIDEO_CAPTURE); @@ -1143,11 +1140,6 @@ static int isp_subdev_init_entities(struct atomisp_sub_device *asd) atomisp_init_acc_pipe(asd, &asd->video_acc); - ret = atomisp_video_init(&asd->video_in, "MEMORY", - ATOMISP_RUN_MODE_SDV); - if (ret < 0) - return ret; - ret = atomisp_video_init(&asd->video_out_capture, "CAPTURE", ATOMISP_RUN_MODE_STILL_CAPTURE); if (ret < 0) @@ -1226,7 +1218,11 @@ int atomisp_create_pads_links(struct atomisp_device *isp) return ret; } } - for (i = 0; i < isp->input_cnt - 2; i++) { + for (i = 0; i < isp->input_cnt; i++) { + /* Don't create links for the test-pattern-generator */ + if (isp->inputs[i].type == TEST_PATTERN) + continue; + ret = media_create_pad_link(&isp->inputs[i].camera->entity, 0, &isp->csi2_port[isp->inputs[i]. port].subdev.entity, @@ -1262,17 +1258,6 @@ int atomisp_create_pads_links(struct atomisp_device *isp) entity, 0, 0); if (ret < 0) return ret; - /* - * file input only supported on subdev0 - * so do not create pad link for subdevs other then subdev0 - */ - if (asd->index) - return 0; - ret = media_create_pad_link(&asd->video_in.vdev.entity, - 0, &asd->subdev.entity, - ATOMISP_SUBDEV_PAD_SINK, 0); - if (ret < 0) - return ret; } return 0; } @@ -1302,7 +1287,6 @@ void atomisp_subdev_unregister_entities(struct atomisp_sub_device *asd) { atomisp_subdev_cleanup_entities(asd); v4l2_device_unregister_subdev(&asd->subdev); - atomisp_video_unregister(&asd->video_in); atomisp_video_unregister(&asd->video_out_preview); atomisp_video_unregister(&asd->video_out_vf); atomisp_video_unregister(&asd->video_out_capture); @@ -1360,20 +1344,6 @@ int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, if (ret < 0) goto error; - /* - * file input only supported on subdev0 - * so do not create video node for subdevs other then subdev0 - */ - if (asd->index) - return 0; - - asd->video_in.vdev.v4l2_dev = vdev; - asd->video_in.vdev.device_caps = V4L2_CAP_VIDEO_OUT | V4L2_CAP_STREAMING; - ret = video_register_device(&asd->video_in.vdev, - VFL_TYPE_VIDEO, -1); - if (ret < 0) - goto error; - return 0; error: diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index 798a93793a9a4..938d427bede89 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -297,7 +297,6 @@ struct atomisp_sub_device { enum atomisp_subdev_input_entity input; unsigned int output; - struct atomisp_video_pipe video_in; struct atomisp_video_pipe video_out_capture; /* capture output */ struct atomisp_video_pipe video_out_vf; /* viewfinder output */ struct atomisp_video_pipe video_out_preview; /* preview output */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 643ba981601b6..5488a02200ed1 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -34,7 +34,6 @@ #include "atomisp_cmd.h" #include "atomisp_common.h" #include "atomisp_fops.h" -#include "atomisp_file.h" #include "atomisp_ioctl.h" #include "atomisp_internal.h" #include "atomisp-regs.h" @@ -1158,7 +1157,6 @@ static void atomisp_unregister_entities(struct atomisp_device *isp) for (i = 0; i < isp->num_of_streams; i++) atomisp_subdev_unregister_entities(&isp->asd[i]); atomisp_tpg_unregister_entities(&isp->tpg); - atomisp_file_input_unregister_entities(&isp->file_dev); for (i = 0; i < ATOMISP_CAMERA_NR_PORTS; i++) atomisp_mipi_csi2_unregister_entities(&isp->csi2_port[i]); @@ -1210,13 +1208,6 @@ static int atomisp_register_entities(struct atomisp_device *isp) goto csi_and_subdev_probe_failed; } - ret = - atomisp_file_input_register_entities(&isp->file_dev, &isp->v4l2_dev); - if (ret < 0) { - dev_err(isp->dev, "atomisp_file_input_register_entities\n"); - goto file_input_register_failed; - } - ret = atomisp_tpg_register_entities(&isp->tpg, &isp->v4l2_dev); if (ret < 0) { dev_err(isp->dev, "atomisp_tpg_register_entities\n"); @@ -1267,14 +1258,6 @@ static int atomisp_register_entities(struct atomisp_device *isp) } } - dev_dbg(isp->dev, - "FILE_INPUT enable, camera_cnt: %d\n", isp->input_cnt); - isp->inputs[isp->input_cnt].type = FILE_INPUT; - isp->inputs[isp->input_cnt].port = -1; - isp->inputs[isp->input_cnt].camera_caps = - atomisp_get_default_camera_caps(); - isp->inputs[isp->input_cnt++].camera = &isp->file_dev.sd; - if (isp->input_cnt < ATOM_ISP_MAX_INPUTS) { dev_dbg(isp->dev, "TPG detected, camera_cnt: %d\n", isp->input_cnt); @@ -1304,8 +1287,6 @@ wq_alloc_failed: subdev_register_failed: atomisp_tpg_unregister_entities(&isp->tpg); tpg_register_failed: - atomisp_file_input_unregister_entities(&isp->file_dev); -file_input_register_failed: for (i = 0; i < ATOMISP_CAMERA_NR_PORTS; i++) atomisp_mipi_csi2_unregister_entities(&isp->csi2_port[i]); csi_and_subdev_probe_failed: @@ -1326,13 +1307,6 @@ static int atomisp_initialize_modules(struct atomisp_device *isp) goto error_mipi_csi2; } - ret = atomisp_file_input_init(isp); - if (ret < 0) { - dev_err(isp->dev, - "file input device initialization failed\n"); - goto error_file_input; - } - ret = atomisp_tpg_init(isp); if (ret < 0) { dev_err(isp->dev, "tpg initialization failed\n"); @@ -1350,8 +1324,6 @@ static int atomisp_initialize_modules(struct atomisp_device *isp) error_isp_subdev: error_tpg: atomisp_tpg_cleanup(isp); -error_file_input: - atomisp_file_input_cleanup(isp); error_mipi_csi2: atomisp_mipi_csi2_cleanup(isp); return ret; @@ -1360,7 +1332,6 @@ error_mipi_csi2: static void atomisp_uninitialize_modules(struct atomisp_device *isp) { atomisp_tpg_cleanup(isp); - atomisp_file_input_cleanup(isp); atomisp_mipi_csi2_cleanup(isp); } @@ -1852,7 +1823,6 @@ static void atomisp_pci_remove(struct pci_dev *pdev) atomisp_unregister_entities(isp); destroy_workqueue(isp->wdt_work_queue); - atomisp_file_input_cleanup(isp); release_firmware(isp->firmware); } -- GitLab From 1ace82c7f9afee232c7e39f533fb40d636941090 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 27 Aug 2022 16:28:50 +0200 Subject: [PATCH 0559/2223] media: atomisp: Remove atomisp_file_fops and atomisp_file_ioctl_ops After the file-injection support removal, atomisp_video_pipe->type never is V4L2_BUF_TYPE_VIDEO_OUTPUT anymore, so the V4L2_BUF_TYPE_VIDEO_OUTPUT support path in atomisp_video_init() is never hit and this path is the only user of atomisp_file_fops and atomisp_file_ioctl_ops. Remove atomisp_file_fops and atomisp_file_ioctl_ops and all of the functions which are only referenced by these ops structs. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 84 ------------ .../staging/media/atomisp/pci/atomisp_cmd.h | 1 - .../staging/media/atomisp/pci/atomisp_fops.c | 20 --- .../staging/media/atomisp/pci/atomisp_ioctl.c | 122 ------------------ .../staging/media/atomisp/pci/atomisp_ioctl.h | 4 - .../staging/media/atomisp/pci/atomisp_v4l2.c | 6 - 6 files changed, 237 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index db6465756e497..8313724f06b37 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -4906,41 +4906,6 @@ int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, return 0; } -static int -atomisp_try_fmt_file(struct atomisp_device *isp, struct v4l2_format *f) -{ - u32 width = f->fmt.pix.width; - u32 height = f->fmt.pix.height; - u32 pixelformat = f->fmt.pix.pixelformat; - enum v4l2_field field = f->fmt.pix.field; - u32 depth; - - if (!atomisp_get_format_bridge(pixelformat)) { - dev_err(isp->dev, "Wrong output pixelformat\n"); - return -EINVAL; - } - - depth = atomisp_get_pixel_depth(pixelformat); - - if (field == V4L2_FIELD_ANY) { - field = V4L2_FIELD_NONE; - } else if (field != V4L2_FIELD_NONE) { - dev_err(isp->dev, "Wrong output field\n"); - return -EINVAL; - } - - f->fmt.pix.field = field; - f->fmt.pix.width = clamp_t(u32, - rounddown(width, (u32)ATOM_ISP_STEP_WIDTH), - ATOM_ISP_MIN_WIDTH, ATOM_ISP_MAX_WIDTH); - f->fmt.pix.height = clamp_t(u32, rounddown(height, - (u32)ATOM_ISP_STEP_HEIGHT), - ATOM_ISP_MIN_HEIGHT, ATOM_ISP_MAX_HEIGHT); - f->fmt.pix.bytesperline = (width * depth) >> 3; - - return 0; -} - enum mipi_port_id __get_mipi_port(struct atomisp_device *isp, enum atomisp_camera_port port) { @@ -6078,55 +6043,6 @@ done: return 0; } -int atomisp_set_fmt_file(struct video_device *vdev, struct v4l2_format *f) -{ - struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - struct atomisp_sub_device *asd = pipe->asd; - struct v4l2_mbus_framefmt ffmt = {0}; - const struct atomisp_format_bridge *format_bridge; - struct v4l2_subdev_fh fh; - int ret; - - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - - v4l2_fh_init(&fh.vfh, vdev); - - dev_dbg(isp->dev, "setting fmt %ux%u 0x%x for file inject\n", - f->fmt.pix.width, f->fmt.pix.height, f->fmt.pix.pixelformat); - ret = atomisp_try_fmt_file(isp, f); - if (ret) { - dev_err(isp->dev, "atomisp_try_fmt_file err: %d\n", ret); - return ret; - } - - format_bridge = atomisp_get_format_bridge(f->fmt.pix.pixelformat); - if (!format_bridge) { - dev_dbg(isp->dev, "atomisp_get_format_bridge err! fmt:0x%x\n", - f->fmt.pix.pixelformat); - return -EINVAL; - } - - pipe->pix = f->fmt.pix; - atomisp_css_input_set_mode(asd, IA_CSS_INPUT_MODE_FIFO); - atomisp_css_input_configure_port(asd, - __get_mipi_port(isp, ATOMISP_CAMERA_PORT_PRIMARY), 2, 0xffff4, - 0, 0, 0, 0); - ffmt.width = f->fmt.pix.width; - ffmt.height = f->fmt.pix.height; - ffmt.code = format_bridge->mbus_code; - - atomisp_subdev_set_ffmt(&asd->subdev, fh.state, - V4L2_SUBDEV_FORMAT_ACTIVE, - ATOMISP_SUBDEV_PAD_SINK, &ffmt); - - return 0; -} - int atomisp_set_shading_table(struct atomisp_sub_device *asd, struct atomisp_shading_table *user_shading_table) { diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.h b/drivers/staging/media/atomisp/pci/atomisp_cmd.h index ebc729468f873..ed1ad53891042 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.h +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.h @@ -269,7 +269,6 @@ int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, bool *res_overflow); int atomisp_set_fmt(struct video_device *vdev, struct v4l2_format *f); -int atomisp_set_fmt_file(struct video_device *vdev, struct v4l2_format *f); int atomisp_set_shading_table(struct atomisp_sub_device *asd, struct atomisp_shading_table *shading_table); diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 9ff0bcc043072..08b62fc65c769 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -1269,14 +1269,6 @@ error: return ret; } -static int atomisp_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - - return videobuf_mmap_mapper(&pipe->outq, vma); -} - static __poll_t atomisp_poll(struct file *file, struct poll_table_struct *pt) { @@ -1309,15 +1301,3 @@ const struct v4l2_file_operations atomisp_fops = { #endif .poll = atomisp_poll, }; - -const struct v4l2_file_operations atomisp_file_fops = { - .owner = THIS_MODULE, - .open = atomisp_open, - .release = atomisp_release, - .mmap = atomisp_file_mmap, - .unlocked_ioctl = video_ioctl2, -#ifdef CONFIG_COMPAT - /* .compat_ioctl32 = atomisp_compat_ioctl32, */ -#endif - .poll = atomisp_poll, -}; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 571a2df75238e..345970ca4fcbd 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -871,20 +871,6 @@ static int atomisp_enum_fmt_cap(struct file *file, void *fh, return -EINVAL; } -static int atomisp_g_fmt_file(struct file *file, void *fh, - struct v4l2_format *f) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - - rt_mutex_lock(&isp->mutex); - f->fmt.pix = pipe->pix; - rt_mutex_unlock(&isp->mutex); - - return 0; -} - static int atomisp_adjust_fmt(struct v4l2_format *f) { const struct atomisp_format_bridge *format_bridge; @@ -1018,19 +1004,6 @@ static int atomisp_s_fmt_cap(struct file *file, void *fh, return ret; } -static int atomisp_s_fmt_file(struct file *file, void *fh, - struct v4l2_format *f) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - int ret; - - rt_mutex_lock(&isp->mutex); - ret = atomisp_set_fmt_file(vdev, f); - rt_mutex_unlock(&isp->mutex); - return ret; -} - /* * Free videobuffer buffer priv data */ @@ -1258,22 +1231,6 @@ int atomisp_reqbufs(struct file *file, void *fh, return ret; } -static int atomisp_reqbufs_file(struct file *file, void *fh, - struct v4l2_requestbuffers *req) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - - if (req->count == 0) { - mutex_lock(&pipe->outq.vb_lock); - atomisp_videobuf_free_queue(&pipe->outq); - mutex_unlock(&pipe->outq.vb_lock); - return 0; - } - - return videobuf_reqbufs(&pipe->outq, req); -} - /* application query the status of a buffer */ static int atomisp_querybuf(struct file *file, void *fh, struct v4l2_buffer *buf) @@ -1284,15 +1241,6 @@ static int atomisp_querybuf(struct file *file, void *fh, return videobuf_querybuf(&pipe->capq, buf); } -static int atomisp_querybuf_file(struct file *file, void *fh, - struct v4l2_buffer *buf) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - - return videobuf_querybuf(&pipe->outq, buf); -} - /* * Applications call the VIDIOC_QBUF ioctl to enqueue an empty (capturing) or * filled (output) buffer in the drivers incoming queue. @@ -1473,48 +1421,6 @@ error: return ret; } -static int atomisp_qbuf_file(struct file *file, void *fh, - struct v4l2_buffer *buf) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - int ret; - - rt_mutex_lock(&isp->mutex); - if (isp->isp_fatal_error) { - ret = -EIO; - goto error; - } - - if (!buf || buf->index >= VIDEO_MAX_FRAME || - !pipe->outq.bufs[buf->index]) { - dev_err(isp->dev, "Invalid index for qbuf.\n"); - ret = -EINVAL; - goto error; - } - - if (buf->memory != V4L2_MEMORY_MMAP) { - dev_err(isp->dev, "Unsupported memory method\n"); - ret = -EINVAL; - goto error; - } - - if (buf->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) { - dev_err(isp->dev, "Unsupported buffer type\n"); - ret = -EINVAL; - goto error; - } - rt_mutex_unlock(&isp->mutex); - - return videobuf_qbuf(&pipe->outq, buf); - -error: - rt_mutex_unlock(&isp->mutex); - - return ret; -} - static int __get_frame_exp_id(struct atomisp_video_pipe *pipe, struct v4l2_buffer *buf) { @@ -2882,24 +2788,6 @@ out: return rval == -ENOIOCTLCMD ? 0 : rval; } -static int atomisp_s_parm_file(struct file *file, void *fh, - struct v4l2_streamparm *parm) -{ - struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - - if (parm->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) { - dev_err(isp->dev, "unsupported v4l2 buf type for output\n"); - return -EINVAL; - } - - rt_mutex_lock(&isp->mutex); - isp->sw_contex.file_input = true; - rt_mutex_unlock(&isp->mutex); - - return 0; -} - static long atomisp_vidioc_default(struct file *file, void *fh, bool valid_prio, unsigned int cmd, void *arg) { @@ -3230,13 +3118,3 @@ const struct v4l2_ioctl_ops atomisp_ioctl_ops = { .vidioc_s_parm = atomisp_s_parm, .vidioc_g_parm = atomisp_g_parm, }; - -const struct v4l2_ioctl_ops atomisp_file_ioctl_ops = { - .vidioc_querycap = atomisp_querycap, - .vidioc_g_fmt_vid_out = atomisp_g_fmt_file, - .vidioc_s_fmt_vid_out = atomisp_s_fmt_file, - .vidioc_s_parm = atomisp_s_parm_file, - .vidioc_reqbufs = atomisp_reqbufs_file, - .vidioc_querybuf = atomisp_querybuf_file, - .vidioc_qbuf = atomisp_qbuf_file, -}; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.h b/drivers/staging/media/atomisp/pci/atomisp_ioctl.h index d85e0d697a4e7..382b78275240a 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.h +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.h @@ -49,12 +49,8 @@ enum ia_css_pipe_id atomisp_get_css_pipe_id(struct atomisp_sub_device void atomisp_videobuf_free_buf(struct videobuf_buffer *vb); -extern const struct v4l2_file_operations atomisp_file_fops; - extern const struct v4l2_ioctl_ops atomisp_ioctl_ops; -extern const struct v4l2_ioctl_ops atomisp_file_ioctl_ops; - unsigned int atomisp_streaming_count(struct atomisp_device *isp); /* compat_ioctl for 32bit userland app and 64bit kernel */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 5488a02200ed1..672b3b68c613b 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -442,12 +442,6 @@ int atomisp_video_init(struct atomisp_video_pipe *video, const char *name, video->vdev.fops = &atomisp_fops; video->vdev.ioctl_ops = &atomisp_ioctl_ops; break; - case V4L2_BUF_TYPE_VIDEO_OUTPUT: - direction = "input"; - video->pad.flags = MEDIA_PAD_FL_SOURCE; - video->vdev.fops = &atomisp_file_fops; - video->vdev.ioctl_ops = &atomisp_file_ioctl_ops; - break; default: return -EINVAL; } -- GitLab From 5e61114e3abf221b37f2e4ab7da35dae991cfad2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 27 Aug 2022 16:39:10 +0200 Subject: [PATCH 0560/2223] media: atomisp: Remove the outq videobuf queue After the file-injection support removal the outq videobuf queue is no longer used, remove it. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_fops.c | 62 ------------------- .../media/atomisp/pci/atomisp_subdev.c | 1 - .../media/atomisp/pci/atomisp_subdev.h | 9 ++- 3 files changed, 4 insertions(+), 68 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 08b62fc65c769..d6a7198a957c5 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -593,47 +593,6 @@ static void atomisp_buf_release(struct videobuf_queue *vq, atomisp_videobuf_free_buf(vb); } -static int atomisp_buf_setup_output(struct videobuf_queue *vq, - unsigned int *count, unsigned int *size) -{ - struct atomisp_video_pipe *pipe = vq->priv_data; - - *size = pipe->pix.sizeimage; - - return 0; -} - -static int atomisp_buf_prepare_output(struct videobuf_queue *vq, - struct videobuf_buffer *vb, - enum v4l2_field field) -{ - struct atomisp_video_pipe *pipe = vq->priv_data; - - vb->size = pipe->pix.sizeimage; - vb->width = pipe->pix.width; - vb->height = pipe->pix.height; - vb->field = field; - vb->state = VIDEOBUF_PREPARED; - - return 0; -} - -static void atomisp_buf_queue_output(struct videobuf_queue *vq, - struct videobuf_buffer *vb) -{ - struct atomisp_video_pipe *pipe = vq->priv_data; - - list_add_tail(&vb->queue, &pipe->activeq_out); - vb->state = VIDEOBUF_QUEUED; -} - -static void atomisp_buf_release_output(struct videobuf_queue *vq, - struct videobuf_buffer *vb) -{ - videobuf_vmalloc_free(vb); - vb->state = VIDEOBUF_NEEDS_INIT; -} - static const struct videobuf_queue_ops videobuf_qops = { .buf_setup = atomisp_buf_setup, .buf_prepare = atomisp_buf_prepare, @@ -641,13 +600,6 @@ static const struct videobuf_queue_ops videobuf_qops = { .buf_release = atomisp_buf_release, }; -static const struct videobuf_queue_ops videobuf_qops_output = { - .buf_setup = atomisp_buf_setup_output, - .buf_prepare = atomisp_buf_prepare_output, - .buf_queue = atomisp_buf_queue_output, - .buf_release = atomisp_buf_release_output, -}; - static int atomisp_init_pipe(struct atomisp_video_pipe *pipe) { /* init locks */ @@ -660,15 +612,7 @@ static int atomisp_init_pipe(struct atomisp_video_pipe *pipe) sizeof(struct atomisp_buffer), pipe, NULL); /* ext_lock: NULL */ - videobuf_queue_vmalloc_init(&pipe->outq, &videobuf_qops_output, NULL, - &pipe->irq_lock, - V4L2_BUF_TYPE_VIDEO_OUTPUT, - V4L2_FIELD_NONE, - sizeof(struct atomisp_buffer), pipe, - NULL); /* ext_lock: NULL */ - INIT_LIST_HEAD(&pipe->activeq); - INIT_LIST_HEAD(&pipe->activeq_out); INIT_LIST_HEAD(&pipe->buffers_waiting_for_param); INIT_LIST_HEAD(&pipe->per_frame_params); memset(pipe->frame_request_config_id, 0, @@ -964,12 +908,6 @@ static int atomisp_release(struct file *file) goto done; } - if (pipe->outq.bufs[0]) { - mutex_lock(&pipe->outq.vb_lock); - videobuf_queue_cancel(&pipe->outq); - mutex_unlock(&pipe->outq.vb_lock); - } - /* * A little trick here: * file injection input resolution is recorded in the sink pad, diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 1509543924d29..e05aeb0ca86bc 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1066,7 +1066,6 @@ static void atomisp_init_subdev_pipe(struct atomisp_sub_device *asd, pipe->isp = asd->isp; spin_lock_init(&pipe->irq_lock); INIT_LIST_HEAD(&pipe->activeq); - INIT_LIST_HEAD(&pipe->activeq_out); INIT_LIST_HEAD(&pipe->buffers_waiting_for_param); INIT_LIST_HEAD(&pipe->per_frame_params); memset(pipe->frame_request_config_id, diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index 938d427bede89..d89ae3274180d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -70,9 +70,7 @@ struct atomisp_video_pipe { enum v4l2_buf_type type; struct media_pad pad; struct videobuf_queue capq; - struct videobuf_queue outq; struct list_head activeq; - struct list_head activeq_out; /* * the buffers waiting for per-frame parameters, this is only valid * in per-frame setting mode. @@ -86,9 +84,10 @@ struct atomisp_video_pipe { unsigned int buffers_in_css; - /* irq_lock is used to protect video buffer state change operations and - * also to make activeq, activeq_out, capq and outq list - * operations atomic. */ + /* + * irq_lock is used to protect video buffer state change operations and + * also to make activeq and capq operations atomic. + */ spinlock_t irq_lock; unsigned int users; -- GitLab From 79adb947ef6d6ecb0337a8ad515459c5655300ca Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 27 Aug 2022 16:49:23 +0200 Subject: [PATCH 0561/2223] media: atomisp: Remove never set file_input flag After the file-injection support removal the file_input flag is always false. Remove the flag and replace any code checking it with the code-path for when it is false. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 41 +++----------- .../staging/media/atomisp/pci/atomisp_fops.c | 12 +--- .../media/atomisp/pci/atomisp_internal.h | 2 - .../staging/media/atomisp/pci/atomisp_ioctl.c | 56 ++++++------------- 4 files changed, 28 insertions(+), 83 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 8313724f06b37..8e6c10f25318d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -1308,9 +1308,7 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) bool depth_mode = false; int i, ret, depth_cnt = 0; - if (!isp->sw_contex.file_input) - atomisp_css_irq_enable(isp, - IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, false); + atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, false); BUG_ON(isp->num_of_streams > MAX_STREAM_NUM); @@ -1396,16 +1394,11 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) atomisp_csi2_configure(asd); } - if (!isp->sw_contex.file_input) { - atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, - atomisp_css_valid_sof(isp)); + atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, + atomisp_css_valid_sof(isp)); - if (atomisp_freq_scaling(isp, ATOMISP_DFS_MODE_AUTO, true) < 0) - dev_dbg(isp->dev, "DFS auto failed while recovering!\n"); - } else { - if (atomisp_freq_scaling(isp, ATOMISP_DFS_MODE_MAX, true) < 0) - dev_dbg(isp->dev, "DFS max failed while recovering!\n"); - } + if (atomisp_freq_scaling(isp, ATOMISP_DFS_MODE_AUTO, true) < 0) + dev_dbg(isp->dev, "DFS auto failed while recovering!\n"); for (i = 0; i < isp->num_of_streams; i++) { struct atomisp_sub_device *asd; @@ -1610,10 +1603,7 @@ void atomisp_wdt_work(struct work_struct *work) if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) continue; - atomisp_wdt_refresh(asd, - isp->sw_contex.file_input ? - ATOMISP_ISP_FILE_TIMEOUT_DURATION : - ATOMISP_ISP_TIMEOUT_DURATION); + atomisp_wdt_refresh(asd, ATOMISP_ISP_TIMEOUT_DURATION); } } @@ -1643,14 +1633,10 @@ void atomisp_css_flush(struct atomisp_device *isp) for (i = 0; i < isp->num_of_streams; i++) { struct atomisp_sub_device *asd = &isp->asd[i]; - if (asd->streaming != - ATOMISP_DEVICE_STREAMING_ENABLED) + if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) continue; - atomisp_wdt_refresh(asd, - isp->sw_contex.file_input ? - ATOMISP_ISP_FILE_TIMEOUT_DURATION : - ATOMISP_ISP_TIMEOUT_DURATION); + atomisp_wdt_refresh(asd, ATOMISP_ISP_TIMEOUT_DURATION); } dev_dbg(isp->dev, "atomisp css flush done\n"); } @@ -1896,14 +1882,6 @@ irqreturn_t atomisp_isr_thread(int irq, void *isp_ptr) } out: rt_mutex_unlock(&isp->mutex); - for (i = 0; i < isp->num_of_streams; i++) { - asd = &isp->asd[i]; - if (asd->streaming == ATOMISP_DEVICE_STREAMING_ENABLED - && css_pipe_done[asd->index] - && isp->sw_contex.file_input) - v4l2_subdev_call(isp->inputs[asd->input_curr].camera, - video, s_stream, 1); - } dev_dbg(isp->dev, "<%s\n", __func__); return IRQ_HANDLED; @@ -5377,8 +5355,7 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, ia_css_frame_free(asd->raw_output_frame); asd->raw_output_frame = NULL; - if (!asd->continuous_mode->val && - !asd->params.online_process && !isp->sw_contex.file_input && + if (!asd->continuous_mode->val && !asd->params.online_process && ia_css_frame_allocate_from_info(&asd->raw_output_frame, raw_output_info)) return -ENOMEM; diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index d6a7198a957c5..ab767b585011d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -628,7 +628,6 @@ static void atomisp_dev_init_struct(struct atomisp_device *isp) { unsigned int i; - isp->sw_contex.file_input = false; isp->need_gfx_throttle = true; isp->isp_fatal_error = false; isp->mipi_frame_size = 0; @@ -915,7 +914,7 @@ static int atomisp_release(struct file *file) * The sink pad setting can only be cleared when all device nodes * get released. */ - if (!isp->sw_contex.file_input && asd->fmt_auto->val) { + if (asd->fmt_auto->val) { struct v4l2_mbus_framefmt isp_sink_fmt = { 0 }; atomisp_subdev_set_ffmt(&asd->subdev, fh.state, @@ -926,15 +925,6 @@ subdev_uninit: if (atomisp_subdev_users(asd)) goto done; - /* clear the sink pad for file input */ - if (isp->sw_contex.file_input && asd->fmt_auto->val) { - struct v4l2_mbus_framefmt isp_sink_fmt = { 0 }; - - atomisp_subdev_set_ffmt(&asd->subdev, fh.state, - V4L2_SUBDEV_FORMAT_ACTIVE, - ATOMISP_SUBDEV_PAD_SINK, &isp_sink_fmt); - } - atomisp_css_free_stat_buffers(asd); atomisp_free_internal_buffers(asd); ret = v4l2_subdev_call(isp->inputs[asd->input_curr].camera, diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index ce1746e7ab9f4..1d2326a40227b 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -91,7 +91,6 @@ #define ATOMISP_ISP_TIMEOUT_DURATION (2 * HZ) #define ATOMISP_EXT_ISP_TIMEOUT_DURATION (6 * HZ) -#define ATOMISP_ISP_FILE_TIMEOUT_DURATION (60 * HZ) #define ATOMISP_WDT_KEEP_CURRENT_DELAY 0 #define ATOMISP_ISP_MAX_TIMEOUT_COUNT 2 #define ATOMISP_CSS_STOP_TIMEOUT_US 200000 @@ -202,7 +201,6 @@ struct atomisp_regs { }; struct atomisp_sw_contex { - bool file_input; int power_state; int running_freq; }; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 345970ca4fcbd..7f89226c858a0 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -737,7 +737,7 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) ret = v4l2_subdev_call(motor, core, s_power, 1); } - if (!isp->sw_contex.file_input && motor) + if (motor) ret = v4l2_subdev_call(motor, core, init, 1); asd->input_curr = input; @@ -1841,8 +1841,6 @@ static int atomisp_streamon(struct file *file, void *fh, atomic_set(&asd->sof_count, -1); atomic_set(&asd->sequence, -1); atomic_set(&asd->sequence_temp, -1); - if (isp->sw_contex.file_input) - wdt_duration = ATOMISP_ISP_FILE_TIMEOUT_DURATION; asd->params.dis_proj_data_valid = false; asd->latest_preview_exp_id = 0; @@ -1865,26 +1863,21 @@ start_sensor: atomisp_setup_flash(asd); } - if (!isp->sw_contex.file_input) { - atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, - atomisp_css_valid_sof(isp)); - atomisp_csi2_configure(asd); - /* - * set freq to max when streaming count > 1 which indicate - * dual camera would run - */ - if (atomisp_streaming_count(isp) > 1) { - if (atomisp_freq_scaling(isp, - ATOMISP_DFS_MODE_MAX, false) < 0) - dev_dbg(isp->dev, "DFS max mode failed!\n"); - } else { - if (atomisp_freq_scaling(isp, - ATOMISP_DFS_MODE_AUTO, false) < 0) - dev_dbg(isp->dev, "DFS auto mode failed!\n"); - } - } else { - if (atomisp_freq_scaling(isp, ATOMISP_DFS_MODE_MAX, false) < 0) + atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, + atomisp_css_valid_sof(isp)); + atomisp_csi2_configure(asd); + /* + * set freq to max when streaming count > 1 which indicate + * dual camera would run + */ + if (atomisp_streaming_count(isp) > 1) { + if (atomisp_freq_scaling(isp, + ATOMISP_DFS_MODE_MAX, false) < 0) dev_dbg(isp->dev, "DFS max mode failed!\n"); + } else { + if (atomisp_freq_scaling(isp, + ATOMISP_DFS_MODE_AUTO, false) < 0) + dev_dbg(isp->dev, "DFS auto mode failed!\n"); } if (asd->depth_mode->val && atomisp_streaming_count(isp) == @@ -2047,15 +2040,6 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) /* if other streams are running, should not disable watch dog */ rt_mutex_unlock(&isp->mutex); atomisp_wdt_stop(asd, true); - - /* - * must stop sending pixels into GP_FIFO before stop - * the pipeline. - */ - if (isp->sw_contex.file_input) - v4l2_subdev_call(isp->inputs[asd->input_curr].camera, - video, s_stream, 0); - rt_mutex_lock(&isp->mutex); } @@ -2072,10 +2056,7 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) } atomisp_clear_css_buffer_counters(asd); - - if (!isp->sw_contex.file_input) - atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, - false); + atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, false); if (asd->delayed_init == ATOMISP_DELAYED_INIT_QUEUED) { cancel_work_sync(&asd->delayed_init_work); @@ -2128,9 +2109,8 @@ stopsensor: != atomisp_sensor_start_stream(asd)) return 0; - if (!isp->sw_contex.file_input) - ret = v4l2_subdev_call(isp->inputs[asd->input_curr].camera, - video, s_stream, 0); + ret = v4l2_subdev_call(isp->inputs[asd->input_curr].camera, + video, s_stream, 0); if (isp->flash) { asd->params.num_flash_frames = 0; -- GitLab From af69562a28faa50b7ffea97b8ceb8f6554a70e12 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 28 Aug 2022 21:00:20 +0200 Subject: [PATCH 0562/2223] media: atomisp: Remove the ACC device node The ACC /dev/video# device node uses a struct video_device embedded in an atomisp_acc_pipe struct instead of in an atomisp_video_pipe struct. Yet it uses the same file-ops and ioctl-ops even though it does not have a videobuf queue, which makes e.g. the mmap fop nonsense. Worse the only file-ops / ioctls which differentiate between the 2 types and correctly do container_of on the right type are the open/release fops and the vidioc_default handler. The mmap and poll fops and *all* other ioctl handlers unconditionally do container_of on the passed in struct video_device blindly assuming they are dealing with the one embedded in the atomisp_video_pipe struct. This makes it trivial for userspace to cause all sort of undefined behavior by calling mmap, poll or the other ioctls on the ACC device node! Presumably the use of the ACC device node was to allow making the special ioctls to load custom programs while the other /dev/video# nodes were already open, since the /dev/video# nodes can currently all be opened only once (which needs to be fixed). commit 4bbca788b6eb ("media: atomisp: remove private acceleration ioctls") has removed the custom ATOMISP_ACC_* ioctls, so there no longer is any reason to keep the ACC device node. As explained above its presence can easily cause the kernel to crash, so remove the ACC device node and the code for handling it. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 9 --- .../staging/media/atomisp/pci/atomisp_cmd.h | 1 - .../staging/media/atomisp/pci/atomisp_fops.c | 69 ++++--------------- .../staging/media/atomisp/pci/atomisp_ioctl.c | 9 +-- .../media/atomisp/pci/atomisp_subdev.c | 18 ----- .../media/atomisp/pci/atomisp_subdev.h | 9 --- .../staging/media/atomisp/pci/atomisp_v4l2.c | 18 ----- .../staging/media/atomisp/pci/atomisp_v4l2.h | 3 - 8 files changed, 16 insertions(+), 120 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 8e6c10f25318d..1c4748b7186ed 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -98,15 +98,6 @@ struct atomisp_video_pipe *atomisp_to_video_pipe(struct video_device *dev) container_of(dev, struct atomisp_video_pipe, vdev); } -/* - * get struct atomisp_acc_pipe from v4l2 video_device - */ -struct atomisp_acc_pipe *atomisp_to_acc_pipe(struct video_device *dev) -{ - return (struct atomisp_acc_pipe *) - container_of(dev, struct atomisp_acc_pipe, vdev); -} - static unsigned short atomisp_get_sensor_fps(struct atomisp_sub_device *asd) { struct v4l2_subdev_frame_interval fi = { 0 }; diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.h b/drivers/staging/media/atomisp/pci/atomisp_cmd.h index ed1ad53891042..c4472516487ba 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.h +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.h @@ -54,7 +54,6 @@ void dump_sp_dmem(struct atomisp_device *isp, unsigned int addr, unsigned int size); struct camera_mipi_info *atomisp_to_sensor_mipi_info(struct v4l2_subdev *sd); struct atomisp_video_pipe *atomisp_to_video_pipe(struct video_device *dev); -struct atomisp_acc_pipe *atomisp_to_acc_pipe(struct video_device *dev); int atomisp_reset(struct atomisp_device *isp); void atomisp_flush_bufs_and_wakeup(struct atomisp_sub_device *asd); void atomisp_clear_css_buffer_counters(struct atomisp_sub_device *asd); diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index ab767b585011d..3fa3c28b1a80e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -684,8 +684,7 @@ static unsigned int atomisp_subdev_users(struct atomisp_sub_device *asd) return asd->video_out_preview.users + asd->video_out_vf.users + asd->video_out_capture.users + - asd->video_out_video_capture.users + - asd->video_acc.users; + asd->video_out_video_capture.users; } unsigned int atomisp_dev_users(struct atomisp_device *isp) @@ -702,10 +701,8 @@ static int atomisp_open(struct file *file) { struct video_device *vdev = video_devdata(file); struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_video_pipe *pipe = NULL; - struct atomisp_acc_pipe *acc_pipe = NULL; - struct atomisp_sub_device *asd; - bool acc_node = false; + struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); + struct atomisp_sub_device *asd = pipe->asd; int ret; dev_dbg(isp->dev, "open device %s\n", vdev->name); @@ -736,14 +733,6 @@ static int atomisp_open(struct file *file) rt_mutex_lock(&isp->mutex); - acc_node = !strcmp(vdev->name, "ATOMISP ISP ACC"); - if (acc_node) { - acc_pipe = atomisp_to_acc_pipe(vdev); - asd = acc_pipe->asd; - } else { - pipe = atomisp_to_video_pipe(vdev); - asd = pipe->asd; - } asd->subdev.devnode = vdev; /* Deferred firmware loading case. */ if (isp->css_env.isp_css_fw.bytes == 0) { @@ -765,14 +754,6 @@ static int atomisp_open(struct file *file) isp->css_env.isp_css_fw.data = NULL; } - if (acc_node && acc_pipe->users) { - dev_dbg(isp->dev, "acc node already opened\n"); - rt_mutex_unlock(&isp->mutex); - return -EBUSY; - } else if (acc_node) { - goto dev_init; - } - if (!isp->input_cnt) { dev_err(isp->dev, "no camera attached\n"); ret = -EINVAL; @@ -792,7 +773,6 @@ static int atomisp_open(struct file *file) if (ret) goto error; -dev_init: if (atomisp_dev_users(isp)) { dev_dbg(isp->dev, "skip init isp in open\n"); goto init_subdev; @@ -827,16 +807,11 @@ init_subdev: atomisp_subdev_init_struct(asd); done: - - if (acc_node) - acc_pipe->users++; - else - pipe->users++; + pipe->users++; rt_mutex_unlock(&isp->mutex); /* Ensure that a mode is set */ - if (!acc_node) - v4l2_ctrl_s_ctrl(asd->run_mode, pipe->default_run_mode); + v4l2_ctrl_s_ctrl(asd->run_mode, pipe->default_run_mode); return 0; @@ -852,10 +827,8 @@ static int atomisp_release(struct file *file) { struct video_device *vdev = video_devdata(file); struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_video_pipe *pipe; - struct atomisp_acc_pipe *acc_pipe; - struct atomisp_sub_device *asd; - bool acc_node; + struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); + struct atomisp_sub_device *asd = pipe->asd; struct v4l2_requestbuffers req; struct v4l2_subdev_fh fh; struct v4l2_rect clear_compose = {0}; @@ -871,19 +844,9 @@ static int atomisp_release(struct file *file) rt_mutex_lock(&isp->mutex); dev_dbg(isp->dev, "release device %s\n", vdev->name); - acc_node = !strcmp(vdev->name, "ATOMISP ISP ACC"); - if (acc_node) { - acc_pipe = atomisp_to_acc_pipe(vdev); - asd = acc_pipe->asd; - } else { - pipe = atomisp_to_video_pipe(vdev); - asd = pipe->asd; - } + asd->subdev.devnode = vdev; - if (acc_node) { - acc_pipe->users--; - goto subdev_uninit; - } + pipe->users--; if (pipe->capq.streaming) @@ -921,7 +884,7 @@ static int atomisp_release(struct file *file) V4L2_SUBDEV_FORMAT_ACTIVE, ATOMISP_SUBDEV_PAD_SINK, &isp_sink_fmt); } -subdev_uninit: + if (atomisp_subdev_users(asd)) goto done; @@ -956,13 +919,11 @@ subdev_uninit: dev_err(isp->dev, "Failed to power off device\n"); done: - if (!acc_node) { - atomisp_subdev_set_selection(&asd->subdev, fh.state, - V4L2_SUBDEV_FORMAT_ACTIVE, - atomisp_subdev_source_pad(vdev), - V4L2_SEL_TGT_COMPOSE, 0, - &clear_compose); - } + atomisp_subdev_set_selection(&asd->subdev, fh.state, + V4L2_SUBDEV_FORMAT_ACTIVE, + atomisp_subdev_source_pad(vdev), + V4L2_SEL_TGT_COMPOSE, 0, + &clear_compose); rt_mutex_unlock(&isp->mutex); mutex_unlock(&isp->streamoff_mutex); diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 7f89226c858a0..bdbb9dbbceece 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -2773,17 +2773,10 @@ static long atomisp_vidioc_default(struct file *file, void *fh, { struct video_device *vdev = video_devdata(file); struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_sub_device *asd; + struct atomisp_sub_device *asd = atomisp_to_video_pipe(vdev)->asd; struct v4l2_subdev *motor; - bool acc_node; int err; - acc_node = !strcmp(vdev->name, "ATOMISP ISP ACC"); - if (acc_node) - asd = atomisp_to_acc_pipe(vdev)->asd; - else - asd = atomisp_to_video_pipe(vdev)->asd; - if (!IS_ISP2401) motor = isp->inputs[asd->input_curr].motor; else diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index e05aeb0ca86bc..5e66d6a695568 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1075,13 +1075,6 @@ static void atomisp_init_subdev_pipe(struct atomisp_sub_device *asd, sizeof(struct atomisp_css_params_with_list *)); } -static void atomisp_init_acc_pipe(struct atomisp_sub_device *asd, - struct atomisp_acc_pipe *pipe) -{ - pipe->asd = asd; - pipe->isp = asd->isp; -} - /* * isp_subdev_init_entities - Initialize V4L2 subdev and media entity * @asd: ISP CCDC module @@ -1137,8 +1130,6 @@ static int isp_subdev_init_entities(struct atomisp_sub_device *asd) atomisp_init_subdev_pipe(asd, &asd->video_out_video_capture, V4L2_BUF_TYPE_VIDEO_CAPTURE); - atomisp_init_acc_pipe(asd, &asd->video_acc); - ret = atomisp_video_init(&asd->video_out_capture, "CAPTURE", ATOMISP_RUN_MODE_STILL_CAPTURE); if (ret < 0) @@ -1159,8 +1150,6 @@ static int isp_subdev_init_entities(struct atomisp_sub_device *asd) if (ret < 0) return ret; - atomisp_acc_init(&asd->video_acc, "ACC"); - ret = v4l2_ctrl_handler_init(&asd->ctrl_handler, 1); if (ret) return ret; @@ -1290,7 +1279,6 @@ void atomisp_subdev_unregister_entities(struct atomisp_sub_device *asd) atomisp_video_unregister(&asd->video_out_vf); atomisp_video_unregister(&asd->video_out_capture); atomisp_video_unregister(&asd->video_out_video_capture); - atomisp_acc_unregister(&asd->video_acc); } int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, @@ -1336,12 +1324,6 @@ int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, VFL_TYPE_VIDEO, -1); if (ret < 0) goto error; - asd->video_acc.vdev.v4l2_dev = vdev; - asd->video_acc.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; - ret = video_register_device(&asd->video_acc.vdev, - VFL_TYPE_VIDEO, -1); - if (ret < 0) - goto error; return 0; diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index d89ae3274180d..e36e112c3b293 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -119,14 +119,6 @@ struct atomisp_video_pipe { atomic_t wdt_count; }; -struct atomisp_acc_pipe { - struct video_device vdev; - unsigned int users; - bool running; - struct atomisp_sub_device *asd; - struct atomisp_device *isp; -}; - struct atomisp_pad_format { struct v4l2_mbus_framefmt fmt; struct v4l2_rect crop; @@ -299,7 +291,6 @@ struct atomisp_sub_device { struct atomisp_video_pipe video_out_capture; /* capture output */ struct atomisp_video_pipe video_out_vf; /* viewfinder output */ struct atomisp_video_pipe video_out_preview; /* preview output */ - struct atomisp_acc_pipe video_acc; /* video pipe main output */ struct atomisp_video_pipe video_out_video_capture; /* struct isp_subdev_params params; */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 672b3b68c613b..bb48c74c0c07e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -460,18 +460,6 @@ int atomisp_video_init(struct atomisp_video_pipe *video, const char *name, return 0; } -void atomisp_acc_init(struct atomisp_acc_pipe *video, const char *name) -{ - video->vdev.fops = &atomisp_fops; - video->vdev.ioctl_ops = &atomisp_ioctl_ops; - - /* Initialize the video device. */ - snprintf(video->vdev.name, sizeof(video->vdev.name), - "ATOMISP ISP %s", name); - video->vdev.release = video_device_release_empty; - video_set_drvdata(&video->vdev, video->isp); -} - void atomisp_video_unregister(struct atomisp_video_pipe *video) { if (video_is_registered(&video->vdev)) { @@ -480,12 +468,6 @@ void atomisp_video_unregister(struct atomisp_video_pipe *video) } } -void atomisp_acc_unregister(struct atomisp_acc_pipe *video) -{ - if (video_is_registered(&video->vdev)) - video_unregister_device(&video->vdev); -} - static int atomisp_save_iunit_reg(struct atomisp_device *isp) { struct pci_dev *pdev = to_pci_dev(isp->dev); diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.h b/drivers/staging/media/atomisp/pci/atomisp_v4l2.h index 72611b8286a4a..ccf1c0ac17b21 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.h +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.h @@ -22,16 +22,13 @@ #define __ATOMISP_V4L2_H__ struct atomisp_video_pipe; -struct atomisp_acc_pipe; struct v4l2_device; struct atomisp_device; struct firmware; int atomisp_video_init(struct atomisp_video_pipe *video, const char *name, unsigned int run_mode); -void atomisp_acc_init(struct atomisp_acc_pipe *video, const char *name); void atomisp_video_unregister(struct atomisp_video_pipe *video); -void atomisp_acc_unregister(struct atomisp_acc_pipe *video); const struct firmware *atomisp_load_firmware(struct atomisp_device *isp); int atomisp_csi_lane_config(struct atomisp_device *isp); -- GitLab From 5e13ff4cb8f1d8bbc7ff455e47ae347ef26d6867 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 28 Aug 2022 21:11:51 +0200 Subject: [PATCH 0563/2223] media: atomisp: Remove some further ATOMISP_ACC_* related dead code Remove some more code which is no longer referenced after the removal of the ATOMISP_ACC_* custom ioctls. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../media/atomisp/include/linux/atomisp.h | 14 ------------ .../media/atomisp/pci/atomisp_compat.h | 3 --- .../media/atomisp/pci/atomisp_compat_css20.c | 18 --------------- .../media/atomisp/pci/atomisp_subdev.h | 22 ------------------- 4 files changed, 57 deletions(-) diff --git a/drivers/staging/media/atomisp/include/linux/atomisp.h b/drivers/staging/media/atomisp/include/linux/atomisp.h index f96f5adbd9de4..3f602b5aaff92 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp.h @@ -740,20 +740,6 @@ enum atomisp_frame_status { ATOMISP_FRAME_STATUS_FLASH_FAILED, }; -/* ISP memories, isp2400 */ -enum atomisp_acc_memory { - ATOMISP_ACC_MEMORY_PMEM0 = 0, - ATOMISP_ACC_MEMORY_DMEM0, - /* for backward compatibility */ - ATOMISP_ACC_MEMORY_DMEM = ATOMISP_ACC_MEMORY_DMEM0, - ATOMISP_ACC_MEMORY_VMEM0, - ATOMISP_ACC_MEMORY_VAMEM0, - ATOMISP_ACC_MEMORY_VAMEM1, - ATOMISP_ACC_MEMORY_VAMEM2, - ATOMISP_ACC_MEMORY_HMEM0, - ATOMISP_ACC_NR_MEMORY -}; - enum atomisp_ext_isp_id { EXT_ISP_CID_ISO = 0, EXT_ISP_CID_CAPTURE_HDR, diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat.h b/drivers/staging/media/atomisp/pci/atomisp_compat.h index 3393ae6824f0a..54c57bbf4c4de 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat.h +++ b/drivers/staging/media/atomisp/pci/atomisp_compat.h @@ -442,9 +442,6 @@ int atomisp_css_get_dis_stat(struct atomisp_sub_device *asd, int atomisp_css_update_stream(struct atomisp_sub_device *asd); -struct atomisp_acc_fw; -int atomisp_css_set_acc_parameters(struct atomisp_acc_fw *acc_fw); - int atomisp_css_isr_thread(struct atomisp_device *isp, bool *frame_done_found, bool *css_pipe_done); diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c index 5aa108a1724c6..ec47d84698ba3 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c +++ b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c @@ -3771,24 +3771,6 @@ void atomisp_css_set_cont_prev_start_time(struct atomisp_device *isp, return; } -/* Set the ACC binary arguments */ -int atomisp_css_set_acc_parameters(struct atomisp_acc_fw *acc_fw) -{ - unsigned int mem; - - for (mem = 0; mem < ATOMISP_ACC_NR_MEMORY; mem++) { - if (acc_fw->args[mem].length == 0) - continue; - - ia_css_isp_param_set_css_mem_init(&acc_fw->fw->mem_initializers, - IA_CSS_PARAM_CLASS_PARAM, mem, - acc_fw->args[mem].css_ptr, - acc_fw->args[mem].length); - } - - return 0; -} - static struct atomisp_sub_device *__get_atomisp_subdev( struct ia_css_pipe *css_pipe, struct atomisp_device *isp, diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index e36e112c3b293..d1a9857e5d68c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -258,28 +258,6 @@ struct atomisp_css_params_with_list { struct list_head list; }; -struct atomisp_acc_fw { - struct ia_css_fw_info *fw; - unsigned int handle; - unsigned int flags; - unsigned int type; - struct { - size_t length; - unsigned long css_ptr; - } args[ATOMISP_ACC_NR_MEMORY]; - struct list_head list; -}; - -struct atomisp_map { - ia_css_ptr ptr; - size_t length; - struct list_head list; - /* FIXME: should keep book which maps are currently used - * by binaries and not allow releasing those - * which are in use. Implement by reference counting. - */ -}; - struct atomisp_sub_device { struct v4l2_subdev subdev; struct media_pad pads[ATOMISP_SUBDEV_PADS_NUM]; -- GitLab From 0d945e4d0a9c966129f4ba38f9e8297dcc271e03 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 28 Aug 2022 21:15:52 +0200 Subject: [PATCH 0564/2223] media: atomisp: Remove empty atomisp_css_set_cont_prev_start_time() function atomisp_css_set_cont_prev_start_time() is a no-op, remove it. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_compat.h | 3 --- drivers/staging/media/atomisp/pci/atomisp_compat_css20.c | 8 -------- drivers/staging/media/atomisp/pci/atomisp_internal.h | 3 --- drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 2 -- 4 files changed, 16 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat.h b/drivers/staging/media/atomisp/pci/atomisp_compat.h index 54c57bbf4c4de..af6ab8434b5ee 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat.h +++ b/drivers/staging/media/atomisp/pci/atomisp_compat.h @@ -434,9 +434,6 @@ void atomisp_css_get_morph_table(struct atomisp_sub_device *asd, void atomisp_css_morph_table_free(struct ia_css_morph_table *table); -void atomisp_css_set_cont_prev_start_time(struct atomisp_device *isp, - unsigned int overlap); - int atomisp_css_get_dis_stat(struct atomisp_sub_device *asd, struct atomisp_dis_statistics *stats); diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c index ec47d84698ba3..cda0b5eba16db 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c +++ b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c @@ -3763,14 +3763,6 @@ void atomisp_css_morph_table_free(struct ia_css_morph_table *table) ia_css_morph_table_free(table); } -void atomisp_css_set_cont_prev_start_time(struct atomisp_device *isp, - unsigned int overlap) -{ - /* CSS 2.0 doesn't support this API. */ - dev_dbg(isp->dev, "set cont prev start time is not supported.\n"); - return; -} - static struct atomisp_sub_device *__get_atomisp_subdev( struct ia_css_pipe *css_pipe, struct atomisp_device *isp, diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index 1d2326a40227b..e299304c356b5 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -105,9 +105,6 @@ #define ATOMISP_DELAYED_INIT_QUEUED 1 #define ATOMISP_DELAYED_INIT_DONE 2 -#define ATOMISP_CALC_CSS_PREV_OVERLAP(lines) \ - ((lines) * 38 / 100 & 0xfffffe) - /* * Define how fast CPU should be able to serve ISP interrupts. * The bigger the value, the higher risk that the ISP is not diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index bdbb9dbbceece..caeb38eadc489 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1934,8 +1934,6 @@ start_delay_wq: reinit_completion(&asd->init_done); asd->delayed_init = ATOMISP_DELAYED_INIT_QUEUED; queue_work(asd->delayed_init_workq, &asd->delayed_init_work); - atomisp_css_set_cont_prev_start_time(isp, - ATOMISP_CALC_CSS_PREV_OVERLAP(sink->height)); } else { asd->delayed_init = ATOMISP_DELAYED_INIT_NOT_QUEUED; } -- GitLab From 3b423e1bed3dfef5b8260b6bd622253e4f974428 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 28 Aug 2022 21:41:50 +0200 Subject: [PATCH 0565/2223] media: atomisp: Split subdev and video-node registration into 2 steps Split subdev and video-node registration into 2 steps, this is a preparation step for moving video-node registration to the end of probe() so that the loading() mutex can be removed. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_subdev.c | 16 ++++++++-------- .../staging/media/atomisp/pci/atomisp_subdev.h | 6 ++++-- drivers/staging/media/atomisp/pci/atomisp_v4l2.c | 4 +++- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 5e66d6a695568..047e2e9d63d79 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1281,8 +1281,14 @@ void atomisp_subdev_unregister_entities(struct atomisp_sub_device *asd) atomisp_video_unregister(&asd->video_out_video_capture); } -int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, - struct v4l2_device *vdev) +int atomisp_subdev_register_subdev(struct atomisp_sub_device *asd, + struct v4l2_device *vdev) +{ + return v4l2_device_register_subdev(vdev, &asd->subdev); +} + +int atomisp_subdev_register_video_nodes(struct atomisp_sub_device *asd, + struct v4l2_device *vdev) { int ret; @@ -1291,12 +1297,6 @@ int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, * Should any of those use V4L2_CAP_META_CAPTURE? Probably yes. */ - /* Register the subdev and video node. */ - - ret = v4l2_device_register_subdev(vdev, &asd->subdev); - if (ret < 0) - goto error; - asd->video_out_preview.vdev.v4l2_dev = vdev; asd->video_out_preview.vdev.device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; ret = video_register_device(&asd->video_out_preview.vdev, diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index d1a9857e5d68c..d8b2dd00a7929 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -417,8 +417,10 @@ int atomisp_update_run_mode(struct atomisp_sub_device *asd); void atomisp_subdev_cleanup_pending_events(struct atomisp_sub_device *asd); void atomisp_subdev_unregister_entities(struct atomisp_sub_device *asd); -int atomisp_subdev_register_entities(struct atomisp_sub_device *asd, - struct v4l2_device *vdev); +int atomisp_subdev_register_subdev(struct atomisp_sub_device *asd, + struct v4l2_device *vdev); +int atomisp_subdev_register_video_nodes(struct atomisp_sub_device *asd, + struct v4l2_device *vdev); int atomisp_subdev_init(struct atomisp_device *isp); void atomisp_subdev_cleanup(struct atomisp_device *isp); int atomisp_create_pads_links(struct atomisp_device *isp); diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index bb48c74c0c07e..9a1eae1ba8c0f 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1193,7 +1193,9 @@ static int atomisp_register_entities(struct atomisp_device *isp) for (i = 0; i < isp->num_of_streams; i++) { struct atomisp_sub_device *asd = &isp->asd[i]; - ret = atomisp_subdev_register_entities(asd, &isp->v4l2_dev); + ret = atomisp_subdev_register_subdev(asd, &isp->v4l2_dev); + if (ret == 0) + ret = atomisp_subdev_register_video_nodes(asd, &isp->v4l2_dev); if (ret < 0) { dev_err(isp->dev, "atomisp_subdev_register_entities fail\n"); -- GitLab From eb81065b9322d6493a152665b4f0974819899c66 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 29 Aug 2022 00:12:12 +0200 Subject: [PATCH 0566/2223] media: atomisp: Register /dev/* nodes at the end of atomisp_pci_probe() Register /dev/* nodes at the end of atomisp_pci_probe(), this is a prerequisite for dropping the loading mutex + ready flag kludge for delaying open() calls on the /dev/* nodes . Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_v4l2.c | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 9a1eae1ba8c0f..f819a6993e45b 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1194,11 +1194,8 @@ static int atomisp_register_entities(struct atomisp_device *isp) struct atomisp_sub_device *asd = &isp->asd[i]; ret = atomisp_subdev_register_subdev(asd, &isp->v4l2_dev); - if (ret == 0) - ret = atomisp_subdev_register_video_nodes(asd, &isp->v4l2_dev); if (ret < 0) { - dev_err(isp->dev, - "atomisp_subdev_register_entities fail\n"); + dev_err(isp->dev, "atomisp_subdev_register_subdev fail\n"); for (; i > 0; i--) atomisp_subdev_unregister_entities( &isp->asd[i - 1]); @@ -1248,11 +1245,7 @@ static int atomisp_register_entities(struct atomisp_device *isp) dev_warn(isp->dev, "too many atomisp inputs, TPG ignored.\n"); } - ret = v4l2_device_register_subdev_nodes(&isp->v4l2_dev); - if (ret < 0) - goto link_failed; - - return media_device_register(&isp->media_dev); + return 0; link_failed: for (i = 0; i < isp->num_of_streams; i++) @@ -1275,6 +1268,27 @@ v4l2_device_failed: return ret; } +static int atomisp_register_device_nodes(struct atomisp_device *isp) +{ + int i, err; + + for (i = 0; i < isp->num_of_streams; i++) { + err = atomisp_subdev_register_video_nodes(&isp->asd[i], &isp->v4l2_dev); + if (err) + return err; + } + + err = atomisp_create_pads_links(isp); + if (err) + return err; + + err = v4l2_device_register_subdev_nodes(&isp->v4l2_dev); + if (err) + return err; + + return media_device_register(&isp->media_dev); +} + static int atomisp_initialize_modules(struct atomisp_device *isp) { int ret; @@ -1687,9 +1701,6 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev_err(&pdev->dev, "atomisp_register_entities failed (%d)\n", err); goto register_entities_fail; } - err = atomisp_create_pads_links(isp); - if (err < 0) - goto register_entities_fail; /* init atomisp wdts */ err = init_atomisp_wdts(isp); if (err != 0) @@ -1727,8 +1738,13 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i isp->firmware = NULL; isp->css_env.isp_css_fw.data = NULL; isp->ready = true; + rt_mutex_unlock(&isp->loading); + err = atomisp_register_device_nodes(isp); + if (err) + goto css_init_fail; + atomisp_drvfs_init(isp); return 0; -- GitLab From 5a93d0cacf2fe110848200554b2815b2578c679c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 29 Aug 2022 12:30:08 +0200 Subject: [PATCH 0567/2223] media: atomisp: Remove loading mutex Now that the registering of the /dev/* video / subdev nodes has been moved to the end of atomisp_pci_probe() the workaround with the loading mutex to delay opens until init is done is no longer necessary. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_fops.c | 16 ---------------- .../staging/media/atomisp/pci/atomisp_internal.h | 7 ------- drivers/staging/media/atomisp/pci/atomisp_v4l2.c | 7 ------- 3 files changed, 30 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 3fa3c28b1a80e..6518e6d5c7b53 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -707,29 +707,13 @@ static int atomisp_open(struct file *file) dev_dbg(isp->dev, "open device %s\n", vdev->name); - /* - * Ensure that if we are still loading we block. Once the loading - * is over we can proceed. We can't blindly hold the lock until - * that occurs as if the load fails we'll deadlock the unload - */ - rt_mutex_lock(&isp->loading); - /* - * FIXME: revisit this with a better check once the code structure - * is cleaned up a bit more - */ ret = v4l2_fh_open(file); if (ret) { dev_err(isp->dev, "%s: v4l2_fh_open() returned error %d\n", __func__, ret); - rt_mutex_unlock(&isp->loading); return ret; } - if (!isp->ready) { - rt_mutex_unlock(&isp->loading); - return -ENXIO; - } - rt_mutex_unlock(&isp->loading); rt_mutex_lock(&isp->mutex); diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index e299304c356b5..fc7bd877dae88 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -239,13 +239,6 @@ struct atomisp_device { /* Purpose of mutex is to protect and serialize use of isp data * structures and css API calls. */ struct rt_mutex mutex; - /* - * This mutex ensures that we don't allow an open to succeed while - * the initialization process is incomplete - */ - struct rt_mutex loading; - /* Set once the ISP is ready to allow opens */ - bool ready; /* * Serialise streamoff: mutex is dropped during streamoff to * cancel the watchdog queue. MUST be acquired BEFORE diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index f819a6993e45b..4d73bf3d64217 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1515,7 +1515,6 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev_dbg(&pdev->dev, "atomisp mmio base: %p\n", isp->base); rt_mutex_init(&isp->mutex); - rt_mutex_init(&isp->loading); mutex_init(&isp->streamoff_mutex); spin_lock_init(&isp->lock); @@ -1688,8 +1687,6 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i pci_write_config_dword(pdev, MRFLD_PCI_CSI_AFE_TRIM_CONTROL, csi_afe_trim); } - rt_mutex_lock(&isp->loading); - err = atomisp_initialize_modules(isp); if (err < 0) { dev_err(&pdev->dev, "atomisp_initialize_modules (%d)\n", err); @@ -1737,9 +1734,6 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i release_firmware(isp->firmware); isp->firmware = NULL; isp->css_env.isp_css_fw.data = NULL; - isp->ready = true; - - rt_mutex_unlock(&isp->loading); err = atomisp_register_device_nodes(isp); if (err) @@ -1760,7 +1754,6 @@ wdt_work_queue_fail: register_entities_fail: atomisp_uninitialize_modules(isp); initialize_modules_fail: - rt_mutex_unlock(&isp->loading); cpu_latency_qos_remove_request(&isp->pm_qos); atomisp_msi_irq_uninit(isp); pci_free_irq_vectors(pdev); -- GitLab From 5b9853ad1329be49343a608d574eb232ff1273d0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 29 Aug 2022 12:35:43 +0200 Subject: [PATCH 0568/2223] media: atomisp: Fix v4l2_fh resource leak on open errors When atomisp_open() fails then it must call v4l2_fh_release() to undo the results of v4l2_fh_open(). Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_fops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 6518e6d5c7b53..61571f3fb0604 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -804,6 +804,7 @@ css_error: pm_runtime_put(vdev->v4l2_dev->dev); error: rt_mutex_unlock(&isp->mutex); + v4l2_fh_release(file); return ret; } -- GitLab From ba6856aab1a7edebf1a27c0b6bffaa8a6ea80de7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 29 Aug 2022 12:37:11 +0200 Subject: [PATCH 0569/2223] media: atomisp: Simplify v4l2_fh_open() error handling v4l2_fh_open() can only fail with -ENOMEM and as a generic rule drivers do not log their own errors for -ENOMEM since the kernel will already have complained loudly about this before the -ENOMEM return. Remove the unnecessary error logging from atomisp_open(). Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_fops.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 61571f3fb0604..fa37defa5eccf 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -708,12 +708,8 @@ static int atomisp_open(struct file *file) dev_dbg(isp->dev, "open device %s\n", vdev->name); ret = v4l2_fh_open(file); - if (ret) { - dev_err(isp->dev, - "%s: v4l2_fh_open() returned error %d\n", - __func__, ret); + if (ret) return ret; - } rt_mutex_lock(&isp->mutex); -- GitLab From 3ad290194bb06979367622e47357462836c1d3b4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 1 Sep 2022 07:20:09 +0200 Subject: [PATCH 0570/2223] media: atomisp: prevent integer overflow in sh_css_set_black_frame() The "height" and "width" values come from the user so the "height * width" multiplication can overflow. Link: https://lore.kernel.org/r/YxBBCRnm3mmvaiuR@kili Fixes: a49d25364dfb ("staging/atomisp: Add support for the Intel IPU v2") Signed-off-by: Dan Carpenter Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/sh_css_params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/sh_css_params.c b/drivers/staging/media/atomisp/pci/sh_css_params.c index 0e7c38b2bfe32..67915d76a87f2 100644 --- a/drivers/staging/media/atomisp/pci/sh_css_params.c +++ b/drivers/staging/media/atomisp/pci/sh_css_params.c @@ -950,8 +950,8 @@ sh_css_set_black_frame(struct ia_css_stream *stream, params->fpn_config.data = NULL; } if (!params->fpn_config.data) { - params->fpn_config.data = kvmalloc(height * width * - sizeof(short), GFP_KERNEL); + params->fpn_config.data = kvmalloc(array3_size(height, width, sizeof(short)), + GFP_KERNEL); if (!params->fpn_config.data) { IA_CSS_ERROR("out of memory"); IA_CSS_LEAVE_ERR_PRIVATE(-ENOMEM); -- GitLab From f10fc1790d5ecdaa0aabab9b61be6b162e83386c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 2 Sep 2022 12:26:51 +0200 Subject: [PATCH 0571/2223] media: atomisp: Use a normal mutex for the main lock There is no reason for atomisp to use a rt_mutex instead of a normal mutex, so switch over to a normal mutex. All the changes in this patch are just s/rt_mutex/mutex/. This is a preparation patch for switching the ioctl locking over to using the video_dev.lock member so that the v4l2-core takes care of the locking. Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 12 +- .../staging/media/atomisp/pci/atomisp_fops.c | 28 ++--- .../media/atomisp/pci/atomisp_internal.h | 2 +- .../staging/media/atomisp/pci/atomisp_ioctl.c | 106 +++++++++--------- .../staging/media/atomisp/pci/atomisp_v4l2.c | 2 +- 5 files changed, 75 insertions(+), 75 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 1c4748b7186ed..97ef02e4e7a6f 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -1446,10 +1446,10 @@ void atomisp_wdt_work(struct work_struct *work) unsigned int pipe_wdt_cnt[MAX_STREAM_NUM][4] = { {0} }; bool css_recover = true; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (!atomisp_streaming_count(isp)) { atomic_set(&isp->wdt_work_queued, 0); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return; } @@ -1581,7 +1581,7 @@ void atomisp_wdt_work(struct work_struct *work) isp->isp_fatal_error = true; atomic_set(&isp->wdt_work_queued, 0); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return; } } @@ -1601,7 +1601,7 @@ void atomisp_wdt_work(struct work_struct *work) dev_err(isp->dev, "timeout recovery handling done\n"); atomic_set(&isp->wdt_work_queued, 0); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); } void atomisp_css_flush(struct atomisp_device *isp) @@ -1861,7 +1861,7 @@ irqreturn_t atomisp_isr_thread(int irq, void *isp_ptr) * For CSS2.0: we change the way to not dequeue all the event at one * time, instead, dequue one and process one, then another */ - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (atomisp_css_isr_thread(isp, frame_done_found, css_pipe_done)) goto out; @@ -1872,7 +1872,7 @@ irqreturn_t atomisp_isr_thread(int irq, void *isp_ptr) atomisp_setup_flash(asd); } out: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); dev_dbg(isp->dev, "<%s\n", __func__); return IRQ_HANDLED; diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index fa37defa5eccf..57587d739c4b9 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -711,7 +711,7 @@ static int atomisp_open(struct file *file) if (ret) return ret; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); asd->subdev.devnode = vdev; /* Deferred firmware loading case. */ @@ -745,7 +745,7 @@ static int atomisp_open(struct file *file) */ if (pipe->users) { dev_dbg(isp->dev, "video node already opened\n"); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return -EBUSY; } @@ -788,7 +788,7 @@ init_subdev: done: pipe->users++; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); /* Ensure that a mode is set */ v4l2_ctrl_s_ctrl(asd->run_mode, pipe->default_run_mode); @@ -799,7 +799,7 @@ css_error: atomisp_css_uninit(isp); pm_runtime_put(vdev->v4l2_dev->dev); error: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); v4l2_fh_release(file); return ret; } @@ -822,7 +822,7 @@ static int atomisp_release(struct file *file) return -EBADF; mutex_lock(&isp->streamoff_mutex); - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); dev_dbg(isp->dev, "release device %s\n", vdev->name); @@ -905,7 +905,7 @@ done: atomisp_subdev_source_pad(vdev), V4L2_SEL_TGT_COMPOSE, 0, &clear_compose); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); mutex_unlock(&isp->streamoff_mutex); return v4l2_fh_release(file); @@ -1063,7 +1063,7 @@ static int atomisp_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & (VM_WRITE | VM_READ))) return -EACCES; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (!(vma->vm_flags & VM_SHARED)) { /* Map private buffer. @@ -1074,7 +1074,7 @@ static int atomisp_mmap(struct file *file, struct vm_area_struct *vma) */ vma->vm_flags |= VM_SHARED; ret = hmm_mmap(vma, vma->vm_pgoff << PAGE_SHIFT); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -1117,7 +1117,7 @@ static int atomisp_mmap(struct file *file, struct vm_area_struct *vma) } raw_virt_addr->data_bytes = origin_size; vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return 0; } @@ -1129,12 +1129,12 @@ static int atomisp_mmap(struct file *file, struct vm_area_struct *vma) ret = -EINVAL; goto error; } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return atomisp_videobuf_mmap_mapper(&pipe->capq, vma); error: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -1146,12 +1146,12 @@ static __poll_t atomisp_poll(struct file *file, struct atomisp_device *isp = video_get_drvdata(vdev); struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (pipe->capq.streaming != 1) { - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return EPOLLERR; } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return videobuf_poll_stream(file, &pipe->capq, pt); } diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index fc7bd877dae88..759575cbd3561 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -238,7 +238,7 @@ struct atomisp_device { /* Purpose of mutex is to protect and serialize use of isp data * structures and css API calls. */ - struct rt_mutex mutex; + struct mutex mutex; /* * Serialise streamoff: mutex is dropped during streamoff to * cancel the watchdog queue. MUST be acquired BEFORE diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index caeb38eadc489..4016ac4fffe06 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -638,9 +638,9 @@ static int atomisp_g_input(struct file *file, void *fh, unsigned int *input) return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); *input = asd->input_curr; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return 0; } @@ -663,7 +663,7 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (input >= ATOM_ISP_MAX_INPUTS || input >= isp->input_cnt) { dev_dbg(isp->dev, "input_cnt: %d\n", isp->input_cnt); ret = -EINVAL; @@ -743,12 +743,12 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) asd->input_curr = input; /* mark this camera is used by the current stream */ isp->inputs[input].asd = asd; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return 0; error: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -831,7 +831,7 @@ static int atomisp_enum_fmt_cap(struct file *file, void *fh, return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); rval = v4l2_subdev_call(camera, pad, enum_mbus_code, NULL, &code); if (rval == -ENOIOCTLCMD) { @@ -839,7 +839,7 @@ static int atomisp_enum_fmt_cap(struct file *file, void *fh, "enum_mbus_code pad op not supported by %s. Please fix your sensor driver!\n", camera->name); } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); if (rval) return rval; @@ -952,9 +952,9 @@ static int atomisp_try_fmt_cap(struct file *file, void *fh, f->fmt.pix.width += pad_w; f->fmt.pix.height += pad_h; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); ret = atomisp_try_fmt(vdev, &f->fmt.pix, NULL); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); if (ret) return ret; @@ -969,9 +969,9 @@ static int atomisp_g_fmt_cap(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); struct atomisp_video_pipe *pipe; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); pipe = atomisp_to_video_pipe(vdev); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); f->fmt.pix = pipe->pix; @@ -993,14 +993,14 @@ static int atomisp_s_fmt_cap(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); int ret; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (isp->isp_fatal_error) { ret = -EIO; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } ret = atomisp_set_fmt(vdev, f); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -1224,9 +1224,9 @@ int atomisp_reqbufs(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); int ret; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); ret = __atomisp_reqbufs(file, fh, req); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -1267,7 +1267,7 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (isp->isp_fatal_error) { ret = -EIO; goto error; @@ -1366,10 +1366,10 @@ done: pipe->frame_params[buf->index] = NULL; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); ret = videobuf_qbuf(&pipe->capq, buf); - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (ret) goto error; @@ -1409,7 +1409,7 @@ done: asd->pending_capture_request++; dev_dbg(isp->dev, "Add one pending capture request.\n"); } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); dev_dbg(isp->dev, "qbuf buffer %d (%s) for asd%d\n", buf->index, vdev->name, asd->index); @@ -1417,7 +1417,7 @@ done: return ret; error: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -1455,21 +1455,21 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (isp->isp_fatal_error) { - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return -EIO; } if (asd->streaming == ATOMISP_DEVICE_STREAMING_STOPPING) { - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); dev_err(isp->dev, "%s: reject, as ISP at stopping.\n", __func__); return -EIO; } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); ret = videobuf_dqbuf(&pipe->capq, buf, file->f_flags & O_NONBLOCK); if (ret) { @@ -1477,7 +1477,7 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) dev_dbg(isp->dev, "<%s: %d\n", __func__, ret); return ret; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); buf->bytesused = pipe->pix.sizeimage; buf->reserved = asd->frame_status[buf->index]; @@ -1491,7 +1491,7 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) if (!(buf->flags & V4L2_BUF_FLAG_ERROR)) buf->reserved |= __get_frame_exp_id(pipe, buf) << 16; buf->reserved2 = pipe->frame_config_id[buf->index]; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); dev_dbg(isp->dev, "dqbuf buffer %d (%s) for asd%d with exp_id %d, isp_config_id %d\n", @@ -1720,7 +1720,7 @@ static int atomisp_streamon(struct file *file, void *fh, return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (isp->isp_fatal_error) { ret = -EIO; goto out; @@ -1774,11 +1774,11 @@ static int atomisp_streamon(struct file *file, void *fh, if (asd->delayed_init == ATOMISP_DELAYED_INIT_QUEUED) { flush_work(&asd->delayed_init_work); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); if (wait_for_completion_interruptible( &asd->init_done) != 0) return -ERESTARTSYS; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); } /* handle per_frame_setting parameter and buffers */ @@ -1938,7 +1938,7 @@ start_delay_wq: asd->delayed_init = ATOMISP_DELAYED_INIT_NOT_QUEUED; } out: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -2036,9 +2036,9 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) if (first_streamoff) { /* if other streams are running, should not disable watch dog */ - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); atomisp_wdt_stop(asd, true); - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); } spin_lock_irqsave(&isp->lock, flags); @@ -2188,9 +2188,9 @@ static int atomisp_streamoff(struct file *file, void *fh, int rval; mutex_lock(&isp->streamoff_mutex); - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); rval = __atomisp_streamoff(file, fh, type); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); mutex_unlock(&isp->streamoff_mutex); return rval; @@ -2225,7 +2225,7 @@ static int atomisp_g_ctrl(struct file *file, void *fh, if (ret) return ret; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); switch (control->id) { case V4L2_CID_IRIS_ABSOLUTE: @@ -2248,7 +2248,7 @@ static int atomisp_g_ctrl(struct file *file, void *fh, case V4L2_CID_TEST_PATTERN_COLOR_GR: case V4L2_CID_TEST_PATTERN_COLOR_GB: case V4L2_CID_TEST_PATTERN_COLOR_B: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return v4l2_g_ctrl(isp->inputs[asd->input_curr].camera-> ctrl_handler, control); case V4L2_CID_COLORFX: @@ -2277,7 +2277,7 @@ static int atomisp_g_ctrl(struct file *file, void *fh, break; } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -2310,7 +2310,7 @@ static int atomisp_s_ctrl(struct file *file, void *fh, if (ret) return ret; - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); switch (control->id) { case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: case V4L2_CID_EXPOSURE: @@ -2331,7 +2331,7 @@ static int atomisp_s_ctrl(struct file *file, void *fh, case V4L2_CID_TEST_PATTERN_COLOR_GR: case V4L2_CID_TEST_PATTERN_COLOR_GB: case V4L2_CID_TEST_PATTERN_COLOR_B: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return v4l2_s_ctrl(NULL, isp->inputs[asd->input_curr].camera-> ctrl_handler, control); @@ -2363,7 +2363,7 @@ static int atomisp_s_ctrl(struct file *file, void *fh, ret = -EINVAL; break; } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return ret; } @@ -2488,9 +2488,9 @@ static int atomisp_camera_g_ext_ctrls(struct file *file, void *fh, &ctrl); break; case V4L2_CID_ZOOM_ABSOLUTE: - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); ret = atomisp_digital_zoom(asd, 0, &ctrl.value); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); break; case V4L2_CID_G_SKIP_FRAMES: ret = v4l2_subdev_call( @@ -2603,7 +2603,7 @@ static int atomisp_camera_s_ext_ctrls(struct file *file, void *fh, case V4L2_CID_FLASH_STROBE: case V4L2_CID_FLASH_MODE: case V4L2_CID_FLASH_STATUS_REGISTER: - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); if (isp->flash) { ret = v4l2_s_ctrl(NULL, isp->flash->ctrl_handler, @@ -2618,12 +2618,12 @@ static int atomisp_camera_s_ext_ctrls(struct file *file, void *fh, asd->params.num_flash_frames = 0; } } - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); break; case V4L2_CID_ZOOM_ABSOLUTE: - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); ret = atomisp_digital_zoom(asd, 1, &ctrl.value); - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); break; default: ctr = v4l2_ctrl_find(&asd->ctrl_handler, ctrl.id); @@ -2691,9 +2691,9 @@ static int atomisp_g_parm(struct file *file, void *fh, return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); parm->parm.capture.capturemode = asd->run_mode->val; - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return 0; } @@ -2719,7 +2719,7 @@ static int atomisp_s_parm(struct file *file, void *fh, return -EINVAL; } - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); asd->high_speed_mode = false; switch (parm->parm.capture.capturemode) { @@ -2761,7 +2761,7 @@ static int atomisp_s_parm(struct file *file, void *fh, rval = v4l2_ctrl_s_ctrl(asd->run_mode, mode); out: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); return rval == -ENOIOCTLCMD ? 0 : rval; } @@ -2795,7 +2795,7 @@ static long atomisp_vidioc_default(struct file *file, void *fh, /* we do not need take isp->mutex for these IOCTLs */ break; default: - rt_mutex_lock(&isp->mutex); + mutex_lock(&isp->mutex); break; } switch (cmd) { @@ -3057,7 +3057,7 @@ static long atomisp_vidioc_default(struct file *file, void *fh, case ATOMISP_IOC_G_UPDATE_EXPOSURE: break; default: - rt_mutex_unlock(&isp->mutex); + mutex_unlock(&isp->mutex); break; } return err; diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 4d73bf3d64217..aa38e0d33b5b8 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1514,7 +1514,7 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev_dbg(&pdev->dev, "atomisp mmio base: %p\n", isp->base); - rt_mutex_init(&isp->mutex); + mutex_init(&isp->mutex); mutex_init(&isp->streamoff_mutex); spin_lock_init(&isp->lock); -- GitLab From d33a6d321078b66b9dd0a87413827ed178dd1779 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 2 Sep 2022 13:04:33 +0200 Subject: [PATCH 0572/2223] media: atomisp: Remove unused lock member from struct atomisp_sub_device The spin-lock embedded in struct atomisp_sub_device is not used anywhere, remove it. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_subdev.c | 1 - drivers/staging/media/atomisp/pci/atomisp_subdev.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 047e2e9d63d79..4a43677015098 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1356,7 +1356,6 @@ int atomisp_subdev_init(struct atomisp_device *isp) return -ENOMEM; for (i = 0; i < isp->num_of_streams; i++) { asd = &isp->asd[i]; - spin_lock_init(&asd->lock); asd->isp = isp; isp_subdev_init_params(asd); asd->index = i; diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index d8b2dd00a7929..eaf7678804075 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -272,7 +272,6 @@ struct atomisp_sub_device { /* video pipe main output */ struct atomisp_video_pipe video_out_video_capture; /* struct isp_subdev_params params; */ - spinlock_t lock; struct atomisp_device *isp; struct v4l2_ctrl_handler ctrl_handler; struct v4l2_ctrl *fmt_auto; -- GitLab From 2468083f799eb9eef7b03f48ebb9673ad5655f88 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 2 Sep 2022 13:39:51 +0200 Subject: [PATCH 0573/2223] media: atomisp: Fix locking around asd->streaming read/write For reading / writing the asd->streaming enum the following rules should be followed: 1. Writers of streaming must hold both isp->mutex and isp->lock. 2. Readers of streaming need to hold only one of the two locks. Not all writers where properly taking both locks this fixes this. In the case of the readers, many readers depend on their caller to hold isp->mutex, add asserts for this And in the case of atomisp_css_get_dis_stat() it is called with isp->mutex held, so there is no need to take the spinlock just for reading the streaming value. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 32 +++++++++++++++++-- .../media/atomisp/pci/atomisp_compat_css20.c | 10 +++--- .../staging/media/atomisp/pci/atomisp_fops.c | 3 ++ .../media/atomisp/pci/atomisp_internal.h | 2 +- .../staging/media/atomisp/pci/atomisp_ioctl.c | 4 +++ .../media/atomisp/pci/atomisp_subdev.c | 8 ++++- .../media/atomisp/pci/atomisp_subdev.h | 6 +++- 7 files changed, 55 insertions(+), 10 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 97ef02e4e7a6f..c7f825e38921d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -899,6 +899,8 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, struct v4l2_control ctrl; bool reset_wdt_timer = false; + lockdep_assert_held(&isp->mutex); + if ( buf_type != IA_CSS_BUFFER_TYPE_METADATA && buf_type != IA_CSS_BUFFER_TYPE_3A_STATISTICS && @@ -1298,6 +1300,9 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) bool stream_restart[MAX_STREAM_NUM] = {0}; bool depth_mode = false; int i, ret, depth_cnt = 0; + unsigned long flags; + + lockdep_assert_held(&isp->mutex); atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, false); @@ -1320,7 +1325,9 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) stream_restart[asd->index] = true; + spin_lock_irqsave(&isp->lock, flags); asd->streaming = ATOMISP_DEVICE_STREAMING_STOPPING; + spin_unlock_irqrestore(&isp->lock, flags); /* stream off sensor */ ret = v4l2_subdev_call( @@ -1335,7 +1342,9 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) css_pipe_id = atomisp_get_css_pipe_id(asd); atomisp_css_stop(asd, css_pipe_id, true); + spin_lock_irqsave(&isp->lock, flags); asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; + spin_unlock_irqrestore(&isp->lock, flags); asd->preview_exp_id = 1; asd->postview_exp_id = 1; @@ -1376,11 +1385,14 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) IA_CSS_INPUT_MODE_BUFFERED_SENSOR); css_pipe_id = atomisp_get_css_pipe_id(asd); - if (atomisp_css_start(asd, css_pipe_id, true)) + if (atomisp_css_start(asd, css_pipe_id, true)) { dev_warn(isp->dev, "start SP failed, so do not set streaming to be enable!\n"); - else + } else { + spin_lock_irqsave(&isp->lock, flags); asd->streaming = ATOMISP_DEVICE_STREAMING_ENABLED; + spin_unlock_irqrestore(&isp->lock, flags); + } atomisp_csi2_configure(asd); } @@ -1608,6 +1620,8 @@ void atomisp_css_flush(struct atomisp_device *isp) { int i; + lockdep_assert_held(&isp->mutex); + if (!atomisp_streaming_count(isp)) return; @@ -4046,6 +4060,8 @@ void atomisp_handle_parameter_and_buffer(struct atomisp_video_pipe *pipe) unsigned long irqflags; bool need_to_enqueue_buffer = false; + lockdep_assert_held(&asd->isp->mutex); + if (!asd) { dev_err(pipe->isp->dev, "%s(): asd is NULL, device is %s\n", __func__, pipe->vdev.name); @@ -4139,6 +4155,8 @@ int atomisp_set_parameters(struct video_device *vdev, struct atomisp_css_params *css_param = &asd->params.css_param; int ret; + lockdep_assert_held(&asd->isp->mutex); + if (!asd) { dev_err(pipe->isp->dev, "%s(): asd is NULL, device is %s\n", __func__, vdev->name); @@ -5537,6 +5555,8 @@ int atomisp_set_fmt(struct video_device *vdev, struct v4l2_format *f) struct v4l2_subdev_fh fh; int ret; + lockdep_assert_held(&isp->mutex); + if (!asd) { dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", __func__, vdev->name); @@ -6159,6 +6179,8 @@ int atomisp_offline_capture_configure(struct atomisp_sub_device *asd, { struct v4l2_ctrl *c; + lockdep_assert_held(&asd->isp->mutex); + /* * In case of M10MO ZSL capture case, we need to issue a separate * capture request to M10MO which will output captured jpeg image @@ -6433,6 +6455,8 @@ int atomisp_exp_id_capture(struct atomisp_sub_device *asd, int *exp_id) int value = *exp_id; int ret; + lockdep_assert_held(&isp->mutex); + ret = __is_raw_buffer_locked(asd, value); if (ret) { dev_err(isp->dev, "%s exp_id %d invalid %d.\n", __func__, value, ret); @@ -6454,6 +6478,8 @@ int atomisp_exp_id_unlock(struct atomisp_sub_device *asd, int *exp_id) int value = *exp_id; int ret; + lockdep_assert_held(&isp->mutex); + ret = __clear_raw_buffer_bitmap(asd, value); if (ret) { dev_err(isp->dev, "%s exp_id %d invalid %d.\n", __func__, value, ret); @@ -6489,6 +6515,8 @@ int atomisp_inject_a_fake_event(struct atomisp_sub_device *asd, int *event) if (!event || asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) return -EINVAL; + lockdep_assert_held(&asd->isp->mutex); + dev_dbg(asd->isp->dev, "%s: trying to inject a fake event 0x%x\n", __func__, *event); diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c index cda0b5eba16db..15ef31b0c601d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c +++ b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c @@ -3626,6 +3626,8 @@ int atomisp_css_get_dis_stat(struct atomisp_sub_device *asd, struct atomisp_dis_buf *dis_buf; unsigned long flags; + lockdep_assert_held(&isp->mutex); + if (!asd->params.dvs_stat->hor_prod.odd_real || !asd->params.dvs_stat->hor_prod.odd_imag || !asd->params.dvs_stat->hor_prod.even_real || @@ -3637,12 +3639,8 @@ int atomisp_css_get_dis_stat(struct atomisp_sub_device *asd, return -EINVAL; /* isp needs to be streaming to get DIS statistics */ - spin_lock_irqsave(&isp->lock, flags); - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) { - spin_unlock_irqrestore(&isp->lock, flags); + if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) return -EINVAL; - } - spin_unlock_irqrestore(&isp->lock, flags); if (atomisp_compare_dvs_grid(asd, &stats->dvs2_stat.grid_info) != 0) /* If the grid info in the argument differs from the current @@ -3801,6 +3799,8 @@ int atomisp_css_isr_thread(struct atomisp_device *isp, bool reset_wdt_timer[MAX_STREAM_NUM] = {false}; int i; + lockdep_assert_held(&isp->mutex); + while (!ia_css_dequeue_psys_event(¤t_event.event)) { if (current_event.event.type == IA_CSS_EVENT_TYPE_FW_ASSERT) { diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 57587d739c4b9..e1b213ba46865 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -813,6 +813,7 @@ static int atomisp_release(struct file *file) struct v4l2_requestbuffers req; struct v4l2_subdev_fh fh; struct v4l2_rect clear_compose = {0}; + unsigned long flags; int ret = 0; v4l2_fh_init(&fh.vfh, vdev); @@ -878,7 +879,9 @@ static int atomisp_release(struct file *file) /* clear the asd field to show this camera is not used */ isp->inputs[asd->input_curr].asd = NULL; + spin_lock_irqsave(&isp->lock, flags); asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; + spin_unlock_irqrestore(&isp->lock, flags); if (atomisp_dev_users(isp)) goto done; diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index 759575cbd3561..b2c362ef71994 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -266,7 +266,7 @@ struct atomisp_device { atomic_t wdt_work_queued; - spinlock_t lock; /* Just for streaming below */ + spinlock_t lock; /* Protects asd[i].streaming */ bool need_gfx_throttle; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 4016ac4fffe06..21af5feca386b 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1837,7 +1837,9 @@ static int atomisp_streamon(struct file *file, void *fh, if (ret) goto out; + spin_lock_irqsave(&isp->lock, irqflags); asd->streaming = ATOMISP_DEVICE_STREAMING_ENABLED; + spin_unlock_irqrestore(&isp->lock, irqflags); atomic_set(&asd->sof_count, -1); atomic_set(&asd->sequence, -1); atomic_set(&asd->sequence_temp, -1); @@ -1910,7 +1912,9 @@ start_sensor: ret = v4l2_subdev_call(isp->inputs[asd->input_curr].camera, video, s_stream, 1); if (ret) { + spin_lock_irqsave(&isp->lock, irqflags); asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; + spin_unlock_irqrestore(&isp->lock, irqflags); ret = -EINVAL; goto out; } diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 4a43677015098..88bf693f4c50a 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -874,12 +874,18 @@ static int s_ctrl(struct v4l2_ctrl *ctrl) { struct atomisp_sub_device *asd = container_of( ctrl->handler, struct atomisp_sub_device, ctrl_handler); + unsigned int streaming; + unsigned long flags; switch (ctrl->id) { case V4L2_CID_RUN_MODE: return __atomisp_update_run_mode(asd); case V4L2_CID_DEPTH_MODE: - if (asd->streaming != ATOMISP_DEVICE_STREAMING_DISABLED) { + /* Use spinlock instead of mutex to avoid possible locking issues */ + spin_lock_irqsave(&asd->isp->lock, flags); + streaming = asd->streaming; + spin_unlock_irqrestore(&asd->isp->lock, flags); + if (streaming != ATOMISP_DEVICE_STREAMING_DISABLED) { dev_err(asd->isp->dev, "ISP is streaming, it is not supported to change the depth mode\n"); return -EINVAL; diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index eaf7678804075..b44f060b0bb52 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -330,7 +330,11 @@ struct atomisp_sub_device { atomic_t sequence; /* Sequence value that is assigned to buffer. */ atomic_t sequence_temp; - unsigned int streaming; /* Hold both mutex and lock to change this */ + /* + * Writers of streaming must hold both isp->mutex and isp->lock. + * Readers of streaming need to hold only one of the two locks. + */ + unsigned int streaming; bool stream_prepared; /* whether css stream is created */ /* subdev index: will be used to show which subdev is holding the -- GitLab From 0d51573df3e0d944a644dbe90cdb06afefe77cc4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 3 Sep 2022 12:41:15 +0200 Subject: [PATCH 0574/2223] media: atomisp: Remove asd == NULL checks from ioctl handling At probe time isp_subdev_init_entities() sets pipe->asd to a non NULL value for all four (preview/vf/capture/capture_video) pipes by calling atomisp_init_subdev_pipe() for all 4 pipes. So it can never be NULL. Remove the redundant NULL checks. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 18 ---- .../staging/media/atomisp/pci/atomisp_ioctl.c | 89 ------------------- 2 files changed, 107 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index c7f825e38921d..0870789004159 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -1755,12 +1755,6 @@ void atomisp_wdt_refresh(struct atomisp_sub_device *asd, unsigned int delay) /* ISP2401 */ void atomisp_wdt_stop_pipe(struct atomisp_video_pipe *pipe, bool sync) { - if (!pipe->asd) { - dev_err(pipe->isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, pipe->vdev.name); - return; - } - if (!atomisp_is_wdt_running(pipe)) return; @@ -5557,12 +5551,6 @@ int atomisp_set_fmt(struct video_device *vdev, struct v4l2_format *f) lockdep_assert_held(&isp->mutex); - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - if (source_pad >= ATOMISP_SUBDEV_PADS_NUM) return -EINVAL; @@ -6587,12 +6575,6 @@ int atomisp_get_invalid_frame_num(struct video_device *vdev, struct ia_css_pipe_info p_info; int ret; - if (!asd) { - dev_err(pipe->isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - if (asd->isp->inputs[asd->input_curr].camera_caps-> sensor[asd->sensor_curr].stream_num > 1) { /* External ISP */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 21af5feca386b..9c7022be3a06d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -632,12 +632,6 @@ static int atomisp_g_input(struct file *file, void *fh, unsigned int *input) struct atomisp_device *isp = video_get_drvdata(vdev); struct atomisp_sub_device *asd = atomisp_to_video_pipe(vdev)->asd; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - mutex_lock(&isp->mutex); *input = asd->input_curr; mutex_unlock(&isp->mutex); @@ -657,12 +651,6 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) struct v4l2_subdev *motor; int ret; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - mutex_lock(&isp->mutex); if (input >= ATOM_ISP_MAX_INPUTS || input >= isp->input_cnt) { dev_dbg(isp->dev, "input_cnt: %d\n", isp->input_cnt); @@ -818,12 +806,6 @@ static int atomisp_enum_fmt_cap(struct file *file, void *fh, unsigned int i, fi = 0; int rval; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - camera = isp->inputs[asd->input_curr].camera; if(!camera) { dev_err(isp->dev, "%s(): camera is NULL, device is %s\n", @@ -1152,11 +1134,6 @@ int __atomisp_reqbufs(struct file *file, void *fh, u16 stream_id; int ret = 0, i = 0; - if (!asd) { - dev_err(pipe->isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } stream_id = atomisp_source_pad_to_stream_id(asd, source_pad); if (req->count == 0) { @@ -1261,12 +1238,6 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) u32 pgnr; int ret = 0; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - mutex_lock(&isp->mutex); if (isp->isp_fatal_error) { ret = -EIO; @@ -1449,12 +1420,6 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) struct atomisp_device *isp = video_get_drvdata(vdev); int ret = 0; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - mutex_lock(&isp->mutex); if (isp->isp_fatal_error) { @@ -1706,12 +1671,6 @@ static int atomisp_streamon(struct file *file, void *fh, int ret = 0; unsigned long irqflags; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - dev_dbg(isp->dev, "Start stream on pad %d for asd%d\n", atomisp_subdev_source_pad(vdev), asd->index); @@ -1963,12 +1922,6 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) unsigned long flags; bool first_streamoff = false; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - dev_dbg(isp->dev, "Stop stream on pad %d for asd%d\n", atomisp_subdev_source_pad(vdev), asd->index); @@ -2213,12 +2166,6 @@ static int atomisp_g_ctrl(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); int i, ret = -EINVAL; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - for (i = 0; i < ctrls_num; i++) { if (ci_v4l2_controls[i].id == control->id) { ret = 0; @@ -2298,12 +2245,6 @@ static int atomisp_s_ctrl(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); int i, ret = -EINVAL; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - for (i = 0; i < ctrls_num; i++) { if (ci_v4l2_controls[i].id == control->id) { ret = 0; @@ -2385,12 +2326,6 @@ static int atomisp_queryctl(struct file *file, void *fh, struct atomisp_sub_device *asd = atomisp_to_video_pipe(vdev)->asd; struct atomisp_device *isp = video_get_drvdata(vdev); - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - switch (qc->id) { case V4L2_CID_FOCUS_ABSOLUTE: case V4L2_CID_FOCUS_RELATIVE: @@ -2436,12 +2371,6 @@ static int atomisp_camera_g_ext_ctrls(struct file *file, void *fh, int i; int ret = 0; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - if (!IS_ISP2401) motor = isp->inputs[asd->input_curr].motor; else @@ -2553,12 +2482,6 @@ static int atomisp_camera_s_ext_ctrls(struct file *file, void *fh, int i; int ret = 0; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - if (!IS_ISP2401) motor = isp->inputs[asd->input_curr].motor; else @@ -2684,12 +2607,6 @@ static int atomisp_g_parm(struct file *file, void *fh, struct atomisp_sub_device *asd = atomisp_to_video_pipe(vdev)->asd; struct atomisp_device *isp = video_get_drvdata(vdev); - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - if (parm->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) { dev_err(isp->dev, "unsupported v4l2 buf type\n"); return -EINVAL; @@ -2712,12 +2629,6 @@ static int atomisp_s_parm(struct file *file, void *fh, int rval; int fps; - if (!asd) { - dev_err(isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, vdev->name); - return -EINVAL; - } - if (parm->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) { dev_err(isp->dev, "unsupported v4l2 buf type\n"); return -EINVAL; -- GitLab From f315c1acba84fe17cd92a05e6c1c1bf26ee6bd43 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 3 Sep 2022 14:49:39 +0200 Subject: [PATCH 0575/2223] media: atomisp: Add atomisp_pipe_check() helper Several of the ioctl handlers all do the same checks (isp->fatal_error and asd->streaming errors) add an atomisp_pipe_check() helper for this. Note this changes the vidioc_s_fmt_vid_cap and vidioc_s_input handlers to now reject calls made while asd->streaming==STOPPING. This fixes a possible race where one thread can make this ioctls while vidioc_streamoff is running from another thread and it has temporarily released isp->mutex to kill the watchdog timers / work. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 9 +- .../staging/media/atomisp/pci/atomisp_ioctl.c | 89 +++++++++---------- .../staging/media/atomisp/pci/atomisp_ioctl.h | 2 + 3 files changed, 48 insertions(+), 52 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 0870789004159..7945852ecd133 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -5549,16 +5549,13 @@ int atomisp_set_fmt(struct video_device *vdev, struct v4l2_format *f) struct v4l2_subdev_fh fh; int ret; - lockdep_assert_held(&isp->mutex); + ret = atomisp_pipe_check(pipe, true); + if (ret) + return ret; if (source_pad >= ATOMISP_SUBDEV_PADS_NUM) return -EINVAL; - if (asd->streaming == ATOMISP_DEVICE_STREAMING_ENABLED) { - dev_warn(isp->dev, "ISP does not support set format while at streaming!\n"); - return -EBUSY; - } - dev_dbg(isp->dev, "setting resolution %ux%u on pad %u for asd%d, bytesperline %u\n", f->fmt.pix.width, f->fmt.pix.height, source_pad, diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 9c7022be3a06d..9b50f637c46af 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -535,6 +535,32 @@ atomisp_get_format_bridge_from_mbus(u32 mbus_code) return NULL; } +int atomisp_pipe_check(struct atomisp_video_pipe *pipe, bool settings_change) +{ + lockdep_assert_held(&pipe->isp->mutex); + + if (pipe->isp->isp_fatal_error) + return -EIO; + + switch (pipe->asd->streaming) { + case ATOMISP_DEVICE_STREAMING_DISABLED: + break; + case ATOMISP_DEVICE_STREAMING_ENABLED: + if (settings_change) { + dev_err(pipe->isp->dev, "Set fmt/input IOCTL while streaming\n"); + return -EBUSY; + } + break; + case ATOMISP_DEVICE_STREAMING_STOPPING: + dev_err(pipe->isp->dev, "IOCTL issued while stopping\n"); + return -EBUSY; + default: + return -EINVAL; + } + + return 0; +} + /* * v4l2 ioctls * return ISP capabilities @@ -646,12 +672,18 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) { struct video_device *vdev = video_devdata(file); struct atomisp_device *isp = video_get_drvdata(vdev); - struct atomisp_sub_device *asd = atomisp_to_video_pipe(vdev)->asd; + struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); + struct atomisp_sub_device *asd = pipe->asd; struct v4l2_subdev *camera = NULL; struct v4l2_subdev *motor; int ret; mutex_lock(&isp->mutex); + + ret = atomisp_pipe_check(pipe, true); + if (ret) + goto error; + if (input >= ATOM_ISP_MAX_INPUTS || input >= isp->input_cnt) { dev_dbg(isp->dev, "input_cnt: %d\n", isp->input_cnt); ret = -EINVAL; @@ -678,13 +710,6 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) goto error; } - if (atomisp_subdev_streaming_count(asd)) { - dev_err(isp->dev, - "ISP is still streaming, stop first\n"); - ret = -EINVAL; - goto error; - } - /* power off the current owned sensor, as it is not used this time */ if (isp->inputs[asd->input_curr].asd == asd && asd->input_curr != input) { @@ -976,11 +1001,6 @@ static int atomisp_s_fmt_cap(struct file *file, void *fh, int ret; mutex_lock(&isp->mutex); - if (isp->isp_fatal_error) { - ret = -EIO; - mutex_unlock(&isp->mutex); - return ret; - } ret = atomisp_set_fmt(vdev, f); mutex_unlock(&isp->mutex); return ret; @@ -1236,20 +1256,13 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) struct ia_css_frame *handle = NULL; u32 length; u32 pgnr; - int ret = 0; + int ret; mutex_lock(&isp->mutex); - if (isp->isp_fatal_error) { - ret = -EIO; - goto error; - } - if (asd->streaming == ATOMISP_DEVICE_STREAMING_STOPPING) { - dev_err(isp->dev, "%s: reject, as ISP at stopping.\n", - __func__); - ret = -EIO; + ret = atomisp_pipe_check(pipe, false); + if (ret) goto error; - } if (!buf || buf->index >= VIDEO_MAX_FRAME || !pipe->capq.bufs[buf->index]) { @@ -1418,23 +1431,13 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); struct atomisp_sub_device *asd = pipe->asd; struct atomisp_device *isp = video_get_drvdata(vdev); - int ret = 0; + int ret; mutex_lock(&isp->mutex); - - if (isp->isp_fatal_error) { - mutex_unlock(&isp->mutex); - return -EIO; - } - - if (asd->streaming == ATOMISP_DEVICE_STREAMING_STOPPING) { - mutex_unlock(&isp->mutex); - dev_err(isp->dev, "%s: reject, as ISP at stopping.\n", - __func__); - return -EIO; - } - + ret = atomisp_pipe_check(pipe, false); mutex_unlock(&isp->mutex); + if (ret) + return ret; ret = videobuf_dqbuf(&pipe->capq, buf, file->f_flags & O_NONBLOCK); if (ret) { @@ -1668,8 +1671,8 @@ static int atomisp_streamon(struct file *file, void *fh, enum ia_css_pipe_id css_pipe_id; unsigned int sensor_start_stream; unsigned int wdt_duration = ATOMISP_ISP_TIMEOUT_DURATION; - int ret = 0; unsigned long irqflags; + int ret; dev_dbg(isp->dev, "Start stream on pad %d for asd%d\n", atomisp_subdev_source_pad(vdev), asd->index); @@ -1680,15 +1683,9 @@ static int atomisp_streamon(struct file *file, void *fh, } mutex_lock(&isp->mutex); - if (isp->isp_fatal_error) { - ret = -EIO; - goto out; - } - - if (asd->streaming == ATOMISP_DEVICE_STREAMING_STOPPING) { - ret = -EBUSY; + ret = atomisp_pipe_check(pipe, false); + if (ret) goto out; - } if (pipe->capq.streaming) goto out; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.h b/drivers/staging/media/atomisp/pci/atomisp_ioctl.h index 382b78275240a..61a6148a6ad50 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.h +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.h @@ -34,6 +34,8 @@ atomisp_format_bridge *atomisp_get_format_bridge(unsigned int pixelformat); const struct atomisp_format_bridge *atomisp_get_format_bridge_from_mbus(u32 mbus_code); +int atomisp_pipe_check(struct atomisp_video_pipe *pipe, bool streaming_ok); + int atomisp_alloc_css_stat_bufs(struct atomisp_sub_device *asd, uint16_t stream_id); -- GitLab From 93d3fb35aa10a8417f923871c7c58c9ad81d7a08 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 3 Sep 2022 23:26:27 +0200 Subject: [PATCH 0576/2223] media: atomisp: Remove watchdog timer The watchdog timer code to recover from the ISP getting stuck has several major issues: 1. There is no way to do fault injection and normally the ISP does not get stuck, so is it is impossible to test it. 2. It in essence just stops all streams, resets the ISP and then brings everything back up. Userspace can easily do this itself by using a timeout on dqbuf and then closing (which causes a poweroff) + re-opening the device. Doing this in userspace (if it ever turns out to be necessary) greatly simplifies the kernel code and in general will be a more robust solution. Even just a quick look at the code finds several more issues: 3. The need to sync-cancel the timers + work on streamoff requires isp->mutex to be dropped halfway during the ioctl opening all sorts of races. 4. The atomisp code supports setting up 2 pipelines, streaming from two sensors at the same time. But there is only a single wdt_work and stopping one of the 2 streams will cancel the timers + work, stopping the wdt even though the other stream might still be running. 5. In case atomisp_css_flush() the sync cancel is done while keeping isp->mutex locked, causing a deadlock when racing with wdt_work which also takes isp->mutex. 6. Even though the watchdog is purely a software/driver thing which just checkes that new frames keep coming in, there are 2 completely different implementations for the ISP2400/ISP2401 which is not necessary at all. So all in all I believe that it is better to just remove the current watchdog implementation. Fixing all the issues with the current implementation will be so much work, that if it turns out that we do need something like this then doing a clean re-implementation from scratch will be better anyways. wdt_work was also (ab)used to reset the ISP after the firmware signalled an fw-assert error through the irq, add a new assert_recover_work to replace this. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 361 +----------------- .../staging/media/atomisp/pci/atomisp_cmd.h | 3 +- .../media/atomisp/pci/atomisp_compat_css20.c | 43 +-- .../media/atomisp/pci/atomisp_internal.h | 24 +- .../staging/media/atomisp/pci/atomisp_ioctl.c | 60 --- .../media/atomisp/pci/atomisp_subdev.h | 14 - .../staging/media/atomisp/pci/atomisp_v4l2.c | 43 +-- 7 files changed, 9 insertions(+), 539 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 7945852ecd133..4b459c4c6d76c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -897,7 +897,6 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, enum atomisp_metadata_type md_type; struct atomisp_device *isp = asd->isp; struct v4l2_control ctrl; - bool reset_wdt_timer = false; lockdep_assert_held(&isp->mutex); @@ -1006,9 +1005,6 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, break; case IA_CSS_BUFFER_TYPE_VF_OUTPUT_FRAME: case IA_CSS_BUFFER_TYPE_SEC_VF_OUTPUT_FRAME: - if (IS_ISP2401) - reset_wdt_timer = true; - pipe->buffers_in_css--; frame = buffer.css_buffer.data.frame; if (!frame) { @@ -1061,9 +1057,6 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, break; case IA_CSS_BUFFER_TYPE_OUTPUT_FRAME: case IA_CSS_BUFFER_TYPE_SEC_OUTPUT_FRAME: - if (IS_ISP2401) - reset_wdt_timer = true; - pipe->buffers_in_css--; frame = buffer.css_buffer.data.frame; if (!frame) { @@ -1231,8 +1224,6 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, */ wake_up(&vb->done); } - if (IS_ISP2401) - atomic_set(&pipe->wdt_count, 0); /* * Requeue should only be done for 3a and dis buffers. @@ -1249,19 +1240,6 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, } if (!error && q_buffers) atomisp_qbuffers_to_css(asd); - - if (IS_ISP2401) { - /* If there are no buffers queued then - * delete wdt timer. */ - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - return; - if (!atomisp_buffers_queued_pipe(pipe)) - atomisp_wdt_stop_pipe(pipe, false); - else if (reset_wdt_timer) - /* SOF irq should not reset wdt timer. */ - atomisp_wdt_refresh_pipe(pipe, - ATOMISP_WDT_KEEP_CURRENT_DELAY); - } } void atomisp_delayed_init_work(struct work_struct *work) @@ -1450,350 +1428,32 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) } } -void atomisp_wdt_work(struct work_struct *work) +void atomisp_assert_recovery_work(struct work_struct *work) { struct atomisp_device *isp = container_of(work, struct atomisp_device, - wdt_work); - int i; - unsigned int pipe_wdt_cnt[MAX_STREAM_NUM][4] = { {0} }; - bool css_recover = true; + assert_recovery_work); mutex_lock(&isp->mutex); - if (!atomisp_streaming_count(isp)) { - atomic_set(&isp->wdt_work_queued, 0); - mutex_unlock(&isp->mutex); - return; - } - - if (!IS_ISP2401) { - dev_err(isp->dev, "timeout %d of %d\n", - atomic_read(&isp->wdt_count) + 1, - ATOMISP_ISP_MAX_TIMEOUT_COUNT); - } else { - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - pipe_wdt_cnt[i][0] += - atomic_read(&asd->video_out_capture.wdt_count); - pipe_wdt_cnt[i][1] += - atomic_read(&asd->video_out_vf.wdt_count); - pipe_wdt_cnt[i][2] += - atomic_read(&asd->video_out_preview.wdt_count); - pipe_wdt_cnt[i][3] += - atomic_read(&asd->video_out_video_capture.wdt_count); - css_recover = - (pipe_wdt_cnt[i][0] <= ATOMISP_ISP_MAX_TIMEOUT_COUNT && - pipe_wdt_cnt[i][1] <= ATOMISP_ISP_MAX_TIMEOUT_COUNT && - pipe_wdt_cnt[i][2] <= ATOMISP_ISP_MAX_TIMEOUT_COUNT && - pipe_wdt_cnt[i][3] <= ATOMISP_ISP_MAX_TIMEOUT_COUNT) - ? true : false; - dev_err(isp->dev, - "pipe on asd%d timeout cnt: (%d, %d, %d, %d) of %d, recover = %d\n", - asd->index, pipe_wdt_cnt[i][0], pipe_wdt_cnt[i][1], - pipe_wdt_cnt[i][2], pipe_wdt_cnt[i][3], - ATOMISP_ISP_MAX_TIMEOUT_COUNT, css_recover); - } - } - - if (css_recover) { - ia_css_debug_dump_sp_sw_debug_info(); - ia_css_debug_dump_debug_info(__func__); - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - continue; - dev_err(isp->dev, "%s, vdev %s buffers in css: %d\n", - __func__, - asd->video_out_capture.vdev.name, - asd->video_out_capture. - buffers_in_css); - dev_err(isp->dev, - "%s, vdev %s buffers in css: %d\n", - __func__, - asd->video_out_vf.vdev.name, - asd->video_out_vf. - buffers_in_css); - dev_err(isp->dev, - "%s, vdev %s buffers in css: %d\n", - __func__, - asd->video_out_preview.vdev.name, - asd->video_out_preview. - buffers_in_css); - dev_err(isp->dev, - "%s, vdev %s buffers in css: %d\n", - __func__, - asd->video_out_video_capture.vdev.name, - asd->video_out_video_capture. - buffers_in_css); - dev_err(isp->dev, - "%s, s3a buffers in css preview pipe:%d\n", - __func__, - asd->s3a_bufs_in_css[IA_CSS_PIPE_ID_PREVIEW]); - dev_err(isp->dev, - "%s, s3a buffers in css capture pipe:%d\n", - __func__, - asd->s3a_bufs_in_css[IA_CSS_PIPE_ID_CAPTURE]); - dev_err(isp->dev, - "%s, s3a buffers in css video pipe:%d\n", - __func__, - asd->s3a_bufs_in_css[IA_CSS_PIPE_ID_VIDEO]); - dev_err(isp->dev, - "%s, dis buffers in css: %d\n", - __func__, asd->dis_bufs_in_css); - dev_err(isp->dev, - "%s, metadata buffers in css preview pipe:%d\n", - __func__, - asd->metadata_bufs_in_css - [ATOMISP_INPUT_STREAM_GENERAL] - [IA_CSS_PIPE_ID_PREVIEW]); - dev_err(isp->dev, - "%s, metadata buffers in css capture pipe:%d\n", - __func__, - asd->metadata_bufs_in_css - [ATOMISP_INPUT_STREAM_GENERAL] - [IA_CSS_PIPE_ID_CAPTURE]); - dev_err(isp->dev, - "%s, metadata buffers in css video pipe:%d\n", - __func__, - asd->metadata_bufs_in_css - [ATOMISP_INPUT_STREAM_GENERAL] - [IA_CSS_PIPE_ID_VIDEO]); - if (asd->enable_raw_buffer_lock->val) { - unsigned int j; - - dev_err(isp->dev, "%s, raw_buffer_locked_count %d\n", - __func__, asd->raw_buffer_locked_count); - for (j = 0; j <= ATOMISP_MAX_EXP_ID / 32; j++) - dev_err(isp->dev, "%s, raw_buffer_bitmap[%d]: 0x%x\n", - __func__, j, - asd->raw_buffer_bitmap[j]); - } - } - /*sh_css_dump_sp_state();*/ - /*sh_css_dump_isp_state();*/ - } else { - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - if (asd->streaming == - ATOMISP_DEVICE_STREAMING_ENABLED) { - atomisp_clear_css_buffer_counters(asd); - atomisp_flush_bufs_and_wakeup(asd); - complete(&asd->init_done); - } - if (IS_ISP2401) - atomisp_wdt_stop(asd, false); - } - - if (!IS_ISP2401) { - atomic_set(&isp->wdt_count, 0); - } else { - isp->isp_fatal_error = true; - atomic_set(&isp->wdt_work_queued, 0); - - mutex_unlock(&isp->mutex); - return; - } - } - - __atomisp_css_recover(isp, true); - if (IS_ISP2401) { - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - continue; - - atomisp_wdt_refresh(asd, ATOMISP_ISP_TIMEOUT_DURATION); - } - } - - dev_err(isp->dev, "timeout recovery handling done\n"); - atomic_set(&isp->wdt_work_queued, 0); + if (atomisp_streaming_count(isp)) + __atomisp_css_recover(isp, true); mutex_unlock(&isp->mutex); } void atomisp_css_flush(struct atomisp_device *isp) { - int i; - lockdep_assert_held(&isp->mutex); if (!atomisp_streaming_count(isp)) return; - /* Disable wdt */ - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - atomisp_wdt_stop(asd, true); - } - /* Start recover */ __atomisp_css_recover(isp, false); - /* Restore wdt */ - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - continue; - atomisp_wdt_refresh(asd, ATOMISP_ISP_TIMEOUT_DURATION); - } dev_dbg(isp->dev, "atomisp css flush done\n"); } -void atomisp_wdt(struct timer_list *t) -{ - struct atomisp_sub_device *asd; - struct atomisp_device *isp; - - if (!IS_ISP2401) { - asd = from_timer(asd, t, wdt); - isp = asd->isp; - } else { - struct atomisp_video_pipe *pipe = from_timer(pipe, t, wdt); - - asd = pipe->asd; - isp = asd->isp; - - atomic_inc(&pipe->wdt_count); - dev_warn(isp->dev, - "[WARNING]asd %d pipe %s ISP timeout %d!\n", - asd->index, pipe->vdev.name, - atomic_read(&pipe->wdt_count)); - } - - if (atomic_read(&isp->wdt_work_queued)) { - dev_dbg(isp->dev, "ISP watchdog was put into workqueue\n"); - return; - } - atomic_set(&isp->wdt_work_queued, 1); - queue_work(isp->wdt_work_queue, &isp->wdt_work); -} - -/* ISP2400 */ -void atomisp_wdt_start(struct atomisp_sub_device *asd) -{ - atomisp_wdt_refresh(asd, ATOMISP_ISP_TIMEOUT_DURATION); -} - -/* ISP2401 */ -void atomisp_wdt_refresh_pipe(struct atomisp_video_pipe *pipe, - unsigned int delay) -{ - unsigned long next; - - if (!pipe->asd) { - dev_err(pipe->isp->dev, "%s(): asd is NULL, device is %s\n", - __func__, pipe->vdev.name); - return; - } - - if (delay != ATOMISP_WDT_KEEP_CURRENT_DELAY) - pipe->wdt_duration = delay; - - next = jiffies + pipe->wdt_duration; - - /* Override next if it has been pushed beyon the "next" time */ - if (atomisp_is_wdt_running(pipe) && time_after(pipe->wdt_expires, next)) - next = pipe->wdt_expires; - - pipe->wdt_expires = next; - - if (atomisp_is_wdt_running(pipe)) - dev_dbg(pipe->asd->isp->dev, "WDT will hit after %d ms (%s)\n", - ((int)(next - jiffies) * 1000 / HZ), pipe->vdev.name); - else - dev_dbg(pipe->asd->isp->dev, "WDT starts with %d ms period (%s)\n", - ((int)(next - jiffies) * 1000 / HZ), pipe->vdev.name); - - mod_timer(&pipe->wdt, next); -} - -void atomisp_wdt_refresh(struct atomisp_sub_device *asd, unsigned int delay) -{ - if (!IS_ISP2401) { - unsigned long next; - - if (delay != ATOMISP_WDT_KEEP_CURRENT_DELAY) - asd->wdt_duration = delay; - - next = jiffies + asd->wdt_duration; - - /* Override next if it has been pushed beyon the "next" time */ - if (atomisp_is_wdt_running(asd) && time_after(asd->wdt_expires, next)) - next = asd->wdt_expires; - - asd->wdt_expires = next; - - if (atomisp_is_wdt_running(asd)) - dev_dbg(asd->isp->dev, "WDT will hit after %d ms\n", - ((int)(next - jiffies) * 1000 / HZ)); - else - dev_dbg(asd->isp->dev, "WDT starts with %d ms period\n", - ((int)(next - jiffies) * 1000 / HZ)); - - mod_timer(&asd->wdt, next); - atomic_set(&asd->isp->wdt_count, 0); - } else { - dev_dbg(asd->isp->dev, "WDT refresh all:\n"); - if (atomisp_is_wdt_running(&asd->video_out_capture)) - atomisp_wdt_refresh_pipe(&asd->video_out_capture, delay); - if (atomisp_is_wdt_running(&asd->video_out_preview)) - atomisp_wdt_refresh_pipe(&asd->video_out_preview, delay); - if (atomisp_is_wdt_running(&asd->video_out_vf)) - atomisp_wdt_refresh_pipe(&asd->video_out_vf, delay); - if (atomisp_is_wdt_running(&asd->video_out_video_capture)) - atomisp_wdt_refresh_pipe(&asd->video_out_video_capture, delay); - } -} - -/* ISP2401 */ -void atomisp_wdt_stop_pipe(struct atomisp_video_pipe *pipe, bool sync) -{ - if (!atomisp_is_wdt_running(pipe)) - return; - - dev_dbg(pipe->asd->isp->dev, - "WDT stop asd %d (%s)\n", pipe->asd->index, pipe->vdev.name); - - if (sync) { - del_timer_sync(&pipe->wdt); - cancel_work_sync(&pipe->asd->isp->wdt_work); - } else { - del_timer(&pipe->wdt); - } -} - -/* ISP 2401 */ -void atomisp_wdt_start_pipe(struct atomisp_video_pipe *pipe) -{ - atomisp_wdt_refresh_pipe(pipe, ATOMISP_ISP_TIMEOUT_DURATION); -} - -void atomisp_wdt_stop(struct atomisp_sub_device *asd, bool sync) -{ - dev_dbg(asd->isp->dev, "WDT stop:\n"); - - if (!IS_ISP2401) { - if (sync) { - del_timer_sync(&asd->wdt); - cancel_work_sync(&asd->isp->wdt_work); - } else { - del_timer(&asd->wdt); - } - } else { - atomisp_wdt_stop_pipe(&asd->video_out_capture, sync); - atomisp_wdt_stop_pipe(&asd->video_out_preview, sync); - atomisp_wdt_stop_pipe(&asd->video_out_vf, sync); - atomisp_wdt_stop_pipe(&asd->video_out_video_capture, sync); - } -} - void atomisp_setup_flash(struct atomisp_sub_device *asd) { struct atomisp_device *isp = asd->isp; @@ -4122,19 +3782,6 @@ void atomisp_handle_parameter_and_buffer(struct atomisp_video_pipe *pipe) return; atomisp_qbuffers_to_css(asd); - - if (!IS_ISP2401) { - if (!atomisp_is_wdt_running(asd) && atomisp_buffers_queued(asd)) - atomisp_wdt_start(asd); - } else { - if (atomisp_buffers_queued_pipe(pipe)) { - if (!atomisp_is_wdt_running(pipe)) - atomisp_wdt_start_pipe(pipe); - else - atomisp_wdt_refresh_pipe(pipe, - ATOMISP_WDT_KEEP_CURRENT_DELAY); - } - } } /* diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.h b/drivers/staging/media/atomisp/pci/atomisp_cmd.h index c4472516487ba..5ab7d6aca7fad 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.h +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.h @@ -65,8 +65,7 @@ bool atomisp_buffers_queued_pipe(struct atomisp_video_pipe *pipe); /* Interrupt functions */ void atomisp_msi_irq_init(struct atomisp_device *isp); void atomisp_msi_irq_uninit(struct atomisp_device *isp); -void atomisp_wdt_work(struct work_struct *work); -void atomisp_wdt(struct timer_list *t); +void atomisp_assert_recovery_work(struct work_struct *work); void atomisp_setup_flash(struct atomisp_sub_device *asd); irqreturn_t atomisp_isr(int irq, void *dev); irqreturn_t atomisp_isr_thread(int irq, void *isp_ptr); diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c index 15ef31b0c601d..0154ebf2cba5c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c +++ b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c @@ -3796,8 +3796,6 @@ int atomisp_css_isr_thread(struct atomisp_device *isp, enum atomisp_input_stream_id stream_id = 0; struct atomisp_css_event current_event; struct atomisp_sub_device *asd; - bool reset_wdt_timer[MAX_STREAM_NUM] = {false}; - int i; lockdep_assert_held(&isp->mutex); @@ -3813,14 +3811,8 @@ int atomisp_css_isr_thread(struct atomisp_device *isp, __func__, current_event.event.fw_assert_module_id, current_event.event.fw_assert_line_no); - for (i = 0; i < isp->num_of_streams; i++) - atomisp_wdt_stop(&isp->asd[i], 0); - - if (!IS_ISP2401) - atomisp_wdt(&isp->asd[0].wdt); - else - queue_work(isp->wdt_work_queue, &isp->wdt_work); + queue_work(system_long_wq, &isp->assert_recovery_work); return -EINVAL; } else if (current_event.event.type == IA_CSS_EVENT_TYPE_FW_WARNING) { dev_warn(isp->dev, "%s: ISP reports warning, code is %d, exp_id %d\n", @@ -3849,20 +3841,12 @@ int atomisp_css_isr_thread(struct atomisp_device *isp, frame_done_found[asd->index] = true; atomisp_buf_done(asd, 0, IA_CSS_BUFFER_TYPE_OUTPUT_FRAME, current_event.pipe, true, stream_id); - - if (!IS_ISP2401) - reset_wdt_timer[asd->index] = true; /* ISP running */ - break; case IA_CSS_EVENT_TYPE_SECOND_OUTPUT_FRAME_DONE: dev_dbg(isp->dev, "event: Second output frame done"); frame_done_found[asd->index] = true; atomisp_buf_done(asd, 0, IA_CSS_BUFFER_TYPE_SEC_OUTPUT_FRAME, current_event.pipe, true, stream_id); - - if (!IS_ISP2401) - reset_wdt_timer[asd->index] = true; /* ISP running */ - break; case IA_CSS_EVENT_TYPE_3A_STATISTICS_DONE: dev_dbg(isp->dev, "event: 3A stats frame done"); @@ -3883,19 +3867,12 @@ int atomisp_css_isr_thread(struct atomisp_device *isp, atomisp_buf_done(asd, 0, IA_CSS_BUFFER_TYPE_VF_OUTPUT_FRAME, current_event.pipe, true, stream_id); - - if (!IS_ISP2401) - reset_wdt_timer[asd->index] = true; /* ISP running */ - break; case IA_CSS_EVENT_TYPE_SECOND_VF_OUTPUT_FRAME_DONE: dev_dbg(isp->dev, "event: second VF output frame done"); atomisp_buf_done(asd, 0, IA_CSS_BUFFER_TYPE_SEC_VF_OUTPUT_FRAME, current_event.pipe, true, stream_id); - if (!IS_ISP2401) - reset_wdt_timer[asd->index] = true; /* ISP running */ - break; case IA_CSS_EVENT_TYPE_DIS_STATISTICS_DONE: dev_dbg(isp->dev, "event: dis stats frame done"); @@ -3918,24 +3895,6 @@ int atomisp_css_isr_thread(struct atomisp_device *isp, } } - if (IS_ISP2401) - return 0; - - /* ISP2400: If there are no buffers queued then delete wdt timer. */ - for (i = 0; i < isp->num_of_streams; i++) { - asd = &isp->asd[i]; - if (!asd) - continue; - if (asd->streaming != ATOMISP_DEVICE_STREAMING_ENABLED) - continue; - if (!atomisp_buffers_queued(asd)) - atomisp_wdt_stop(asd, false); - else if (reset_wdt_timer[i]) - /* SOF irq should not reset wdt timer. */ - atomisp_wdt_refresh(asd, - ATOMISP_WDT_KEEP_CURRENT_DELAY); - } - return 0; } diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index b2c362ef71994..2279d45e7d7ae 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -258,13 +258,7 @@ struct atomisp_device { /* isp timeout status flag */ bool isp_timeout; bool isp_fatal_error; - struct workqueue_struct *wdt_work_queue; - struct work_struct wdt_work; - - /* ISP2400 */ - atomic_t wdt_count; - - atomic_t wdt_work_queued; + struct work_struct assert_recovery_work; spinlock_t lock; /* Protects asd[i].streaming */ @@ -282,20 +276,4 @@ struct atomisp_device { extern struct device *atomisp_dev; -#define atomisp_is_wdt_running(a) timer_pending(&(a)->wdt) - -/* ISP2401 */ -void atomisp_wdt_refresh_pipe(struct atomisp_video_pipe *pipe, - unsigned int delay); -void atomisp_wdt_refresh(struct atomisp_sub_device *asd, unsigned int delay); - -/* ISP2400 */ -void atomisp_wdt_start(struct atomisp_sub_device *asd); - -/* ISP2401 */ -void atomisp_wdt_start_pipe(struct atomisp_video_pipe *pipe); -void atomisp_wdt_stop_pipe(struct atomisp_video_pipe *pipe, bool sync); - -void atomisp_wdt_stop(struct atomisp_sub_device *asd, bool sync); - #endif /* __ATOMISP_INTERNAL_H__ */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 9b50f637c46af..daecdcdeb27c7 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1363,15 +1363,6 @@ done: atomisp_handle_parameter_and_buffer(pipe); } else { atomisp_qbuffers_to_css(asd); - - if (!IS_ISP2401) { - if (!atomisp_is_wdt_running(asd) && atomisp_buffers_queued(asd)) - atomisp_wdt_start(asd); - } else { - if (!atomisp_is_wdt_running(pipe) && - atomisp_buffers_queued_pipe(pipe)) - atomisp_wdt_start_pipe(pipe); - } } } @@ -1594,33 +1585,6 @@ int atomisp_stream_on_master_slave_sensor(struct atomisp_device *isp, return 0; } -/* FIXME! ISP2400 */ -static void __wdt_on_master_slave_sensor(struct atomisp_device *isp, - unsigned int wdt_duration) -{ - if (atomisp_buffers_queued(&isp->asd[0])) - atomisp_wdt_refresh(&isp->asd[0], wdt_duration); - if (atomisp_buffers_queued(&isp->asd[1])) - atomisp_wdt_refresh(&isp->asd[1], wdt_duration); -} - -/* FIXME! ISP2401 */ -static void __wdt_on_master_slave_sensor_pipe(struct atomisp_video_pipe *pipe, - unsigned int wdt_duration, - bool enable) -{ - static struct atomisp_video_pipe *pipe0; - - if (enable) { - if (atomisp_buffers_queued_pipe(pipe0)) - atomisp_wdt_refresh_pipe(pipe0, wdt_duration); - if (atomisp_buffers_queued_pipe(pipe)) - atomisp_wdt_refresh_pipe(pipe, wdt_duration); - } else { - pipe0 = pipe; - } -} - static void atomisp_pause_buffer_event(struct atomisp_device *isp) { struct v4l2_event event = {0}; @@ -1670,7 +1634,6 @@ static int atomisp_streamon(struct file *file, void *fh, struct pci_dev *pdev = to_pci_dev(isp->dev); enum ia_css_pipe_id css_pipe_id; unsigned int sensor_start_stream; - unsigned int wdt_duration = ATOMISP_ISP_TIMEOUT_DURATION; unsigned long irqflags; int ret; @@ -1845,15 +1808,9 @@ start_sensor: dev_err(isp->dev, "master slave sensor stream on failed!\n"); goto out; } - if (!IS_ISP2401) - __wdt_on_master_slave_sensor(isp, wdt_duration); - else - __wdt_on_master_slave_sensor_pipe(pipe, wdt_duration, true); goto start_delay_wq; } else if (asd->depth_mode->val && (atomisp_streaming_count(isp) < ATOMISP_DEPTH_SENSOR_STREAMON_COUNT)) { - if (IS_ISP2401) - __wdt_on_master_slave_sensor_pipe(pipe, wdt_duration, false); goto start_delay_wq; } @@ -1875,14 +1832,6 @@ start_sensor: goto out; } - if (!IS_ISP2401) { - if (atomisp_buffers_queued(asd)) - atomisp_wdt_refresh(asd, wdt_duration); - } else { - if (atomisp_buffers_queued_pipe(pipe)) - atomisp_wdt_refresh_pipe(pipe, wdt_duration); - } - start_delay_wq: if (asd->continuous_mode->val) { struct v4l2_mbus_framefmt *sink; @@ -1986,16 +1935,7 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) asd->streaming = ATOMISP_DEVICE_STREAMING_STOPPING; first_streamoff = true; } - spin_unlock_irqrestore(&isp->lock, flags); - - if (first_streamoff) { - /* if other streams are running, should not disable watch dog */ - mutex_unlock(&isp->mutex); - atomisp_wdt_stop(asd, true); - mutex_lock(&isp->mutex); - } - spin_lock_irqsave(&isp->lock, flags); if (atomisp_subdev_streaming_count(asd) == 1) asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; spin_unlock_irqrestore(&isp->lock, flags); diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index b44f060b0bb52..43e6a1d1e4109 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -108,15 +108,6 @@ struct atomisp_video_pipe { */ unsigned int frame_request_config_id[VIDEO_MAX_FRAME]; struct atomisp_css_params_with_list *frame_params[VIDEO_MAX_FRAME]; - - /* - * move wdt from asd struct to create wdt for each pipe - */ - /* ISP2401 */ - struct timer_list wdt; - unsigned int wdt_duration; /* in jiffies */ - unsigned long wdt_expires; - atomic_t wdt_count; }; struct atomisp_pad_format { @@ -360,11 +351,6 @@ struct atomisp_sub_device { int raw_buffer_locked_count; spinlock_t raw_buffer_bitmap_lock; - /* ISP 2400 */ - struct timer_list wdt; - unsigned int wdt_duration; /* in jiffies */ - unsigned long wdt_expires; - /* ISP2401 */ bool re_trigger_capture; diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index aa38e0d33b5b8..d55e8d32a286e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1433,39 +1433,6 @@ static bool is_valid_device(struct pci_dev *pdev, const struct pci_device_id *id return true; } -static int init_atomisp_wdts(struct atomisp_device *isp) -{ - int i, err; - - atomic_set(&isp->wdt_work_queued, 0); - isp->wdt_work_queue = alloc_workqueue(isp->v4l2_dev.name, 0, 1); - if (!isp->wdt_work_queue) { - dev_err(isp->dev, "Failed to initialize wdt work queue\n"); - err = -ENOMEM; - goto alloc_fail; - } - INIT_WORK(&isp->wdt_work, atomisp_wdt_work); - - for (i = 0; i < isp->num_of_streams; i++) { - struct atomisp_sub_device *asd = &isp->asd[i]; - - if (!IS_ISP2401) { - timer_setup(&asd->wdt, atomisp_wdt, 0); - } else { - timer_setup(&asd->video_out_capture.wdt, - atomisp_wdt, 0); - timer_setup(&asd->video_out_preview.wdt, - atomisp_wdt, 0); - timer_setup(&asd->video_out_vf.wdt, atomisp_wdt, 0); - timer_setup(&asd->video_out_video_capture.wdt, - atomisp_wdt, 0); - } - } - return 0; -alloc_fail: - return err; -} - #define ATOM_ISP_PCI_BAR 0 static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -1698,10 +1665,8 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev_err(&pdev->dev, "atomisp_register_entities failed (%d)\n", err); goto register_entities_fail; } - /* init atomisp wdts */ - err = init_atomisp_wdts(isp); - if (err != 0) - goto wdt_work_queue_fail; + + INIT_WORK(&isp->assert_recovery_work, atomisp_assert_recovery_work); /* save the iunit context only once after all the values are init'ed. */ atomisp_save_iunit_reg(isp); @@ -1748,8 +1713,6 @@ css_init_fail: request_irq_fail: hmm_cleanup(); pm_runtime_get_noresume(&pdev->dev); - destroy_workqueue(isp->wdt_work_queue); -wdt_work_queue_fail: atomisp_unregister_entities(isp); register_entities_fail: atomisp_uninitialize_modules(isp); @@ -1809,8 +1772,6 @@ static void atomisp_pci_remove(struct pci_dev *pdev) atomisp_msi_irq_uninit(isp); atomisp_unregister_entities(isp); - destroy_workqueue(isp->wdt_work_queue); - release_firmware(isp->firmware); } -- GitLab From 0ecc5236d1cb0015c00981d253bb8edee36770bb Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 3 Sep 2022 23:36:01 +0200 Subject: [PATCH 0577/2223] media: atomisp: Move atomisp_streaming_count() check into __atomisp_css_recover() Both callers of __atomisp_css_recover() check atomisp_streaming_count() first, move the check into __atomisp_css_recover(). And __atomisp_css_recover() already calls lockdep_assert_held(&isp->mutex), so drop that from atomisp_css_flush(). Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_cmd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 4b459c4c6d76c..a96a4658e113c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -1282,6 +1282,9 @@ static void __atomisp_css_recover(struct atomisp_device *isp, bool isp_timeout) lockdep_assert_held(&isp->mutex); + if (!atomisp_streaming_count(isp)) + return; + atomisp_css_irq_enable(isp, IA_CSS_IRQ_INFO_CSS_RECEIVER_SOF, false); BUG_ON(isp->num_of_streams > MAX_STREAM_NUM); @@ -1434,20 +1437,12 @@ void atomisp_assert_recovery_work(struct work_struct *work) assert_recovery_work); mutex_lock(&isp->mutex); - - if (atomisp_streaming_count(isp)) - __atomisp_css_recover(isp, true); - + __atomisp_css_recover(isp, true); mutex_unlock(&isp->mutex); } void atomisp_css_flush(struct atomisp_device *isp) { - lockdep_assert_held(&isp->mutex); - - if (!atomisp_streaming_count(isp)) - return; - /* Start recover */ __atomisp_css_recover(isp, false); -- GitLab From 1636369bcfa253a91a8ac7327469c6642df018fc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 3 Sep 2022 23:50:42 +0200 Subject: [PATCH 0578/2223] media: atomisp: Rework asd->streaming state update in __atomisp_streamoff() During the first __atomisp_streamoff() call on an asd with only one pipe streaming asd->streaming would get set twice: asd->streaming = ATOMISP_DEVICE_STREAMING_STOPPING; asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; Rework the code a bit so that it gets set to the correct value right away instead of doing this in 2 steps. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index daecdcdeb27c7..8991575849512 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1930,14 +1930,14 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) if (!pipe->capq.streaming) return 0; - spin_lock_irqsave(&isp->lock, flags); - if (asd->streaming == ATOMISP_DEVICE_STREAMING_ENABLED) { - asd->streaming = ATOMISP_DEVICE_STREAMING_STOPPING; + if (asd->streaming == ATOMISP_DEVICE_STREAMING_ENABLED) first_streamoff = true; - } + spin_lock_irqsave(&isp->lock, flags); if (atomisp_subdev_streaming_count(asd) == 1) asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; + else + asd->streaming = ATOMISP_DEVICE_STREAMING_STOPPING; spin_unlock_irqrestore(&isp->lock, flags); if (!first_streamoff) { -- GitLab From b88e0ee7b081930e0e2f65a77696695c44a4a4b2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 3 Sep 2022 23:42:53 +0200 Subject: [PATCH 0579/2223] media: atomisp: Drop streamoff_mutex Now that __atomisp_streamoff() no longer drops isp->mutex to cancel the watchdog timer, the streamoff_mutex is no longer necessary to avoid multiple streamoffs racing with each other. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_fops.c | 2 -- drivers/staging/media/atomisp/pci/atomisp_internal.h | 6 ------ drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 3 --- drivers/staging/media/atomisp/pci/atomisp_v4l2.c | 1 - 4 files changed, 12 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index e1b213ba46865..531bbd6d7ee03 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -822,7 +822,6 @@ static int atomisp_release(struct file *file) if (!isp) return -EBADF; - mutex_lock(&isp->streamoff_mutex); mutex_lock(&isp->mutex); dev_dbg(isp->dev, "release device %s\n", vdev->name); @@ -909,7 +908,6 @@ done: V4L2_SEL_TGT_COMPOSE, 0, &clear_compose); mutex_unlock(&isp->mutex); - mutex_unlock(&isp->streamoff_mutex); return v4l2_fh_release(file); } diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index 2279d45e7d7ae..f3ef840c640a3 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -239,12 +239,6 @@ struct atomisp_device { /* Purpose of mutex is to protect and serialize use of isp data * structures and css API calls. */ struct mutex mutex; - /* - * Serialise streamoff: mutex is dropped during streamoff to - * cancel the watchdog queue. MUST be acquired BEFORE - * "mutex". - */ - struct mutex streamoff_mutex; unsigned int input_cnt; struct atomisp_input_subdev inputs[ATOM_ISP_MAX_INPUTS]; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 8991575849512..6d84a7e9cb570 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1872,7 +1872,6 @@ int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) atomisp_subdev_source_pad(vdev), asd->index); lockdep_assert_held(&isp->mutex); - lockdep_assert_held(&isp->streamoff_mutex); if (type != V4L2_BUF_TYPE_VIDEO_CAPTURE) { dev_dbg(isp->dev, "unsupported v4l2 buf type\n"); @@ -2081,11 +2080,9 @@ static int atomisp_streamoff(struct file *file, void *fh, struct atomisp_device *isp = video_get_drvdata(vdev); int rval; - mutex_lock(&isp->streamoff_mutex); mutex_lock(&isp->mutex); rval = __atomisp_streamoff(file, fh, type); mutex_unlock(&isp->mutex); - mutex_unlock(&isp->streamoff_mutex); return rval; } diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index d55e8d32a286e..4ab91858d3088 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1482,7 +1482,6 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev_dbg(&pdev->dev, "atomisp mmio base: %p\n", isp->base); mutex_init(&isp->mutex); - mutex_init(&isp->streamoff_mutex); spin_lock_init(&isp->lock); /* This is not a true PCI device on SoC, so the delay is not needed. */ -- GitLab From cf223056fb29772208e8612e631eb0ef9e2c1a3a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 2 Sep 2022 23:56:48 +0200 Subject: [PATCH 0580/2223] media: atomisp: Use video_dev.lock for ioctl locking Set video_dev.lock to point to isp->mutex so that the core does the locking surroundig ioctls for us and drop all the now no longer necessary (and conflicting) locking from the ioctl handling code. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_ioctl.c | 203 ++++-------------- .../staging/media/atomisp/pci/atomisp_v4l2.c | 1 + 2 files changed, 42 insertions(+), 162 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 6d84a7e9cb570..42d8d12675538 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -655,13 +655,9 @@ unsigned int atomisp_streaming_count(struct atomisp_device *isp) static int atomisp_g_input(struct file *file, void *fh, unsigned int *input) { struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); struct atomisp_sub_device *asd = atomisp_to_video_pipe(vdev)->asd; - mutex_lock(&isp->mutex); *input = asd->input_curr; - mutex_unlock(&isp->mutex); - return 0; } @@ -678,16 +674,13 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) struct v4l2_subdev *motor; int ret; - mutex_lock(&isp->mutex); - ret = atomisp_pipe_check(pipe, true); if (ret) - goto error; + return ret; if (input >= ATOM_ISP_MAX_INPUTS || input >= isp->input_cnt) { dev_dbg(isp->dev, "input_cnt: %d\n", isp->input_cnt); - ret = -EINVAL; - goto error; + return -EINVAL; } /* @@ -699,15 +692,13 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) dev_err(isp->dev, "%s, camera is already used by stream: %d\n", __func__, isp->inputs[input].asd->index); - ret = -EBUSY; - goto error; + return -EBUSY; } camera = isp->inputs[input].camera; if (!camera) { dev_err(isp->dev, "%s, no camera\n", __func__); - ret = -EINVAL; - goto error; + return -EINVAL; } /* power off the current owned sensor, as it is not used this time */ @@ -726,7 +717,7 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) ret = v4l2_subdev_call(isp->inputs[input].camera, core, s_power, 1); if (ret) { dev_err(isp->dev, "Failed to power-on sensor\n"); - goto error; + return ret; } /* * Some sensor driver resets the run mode during power-on, thus force @@ -739,7 +730,7 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) 0, isp->inputs[input].sensor_index, 0); if (ret && (ret != -ENOIOCTLCMD)) { dev_err(isp->dev, "Failed to select sensor\n"); - goto error; + return ret; } if (!IS_ISP2401) { @@ -756,14 +747,8 @@ static int atomisp_s_input(struct file *file, void *fh, unsigned int input) asd->input_curr = input; /* mark this camera is used by the current stream */ isp->inputs[input].asd = asd; - mutex_unlock(&isp->mutex); return 0; - -error: - mutex_unlock(&isp->mutex); - - return ret; } static int atomisp_enum_framesizes(struct file *file, void *priv, @@ -838,15 +823,12 @@ static int atomisp_enum_fmt_cap(struct file *file, void *fh, return -EINVAL; } - mutex_lock(&isp->mutex); - rval = v4l2_subdev_call(camera, pad, enum_mbus_code, NULL, &code); if (rval == -ENOIOCTLCMD) { dev_warn(isp->dev, "enum_mbus_code pad op not supported by %s. Please fix your sensor driver!\n", camera->name); } - mutex_unlock(&isp->mutex); if (rval) return rval; @@ -949,7 +931,6 @@ static int atomisp_try_fmt_cap(struct file *file, void *fh, struct v4l2_format *f) { struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); int ret; /* @@ -959,10 +940,7 @@ static int atomisp_try_fmt_cap(struct file *file, void *fh, f->fmt.pix.width += pad_w; f->fmt.pix.height += pad_h; - mutex_lock(&isp->mutex); ret = atomisp_try_fmt(vdev, &f->fmt.pix, NULL); - mutex_unlock(&isp->mutex); - if (ret) return ret; @@ -973,12 +951,9 @@ static int atomisp_g_fmt_cap(struct file *file, void *fh, struct v4l2_format *f) { struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); struct atomisp_video_pipe *pipe; - mutex_lock(&isp->mutex); pipe = atomisp_to_video_pipe(vdev); - mutex_unlock(&isp->mutex); f->fmt.pix = pipe->pix; @@ -997,13 +972,8 @@ static int atomisp_s_fmt_cap(struct file *file, void *fh, struct v4l2_format *f) { struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - int ret; - mutex_lock(&isp->mutex); - ret = atomisp_set_fmt(vdev, f); - mutex_unlock(&isp->mutex); - return ret; + return atomisp_set_fmt(vdev, f); } /* @@ -1217,15 +1187,7 @@ error: int atomisp_reqbufs(struct file *file, void *fh, struct v4l2_requestbuffers *req) { - struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - int ret; - - mutex_lock(&isp->mutex); - ret = __atomisp_reqbufs(file, fh, req); - mutex_unlock(&isp->mutex); - - return ret; + return __atomisp_reqbufs(file, fh, req); } /* application query the status of a buffer */ @@ -1258,17 +1220,14 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) u32 pgnr; int ret; - mutex_lock(&isp->mutex); - ret = atomisp_pipe_check(pipe, false); if (ret) - goto error; + return ret; if (!buf || buf->index >= VIDEO_MAX_FRAME || !pipe->capq.bufs[buf->index]) { dev_err(isp->dev, "Invalid index for qbuf.\n"); - ret = -EINVAL; - goto error; + return -EINVAL; } /* @@ -1278,16 +1237,13 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) if (buf->memory == V4L2_MEMORY_USERPTR) { if (offset_in_page(buf->m.userptr)) { dev_err(isp->dev, "Error userptr is not page aligned.\n"); - ret = -EINVAL; - goto error; + return -EINVAL; } vb = pipe->capq.bufs[buf->index]; vm_mem = vb->priv; - if (!vm_mem) { - ret = -EINVAL; - goto error; - } + if (!vm_mem) + return -EINVAL; length = vb->bsize; pgnr = (length + (PAGE_SIZE - 1)) >> PAGE_SHIFT; @@ -1296,17 +1252,15 @@ static int atomisp_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) goto done; if (atomisp_get_css_frame_info(asd, - atomisp_subdev_source_pad(vdev), &frame_info)) { - ret = -EIO; - goto error; - } + atomisp_subdev_source_pad(vdev), &frame_info)) + return -EIO; ret = ia_css_frame_map(&handle, &frame_info, (void __user *)buf->m.userptr, pgnr); if (ret) { dev_err(isp->dev, "Failed to map user buffer\n"); - goto error; + return ret; } if (vm_mem->vaddr) { @@ -1351,11 +1305,10 @@ done: pipe->frame_params[buf->index] = NULL; mutex_unlock(&isp->mutex); - ret = videobuf_qbuf(&pipe->capq, buf); mutex_lock(&isp->mutex); if (ret) - goto error; + return ret; /* TODO: do this better, not best way to queue to css */ if (asd->streaming == ATOMISP_DEVICE_STREAMING_ENABLED) { @@ -1384,16 +1337,11 @@ done: asd->pending_capture_request++; dev_dbg(isp->dev, "Add one pending capture request.\n"); } - mutex_unlock(&isp->mutex); dev_dbg(isp->dev, "qbuf buffer %d (%s) for asd%d\n", buf->index, vdev->name, asd->index); - return ret; - -error: - mutex_unlock(&isp->mutex); - return ret; + return 0; } static int __get_frame_exp_id(struct atomisp_video_pipe *pipe, @@ -1424,19 +1372,19 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) struct atomisp_device *isp = video_get_drvdata(vdev); int ret; - mutex_lock(&isp->mutex); ret = atomisp_pipe_check(pipe, false); - mutex_unlock(&isp->mutex); if (ret) return ret; + mutex_unlock(&isp->mutex); ret = videobuf_dqbuf(&pipe->capq, buf, file->f_flags & O_NONBLOCK); + mutex_lock(&isp->mutex); if (ret) { if (ret != -EAGAIN) dev_dbg(isp->dev, "<%s: %d\n", __func__, ret); return ret; } - mutex_lock(&isp->mutex); + buf->bytesused = pipe->pix.sizeimage; buf->reserved = asd->frame_status[buf->index]; @@ -1450,7 +1398,6 @@ static int atomisp_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) if (!(buf->flags & V4L2_BUF_FLAG_ERROR)) buf->reserved |= __get_frame_exp_id(pipe, buf) << 16; buf->reserved2 = pipe->frame_config_id[buf->index]; - mutex_unlock(&isp->mutex); dev_dbg(isp->dev, "dqbuf buffer %d (%s) for asd%d with exp_id %d, isp_config_id %d\n", @@ -1645,13 +1592,12 @@ static int atomisp_streamon(struct file *file, void *fh, return -EINVAL; } - mutex_lock(&isp->mutex); ret = atomisp_pipe_check(pipe, false); if (ret) - goto out; + return ret; if (pipe->capq.streaming) - goto out; + return 0; /* Input system HW workaround */ atomisp_dma_burst_len_cfg(asd); @@ -1666,14 +1612,13 @@ static int atomisp_streamon(struct file *file, void *fh, if (list_empty(&pipe->capq.stream)) { spin_unlock_irqrestore(&pipe->irq_lock, irqflags); dev_dbg(isp->dev, "no buffer in the queue\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } spin_unlock_irqrestore(&pipe->irq_lock, irqflags); ret = videobuf_streamon(&pipe->capq); if (ret) - goto out; + return ret; /* Reset pending capture request count. */ asd->pending_capture_request = 0; @@ -1694,10 +1639,10 @@ static int atomisp_streamon(struct file *file, void *fh, if (asd->delayed_init == ATOMISP_DELAYED_INIT_QUEUED) { flush_work(&asd->delayed_init_work); mutex_unlock(&isp->mutex); - if (wait_for_completion_interruptible( - &asd->init_done) != 0) - return -ERESTARTSYS; + ret = wait_for_completion_interruptible(&asd->init_done); mutex_lock(&isp->mutex); + if (ret != 0) + return -ERESTARTSYS; } /* handle per_frame_setting parameter and buffers */ @@ -1719,16 +1664,15 @@ static int atomisp_streamon(struct file *file, void *fh, asd->params.offline_parm.num_captures, asd->params.offline_parm.skip_frames, asd->params.offline_parm.offset); - if (ret) { - ret = -EINVAL; - goto out; - } + if (ret) + return -EINVAL; + if (asd->depth_mode->val) atomisp_pause_buffer_event(isp); } } atomisp_qbuffers_to_css(asd); - goto out; + return 0; } if (asd->streaming == ATOMISP_DEVICE_STREAMING_ENABLED) { @@ -1754,7 +1698,7 @@ static int atomisp_streamon(struct file *file, void *fh, ret = atomisp_css_start(asd, css_pipe_id, false); if (ret) - goto out; + return ret; spin_lock_irqsave(&isp->lock, irqflags); asd->streaming = ATOMISP_DEVICE_STREAMING_ENABLED; @@ -1775,7 +1719,7 @@ static int atomisp_streamon(struct file *file, void *fh, /* Only start sensor when the last streaming instance started */ if (atomisp_subdev_streaming_count(asd) < sensor_start_stream) - goto out; + return 0; start_sensor: if (isp->flash) { @@ -1806,7 +1750,7 @@ start_sensor: ret = atomisp_stream_on_master_slave_sensor(isp, false); if (ret) { dev_err(isp->dev, "master slave sensor stream on failed!\n"); - goto out; + return ret; } goto start_delay_wq; } else if (asd->depth_mode->val && (atomisp_streaming_count(isp) < @@ -1828,8 +1772,7 @@ start_sensor: spin_lock_irqsave(&isp->lock, irqflags); asd->streaming = ATOMISP_DEVICE_STREAMING_DISABLED; spin_unlock_irqrestore(&isp->lock, irqflags); - ret = -EINVAL; - goto out; + return -EINVAL; } start_delay_wq: @@ -1846,9 +1789,8 @@ start_delay_wq: } else { asd->delayed_init = ATOMISP_DELAYED_INIT_NOT_QUEUED; } -out: - mutex_unlock(&isp->mutex); - return ret; + + return 0; } int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) @@ -2076,15 +2018,7 @@ stopsensor: static int atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) { - struct video_device *vdev = video_devdata(file); - struct atomisp_device *isp = video_get_drvdata(vdev); - int rval; - - mutex_lock(&isp->mutex); - rval = __atomisp_streamoff(file, fh, type); - mutex_unlock(&isp->mutex); - - return rval; + return __atomisp_streamoff(file, fh, type); } /* @@ -2110,8 +2044,6 @@ static int atomisp_g_ctrl(struct file *file, void *fh, if (ret) return ret; - mutex_lock(&isp->mutex); - switch (control->id) { case V4L2_CID_IRIS_ABSOLUTE: case V4L2_CID_EXPOSURE_ABSOLUTE: @@ -2133,7 +2065,6 @@ static int atomisp_g_ctrl(struct file *file, void *fh, case V4L2_CID_TEST_PATTERN_COLOR_GR: case V4L2_CID_TEST_PATTERN_COLOR_GB: case V4L2_CID_TEST_PATTERN_COLOR_B: - mutex_unlock(&isp->mutex); return v4l2_g_ctrl(isp->inputs[asd->input_curr].camera-> ctrl_handler, control); case V4L2_CID_COLORFX: @@ -2162,7 +2093,6 @@ static int atomisp_g_ctrl(struct file *file, void *fh, break; } - mutex_unlock(&isp->mutex); return ret; } @@ -2189,7 +2119,6 @@ static int atomisp_s_ctrl(struct file *file, void *fh, if (ret) return ret; - mutex_lock(&isp->mutex); switch (control->id) { case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: case V4L2_CID_EXPOSURE: @@ -2210,7 +2139,6 @@ static int atomisp_s_ctrl(struct file *file, void *fh, case V4L2_CID_TEST_PATTERN_COLOR_GR: case V4L2_CID_TEST_PATTERN_COLOR_GB: case V4L2_CID_TEST_PATTERN_COLOR_B: - mutex_unlock(&isp->mutex); return v4l2_s_ctrl(NULL, isp->inputs[asd->input_curr].camera-> ctrl_handler, control); @@ -2242,7 +2170,6 @@ static int atomisp_s_ctrl(struct file *file, void *fh, ret = -EINVAL; break; } - mutex_unlock(&isp->mutex); return ret; } @@ -2355,9 +2282,7 @@ static int atomisp_camera_g_ext_ctrls(struct file *file, void *fh, &ctrl); break; case V4L2_CID_ZOOM_ABSOLUTE: - mutex_lock(&isp->mutex); ret = atomisp_digital_zoom(asd, 0, &ctrl.value); - mutex_unlock(&isp->mutex); break; case V4L2_CID_G_SKIP_FRAMES: ret = v4l2_subdev_call( @@ -2464,7 +2389,6 @@ static int atomisp_camera_s_ext_ctrls(struct file *file, void *fh, case V4L2_CID_FLASH_STROBE: case V4L2_CID_FLASH_MODE: case V4L2_CID_FLASH_STATUS_REGISTER: - mutex_lock(&isp->mutex); if (isp->flash) { ret = v4l2_s_ctrl(NULL, isp->flash->ctrl_handler, @@ -2479,12 +2403,9 @@ static int atomisp_camera_s_ext_ctrls(struct file *file, void *fh, asd->params.num_flash_frames = 0; } } - mutex_unlock(&isp->mutex); break; case V4L2_CID_ZOOM_ABSOLUTE: - mutex_lock(&isp->mutex); ret = atomisp_digital_zoom(asd, 1, &ctrl.value); - mutex_unlock(&isp->mutex); break; default: ctr = v4l2_ctrl_find(&asd->ctrl_handler, ctrl.id); @@ -2546,9 +2467,7 @@ static int atomisp_g_parm(struct file *file, void *fh, return -EINVAL; } - mutex_lock(&isp->mutex); parm->parm.capture.capturemode = asd->run_mode->val; - mutex_unlock(&isp->mutex); return 0; } @@ -2568,8 +2487,6 @@ static int atomisp_s_parm(struct file *file, void *fh, return -EINVAL; } - mutex_lock(&isp->mutex); - asd->high_speed_mode = false; switch (parm->parm.capture.capturemode) { case CI_MODE_NONE: { @@ -2588,7 +2505,7 @@ static int atomisp_s_parm(struct file *file, void *fh, asd->high_speed_mode = true; } - goto out; + return rval == -ENOIOCTLCMD ? 0 : rval; } case CI_MODE_VIDEO: mode = ATOMISP_RUN_MODE_VIDEO; @@ -2603,15 +2520,11 @@ static int atomisp_s_parm(struct file *file, void *fh, mode = ATOMISP_RUN_MODE_PREVIEW; break; default: - rval = -EINVAL; - goto out; + return -EINVAL; } rval = v4l2_ctrl_s_ctrl(asd->run_mode, mode); -out: - mutex_unlock(&isp->mutex); - return rval == -ENOIOCTLCMD ? 0 : rval; } @@ -2629,24 +2542,6 @@ static long atomisp_vidioc_default(struct file *file, void *fh, else motor = isp->motor; - switch (cmd) { - case ATOMISP_IOC_G_MOTOR_PRIV_INT_DATA: - case ATOMISP_IOC_S_EXPOSURE: - case ATOMISP_IOC_G_SENSOR_CALIBRATION_GROUP: - case ATOMISP_IOC_G_SENSOR_PRIV_INT_DATA: - case ATOMISP_IOC_EXT_ISP_CTRL: - case ATOMISP_IOC_G_SENSOR_AE_BRACKETING_INFO: - case ATOMISP_IOC_S_SENSOR_AE_BRACKETING_MODE: - case ATOMISP_IOC_G_SENSOR_AE_BRACKETING_MODE: - case ATOMISP_IOC_S_SENSOR_AE_BRACKETING_LUT: - case ATOMISP_IOC_S_SENSOR_EE_CONFIG: - case ATOMISP_IOC_G_UPDATE_EXPOSURE: - /* we do not need take isp->mutex for these IOCTLs */ - break; - default: - mutex_lock(&isp->mutex); - break; - } switch (cmd) { case ATOMISP_IOC_S_SENSOR_RUNMODE: if (IS_ISP2401) @@ -2893,22 +2788,6 @@ static long atomisp_vidioc_default(struct file *file, void *fh, break; } - switch (cmd) { - case ATOMISP_IOC_G_MOTOR_PRIV_INT_DATA: - case ATOMISP_IOC_S_EXPOSURE: - case ATOMISP_IOC_G_SENSOR_CALIBRATION_GROUP: - case ATOMISP_IOC_G_SENSOR_PRIV_INT_DATA: - case ATOMISP_IOC_EXT_ISP_CTRL: - case ATOMISP_IOC_G_SENSOR_AE_BRACKETING_INFO: - case ATOMISP_IOC_S_SENSOR_AE_BRACKETING_MODE: - case ATOMISP_IOC_G_SENSOR_AE_BRACKETING_MODE: - case ATOMISP_IOC_S_SENSOR_AE_BRACKETING_LUT: - case ATOMISP_IOC_G_UPDATE_EXPOSURE: - break; - default: - mutex_unlock(&isp->mutex); - break; - } return err; } diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 4ab91858d3088..026ff3ca5c04e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -441,6 +441,7 @@ int atomisp_video_init(struct atomisp_video_pipe *video, const char *name, video->pad.flags = MEDIA_PAD_FL_SINK; video->vdev.fops = &atomisp_fops; video->vdev.ioctl_ops = &atomisp_ioctl_ops; + video->vdev.lock = &video->isp->mutex; break; default: return -EINVAL; -- GitLab From d7306735e972a06cb788b8886bd0c070a245171d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 4 Sep 2022 10:47:23 +0200 Subject: [PATCH 0581/2223] media: atomisp: Remove a couple of not useful function wrappers The __atomisp_reqbufs(), __atomisp_streamoff() are 1:1 wrappers for the non __ prefixed functions now, drop these wrappers. The atomisp_s_fmt_cap() wrapper is almost a 1:1 wrapper for atomisp_set_fmt() adjust the latter to have the right function prototype and drop the wrapper. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 3 ++- .../staging/media/atomisp/pci/atomisp_cmd.h | 2 +- .../staging/media/atomisp/pci/atomisp_fops.c | 10 +++---- .../staging/media/atomisp/pci/atomisp_ioctl.c | 27 +++---------------- .../staging/media/atomisp/pci/atomisp_ioctl.h | 8 ++---- 5 files changed, 12 insertions(+), 38 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index a96a4658e113c..20962d4c2b76d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -5167,8 +5167,9 @@ static int atomisp_set_fmt_to_snr(struct video_device *vdev, return css_input_resolution_changed(asd, ffmt); } -int atomisp_set_fmt(struct video_device *vdev, struct v4l2_format *f) +int atomisp_set_fmt(struct file *file, void *unused, struct v4l2_format *f) { + struct video_device *vdev = video_devdata(file); struct atomisp_device *isp = video_get_drvdata(vdev); struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); struct atomisp_sub_device *asd = pipe->asd; diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.h b/drivers/staging/media/atomisp/pci/atomisp_cmd.h index 5ab7d6aca7fad..cfc970b531f0c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.h +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.h @@ -266,7 +266,7 @@ int atomisp_get_sensor_mode_data(struct atomisp_sub_device *asd, int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, bool *res_overflow); -int atomisp_set_fmt(struct video_device *vdev, struct v4l2_format *f); +int atomisp_set_fmt(struct file *file, void *fh, struct v4l2_format *f); int atomisp_set_shading_table(struct atomisp_sub_device *asd, struct atomisp_shading_table *shading_table); diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 531bbd6d7ee03..047e1180e35f4 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -836,18 +836,16 @@ static int atomisp_release(struct file *file) __func__); if (pipe->capq.streaming && - __atomisp_streamoff(file, NULL, V4L2_BUF_TYPE_VIDEO_CAPTURE)) { - dev_err(isp->dev, - "atomisp_streamoff failed on release, driver bug"); + atomisp_streamoff(file, NULL, V4L2_BUF_TYPE_VIDEO_CAPTURE)) { + dev_err(isp->dev, "atomisp_streamoff failed on release, driver bug"); goto done; } if (pipe->users) goto done; - if (__atomisp_reqbufs(file, NULL, &req)) { - dev_err(isp->dev, - "atomisp_reqbufs failed on release, driver bug"); + if (atomisp_reqbufs(file, NULL, &req)) { + dev_err(isp->dev, "atomisp_reqbufs failed on release, driver bug"); goto done; } diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 42d8d12675538..ed3ec603a713d 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -968,14 +968,6 @@ static int atomisp_g_fmt_cap(struct file *file, void *fh, return atomisp_try_fmt_cap(file, fh, f); } -static int atomisp_s_fmt_cap(struct file *file, void *fh, - struct v4l2_format *f) -{ - struct video_device *vdev = video_devdata(file); - - return atomisp_set_fmt(vdev, f); -} - /* * Free videobuffer buffer priv data */ @@ -1111,8 +1103,7 @@ error: /* * Initiate Memory Mapping or User Pointer I/O */ -int __atomisp_reqbufs(struct file *file, void *fh, - struct v4l2_requestbuffers *req) +int atomisp_reqbufs(struct file *file, void *fh, struct v4l2_requestbuffers *req) { struct video_device *vdev = video_devdata(file); struct atomisp_video_pipe *pipe = atomisp_to_video_pipe(vdev); @@ -1184,12 +1175,6 @@ error: return -ENOMEM; } -int atomisp_reqbufs(struct file *file, void *fh, - struct v4l2_requestbuffers *req) -{ - return __atomisp_reqbufs(file, fh, req); -} - /* application query the status of a buffer */ static int atomisp_querybuf(struct file *file, void *fh, struct v4l2_buffer *buf) @@ -1793,7 +1778,7 @@ start_delay_wq: return 0; } -int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) +int atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) { struct video_device *vdev = video_devdata(file); struct atomisp_device *isp = video_get_drvdata(vdev); @@ -2015,12 +2000,6 @@ stopsensor: return ret; } -static int atomisp_streamoff(struct file *file, void *fh, - enum v4l2_buf_type type) -{ - return __atomisp_streamoff(file, fh, type); -} - /* * To get the current value of a control. * applications initialize the id field of a struct v4l2_control and @@ -2806,7 +2785,7 @@ const struct v4l2_ioctl_ops atomisp_ioctl_ops = { .vidioc_enum_fmt_vid_cap = atomisp_enum_fmt_cap, .vidioc_try_fmt_vid_cap = atomisp_try_fmt_cap, .vidioc_g_fmt_vid_cap = atomisp_g_fmt_cap, - .vidioc_s_fmt_vid_cap = atomisp_s_fmt_cap, + .vidioc_s_fmt_vid_cap = atomisp_set_fmt, .vidioc_reqbufs = atomisp_reqbufs, .vidioc_querybuf = atomisp_querybuf, .vidioc_qbuf = atomisp_qbuf, diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.h b/drivers/staging/media/atomisp/pci/atomisp_ioctl.h index 61a6148a6ad50..c660f631d371a 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.h +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.h @@ -39,12 +39,8 @@ int atomisp_pipe_check(struct atomisp_video_pipe *pipe, bool streaming_ok); int atomisp_alloc_css_stat_bufs(struct atomisp_sub_device *asd, uint16_t stream_id); -int __atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type); -int __atomisp_reqbufs(struct file *file, void *fh, - struct v4l2_requestbuffers *req); - -int atomisp_reqbufs(struct file *file, void *fh, - struct v4l2_requestbuffers *req); +int atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type); +int atomisp_reqbufs(struct file *file, void *fh, struct v4l2_requestbuffers *req); enum ia_css_pipe_id atomisp_get_css_pipe_id(struct atomisp_sub_device *asd); -- GitLab From 405dac898124da8c30474b4b720405915dcf209f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 4 Sep 2022 10:56:47 +0200 Subject: [PATCH 0582/2223] media: atomisp: Drop unnecessary first_streamoff check Drop an unnecessary first_streamoff check from atomisp_streamoff(), above the check there is a: if (!first_streamoff) goto stop_sensor; Code block which will jump over the code with the test, so the test is only executed when first_streamoff is true and therefor the test is not necessary. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index ed3ec603a713d..77c0d55ab409c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1880,10 +1880,10 @@ int atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) cancel_work_sync(&asd->delayed_init_work); asd->delayed_init = ATOMISP_DELAYED_INIT_NOT_QUEUED; } - if (first_streamoff) { - css_pipe_id = atomisp_get_css_pipe_id(asd); - atomisp_css_stop(asd, css_pipe_id, false); - } + + css_pipe_id = atomisp_get_css_pipe_id(asd); + atomisp_css_stop(asd, css_pipe_id, false); + /* cancel work queue*/ if (asd->video_out_capture.users) { capture_pipe = &asd->video_out_capture; -- GitLab From e208848cb10e203681d6b07b96acd55d3378ede0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 4 Sep 2022 14:59:08 +0200 Subject: [PATCH 0583/2223] media: atomisp: Make atomisp_set_raw_buffer_bitmap() static atomisp_set_raw_buffer_bitmap() is only used in atomisp_cmd.c, make it static. Unfortunately this still requires a forward declaration (the function cannot be moved easily). Still this will at least make it obvious to anyone reading the code that the function is not used elsewhere. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_cmd.c | 4 +++- drivers/staging/media/atomisp/pci/atomisp_cmd.h | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 20962d4c2b76d..8cc8ee64fb44c 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -80,6 +80,8 @@ union host { } ptr; }; +static int atomisp_set_raw_buffer_bitmap(struct atomisp_sub_device *asd, int exp_id); + /* * get sensor:dis71430/ov2720 related info from v4l2_subdev->priv data field. * subdev->priv is set in mrst.c @@ -5993,7 +5995,7 @@ void atomisp_init_raw_buffer_bitmap(struct atomisp_sub_device *asd) spin_unlock_irqrestore(&asd->raw_buffer_bitmap_lock, flags); } -int atomisp_set_raw_buffer_bitmap(struct atomisp_sub_device *asd, int exp_id) +static int atomisp_set_raw_buffer_bitmap(struct atomisp_sub_device *asd, int exp_id) { int *bitmap, bit; unsigned long flags; diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.h b/drivers/staging/media/atomisp/pci/atomisp_cmd.h index cfc970b531f0c..ba3433a635959 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.h +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.h @@ -321,8 +321,6 @@ void atomisp_flush_params_queue(struct atomisp_video_pipe *asd); int atomisp_exp_id_unlock(struct atomisp_sub_device *asd, int *exp_id); int atomisp_exp_id_capture(struct atomisp_sub_device *asd, int *exp_id); -/* Function to update Raw Buffer bitmap */ -int atomisp_set_raw_buffer_bitmap(struct atomisp_sub_device *asd, int exp_id); void atomisp_init_raw_buffer_bitmap(struct atomisp_sub_device *asd); /* Function to enable/disable zoom for capture pipe */ -- GitLab From e226e9a492a32d0789b27bedc1d0a4644fe8c118 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 4 Sep 2022 15:04:36 +0200 Subject: [PATCH 0584/2223] media: atomisp: Remove unused atomisp_css_get_dis_statistics() Remove the unused atomisp_css_get_dis_statistics() function. This seems to be a leftover variant / older version of atomisp_css_get_dis_stats() which is actually used. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_compat.h | 4 ---- .../media/atomisp/pci/atomisp_compat_css20.c | 14 -------------- 2 files changed, 18 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat.h b/drivers/staging/media/atomisp/pci/atomisp_compat.h index af6ab8434b5ee..a6d85d0f9ae5f 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat.h +++ b/drivers/staging/media/atomisp/pci/atomisp_compat.h @@ -129,10 +129,6 @@ int atomisp_alloc_metadata_output_buf(struct atomisp_sub_device *asd); void atomisp_free_metadata_output_buf(struct atomisp_sub_device *asd); -void atomisp_css_get_dis_statistics(struct atomisp_sub_device *asd, - struct atomisp_css_buffer *isp_css_buffer, - struct ia_css_isp_dvs_statistics_map *dvs_map); - void atomisp_css_temp_pipe_to_pipe_id(struct atomisp_sub_device *asd, struct atomisp_css_event *current_event); diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c index 0154ebf2cba5c..64dd63ddc29c7 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c +++ b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c @@ -1574,20 +1574,6 @@ void atomisp_free_metadata_output_buf(struct atomisp_sub_device *asd) } } -void atomisp_css_get_dis_statistics(struct atomisp_sub_device *asd, - struct atomisp_css_buffer *isp_css_buffer, - struct ia_css_isp_dvs_statistics_map *dvs_map) -{ - if (asd->params.dvs_stat) { - if (dvs_map) - ia_css_translate_dvs2_statistics( - asd->params.dvs_stat, dvs_map); - else - ia_css_get_dvs2_statistics(asd->params.dvs_stat, - isp_css_buffer->css_buffer.data.stats_dvs); - } -} - void atomisp_css_temp_pipe_to_pipe_id(struct atomisp_sub_device *asd, struct atomisp_css_event *current_event) { -- GitLab From cf2e0516f040fd576b35436b316c0923c45fb468 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 4 Sep 2022 18:54:36 +0200 Subject: [PATCH 0585/2223] media: atomisp: Remove const/fixed camera_caps The code checks a camera_caps struct in various places, but this always points to the same const camera_caps struct. Remove the checks, keeping the code paths which would be taken with the fixed camera caps struct still in place and remove the camera_caps struct itself. Note this completely removes atomisp_pause_buffer_event() because that only ever does something if camera_caps.sensors[0].is_slave is true and that never is true. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../atomisp/include/linux/atomisp_platform.h | 18 --- .../staging/media/atomisp/pci/atomisp_cmd.c | 111 +----------------- .../staging/media/atomisp/pci/atomisp_fops.c | 44 ------- .../media/atomisp/pci/atomisp_gmin_platform.c | 18 --- .../media/atomisp/pci/atomisp_internal.h | 5 +- .../staging/media/atomisp/pci/atomisp_ioctl.c | 81 ++----------- .../media/atomisp/pci/atomisp_subdev.h | 3 - .../staging/media/atomisp/pci/atomisp_v4l2.c | 18 --- 8 files changed, 17 insertions(+), 281 deletions(-) diff --git a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h index 8c65733e0255a..0253661d43320 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h @@ -141,23 +141,6 @@ struct atomisp_platform_data { struct intel_v4l2_subdev_table *subdevs; }; -/* Describe the capacities of one single sensor. */ -struct atomisp_sensor_caps { - /* The number of streams this sensor can output. */ - int stream_num; - bool is_slave; -}; - -/* Describe the capacities of sensors connected to one camera port. */ -struct atomisp_camera_caps { - /* The number of sensors connected to this camera port. */ - int sensor_num; - /* The capacities of each sensor. */ - struct atomisp_sensor_caps sensor[MAX_SENSORS_PER_PORT]; - /* Define whether stream control is required for multiple streams. */ - bool multi_stream_ctrl; -}; - /* * Sensor of external ISP can send multiple steams with different mipi data * type in the same virtual channel. This information needs to come from the @@ -235,7 +218,6 @@ struct camera_mipi_info { }; const struct atomisp_platform_data *atomisp_get_platform_data(void); -const struct atomisp_camera_caps *atomisp_get_default_camera_caps(void); /* API from old platform_camera.h, new CPUID implementation */ #define __IS_SOC(x) (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && \ diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index 8cc8ee64fb44c..b01cacb8d2a81 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -770,24 +770,6 @@ static struct atomisp_video_pipe *__atomisp_get_pipe( enum ia_css_pipe_id css_pipe_id, enum ia_css_buffer_type buf_type) { - struct atomisp_device *isp = asd->isp; - - if (css_pipe_id == IA_CSS_PIPE_ID_COPY && - isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num > 1) { - switch (stream_id) { - case ATOMISP_INPUT_STREAM_PREVIEW: - return &asd->video_out_preview; - case ATOMISP_INPUT_STREAM_POSTVIEW: - return &asd->video_out_vf; - case ATOMISP_INPUT_STREAM_VIDEO: - return &asd->video_out_video_capture; - case ATOMISP_INPUT_STREAM_CAPTURE: - default: - return &asd->video_out_capture; - } - } - /* video is same in online as in continuouscapture mode */ if (asd->vfpp->val == ATOMISP_VFPP_DISABLE_LOWLAT) { /* @@ -5051,12 +5033,7 @@ static void atomisp_check_copy_mode(struct atomisp_sub_device *asd, src = atomisp_subdev_get_ffmt(&asd->subdev, NULL, V4L2_SUBDEV_FORMAT_ACTIVE, source_pad); - if ((sink->code == src->code && - sink->width == f->width && - sink->height == f->height) || - ((asd->isp->inputs[asd->input_curr].type == SOC_CAMERA) && - (asd->isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num > 1))) + if (sink->code == src->code && sink->width == f->width && sink->height == f->height) asd->copy_mode = true; else asd->copy_mode = false; @@ -5282,58 +5259,7 @@ int atomisp_set_fmt(struct file *file, void *unused, struct v4l2_format *f) f->fmt.pix.height = r.height; } - if (source_pad == ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW && - (asd->isp->inputs[asd->input_curr].type == SOC_CAMERA) && - (asd->isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num > 1)) { - /* For M10MO outputing YUV preview images. */ - u16 video_index = - atomisp_source_pad_to_stream_id(asd, - ATOMISP_SUBDEV_PAD_SOURCE_VIDEO); - - ret = atomisp_css_copy_get_output_frame_info(asd, - video_index, &output_info); - if (ret) { - dev_err(isp->dev, - "copy_get_output_frame_info ret %i", ret); - return -EINVAL; - } - if (!asd->yuvpp_mode) { - /* - * If viewfinder was configured into copy_mode, - * we switch to using yuvpp pipe instead. - */ - asd->yuvpp_mode = true; - ret = atomisp_css_copy_configure_output( - asd, video_index, 0, 0, 0, 0); - if (ret) { - dev_err(isp->dev, - "failed to disable copy pipe"); - return -EINVAL; - } - ret = atomisp_css_yuvpp_configure_output( - asd, video_index, - output_info.res.width, - output_info.res.height, - output_info.padded_width, - output_info.format); - if (ret) { - dev_err(isp->dev, - "failed to set up yuvpp pipe\n"); - return -EINVAL; - } - atomisp_css_video_enable_online(asd, false); - atomisp_css_preview_enable_online(asd, - ATOMISP_INPUT_STREAM_GENERAL, false); - } - atomisp_css_yuvpp_configure_viewfinder(asd, video_index, - f->fmt.pix.width, f->fmt.pix.height, - format_bridge->planar ? f->fmt.pix.bytesperline - : f->fmt.pix.bytesperline * 8 - / format_bridge->depth, format_bridge->sh_fmt); - atomisp_css_yuvpp_get_viewfinder_frame_info( - asd, video_index, &output_info); - } else if (source_pad == ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW) { + if (source_pad == ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW) { atomisp_css_video_configure_viewfinder(asd, f->fmt.pix.width, f->fmt.pix.height, format_bridge->planar ? f->fmt.pix.bytesperline @@ -5918,31 +5844,7 @@ int atomisp_flash_enable(struct atomisp_sub_device *asd, int num_frames) int atomisp_source_pad_to_stream_id(struct atomisp_sub_device *asd, uint16_t source_pad) { - int stream_id; - struct atomisp_device *isp = asd->isp; - - if (isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num == 1) - return ATOMISP_INPUT_STREAM_GENERAL; - - switch (source_pad) { - case ATOMISP_SUBDEV_PAD_SOURCE_CAPTURE: - stream_id = ATOMISP_INPUT_STREAM_CAPTURE; - break; - case ATOMISP_SUBDEV_PAD_SOURCE_VF: - stream_id = ATOMISP_INPUT_STREAM_POSTVIEW; - break; - case ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW: - stream_id = ATOMISP_INPUT_STREAM_PREVIEW; - break; - case ATOMISP_SUBDEV_PAD_SOURCE_VIDEO: - stream_id = ATOMISP_INPUT_STREAM_VIDEO; - break; - default: - stream_id = ATOMISP_INPUT_STREAM_GENERAL; - } - - return stream_id; + return ATOMISP_INPUT_STREAM_GENERAL; } bool atomisp_is_vf_pipe(struct atomisp_video_pipe *pipe) @@ -6217,13 +6119,6 @@ int atomisp_get_invalid_frame_num(struct video_device *vdev, struct ia_css_pipe_info p_info; int ret; - if (asd->isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num > 1) { - /* External ISP */ - *invalid_frame_num = 0; - return 0; - } - pipe_id = atomisp_get_pipe_id(pipe); if (!asd->stream_env[ATOMISP_INPUT_STREAM_GENERAL].pipes[pipe_id]) { dev_warn(asd->isp->dev, diff --git a/drivers/staging/media/atomisp/pci/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp_fops.c index 047e1180e35f4..84a84e0cdeef7 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_fops.c +++ b/drivers/staging/media/atomisp/pci/atomisp_fops.c @@ -369,45 +369,6 @@ static int atomisp_get_css_buf_type(struct atomisp_sub_device *asd, return IA_CSS_BUFFER_TYPE_VF_OUTPUT_FRAME; } -static int atomisp_qbuffers_to_css_for_all_pipes(struct atomisp_sub_device *asd) -{ - enum ia_css_buffer_type buf_type; - enum ia_css_pipe_id css_capture_pipe_id = IA_CSS_PIPE_ID_COPY; - enum ia_css_pipe_id css_preview_pipe_id = IA_CSS_PIPE_ID_COPY; - enum ia_css_pipe_id css_video_pipe_id = IA_CSS_PIPE_ID_COPY; - enum atomisp_input_stream_id input_stream_id; - struct atomisp_video_pipe *capture_pipe; - struct atomisp_video_pipe *preview_pipe; - struct atomisp_video_pipe *video_pipe; - - capture_pipe = &asd->video_out_capture; - preview_pipe = &asd->video_out_preview; - video_pipe = &asd->video_out_video_capture; - - buf_type = atomisp_get_css_buf_type( - asd, css_preview_pipe_id, - atomisp_subdev_source_pad(&preview_pipe->vdev)); - input_stream_id = ATOMISP_INPUT_STREAM_PREVIEW; - atomisp_q_video_buffers_to_css(asd, preview_pipe, - input_stream_id, - buf_type, css_preview_pipe_id); - - buf_type = atomisp_get_css_buf_type(asd, css_capture_pipe_id, - atomisp_subdev_source_pad(&capture_pipe->vdev)); - input_stream_id = ATOMISP_INPUT_STREAM_GENERAL; - atomisp_q_video_buffers_to_css(asd, capture_pipe, - input_stream_id, - buf_type, css_capture_pipe_id); - - buf_type = atomisp_get_css_buf_type(asd, css_video_pipe_id, - atomisp_subdev_source_pad(&video_pipe->vdev)); - input_stream_id = ATOMISP_INPUT_STREAM_VIDEO; - atomisp_q_video_buffers_to_css(asd, video_pipe, - input_stream_id, - buf_type, css_video_pipe_id); - return 0; -} - /* queue all available buffers to css */ int atomisp_qbuffers_to_css(struct atomisp_sub_device *asd) { @@ -423,11 +384,6 @@ int atomisp_qbuffers_to_css(struct atomisp_sub_device *asd) bool raw_mode = atomisp_is_mbuscode_raw( asd->fmt[asd->capture_pad].fmt.code); - if (asd->isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num == 2 && - !asd->yuvpp_mode) - return atomisp_qbuffers_to_css_for_all_pipes(asd); - if (asd->vfpp->val == ATOMISP_VFPP_DISABLE_SCALER) { video_pipe = &asd->video_out_video_capture; css_video_pipe_id = IA_CSS_PIPE_ID_VIDEO; diff --git a/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c b/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c index f7fc5137199cf..254e8c97f71f7 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c +++ b/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c @@ -134,24 +134,6 @@ static DEFINE_MUTEX(vcm_lock); static struct gmin_subdev *find_gmin_subdev(struct v4l2_subdev *subdev); -/* - * Legacy/stub behavior copied from upstream platform_camera.c. The - * atomisp driver relies on these values being non-NULL in a few - * places, even though they are hard-coded in all current - * implementations. - */ -const struct atomisp_camera_caps *atomisp_get_default_camera_caps(void) -{ - static const struct atomisp_camera_caps caps = { - .sensor_num = 1, - .sensor = { - { .stream_num = 1, }, - }, - }; - return ∩︀ -} -EXPORT_SYMBOL_GPL(atomisp_get_default_camera_caps); - const struct atomisp_platform_data *atomisp_get_platform_data(void) { return &pdata; diff --git a/drivers/staging/media/atomisp/pci/atomisp_internal.h b/drivers/staging/media/atomisp/pci/atomisp_internal.h index f3ef840c640a3..d9d158cdf09e9 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_internal.h +++ b/drivers/staging/media/atomisp/pci/atomisp_internal.h @@ -127,9 +127,7 @@ * Moorefield/Baytrail platform. */ #define ATOMISP_SOC_CAMERA(asd) \ - (asd->isp->inputs[asd->input_curr].type == SOC_CAMERA \ - && asd->isp->inputs[asd->input_curr].camera_caps-> \ - sensor[asd->sensor_curr].stream_num == 1) + (asd->isp->inputs[asd->input_curr].type == SOC_CAMERA) #define ATOMISP_USE_YUVPP(asd) \ (ATOMISP_SOC_CAMERA(asd) && ATOMISP_CSS_SUPPORT_YUVPP && \ @@ -162,7 +160,6 @@ struct atomisp_input_subdev { */ struct atomisp_sub_device *asd; - const struct atomisp_camera_caps *camera_caps; int sensor_index; }; diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index 77c0d55ab409c..cbbb25d3e5fe3 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1431,16 +1431,6 @@ enum ia_css_pipe_id atomisp_get_css_pipe_id(struct atomisp_sub_device *asd) static unsigned int atomisp_sensor_start_stream(struct atomisp_sub_device *asd) { - struct atomisp_device *isp = asd->isp; - - if (isp->inputs[asd->input_curr].camera_caps-> - sensor[asd->sensor_curr].stream_num > 1) { - if (asd->high_speed_mode) - return 1; - else - return 2; - } - if (asd->vfpp->val != ATOMISP_VFPP_ENABLE || asd->copy_mode) return 1; @@ -1459,31 +1449,15 @@ static unsigned int atomisp_sensor_start_stream(struct atomisp_sub_device *asd) int atomisp_stream_on_master_slave_sensor(struct atomisp_device *isp, bool isp_timeout) { - unsigned int master = -1, slave = -1, delay_slave = 0; - int i, ret; - - /* - * ISP only support 2 streams now so ignore multiple master/slave - * case to reduce the delay between 2 stream_on calls. - */ - for (i = 0; i < isp->num_of_streams; i++) { - int sensor_index = isp->asd[i].input_curr; - - if (isp->inputs[sensor_index].camera_caps-> - sensor[isp->asd[i].sensor_curr].is_slave) - slave = sensor_index; - else - master = sensor_index; - } + unsigned int master, slave, delay_slave = 0; + int ret; - if (master == -1 || slave == -1) { - master = ATOMISP_DEPTH_DEFAULT_MASTER_SENSOR; - slave = ATOMISP_DEPTH_DEFAULT_SLAVE_SENSOR; - dev_warn(isp->dev, - "depth mode use default master=%s.slave=%s.\n", - isp->inputs[master].camera->name, - isp->inputs[slave].camera->name); - } + master = ATOMISP_DEPTH_DEFAULT_MASTER_SENSOR; + slave = ATOMISP_DEPTH_DEFAULT_SLAVE_SENSOR; + dev_warn(isp->dev, + "depth mode use default master=%s.slave=%s.\n", + isp->inputs[master].camera->name, + isp->inputs[slave].camera->name); ret = v4l2_subdev_call(isp->inputs[master].camera, core, ioctl, ATOMISP_IOC_G_DEPTH_SYNC_COMP, @@ -1517,24 +1491,6 @@ int atomisp_stream_on_master_slave_sensor(struct atomisp_device *isp, return 0; } -static void atomisp_pause_buffer_event(struct atomisp_device *isp) -{ - struct v4l2_event event = {0}; - int i; - - event.type = V4L2_EVENT_ATOMISP_PAUSE_BUFFER; - - for (i = 0; i < isp->num_of_streams; i++) { - int sensor_index = isp->asd[i].input_curr; - - if (isp->inputs[sensor_index].camera_caps-> - sensor[isp->asd[i].sensor_curr].is_slave) { - v4l2_event_queue(isp->asd[i].subdev.devnode, &event); - break; - } - } -} - /* Input system HW workaround */ /* Input system address translation corrupts burst during */ /* invalidate. SW workaround for this is to set burst length */ @@ -1608,8 +1564,7 @@ static int atomisp_streamon(struct file *file, void *fh, /* Reset pending capture request count. */ asd->pending_capture_request = 0; - if ((atomisp_subdev_streaming_count(asd) > sensor_start_stream) && - (!isp->inputs[asd->input_curr].camera_caps->multi_stream_ctrl)) { + if (atomisp_subdev_streaming_count(asd) > sensor_start_stream) { /* trigger still capture */ if (asd->continuous_mode->val && atomisp_subdev_source_pad(vdev) @@ -1651,9 +1606,6 @@ static int atomisp_streamon(struct file *file, void *fh, asd->params.offline_parm.offset); if (ret) return -EINVAL; - - if (asd->depth_mode->val) - atomisp_pause_buffer_event(isp); } } atomisp_qbuffers_to_css(asd); @@ -1809,17 +1761,10 @@ int atomisp_streamoff(struct file *file, void *fh, enum v4l2_buf_type type) * do only videobuf_streamoff for capture & vf pipes in * case of continuous capture */ - if ((asd->continuous_mode->val || - isp->inputs[asd->input_curr].camera_caps->multi_stream_ctrl) && - atomisp_subdev_source_pad(vdev) != - ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW && - atomisp_subdev_source_pad(vdev) != - ATOMISP_SUBDEV_PAD_SOURCE_VIDEO) { - if (isp->inputs[asd->input_curr].camera_caps->multi_stream_ctrl) { - v4l2_subdev_call(isp->inputs[asd->input_curr].camera, - video, s_stream, 0); - } else if (atomisp_subdev_source_pad(vdev) - == ATOMISP_SUBDEV_PAD_SOURCE_CAPTURE) { + if (asd->continuous_mode->val && + atomisp_subdev_source_pad(vdev) != ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW && + atomisp_subdev_source_pad(vdev) != ATOMISP_SUBDEV_PAD_SOURCE_VIDEO) { + if (atomisp_subdev_source_pad(vdev) == ATOMISP_SUBDEV_PAD_SOURCE_CAPTURE) { /* stop continuous still capture if needed */ if (asd->params.offline_parm.num_captures == -1) atomisp_css_offline_capture_configure(asd, diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.h b/drivers/staging/media/atomisp/pci/atomisp_subdev.h index 43e6a1d1e4109..a1f4da35235d6 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.h +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.h @@ -313,9 +313,6 @@ struct atomisp_sub_device { /* This field specifies which camera (v4l2 input) is selected. */ int input_curr; - /* This field specifies which sensor is being selected when there - are multiple sensors connected to the same MIPI port. */ - int sensor_curr; atomic_t sof_count; atomic_t sequence; /* Sequence value that is assigned to buffer. */ diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 026ff3ca5c04e..d5bb9906ca6f2 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1007,7 +1007,6 @@ static int atomisp_subdev_probe(struct atomisp_device *isp) &subdevs->v4l2_subdev.board_info; struct i2c_adapter *adapter = i2c_get_adapter(subdevs->v4l2_subdev.i2c_adapter_id); - int sensor_num, i; dev_info(isp->dev, "Probing Subdev %s\n", board_info->type); @@ -1066,22 +1065,7 @@ static int atomisp_subdev_probe(struct atomisp_device *isp) * pixel_format. */ isp->inputs[isp->input_cnt].frame_size.pixel_format = 0; - isp->inputs[isp->input_cnt].camera_caps = - atomisp_get_default_camera_caps(); - sensor_num = isp->inputs[isp->input_cnt] - .camera_caps->sensor_num; isp->input_cnt++; - for (i = 1; i < sensor_num; i++) { - if (isp->input_cnt >= ATOM_ISP_MAX_INPUTS) { - dev_warn(isp->dev, - "atomisp inputs out of range\n"); - break; - } - isp->inputs[isp->input_cnt] = - isp->inputs[isp->input_cnt - 1]; - isp->inputs[isp->input_cnt].sensor_index = i; - isp->input_cnt++; - } break; case CAMERA_MOTOR: if (isp->motor) { @@ -1239,8 +1223,6 @@ static int atomisp_register_entities(struct atomisp_device *isp) "TPG detected, camera_cnt: %d\n", isp->input_cnt); isp->inputs[isp->input_cnt].type = TEST_PATTERN; isp->inputs[isp->input_cnt].port = -1; - isp->inputs[isp->input_cnt].camera_caps = - atomisp_get_default_camera_caps(); isp->inputs[isp->input_cnt++].camera = &isp->tpg.sd; } else { dev_warn(isp->dev, "too many atomisp inputs, TPG ignored.\n"); -- GitLab From 1e32f6ea43c44c256a3b63f60223d30d46f9d4b1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 9 Sep 2022 23:46:39 +0200 Subject: [PATCH 0586/2223] media: atomisp: Remove atomisp_source_pad_to_stream_id() atomisp_source_pad_to_stream_id() returns ATOMISP_INPUT_STREAM_GENERAL unconditionally now. Drop it and directly use ATOMISP_INPUT_STREAM_GENERAL in its callers. Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../staging/media/atomisp/pci/atomisp_cmd.c | 40 +++++++------------ .../staging/media/atomisp/pci/atomisp_cmd.h | 2 - .../media/atomisp/pci/atomisp_compat_css20.c | 7 ++-- .../staging/media/atomisp/pci/atomisp_ioctl.c | 5 +-- .../media/atomisp/pci/atomisp_subdev.c | 29 +++++--------- 5 files changed, 29 insertions(+), 54 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c index b01cacb8d2a81..c72d0e3446710 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c @@ -1938,7 +1938,6 @@ static void atomisp_update_grid_info(struct atomisp_sub_device *asd, { struct atomisp_device *isp = asd->isp; int err; - u16 stream_id = atomisp_source_pad_to_stream_id(asd, source_pad); if (atomisp_css_get_grid_info(asd, pipe_id, source_pad)) return; @@ -1947,7 +1946,7 @@ static void atomisp_update_grid_info(struct atomisp_sub_device *asd, the grid size. */ atomisp_css_free_stat_buffers(asd); - err = atomisp_alloc_css_stat_bufs(asd, stream_id); + err = atomisp_alloc_css_stat_bufs(asd, ATOMISP_INPUT_STREAM_GENERAL); if (err) { dev_err(isp->dev, "stat_buf allocate error\n"); goto err; @@ -4431,8 +4430,6 @@ int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, const struct atomisp_format_bridge *fmt; struct atomisp_input_stream_info *stream_info = (struct atomisp_input_stream_info *)snr_mbus_fmt->reserved; - u16 stream_index; - int source_pad = atomisp_subdev_source_pad(vdev); int ret; if (!asd) { @@ -4444,7 +4441,6 @@ int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, if (!isp->inputs[asd->input_curr].camera) return -EINVAL; - stream_index = atomisp_source_pad_to_stream_id(asd, source_pad); fmt = atomisp_get_format_bridge(f->pixelformat); if (!fmt) { dev_err(isp->dev, "unsupported pixelformat!\n"); @@ -4458,7 +4454,7 @@ int atomisp_try_fmt(struct video_device *vdev, struct v4l2_pix_format *f, snr_mbus_fmt->width = f->width; snr_mbus_fmt->height = f->height; - __atomisp_init_stream_info(stream_index, stream_info); + __atomisp_init_stream_info(ATOMISP_INPUT_STREAM_GENERAL, stream_info); dev_dbg(isp->dev, "try_mbus_fmt: asking for %ux%u\n", snr_mbus_fmt->width, snr_mbus_fmt->height); @@ -4743,7 +4739,6 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, int (*configure_pp_input)(struct atomisp_sub_device *asd, unsigned int width, unsigned int height) = configure_pp_input_nop; - u16 stream_index; const struct atomisp_in_fmt_conv *fc; int ret, i; @@ -4752,7 +4747,6 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, __func__, vdev->name); return -EINVAL; } - stream_index = atomisp_source_pad_to_stream_id(asd, source_pad); v4l2_fh_init(&fh.vfh, vdev); @@ -4772,7 +4766,7 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, dev_err(isp->dev, "mipi_info is NULL\n"); return -EINVAL; } - if (atomisp_set_sensor_mipi_to_isp(asd, stream_index, + if (atomisp_set_sensor_mipi_to_isp(asd, ATOMISP_INPUT_STREAM_GENERAL, mipi_info)) return -EINVAL; fc = atomisp_find_in_fmt_conv_by_atomisp_in_fmt( @@ -4856,7 +4850,7 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, /* ISP2401 new input system need to use copy pipe */ if (asd->copy_mode) { pipe_id = IA_CSS_PIPE_ID_COPY; - atomisp_css_capture_enable_online(asd, stream_index, false); + atomisp_css_capture_enable_online(asd, ATOMISP_INPUT_STREAM_GENERAL, false); } else if (asd->vfpp->val == ATOMISP_VFPP_DISABLE_SCALER) { /* video same in continuouscapture and online modes */ configure_output = atomisp_css_video_configure_output; @@ -4888,7 +4882,9 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, pipe_id = IA_CSS_PIPE_ID_CAPTURE; atomisp_update_capture_mode(asd); - atomisp_css_capture_enable_online(asd, stream_index, false); + atomisp_css_capture_enable_online(asd, + ATOMISP_INPUT_STREAM_GENERAL, + false); } } } else if (source_pad == ATOMISP_SUBDEV_PAD_SOURCE_PREVIEW) { @@ -4913,7 +4909,7 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, if (!asd->continuous_mode->val) /* in case of ANR, force capture pipe to offline mode */ - atomisp_css_capture_enable_online(asd, stream_index, + atomisp_css_capture_enable_online(asd, ATOMISP_INPUT_STREAM_GENERAL, asd->params.low_light ? false : asd->params.online_process); @@ -4944,7 +4940,7 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, pipe_id = IA_CSS_PIPE_ID_YUVPP; if (asd->copy_mode) - ret = atomisp_css_copy_configure_output(asd, stream_index, + ret = atomisp_css_copy_configure_output(asd, ATOMISP_INPUT_STREAM_GENERAL, pix->width, pix->height, format->planar ? pix->bytesperline : pix->bytesperline * 8 / format->depth, @@ -4968,8 +4964,9 @@ static int atomisp_set_fmt_to_isp(struct video_device *vdev, return -EINVAL; } if (asd->copy_mode) - ret = atomisp_css_copy_get_output_frame_info(asd, stream_index, - output_info); + ret = atomisp_css_copy_get_output_frame_info(asd, + ATOMISP_INPUT_STREAM_GENERAL, + output_info); else ret = get_frame_info(asd, output_info); if (ret) { @@ -5061,7 +5058,6 @@ static int atomisp_set_fmt_to_snr(struct video_device *vdev, struct atomisp_device *isp; struct atomisp_input_stream_info *stream_info = (struct atomisp_input_stream_info *)ffmt->reserved; - u16 stream_index = ATOMISP_INPUT_STREAM_GENERAL; int source_pad = atomisp_subdev_source_pad(vdev); struct v4l2_subdev_fh fh; int ret; @@ -5076,8 +5072,6 @@ static int atomisp_set_fmt_to_snr(struct video_device *vdev, v4l2_fh_init(&fh.vfh, vdev); - stream_index = atomisp_source_pad_to_stream_id(asd, source_pad); - format = atomisp_get_format_bridge(pixelformat); if (!format) return -EINVAL; @@ -5090,7 +5084,7 @@ static int atomisp_set_fmt_to_snr(struct video_device *vdev, ffmt->width, ffmt->height, padding_w, padding_h, dvs_env_w, dvs_env_h); - __atomisp_init_stream_info(stream_index, stream_info); + __atomisp_init_stream_info(ATOMISP_INPUT_STREAM_GENERAL, stream_info); req_ffmt = ffmt; @@ -5122,7 +5116,7 @@ static int atomisp_set_fmt_to_snr(struct video_device *vdev, if (ret) return ret; - __atomisp_update_stream_env(asd, stream_index, stream_info); + __atomisp_update_stream_env(asd, ATOMISP_INPUT_STREAM_GENERAL, stream_info); dev_dbg(isp->dev, "sensor width: %d, height: %d\n", ffmt->width, ffmt->height); @@ -5841,12 +5835,6 @@ int atomisp_flash_enable(struct atomisp_sub_device *asd, int num_frames) return 0; } -int atomisp_source_pad_to_stream_id(struct atomisp_sub_device *asd, - uint16_t source_pad) -{ - return ATOMISP_INPUT_STREAM_GENERAL; -} - bool atomisp_is_vf_pipe(struct atomisp_video_pipe *pipe) { struct atomisp_sub_device *asd = pipe->asd; diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.h b/drivers/staging/media/atomisp/pci/atomisp_cmd.h index ba3433a635959..c9f92f1326b61 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_cmd.h +++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.h @@ -297,8 +297,6 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error, bool q_buffers, enum atomisp_input_stream_id stream_id); void atomisp_css_flush(struct atomisp_device *isp); -int atomisp_source_pad_to_stream_id(struct atomisp_sub_device *asd, - uint16_t source_pad); /* Events. Only one event has to be exported for now. */ void atomisp_eof_event(struct atomisp_sub_device *asd, uint8_t exp_id); diff --git a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c index 64dd63ddc29c7..fdc05548d9723 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c +++ b/drivers/staging/media/atomisp/pci/atomisp_compat_css20.c @@ -1427,7 +1427,6 @@ int atomisp_css_get_grid_info(struct atomisp_sub_device *asd, struct ia_css_pipe_info p_info; struct ia_css_grid_info old_info; struct atomisp_device *isp = asd->isp; - int stream_index = atomisp_source_pad_to_stream_id(asd, source_pad); int md_width = asd->stream_env[ATOMISP_INPUT_STREAM_GENERAL]. stream_config.metadata_config.resolution.width; @@ -1435,7 +1434,7 @@ int atomisp_css_get_grid_info(struct atomisp_sub_device *asd, memset(&old_info, 0, sizeof(struct ia_css_grid_info)); if (ia_css_pipe_get_info( - asd->stream_env[stream_index].pipes[pipe_id], + asd->stream_env[ATOMISP_INPUT_STREAM_GENERAL].pipes[pipe_id], &p_info) != 0) { dev_err(isp->dev, "ia_css_pipe_get_info failed\n"); return -EINVAL; @@ -2680,11 +2679,11 @@ int atomisp_get_css_frame_info(struct atomisp_sub_device *asd, struct atomisp_device *isp = asd->isp; if (ATOMISP_SOC_CAMERA(asd)) { - stream_index = atomisp_source_pad_to_stream_id(asd, source_pad); + stream_index = ATOMISP_INPUT_STREAM_GENERAL; } else { stream_index = (pipe_index == IA_CSS_PIPE_ID_YUVPP) ? ATOMISP_INPUT_STREAM_VIDEO : - atomisp_source_pad_to_stream_id(asd, source_pad); + ATOMISP_INPUT_STREAM_GENERAL; } if (0 != ia_css_pipe_get_info(asd->stream_env[stream_index] diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index cbbb25d3e5fe3..aefa7c07242ab 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1112,11 +1112,8 @@ int atomisp_reqbufs(struct file *file, void *fh, struct v4l2_requestbuffers *req struct ia_css_frame *frame; struct videobuf_vmalloc_memory *vm_mem; u16 source_pad = atomisp_subdev_source_pad(vdev); - u16 stream_id; int ret = 0, i = 0; - stream_id = atomisp_source_pad_to_stream_id(asd, source_pad); - if (req->count == 0) { mutex_lock(&pipe->capq.vb_lock); if (!list_empty(&pipe->capq.stream)) @@ -1137,7 +1134,7 @@ int atomisp_reqbufs(struct file *file, void *fh, struct v4l2_requestbuffers *req if (ret) return ret; - atomisp_alloc_css_stat_bufs(asd, stream_id); + atomisp_alloc_css_stat_bufs(asd, ATOMISP_INPUT_STREAM_GENERAL); /* * for user pointer type, buffers are not really allocated here, diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 88bf693f4c50a..847dfee6ad78e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -373,16 +373,12 @@ int atomisp_subdev_set_selection(struct v4l2_subdev *sd, struct atomisp_sub_device *isp_sd = v4l2_get_subdevdata(sd); struct atomisp_device *isp = isp_sd->isp; struct v4l2_mbus_framefmt *ffmt[ATOMISP_SUBDEV_PADS_NUM]; - u16 vdev_pad = atomisp_subdev_source_pad(sd->devnode); struct v4l2_rect *crop[ATOMISP_SUBDEV_PADS_NUM], *comp[ATOMISP_SUBDEV_PADS_NUM]; - enum atomisp_input_stream_id stream_id; unsigned int i; unsigned int padding_w = pad_w; unsigned int padding_h = pad_h; - stream_id = atomisp_source_pad_to_stream_id(isp_sd, vdev_pad); - isp_get_fmt_rect(sd, sd_state, which, ffmt, crop, comp); dev_dbg(isp->dev, @@ -478,9 +474,10 @@ int atomisp_subdev_set_selection(struct v4l2_subdev *sd, dvs_w = dvs_h = 0; } atomisp_css_video_set_dis_envelope(isp_sd, dvs_w, dvs_h); - atomisp_css_input_set_effective_resolution(isp_sd, stream_id, - crop[pad]->width, crop[pad]->height); - + atomisp_css_input_set_effective_resolution(isp_sd, + ATOMISP_INPUT_STREAM_GENERAL, + crop[pad]->width, + crop[pad]->height); break; } case ATOMISP_SUBDEV_PAD_SOURCE_CAPTURE: @@ -523,14 +520,14 @@ int atomisp_subdev_set_selection(struct v4l2_subdev *sd, if (r->width * crop[ATOMISP_SUBDEV_PAD_SINK]->height < crop[ATOMISP_SUBDEV_PAD_SINK]->width * r->height) atomisp_css_input_set_effective_resolution(isp_sd, - stream_id, + ATOMISP_INPUT_STREAM_GENERAL, rounddown(crop[ATOMISP_SUBDEV_PAD_SINK]-> height * r->width / r->height, ATOM_ISP_STEP_WIDTH), crop[ATOMISP_SUBDEV_PAD_SINK]->height); else atomisp_css_input_set_effective_resolution(isp_sd, - stream_id, + ATOMISP_INPUT_STREAM_GENERAL, crop[ATOMISP_SUBDEV_PAD_SINK]->width, rounddown(crop[ATOMISP_SUBDEV_PAD_SINK]-> width * r->height / r->width, @@ -620,16 +617,12 @@ void atomisp_subdev_set_ffmt(struct v4l2_subdev *sd, struct atomisp_device *isp = isp_sd->isp; struct v4l2_mbus_framefmt *__ffmt = atomisp_subdev_get_ffmt(sd, sd_state, which, pad); - u16 vdev_pad = atomisp_subdev_source_pad(sd->devnode); - enum atomisp_input_stream_id stream_id; dev_dbg(isp->dev, "ffmt: pad %s w %d h %d code 0x%8.8x which %s\n", atomisp_pad_str(pad), ffmt->width, ffmt->height, ffmt->code, which == V4L2_SUBDEV_FORMAT_TRY ? "V4L2_SUBDEV_FORMAT_TRY" : "V4L2_SUBDEV_FORMAT_ACTIVE"); - stream_id = atomisp_source_pad_to_stream_id(isp_sd, vdev_pad); - switch (pad) { case ATOMISP_SUBDEV_PAD_SINK: { const struct atomisp_in_fmt_conv *fc = @@ -649,15 +642,15 @@ void atomisp_subdev_set_ffmt(struct v4l2_subdev *sd, if (which == V4L2_SUBDEV_FORMAT_ACTIVE) { atomisp_css_input_set_resolution(isp_sd, - stream_id, ffmt); + ATOMISP_INPUT_STREAM_GENERAL, ffmt); atomisp_css_input_set_binning_factor(isp_sd, - stream_id, + ATOMISP_INPUT_STREAM_GENERAL, atomisp_get_sensor_bin_factor(isp_sd)); - atomisp_css_input_set_bayer_order(isp_sd, stream_id, + atomisp_css_input_set_bayer_order(isp_sd, ATOMISP_INPUT_STREAM_GENERAL, fc->bayer_order); - atomisp_css_input_set_format(isp_sd, stream_id, + atomisp_css_input_set_format(isp_sd, ATOMISP_INPUT_STREAM_GENERAL, fc->atomisp_in_fmt); - atomisp_css_set_default_isys_config(isp_sd, stream_id, + atomisp_css_set_default_isys_config(isp_sd, ATOMISP_INPUT_STREAM_GENERAL, ffmt); } -- GitLab From df383edffd2e3af1d0f4df48f248efcb49c58f79 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 30 Jul 2022 18:20:27 +0200 Subject: [PATCH 0587/2223] media: atomisp_gmin_platform: Unexport and split camera_sensor_csi() The camera_sensor_csi() is not used outside the module, hence make it static. While at it, split it to _alloc() and _free() to clearly show the idea behind the last parameter @flag that is passed to gmin_csi_cfg(). Link: https://lore.kernel.org/r/20220730162027.1011-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Sakari Ailus Tested-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- .../include/linux/atomisp_gmin_platform.h | 2 - .../media/atomisp/pci/atomisp_gmin_platform.c | 68 ++++++++++--------- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/drivers/staging/media/atomisp/include/linux/atomisp_gmin_platform.h b/drivers/staging/media/atomisp/include/linux/atomisp_gmin_platform.h index 58e0ea5355a3b..5463d11d4295e 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp_gmin_platform.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp_gmin_platform.h @@ -26,8 +26,6 @@ struct v4l2_subdev *atomisp_gmin_find_subdev(struct i2c_adapter *adapter, int atomisp_gmin_remove_subdev(struct v4l2_subdev *sd); int gmin_get_var_int(struct device *dev, bool is_gmin, const char *var, int def); -int camera_sensor_csi(struct v4l2_subdev *sd, u32 port, - u32 lanes, u32 format, u32 bayer_order, int flag); struct camera_sensor_platform_data * gmin_camera_platform_data( struct v4l2_subdev *subdev, diff --git a/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c b/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c index 254e8c97f71f7..3d41fab661cf0 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c +++ b/drivers/staging/media/atomisp/pci/atomisp_gmin_platform.c @@ -1048,6 +1048,38 @@ static int gmin_flisclk_ctrl(struct v4l2_subdev *subdev, int on) return ret; } +static int camera_sensor_csi_alloc(struct v4l2_subdev *sd, u32 port, u32 lanes, + u32 format, u32 bayer_order) +{ + struct i2c_client *client = v4l2_get_subdevdata(sd); + struct camera_mipi_info *csi; + + csi = kzalloc(sizeof(*csi), GFP_KERNEL); + if (!csi) + return -ENOMEM; + + csi->port = port; + csi->num_lanes = lanes; + csi->input_format = format; + csi->raw_bayer_order = bayer_order; + v4l2_set_subdev_hostdata(sd, csi); + csi->metadata_format = ATOMISP_INPUT_FORMAT_EMBEDDED; + csi->metadata_effective_width = NULL; + dev_info(&client->dev, + "camera pdata: port: %d lanes: %d order: %8.8x\n", + port, lanes, bayer_order); + + return 0; +} + +static void camera_sensor_csi_free(struct v4l2_subdev *sd) +{ + struct camera_mipi_info *csi; + + csi = v4l2_get_subdev_hostdata(sd); + kfree(csi); +} + static int gmin_csi_cfg(struct v4l2_subdev *sd, int flag) { struct i2c_client *client = v4l2_get_subdevdata(sd); @@ -1056,8 +1088,11 @@ static int gmin_csi_cfg(struct v4l2_subdev *sd, int flag) if (!client || !gs) return -ENODEV; - return camera_sensor_csi(sd, gs->csi_port, gs->csi_lanes, - gs->csi_fmt, gs->csi_bayer, flag); + if (flag) + return camera_sensor_csi_alloc(sd, gs->csi_port, gs->csi_lanes, + gs->csi_fmt, gs->csi_bayer); + camera_sensor_csi_free(sd); + return 0; } static struct camera_vcm_control *gmin_get_vcm_ctrl(struct v4l2_subdev *subdev, @@ -1340,35 +1375,6 @@ int gmin_get_var_int(struct device *dev, bool is_gmin, const char *var, int def) } EXPORT_SYMBOL_GPL(gmin_get_var_int); -int camera_sensor_csi(struct v4l2_subdev *sd, u32 port, - u32 lanes, u32 format, u32 bayer_order, int flag) -{ - struct i2c_client *client = v4l2_get_subdevdata(sd); - struct camera_mipi_info *csi = NULL; - - if (flag) { - csi = kzalloc(sizeof(*csi), GFP_KERNEL); - if (!csi) - return -ENOMEM; - csi->port = port; - csi->num_lanes = lanes; - csi->input_format = format; - csi->raw_bayer_order = bayer_order; - v4l2_set_subdev_hostdata(sd, (void *)csi); - csi->metadata_format = ATOMISP_INPUT_FORMAT_EMBEDDED; - csi->metadata_effective_width = NULL; - dev_info(&client->dev, - "camera pdata: port: %d lanes: %d order: %8.8x\n", - port, lanes, bayer_order); - } else { - csi = v4l2_get_subdev_hostdata(sd); - kfree(csi); - } - - return 0; -} -EXPORT_SYMBOL_GPL(camera_sensor_csi); - /* PCI quirk: The BYT ISP advertises PCI runtime PM but it doesn't * work. Disable so the kernel framework doesn't hang the device * trying. The driver itself does direct calls to the PUNIT to manage -- GitLab From a90bc000770c3a745fd26d62c89b1b20ebc0e145 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 24 Sep 2022 11:10:22 +0200 Subject: [PATCH 0588/2223] media: atomisp: don't store an unused sink data on a var Fixes this Werror breakage: drivers/staging/media/atomisp/pci/atomisp_ioctl.c: In function 'atomisp_streamon': drivers/staging/media/atomisp/pci/atomisp_ioctl.c:1714:44: error: variable 'sink' set but not used [-Werror=unused-but-set-variable] 1714 | struct v4l2_mbus_framefmt *sink; | ^~~~ Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/atomisp/pci/atomisp_ioctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c index aefa7c07242ab..0ddb0ed42dd95 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_ioctl.c +++ b/drivers/staging/media/atomisp/pci/atomisp_ioctl.c @@ -1711,11 +1711,9 @@ start_sensor: start_delay_wq: if (asd->continuous_mode->val) { - struct v4l2_mbus_framefmt *sink; - - sink = atomisp_subdev_get_ffmt(&asd->subdev, NULL, - V4L2_SUBDEV_FORMAT_ACTIVE, - ATOMISP_SUBDEV_PAD_SINK); + atomisp_subdev_get_ffmt(&asd->subdev, NULL, + V4L2_SUBDEV_FORMAT_ACTIVE, + ATOMISP_SUBDEV_PAD_SINK); reinit_completion(&asd->init_done); asd->delayed_init = ATOMISP_DELAYED_INIT_QUEUED; -- GitLab From 466c1e6d05003707e8baa16668e7bc287d875d5e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 23 Aug 2022 09:42:01 +0200 Subject: [PATCH 0589/2223] media: sunxi: Fix some error handling path of sun8i_a83t_mipi_csi2_probe() Release some resources in the error handling path of the probe and of sun8i_a83t_mipi_csi2_resources_setup(), as already done in the remove function. Fixes: 576d196c522b ("media: sunxi: Add support for the A83T MIPI CSI-2 controller") Signed-off-by: Christophe JAILLET Acked-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../sun8i_a83t_mipi_csi2.c | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/sun8i_a83t_mipi_csi2.c b/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/sun8i_a83t_mipi_csi2.c index d052ee77ef0aa..b032ec13a683a 100644 --- a/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/sun8i_a83t_mipi_csi2.c +++ b/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/sun8i_a83t_mipi_csi2.c @@ -719,13 +719,15 @@ sun8i_a83t_mipi_csi2_resources_setup(struct sun8i_a83t_mipi_csi2_device *csi2_de csi2_dev->clock_mipi = devm_clk_get(dev, "mipi"); if (IS_ERR(csi2_dev->clock_mipi)) { dev_err(dev, "failed to acquire mipi clock\n"); - return PTR_ERR(csi2_dev->clock_mipi); + ret = PTR_ERR(csi2_dev->clock_mipi); + goto error_clock_rate_exclusive; } csi2_dev->clock_misc = devm_clk_get(dev, "misc"); if (IS_ERR(csi2_dev->clock_misc)) { dev_err(dev, "failed to acquire misc clock\n"); - return PTR_ERR(csi2_dev->clock_misc); + ret = PTR_ERR(csi2_dev->clock_misc); + goto error_clock_rate_exclusive; } /* Reset */ @@ -733,7 +735,8 @@ sun8i_a83t_mipi_csi2_resources_setup(struct sun8i_a83t_mipi_csi2_device *csi2_de csi2_dev->reset = devm_reset_control_get_shared(dev, NULL); if (IS_ERR(csi2_dev->reset)) { dev_err(dev, "failed to get reset controller\n"); - return PTR_ERR(csi2_dev->reset); + ret = PTR_ERR(csi2_dev->reset); + goto error_clock_rate_exclusive; } /* D-PHY */ @@ -741,7 +744,7 @@ sun8i_a83t_mipi_csi2_resources_setup(struct sun8i_a83t_mipi_csi2_device *csi2_de ret = sun8i_a83t_dphy_register(csi2_dev); if (ret) { dev_err(dev, "failed to initialize MIPI D-PHY\n"); - return ret; + goto error_clock_rate_exclusive; } /* Runtime PM */ @@ -749,6 +752,11 @@ sun8i_a83t_mipi_csi2_resources_setup(struct sun8i_a83t_mipi_csi2_device *csi2_de pm_runtime_enable(dev); return 0; + +error_clock_rate_exclusive: + clk_rate_exclusive_put(csi2_dev->clock_mod); + + return ret; } static void @@ -778,9 +786,14 @@ static int sun8i_a83t_mipi_csi2_probe(struct platform_device *platform_dev) ret = sun8i_a83t_mipi_csi2_bridge_setup(csi2_dev); if (ret) - return ret; + goto error_resources; return 0; + +error_resources: + sun8i_a83t_mipi_csi2_resources_cleanup(csi2_dev); + + return ret; } static int sun8i_a83t_mipi_csi2_remove(struct platform_device *platform_dev) -- GitLab From 51e1440d309a74a3e4e252019a00f9d0df329945 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 23 Aug 2022 09:42:11 +0200 Subject: [PATCH 0590/2223] media: sunxi: Fix some error handling path of sun6i_mipi_csi2_probe() Release some resources in the error handling path of the probe and of sun6i_mipi_csi2_resources_setup(), as already done in the remove function. Fixes: af54b4f4c17f ("media: sunxi: Add support for the A31 MIPI CSI-2 controller") Signed-off-by: Christophe JAILLET Acked-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../sunxi/sun6i-mipi-csi2/sun6i_mipi_csi2.c | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-mipi-csi2/sun6i_mipi_csi2.c b/drivers/media/platform/sunxi/sun6i-mipi-csi2/sun6i_mipi_csi2.c index a4e3f9a6b2ff2..30d6c0c5161f4 100644 --- a/drivers/media/platform/sunxi/sun6i-mipi-csi2/sun6i_mipi_csi2.c +++ b/drivers/media/platform/sunxi/sun6i-mipi-csi2/sun6i_mipi_csi2.c @@ -661,7 +661,8 @@ sun6i_mipi_csi2_resources_setup(struct sun6i_mipi_csi2_device *csi2_dev, csi2_dev->reset = devm_reset_control_get_shared(dev, NULL); if (IS_ERR(csi2_dev->reset)) { dev_err(dev, "failed to get reset controller\n"); - return PTR_ERR(csi2_dev->reset); + ret = PTR_ERR(csi2_dev->reset); + goto error_clock_rate_exclusive; } /* D-PHY */ @@ -669,13 +670,14 @@ sun6i_mipi_csi2_resources_setup(struct sun6i_mipi_csi2_device *csi2_dev, csi2_dev->dphy = devm_phy_get(dev, "dphy"); if (IS_ERR(csi2_dev->dphy)) { dev_err(dev, "failed to get MIPI D-PHY\n"); - return PTR_ERR(csi2_dev->dphy); + ret = PTR_ERR(csi2_dev->dphy); + goto error_clock_rate_exclusive; } ret = phy_init(csi2_dev->dphy); if (ret) { dev_err(dev, "failed to initialize MIPI D-PHY\n"); - return ret; + goto error_clock_rate_exclusive; } /* Runtime PM */ @@ -683,6 +685,11 @@ sun6i_mipi_csi2_resources_setup(struct sun6i_mipi_csi2_device *csi2_dev, pm_runtime_enable(dev); return 0; + +error_clock_rate_exclusive: + clk_rate_exclusive_put(csi2_dev->clock_mod); + + return ret; } static void @@ -712,9 +719,14 @@ static int sun6i_mipi_csi2_probe(struct platform_device *platform_dev) ret = sun6i_mipi_csi2_bridge_setup(csi2_dev); if (ret) - return ret; + goto error_resources; return 0; + +error_resources: + sun6i_mipi_csi2_resources_cleanup(csi2_dev); + + return ret; } static int sun6i_mipi_csi2_remove(struct platform_device *platform_dev) -- GitLab From 39dfd52d0f481de42a435f9fb79c98b376c68c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gonz=C3=A1lez=20Cabanelas?= Date: Mon, 7 Feb 2022 15:51:41 +0100 Subject: [PATCH 0591/2223] media: cx88: add IR remote support for NotOnlyTV LV3H MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCI hybrid card NotOnlyTV LV3H has a built-in IR receiver connected via I2C bus, currently not supported. This receiver is probably present in more Geniatech cards. It has no capability for repeating when a key is held down. Add support for this built-in IR receiver. Use the existing Total Media In Hand_02 remote keytable (Geniatech Mygica X8507) which matches exactly the LV3H remote. Signed-off-by: Daniel González Cabanelas Signed-off-by: Marek Kidawski Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/ir-kbd-i2c.c | 47 +++++++++++++++++++++++++++++ drivers/media/pci/cx88/cx88-input.c | 2 +- drivers/media/pci/cx88/cx88-video.c | 1 + include/media/i2c/ir-kbd-i2c.h | 1 + 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/drivers/media/i2c/ir-kbd-i2c.c b/drivers/media/i2c/ir-kbd-i2c.c index 56674173524fd..a229e2d69ed6e 100644 --- a/drivers/media/i2c/ir-kbd-i2c.c +++ b/drivers/media/i2c/ir-kbd-i2c.c @@ -238,6 +238,43 @@ static int get_key_knc1(struct IR_i2c *ir, enum rc_proto *protocol, return 1; } +static int get_key_geniatech(struct IR_i2c *ir, enum rc_proto *protocol, + u32 *scancode, u8 *toggle) +{ + int i, rc; + unsigned char b; + + /* poll IR chip */ + for (i = 0; i < 4; i++) { + rc = i2c_master_recv(ir->c, &b, 1); + if (rc == 1) + break; + msleep(20); + } + if (rc != 1) { + dev_dbg(&ir->rc->dev, "read error\n"); + if (rc < 0) + return rc; + return -EIO; + } + + /* don't repeat the key */ + if (ir->old == b) + return 0; + ir->old = b; + + /* decode to RC5 */ + b &= 0x7f; + b = (b - 1) / 2; + + dev_dbg(&ir->rc->dev, "key %02x\n", b); + + *protocol = RC_PROTO_RC5; + *scancode = b; + *toggle = ir->old >> 7; + return 1; +} + static int get_key_avermedia_cardbus(struct IR_i2c *ir, enum rc_proto *protocol, u32 *scancode, u8 *toggle) { @@ -766,6 +803,13 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) rc_proto = RC_PROTO_BIT_OTHER; ir_codes = RC_MAP_EMPTY; break; + case 0x33: + name = "Geniatech"; + ir->get_key = get_key_geniatech; + rc_proto = RC_PROTO_BIT_RC5; + ir_codes = RC_MAP_TOTAL_MEDIA_IN_HAND_02; + ir->old = 0xfc; + break; case 0x6b: name = "FusionHDTV"; ir->get_key = get_key_fusionhdtv; @@ -825,6 +869,9 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) case IR_KBD_GET_KEY_KNC1: ir->get_key = get_key_knc1; break; + case IR_KBD_GET_KEY_GENIATECH: + ir->get_key = get_key_geniatech; + break; case IR_KBD_GET_KEY_FUSIONHDTV: ir->get_key = get_key_fusionhdtv; break; diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c index ce0ef0b8186f5..a04a1d33fadb1 100644 --- a/drivers/media/pci/cx88/cx88-input.c +++ b/drivers/media/pci/cx88/cx88-input.c @@ -586,7 +586,7 @@ void cx88_i2c_init_ir(struct cx88_core *core) { struct i2c_board_info info; static const unsigned short default_addr_list[] = { - 0x18, 0x6b, 0x71, + 0x18, 0x33, 0x6b, 0x71, I2C_CLIENT_END }; static const unsigned short pvr2000_addr_list[] = { diff --git a/drivers/media/pci/cx88/cx88-video.c b/drivers/media/pci/cx88/cx88-video.c index b509c2a03852b..c0ef03ed74f98 100644 --- a/drivers/media/pci/cx88/cx88-video.c +++ b/drivers/media/pci/cx88/cx88-video.c @@ -1388,6 +1388,7 @@ static int cx8800_initdev(struct pci_dev *pci_dev, } fallthrough; case CX88_BOARD_DVICO_FUSIONHDTV_5_PCI_NANO: + case CX88_BOARD_NOTONLYTV_LV3H: request_module("ir-kbd-i2c"); } diff --git a/include/media/i2c/ir-kbd-i2c.h b/include/media/i2c/ir-kbd-i2c.h index 9f47d6a48cff3..0b58f8b9e7a4f 100644 --- a/include/media/i2c/ir-kbd-i2c.h +++ b/include/media/i2c/ir-kbd-i2c.h @@ -35,6 +35,7 @@ enum ir_kbd_get_key_fn { IR_KBD_GET_KEY_PIXELVIEW, IR_KBD_GET_KEY_HAUP, IR_KBD_GET_KEY_KNC1, + IR_KBD_GET_KEY_GENIATECH, IR_KBD_GET_KEY_FUSIONHDTV, IR_KBD_GET_KEY_HAUP_XVR, IR_KBD_GET_KEY_AVERMEDIA_CARDBUS, -- GitLab From 06a2da340f762addc5935bf851d95b14d4692db2 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Tue, 26 Jul 2022 04:14:54 +0200 Subject: [PATCH 0592/2223] media: venus: dec: Handle the case where find_format fails Debugging the decoder on msm8916 I noticed the vdec probe was crashing if the fmt pointer was NULL. A similar fix from Colin Ian King found by Coverity was implemented for the encoder. Implement the same fix on the decoder. Fixes: 7472c1c69138 ("[media] media: venus: vdec: add video decoder files") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/vdec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/platform/qcom/venus/vdec.c b/drivers/media/platform/qcom/venus/vdec.c index ac0bb45d07f4b..4ceaba37e2e57 100644 --- a/drivers/media/platform/qcom/venus/vdec.c +++ b/drivers/media/platform/qcom/venus/vdec.c @@ -183,6 +183,8 @@ vdec_try_fmt_common(struct venus_inst *inst, struct v4l2_format *f) else return NULL; fmt = find_format(inst, pixmp->pixelformat, f->type); + if (!fmt) + return NULL; } pixmp->width = clamp(pixmp->width, frame_width_min(inst), -- GitLab From 7f77fa9f378c528edb38dbf23ff1273c81429d49 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Tue, 26 Jul 2022 04:14:55 +0200 Subject: [PATCH 0593/2223] media: venus: Fix NV12 decoder buffer discovery on HFI_VERSION_1XX HFI_VERSION_1XX uses HFI_BUFFER_OUTPUT not HFI_BUFFER_OUTPUT2 for decoder buffers. venus_helper_check_format() places a constraint on an output buffer to be of type HFI_BUFFER_OUTPUT2. HFI_1XX uses HFI_BUFFER_OUTPUT though. Switching to the logic used in venus_helper_get_out_fmts() first checking for HFI_BUFFER_OUTPUT and then HFI_BUFFER_OUTPUT2 resolves on HFI_1XX. db410c before: root@linaro-alip:~# v4l2-ctl -d /dev/video0 --list-formats ioctl: VIDIOC_ENUM_FMT Type: Video Capture Multiplanar [0]: 'MPG4' (MPEG-4 Part 2 ES, compressed) [1]: 'H263' (H.263, compressed) [2]: 'H264' (H.264, compressed) [3]: 'VP80' (VP8, compressed) root@linaro-alip:~# v4l2-ctl -d /dev/video1 --list-formats ioctl: VIDIOC_ENUM_FMT Type: Video Capture Multiplanar db410c after: root@linaro-alip:~# v4l2-ctl -d /dev/video0 --list-formats ioctl: VIDIOC_ENUM_FMT Type: Video Capture Multiplanar [0]: 'MPG4' (MPEG-4 Part 2 ES, compressed) [1]: 'H263' (H.263, compressed) [2]: 'H264' (H.264, compressed) [3]: 'VP80' (VP8, compressed) root@linaro-alip:~# v4l2-ctl -d /dev/video1 --list-formats ioctl: VIDIOC_ENUM_FMT Type: Video Capture Multiplanar [0]: 'NV12' (Y/CbCr 4:2:0) Validated playback with ffplay on db410c with h264 and vp8 decoding. Fixes: 9593126dae3e ("media: venus: Add a handling of QC08C compressed format") Cc: stable@vger.kernel.org # v5.19 Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/helpers.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/media/platform/qcom/venus/helpers.c b/drivers/media/platform/qcom/venus/helpers.c index 60de4200375dd..ab6a29ffc81e2 100644 --- a/drivers/media/platform/qcom/venus/helpers.c +++ b/drivers/media/platform/qcom/venus/helpers.c @@ -1800,7 +1800,7 @@ bool venus_helper_check_format(struct venus_inst *inst, u32 v4l2_pixfmt) struct venus_core *core = inst->core; u32 fmt = to_hfi_raw_fmt(v4l2_pixfmt); struct hfi_plat_caps *caps; - u32 buftype; + bool found; if (!fmt) return false; @@ -1809,12 +1809,13 @@ bool venus_helper_check_format(struct venus_inst *inst, u32 v4l2_pixfmt) if (!caps) return false; - if (inst->session_type == VIDC_SESSION_TYPE_DEC) - buftype = HFI_BUFFER_OUTPUT2; - else - buftype = HFI_BUFFER_OUTPUT; + found = find_fmt_from_caps(caps, HFI_BUFFER_OUTPUT, fmt); + if (found) + goto done; - return find_fmt_from_caps(caps, buftype, fmt); + found = find_fmt_from_caps(caps, HFI_BUFFER_OUTPUT2, fmt); +done: + return found; } EXPORT_SYMBOL_GPL(venus_helper_check_format); -- GitLab From 014a6b274bfe051fadf8ec7e99ac5eb95653e248 Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Mon, 8 Aug 2022 11:28:28 +0200 Subject: [PATCH 0594/2223] media: venus : Add default values for the control V4l2 encoder compliance expecting default values of colorimetry for the control. Signed-off-by: Vikash Garodia Signed-off-by: Viswanath Boma Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc_ctrls.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/venc_ctrls.c b/drivers/media/platform/qcom/venus/venc_ctrls.c index ed44e5800759a..52bbc544e9cb4 100644 --- a/drivers/media/platform/qcom/venus/venc_ctrls.c +++ b/drivers/media/platform/qcom/venus/venc_ctrls.c @@ -355,6 +355,10 @@ static const struct v4l2_ctrl_ops venc_ctrl_ops = { int venc_ctrl_init(struct venus_inst *inst) { int ret; + struct v4l2_ctrl_hdr10_mastering_display p_hdr10_mastering = { + { 34000, 13250, 7500 }, + { 16000, 34500, 3000 }, 15635, 16450, 10000000, 500, + }; ret = v4l2_ctrl_handler_init(&inst->ctrl_handler, 58); if (ret) @@ -583,7 +587,7 @@ int venc_ctrl_init(struct venus_inst *inst) v4l2_ctrl_new_std_compound(&inst->ctrl_handler, &venc_ctrl_ops, V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY, - v4l2_ctrl_ptr_create(NULL)); + v4l2_ctrl_ptr_create((void *)&p_hdr10_mastering)); v4l2_ctrl_new_std_menu(&inst->ctrl_handler, &venc_ctrl_ops, V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE, -- GitLab From 096573e4c0c7eb21d5e22a9411aa2ba65ef96d96 Mon Sep 17 00:00:00 2001 From: Viswanath Boma Date: Mon, 8 Aug 2022 11:28:29 +0200 Subject: [PATCH 0595/2223] media: venus : Addition of control support - V4L2_CID_MIN_BUFFERS_FOR_OUTPUT V4l2 encoder compliance expecting minimum buffers support for the application to allocate buffers as per the control support values. Signed-off-by: Viswanath Boma Signed-off-by: Vikash Garodia Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/qcom/venus/venc_ctrls.c | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/venc_ctrls.c b/drivers/media/platform/qcom/venus/venc_ctrls.c index 52bbc544e9cb4..27779c79bf9d8 100644 --- a/drivers/media/platform/qcom/venus/venc_ctrls.c +++ b/drivers/media/platform/qcom/venus/venc_ctrls.c @@ -8,6 +8,7 @@ #include "core.h" #include "venc.h" +#include "helpers.h" #define BITRATE_MIN 32000 #define BITRATE_MAX 160000000 @@ -348,8 +349,29 @@ static int venc_op_s_ctrl(struct v4l2_ctrl *ctrl) return 0; } +static int venc_op_g_volatile_ctrl(struct v4l2_ctrl *ctrl) +{ + struct venus_inst *inst = ctrl_to_inst(ctrl); + struct hfi_buffer_requirements bufreq; + enum hfi_version ver = inst->core->res->hfi_version; + int ret; + + switch (ctrl->id) { + case V4L2_CID_MIN_BUFFERS_FOR_OUTPUT: + ret = venus_helper_get_bufreq(inst, HFI_BUFFER_INPUT, &bufreq); + if (!ret) + ctrl->val = HFI_BUFREQ_COUNT_MIN(&bufreq, ver); + break; + default: + return -EINVAL; + } + + return 0; +} + static const struct v4l2_ctrl_ops venc_ctrl_ops = { .s_ctrl = venc_op_s_ctrl, + .g_volatile_ctrl = venc_op_g_volatile_ctrl, }; int venc_ctrl_init(struct venus_inst *inst) @@ -360,7 +382,7 @@ int venc_ctrl_init(struct venus_inst *inst) { 16000, 34500, 3000 }, 15635, 16450, 10000000, 500, }; - ret = v4l2_ctrl_handler_init(&inst->ctrl_handler, 58); + ret = v4l2_ctrl_handler_init(&inst->ctrl_handler, 59); if (ret) return ret; @@ -440,6 +462,9 @@ int venc_ctrl_init(struct venus_inst *inst) V4L2_MPEG_VIDEO_VP8_PROFILE_3, 0, V4L2_MPEG_VIDEO_VP8_PROFILE_0); + v4l2_ctrl_new_std(&inst->ctrl_handler, &venc_ctrl_ops, + V4L2_CID_MIN_BUFFERS_FOR_OUTPUT, 4, 11, 1, 4); + v4l2_ctrl_new_std(&inst->ctrl_handler, &venc_ctrl_ops, V4L2_CID_MPEG_VIDEO_BITRATE, BITRATE_MIN, BITRATE_MAX, BITRATE_STEP, BITRATE_DEFAULT); -- GitLab From c260bf4bd3a3f40b480a1eb1ea57ce26263c2f85 Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Mon, 8 Aug 2022 11:28:30 +0200 Subject: [PATCH 0596/2223] media: venus : CAPTURE Plane width/height alignment with OUT plane. V4l2 encoder compliance set-format test cases failing as Capture plane width/height not aligned to OUT plane . Signed-off-by: Viswanath Boma Signed-off-by: Vikash Garodia Signed-off-by: Dikshita Agarwal Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index 86918aea1d241..037cbfccf3363 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -192,10 +192,8 @@ venc_try_fmt_common(struct venus_inst *inst, struct v4l2_format *f) pixmp->height = clamp(pixmp->height, frame_height_min(inst), frame_height_max(inst)); - if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) { - pixmp->width = ALIGN(pixmp->width, 128); - pixmp->height = ALIGN(pixmp->height, 32); - } + pixmp->width = ALIGN(pixmp->width, 128); + pixmp->height = ALIGN(pixmp->height, 32); pixmp->width = ALIGN(pixmp->width, 2); pixmp->height = ALIGN(pixmp->height, 2); -- GitLab From 70b2a5463dcdc18cd94d41f6dc170aa29cfcb922 Mon Sep 17 00:00:00 2001 From: Viswanath Boma Date: Mon, 8 Aug 2022 11:28:31 +0200 Subject: [PATCH 0597/2223] media: venus : Addition of EOS Event support for Encoder V4l2 encoder compliance expecting End of stream Event registration support for Encoder. Signed-off-by: Viswanath Boma Signed-off-by: Vikash Garodia Signed-off-by: Dikshita Agarwal Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index 037cbfccf3363..0c464ef89a221 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -507,6 +507,19 @@ static int venc_enum_frameintervals(struct file *file, void *fh, return 0; } +static int venc_subscribe_event(struct v4l2_fh *fh, + const struct v4l2_event_subscription *sub) +{ + switch (sub->type) { + case V4L2_EVENT_EOS: + return v4l2_event_subscribe(fh, sub, 2, NULL); + case V4L2_EVENT_CTRL: + return v4l2_ctrl_subscribe_event(fh, sub); + default: + return -EINVAL; + } +} + static const struct v4l2_ioctl_ops venc_ioctl_ops = { .vidioc_querycap = venc_querycap, .vidioc_enum_fmt_vid_cap = venc_enum_fmt, @@ -532,7 +545,7 @@ static const struct v4l2_ioctl_ops venc_ioctl_ops = { .vidioc_g_parm = venc_g_parm, .vidioc_enum_framesizes = venc_enum_framesizes, .vidioc_enum_frameintervals = venc_enum_frameintervals, - .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, + .vidioc_subscribe_event = venc_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; -- GitLab From 2f2d6fe83d0346923f0247e15dd51f3257e65edd Mon Sep 17 00:00:00 2001 From: Dikshita Agarwal Date: Mon, 8 Aug 2022 11:28:32 +0200 Subject: [PATCH 0598/2223] media: venus : Addition of support for VIDIOC_TRY_ENCODER_CMD v4l2 compliance expecting support for vidioc_try_encoder_cmd . error details : test VIDIOC_(TRY_)ENCODER_CMD: FAIL Signed-off-by: Viswanath Boma Signed-off-by: Dikshita Agarwal Signed-off-by: Vikash Garodia Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index 0c464ef89a221..167ee8ba8fc43 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -547,6 +547,7 @@ static const struct v4l2_ioctl_ops venc_ioctl_ops = { .vidioc_enum_frameintervals = venc_enum_frameintervals, .vidioc_subscribe_event = venc_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, + .vidioc_try_encoder_cmd = v4l2_m2m_ioctl_try_encoder_cmd, }; static int venc_pm_get(struct venus_inst *inst) -- GitLab From 2d5dbc7ff664ba320b1b4ed622c0b4c0d3d5b472 Mon Sep 17 00:00:00 2001 From: Viswanath Boma Date: Mon, 8 Aug 2022 11:28:33 +0200 Subject: [PATCH 0599/2223] media: venus : Remove the capture plane settings for venc_g_parm/venc_s_parm v4l2 compliance expecting settings for out buffer only and the same values will be propagated to capture buffer setting by h/w encoder . settings on cpature plane are optional , required only if offline-encoding supports. error details : fail: v4l2-test-formats.cpp(1350): !ret Signed-off-by: Viswanath Boma Signed-off-by: Vikash Garodia Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index 167ee8ba8fc43..cc08a3c8cd393 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -390,7 +390,7 @@ static int venc_s_parm(struct file *file, void *fh, struct v4l2_streamparm *a) struct v4l2_fract *timeperframe = &out->timeperframe; u64 us_per_frame, fps; - if (a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE && + if (a->type != V4L2_BUF_TYPE_VIDEO_OUTPUT && a->type != V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) return -EINVAL; @@ -422,7 +422,7 @@ static int venc_g_parm(struct file *file, void *fh, struct v4l2_streamparm *a) { struct venus_inst *inst = to_inst(file); - if (a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE && + if (a->type != V4L2_BUF_TYPE_VIDEO_OUTPUT && a->type != V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) return -EINVAL; -- GitLab From 1d600444f7f811bd554c18195fa7200a9254a24f Mon Sep 17 00:00:00 2001 From: Vikash Garodia Date: Mon, 8 Aug 2022 11:28:34 +0200 Subject: [PATCH 0600/2223] media: venus : Allow MIN/MAX settings for the v4l2 encoder controls defined range. Control MIN/MAX range defined as 0 to 1, as MIN value setting enabled for V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM. error details: fail: v4l2-test-controls.cpp(516): invalid maximum range check Signed-off-by: Viswanath Boma Signed-off-by: Vikash Garodia Signed-off-by: Dikshita Agarwal Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc_ctrls.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/media/platform/qcom/venus/venc_ctrls.c b/drivers/media/platform/qcom/venus/venc_ctrls.c index 27779c79bf9d8..cfcacdb797db7 100644 --- a/drivers/media/platform/qcom/venus/venc_ctrls.c +++ b/drivers/media/platform/qcom/venus/venc_ctrls.c @@ -337,8 +337,6 @@ static int venc_op_s_ctrl(struct v4l2_ctrl *ctrl) * if we disable 8x8 transform for HP. */ - if (ctrl->val == 0) - return -EINVAL; ctr->h264_8x8_transform = ctrl->val; break; -- GitLab From 61a70c9702da10296cebe4ab7d654da4bcb893b5 Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Fri, 2 Sep 2022 12:00:31 +0200 Subject: [PATCH 0601/2223] media: venus: venc_ctrls: Add default value for CLL info Add default value for CLL info when creating compound control. Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc_ctrls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/venc_ctrls.c b/drivers/media/platform/qcom/venus/venc_ctrls.c index cfcacdb797db7..7468e43800a94 100644 --- a/drivers/media/platform/qcom/venus/venc_ctrls.c +++ b/drivers/media/platform/qcom/venus/venc_ctrls.c @@ -379,6 +379,7 @@ int venc_ctrl_init(struct venus_inst *inst) { 34000, 13250, 7500 }, { 16000, 34500, 3000 }, 15635, 16450, 10000000, 500, }; + struct v4l2_ctrl_hdr10_cll_info p_hdr10_cll = { 1000, 400 }; ret = v4l2_ctrl_handler_init(&inst->ctrl_handler, 59); if (ret) @@ -606,7 +607,7 @@ int venc_ctrl_init(struct venus_inst *inst) v4l2_ctrl_new_std_compound(&inst->ctrl_handler, &venc_ctrl_ops, V4L2_CID_COLORIMETRY_HDR10_CLL_INFO, - v4l2_ctrl_ptr_create(NULL)); + v4l2_ctrl_ptr_create(&p_hdr10_cll)); v4l2_ctrl_new_std_compound(&inst->ctrl_handler, &venc_ctrl_ops, V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY, -- GitLab From f5218c71098dcf51772b6509184c02ce1cef37b8 Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Fri, 2 Sep 2022 12:01:58 +0200 Subject: [PATCH 0602/2223] media: venus: venc: Set HDR10 PQ SEI property only for MAIN10 profile The HDR10 PQ SEI should be set only when the codec is HEVC and the profile is MAIN10, otherwise some artefacts could be produced on the encoded bitstream. Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/venc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index cc08a3c8cd393..cdb12546c4fa6 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -698,7 +698,8 @@ static int venc_set_properties(struct venus_inst *inst) return ret; } - if (inst->fmt_cap->pixfmt == V4L2_PIX_FMT_HEVC) { + if (inst->fmt_cap->pixfmt == V4L2_PIX_FMT_HEVC && + ctr->profile.hevc == V4L2_MPEG_VIDEO_HEVC_PROFILE_MAIN_10) { struct hfi_hdr10_pq_sei hdr10; unsigned int c; -- GitLab From 265f2fc52f5873ae3fa653ce1f2dbc47afef6344 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 30 Aug 2022 10:37:53 +0200 Subject: [PATCH 0603/2223] media: venus: hfi: Remove the unneeded result variable Return the value venus_hfi_create() directly instead of storing it in another redundant variable. Signed-off-by: ye xingchen Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/venus/hfi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/media/platform/qcom/venus/hfi.c b/drivers/media/platform/qcom/venus/hfi.c index 1968f09ad177a..e00aedb41d168 100644 --- a/drivers/media/platform/qcom/venus/hfi.c +++ b/drivers/media/platform/qcom/venus/hfi.c @@ -569,8 +569,6 @@ irqreturn_t hfi_isr(int irq, void *dev) int hfi_create(struct venus_core *core, const struct hfi_core_ops *ops) { - int ret; - if (!ops) return -EINVAL; @@ -579,9 +577,8 @@ int hfi_create(struct venus_core *core, const struct hfi_core_ops *ops) core->state = CORE_UNINIT; init_completion(&core->done); pkt_set_version(core->res->hfi_version); - ret = venus_hfi_create(core); - return ret; + return venus_hfi_create(core); } void hfi_destroy(struct venus_core *core) -- GitLab From d0734dab5480325fd67548aa51ee126570d48574 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:29 +0200 Subject: [PATCH 0604/2223] media: sun6i-mipi-csi2: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. It also selects PHY_SUN6I_MIPI_DPHY, which depends on RESET_CONTROLLER. Fixes: af54b4f4c17f ("media: sunxi: Add support for the A31 MIPI CSI-2 controller") Signed-off-by: Paul Kocialkowski Reported-by: kernel test robot Acked-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig b/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig index 4d072abdfb705..08852f63692b6 100644 --- a/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig +++ b/drivers/media/platform/sunxi/sun6i-mipi-csi2/Kconfig @@ -3,7 +3,7 @@ config VIDEO_SUN6I_MIPI_CSI2 tristate "Allwinner A31 MIPI CSI-2 Controller Driver" depends on V4L_PLATFORM_DRIVERS && VIDEO_DEV depends on ARCH_SUNXI || COMPILE_TEST - depends on PM && COMMON_CLK + depends on PM && COMMON_CLK && RESET_CONTROLLER depends on PHY_SUN6I_MIPI_DPHY select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API -- GitLab From 398c479234894c3d3347d83869760db3c406c269 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:30 +0200 Subject: [PATCH 0605/2223] media: sun8i-a83t-mipi-csi2: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. Fixes: 576d196c522b ("media: sunxi: Add support for the A83T MIPI CSI-2 controller") Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/Kconfig b/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/Kconfig index 789d58ee12ea9..47a8c0fb7eb9f 100644 --- a/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/Kconfig +++ b/drivers/media/platform/sunxi/sun8i-a83t-mipi-csi2/Kconfig @@ -3,7 +3,7 @@ config VIDEO_SUN8I_A83T_MIPI_CSI2 tristate "Allwinner A83T MIPI CSI-2 Controller and D-PHY Driver" depends on V4L_PLATFORM_DRIVERS && VIDEO_DEV depends on ARCH_SUNXI || COMPILE_TEST - depends on PM && COMMON_CLK + depends on PM && COMMON_CLK && RESET_CONTROLLER select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API select V4L2_FWNODE -- GitLab From 6a720df702db764e2b3bbdaaa217e9d344efcfb2 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:31 +0200 Subject: [PATCH 0606/2223] media: sun6i-csi: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. Fixes: 5cc7522d8965 ("media: sun6i: Add support for Allwinner CSI V3s") Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-csi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/Kconfig b/drivers/media/platform/sunxi/sun6i-csi/Kconfig index 0345901617d41..e5b6991ce7f04 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/Kconfig +++ b/drivers/media/platform/sunxi/sun6i-csi/Kconfig @@ -2,7 +2,7 @@ config VIDEO_SUN6I_CSI tristate "Allwinner V3s Camera Sensor Interface driver" depends on V4L_PLATFORM_DRIVERS - depends on VIDEO_DEV && COMMON_CLK && HAS_DMA + depends on VIDEO_DEV && COMMON_CLK && RESET_CONTROLLER && HAS_DMA depends on ARCH_SUNXI || COMPILE_TEST select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API -- GitLab From 140a9b57d3a306ca77a92e903facbdc4a31ccd51 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:32 +0200 Subject: [PATCH 0607/2223] media: sun4i-csi: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. Fixes: 577bbf23b758 ("media: sunxi: Add A10 CSI driver") Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun4i-csi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun4i-csi/Kconfig b/drivers/media/platform/sunxi/sun4i-csi/Kconfig index 7960e6836f415..60610c04d6a76 100644 --- a/drivers/media/platform/sunxi/sun4i-csi/Kconfig +++ b/drivers/media/platform/sunxi/sun4i-csi/Kconfig @@ -3,7 +3,7 @@ config VIDEO_SUN4I_CSI tristate "Allwinner A10 CMOS Sensor Interface Support" depends on V4L_PLATFORM_DRIVERS - depends on VIDEO_DEV && COMMON_CLK && HAS_DMA + depends on VIDEO_DEV && COMMON_CLK && RESET_CONTROLLER && HAS_DMA depends on ARCH_SUNXI || COMPILE_TEST select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API -- GitLab From c2a46b19f0340e6647168f4ceac4e5e4cb9197d8 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:33 +0200 Subject: [PATCH 0608/2223] media: sun8i-di: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. Fixes: a4260ea49547 ("media: sun4i: Add H3 deinterlace driver") Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun8i-di/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun8i-di/Kconfig b/drivers/media/platform/sunxi/sun8i-di/Kconfig index ff71e06ee2dfe..f688396913b79 100644 --- a/drivers/media/platform/sunxi/sun8i-di/Kconfig +++ b/drivers/media/platform/sunxi/sun8i-di/Kconfig @@ -4,7 +4,7 @@ config VIDEO_SUN8I_DEINTERLACE depends on V4L_MEM2MEM_DRIVERS depends on VIDEO_DEV depends on ARCH_SUNXI || COMPILE_TEST - depends on COMMON_CLK && OF + depends on COMMON_CLK && RESET_CONTROLLER && OF depends on PM select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV -- GitLab From b9273150b8b7f8b02ac961463057191d243f953d Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:34 +0200 Subject: [PATCH 0609/2223] media: sun8i-rotate: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. Fixes: d77182ada3d4 ("media: sun8i: Add Allwinner A83T Rotate driver") Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun8i-rotate/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun8i-rotate/Kconfig b/drivers/media/platform/sunxi/sun8i-rotate/Kconfig index cfba29072d752..ee2c1f248c646 100644 --- a/drivers/media/platform/sunxi/sun8i-rotate/Kconfig +++ b/drivers/media/platform/sunxi/sun8i-rotate/Kconfig @@ -5,7 +5,7 @@ config VIDEO_SUN8I_ROTATE depends on V4L_MEM2MEM_DRIVERS depends on VIDEO_DEV depends on ARCH_SUNXI || COMPILE_TEST - depends on COMMON_CLK && OF + depends on COMMON_CLK && RESET_CONTROLLER && OF depends on PM select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV -- GitLab From 26686b0da9f3fd042578c1093862c853f8e4ff1b Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Thu, 25 Aug 2022 12:20:35 +0200 Subject: [PATCH 0610/2223] media: cedrus: Add a Kconfig dependency on RESET_CONTROLLER The driver relies on the reset controller API to work, so add RESET_CONTROLLER as one of its Kconfig dependencies. Fixes: 50e761516f2b ("media: platform: Add Cedrus VPU decoder driver") Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/sunxi/cedrus/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/media/sunxi/cedrus/Kconfig b/drivers/staging/media/sunxi/cedrus/Kconfig index 21c13f9b6e333..621944f9907a6 100644 --- a/drivers/staging/media/sunxi/cedrus/Kconfig +++ b/drivers/staging/media/sunxi/cedrus/Kconfig @@ -2,6 +2,7 @@ config VIDEO_SUNXI_CEDRUS tristate "Allwinner Cedrus VPU driver" depends on VIDEO_DEV + depends on RESET_CONTROLLER depends on HAS_DMA depends on OF select MEDIA_CONTROLLER -- GitLab From 6f4d0849be9beefc457a8ec95818ff36309d273a Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:31:58 +0200 Subject: [PATCH 0611/2223] media: sun6i-csi: Define and use driver name and (reworked) description Add proper defines for driver name and description instead of MODULE_NAME and hardcoding (cosmetics). Also rework the description while at it to mention the hardware generation that the driver supports and remove the video capture mentions since it applies to the whole media device. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c | 14 ++++++-------- drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h | 3 +++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index a971587dbbd1d..5ca05f348021a 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -27,8 +27,6 @@ #include "sun6i_csi.h" #include "sun6i_csi_reg.h" -#define MODULE_NAME "sun6i-csi" - struct sun6i_csi_dev { struct sun6i_csi csi; struct device *dev; @@ -730,7 +728,7 @@ static int sun6i_csi_v4l2_init(struct sun6i_csi *csi) int ret; csi->media_dev.dev = csi->dev; - strscpy(csi->media_dev.model, "Allwinner Video Capture Device", + strscpy(csi->media_dev.model, SUN6I_CSI_DESCRIPTION, sizeof(csi->media_dev.model)); csi->media_dev.hw_revision = 0; @@ -753,7 +751,7 @@ static int sun6i_csi_v4l2_init(struct sun6i_csi *csi) goto free_ctrl; } - ret = sun6i_video_init(&csi->video, csi, "sun6i-csi"); + ret = sun6i_video_init(&csi->video, csi, SUN6I_CSI_NAME); if (ret) goto unreg_v4l2; @@ -868,8 +866,8 @@ static int sun6i_csi_resource_request(struct sun6i_csi_dev *sdev, if (irq < 0) return -ENXIO; - ret = devm_request_irq(&pdev->dev, irq, sun6i_csi_isr, 0, MODULE_NAME, - sdev); + ret = devm_request_irq(&pdev->dev, irq, sun6i_csi_isr, 0, + SUN6I_CSI_NAME, sdev); if (ret) { dev_err(&pdev->dev, "Cannot request csi IRQ\n"); return ret; @@ -922,12 +920,12 @@ static struct platform_driver sun6i_csi_platform_driver = { .probe = sun6i_csi_probe, .remove = sun6i_csi_remove, .driver = { - .name = MODULE_NAME, + .name = SUN6I_CSI_NAME, .of_match_table = of_match_ptr(sun6i_csi_of_match), }, }; module_platform_driver(sun6i_csi_platform_driver); -MODULE_DESCRIPTION("Allwinner V3s Camera Sensor Interface driver"); +MODULE_DESCRIPTION("Allwinner A31 Camera Sensor Interface driver"); MODULE_AUTHOR("Yong Deng "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h index 3a38d107ae3ff..e04f3c3fa27b7 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h @@ -14,6 +14,9 @@ #include "sun6i_video.h" +#define SUN6I_CSI_NAME "sun6i-csi" +#define SUN6I_CSI_DESCRIPTION "Allwinner A31 CSI Device" + struct sun6i_csi; /** -- GitLab From 0b11253f36e8d6697e3eea81a9a87a704edfcf65 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Wed, 7 Sep 2022 12:00:38 +0200 Subject: [PATCH 0612/2223] media: sun6i-csi: Refactor main driver data structures Merge contents of structs sun6i_csi and sun6i_csi_dev into a main sun6i_csi_device structure holding a sun6i_csi_v4l2 struct for things related to v4l2, as well as the already-existing sun6i_csi_video and sun6i_csi_config which are left unchanged. This mostly simplifies accessing stuff by having a single main structure accessible to every part of the code instead of a private definition. Also solve some kerneldoc warnings by describing return codes while at it. No functional change is intended in this commit, variables are just moved around (cosmetics). Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 346 +++++++++--------- .../platform/sunxi/sun6i-csi/sun6i_csi.h | 50 ++- .../platform/sunxi/sun6i-csi/sun6i_video.c | 52 +-- .../platform/sunxi/sun6i-csi/sun6i_video.h | 8 +- 4 files changed, 229 insertions(+), 227 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 5ca05f348021a..0e2b4d38e81cf 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -27,37 +27,20 @@ #include "sun6i_csi.h" #include "sun6i_csi_reg.h" -struct sun6i_csi_dev { - struct sun6i_csi csi; - struct device *dev; - - struct regmap *regmap; - struct clk *clk_mod; - struct clk *clk_ram; - struct reset_control *rstc_bus; - - int planar_offset[3]; -}; - -static inline struct sun6i_csi_dev *sun6i_csi_to_dev(struct sun6i_csi *csi) -{ - return container_of(csi, struct sun6i_csi_dev, csi); -} - /* TODO add 10&12 bit YUV, RGB support */ -bool sun6i_csi_is_format_supported(struct sun6i_csi *csi, +bool sun6i_csi_is_format_supported(struct sun6i_csi_device *csi_dev, u32 pixformat, u32 mbus_code) { - struct sun6i_csi_dev *sdev = sun6i_csi_to_dev(csi); + struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; /* * Some video receivers have the ability to be compatible with * 8bit and 16bit bus width. * Identify the media bus format from device tree. */ - if ((sdev->csi.v4l2_ep.bus_type == V4L2_MBUS_PARALLEL - || sdev->csi.v4l2_ep.bus_type == V4L2_MBUS_BT656) - && sdev->csi.v4l2_ep.bus.parallel.bus_width == 16) { + if ((v4l2->v4l2_ep.bus_type == V4L2_MBUS_PARALLEL + || v4l2->v4l2_ep.bus_type == V4L2_MBUS_BT656) + && v4l2->v4l2_ep.bus.parallel.bus_width == 16) { switch (pixformat) { case V4L2_PIX_FMT_NV12_16L16: case V4L2_PIX_FMT_NV12: @@ -74,13 +57,14 @@ bool sun6i_csi_is_format_supported(struct sun6i_csi *csi, case MEDIA_BUS_FMT_YVYU8_1X16: return true; default: - dev_dbg(sdev->dev, "Unsupported mbus code: 0x%x\n", + dev_dbg(csi_dev->dev, + "Unsupported mbus code: 0x%x\n", mbus_code); break; } break; default: - dev_dbg(sdev->dev, "Unsupported pixformat: 0x%x\n", + dev_dbg(csi_dev->dev, "Unsupported pixformat: 0x%x\n", pixformat); break; } @@ -137,7 +121,7 @@ bool sun6i_csi_is_format_supported(struct sun6i_csi *csi, case MEDIA_BUS_FMT_YVYU8_2X8: return true; default: - dev_dbg(sdev->dev, "Unsupported mbus code: 0x%x\n", + dev_dbg(csi_dev->dev, "Unsupported mbus code: 0x%x\n", mbus_code); break; } @@ -152,50 +136,50 @@ bool sun6i_csi_is_format_supported(struct sun6i_csi *csi, return (mbus_code == MEDIA_BUS_FMT_JPEG_1X8); default: - dev_dbg(sdev->dev, "Unsupported pixformat: 0x%x\n", pixformat); + dev_dbg(csi_dev->dev, "Unsupported pixformat: 0x%x\n", + pixformat); break; } return false; } -int sun6i_csi_set_power(struct sun6i_csi *csi, bool enable) +int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) { - struct sun6i_csi_dev *sdev = sun6i_csi_to_dev(csi); - struct device *dev = sdev->dev; - struct regmap *regmap = sdev->regmap; + struct device *dev = csi_dev->dev; + struct regmap *regmap = csi_dev->regmap; int ret; if (!enable) { regmap_update_bits(regmap, CSI_EN_REG, CSI_EN_CSI_EN, 0); - clk_disable_unprepare(sdev->clk_ram); + clk_disable_unprepare(csi_dev->clk_ram); if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_rate_exclusive_put(sdev->clk_mod); - clk_disable_unprepare(sdev->clk_mod); - reset_control_assert(sdev->rstc_bus); + clk_rate_exclusive_put(csi_dev->clk_mod); + clk_disable_unprepare(csi_dev->clk_mod); + reset_control_assert(csi_dev->reset); return 0; } - ret = clk_prepare_enable(sdev->clk_mod); + ret = clk_prepare_enable(csi_dev->clk_mod); if (ret) { - dev_err(sdev->dev, "Enable csi clk err %d\n", ret); + dev_err(csi_dev->dev, "Enable csi clk err %d\n", ret); return ret; } if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_set_rate_exclusive(sdev->clk_mod, 300000000); + clk_set_rate_exclusive(csi_dev->clk_mod, 300000000); - ret = clk_prepare_enable(sdev->clk_ram); + ret = clk_prepare_enable(csi_dev->clk_ram); if (ret) { - dev_err(sdev->dev, "Enable clk_dram_csi clk err %d\n", ret); + dev_err(csi_dev->dev, "Enable clk_dram_csi clk err %d\n", ret); goto clk_mod_disable; } - ret = reset_control_deassert(sdev->rstc_bus); + ret = reset_control_deassert(csi_dev->reset); if (ret) { - dev_err(sdev->dev, "reset err %d\n", ret); + dev_err(csi_dev->dev, "reset err %d\n", ret); goto clk_ram_disable; } @@ -204,15 +188,15 @@ int sun6i_csi_set_power(struct sun6i_csi *csi, bool enable) return 0; clk_ram_disable: - clk_disable_unprepare(sdev->clk_ram); + clk_disable_unprepare(csi_dev->clk_ram); clk_mod_disable: if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_rate_exclusive_put(sdev->clk_mod); - clk_disable_unprepare(sdev->clk_mod); + clk_rate_exclusive_put(csi_dev->clk_mod); + clk_disable_unprepare(csi_dev->clk_mod); return ret; } -static enum csi_input_fmt get_csi_input_format(struct sun6i_csi_dev *sdev, +static enum csi_input_fmt get_csi_input_format(struct sun6i_csi_device *csi_dev, u32 mbus_code, u32 pixformat) { /* non-YUV */ @@ -230,12 +214,13 @@ static enum csi_input_fmt get_csi_input_format(struct sun6i_csi_dev *sdev, } /* not support YUV420 input format yet */ - dev_dbg(sdev->dev, "Select YUV422 as default input format of CSI.\n"); + dev_dbg(csi_dev->dev, "Select YUV422 as default input format of CSI.\n"); return CSI_INPUT_FORMAT_YUV422; } -static enum csi_output_fmt get_csi_output_format(struct sun6i_csi_dev *sdev, - u32 pixformat, u32 field) +static enum csi_output_fmt +get_csi_output_format(struct sun6i_csi_device *csi_dev, u32 pixformat, + u32 field) { bool buf_interlaced = false; @@ -294,14 +279,14 @@ static enum csi_output_fmt get_csi_output_format(struct sun6i_csi_dev *sdev, return buf_interlaced ? CSI_FRAME_RAW_8 : CSI_FIELD_RAW_8; default: - dev_warn(sdev->dev, "Unsupported pixformat: 0x%x\n", pixformat); + dev_warn(csi_dev->dev, "Unsupported pixformat: 0x%x\n", pixformat); break; } return CSI_FIELD_RAW_8; } -static enum csi_input_seq get_csi_input_seq(struct sun6i_csi_dev *sdev, +static enum csi_input_seq get_csi_input_seq(struct sun6i_csi_device *csi_dev, u32 mbus_code, u32 pixformat) { /* Input sequence does not apply to non-YUV formats */ @@ -328,7 +313,7 @@ static enum csi_input_seq get_csi_input_seq(struct sun6i_csi_dev *sdev, case MEDIA_BUS_FMT_YVYU8_2X8: return CSI_INPUT_SEQ_YVYU; default: - dev_warn(sdev->dev, "Unsupported mbus code: 0x%x\n", + dev_warn(csi_dev->dev, "Unsupported mbus code: 0x%x\n", mbus_code); break; } @@ -350,7 +335,7 @@ static enum csi_input_seq get_csi_input_seq(struct sun6i_csi_dev *sdev, case MEDIA_BUS_FMT_YVYU8_2X8: return CSI_INPUT_SEQ_YUYV; default: - dev_warn(sdev->dev, "Unsupported mbus code: 0x%x\n", + dev_warn(csi_dev->dev, "Unsupported mbus code: 0x%x\n", mbus_code); break; } @@ -360,7 +345,7 @@ static enum csi_input_seq get_csi_input_seq(struct sun6i_csi_dev *sdev, return CSI_INPUT_SEQ_YUYV; default: - dev_warn(sdev->dev, "Unsupported pixformat: 0x%x, defaulting to YUYV\n", + dev_warn(csi_dev->dev, "Unsupported pixformat: 0x%x, defaulting to YUYV\n", pixformat); break; } @@ -368,23 +353,23 @@ static enum csi_input_seq get_csi_input_seq(struct sun6i_csi_dev *sdev, return CSI_INPUT_SEQ_YUYV; } -static void sun6i_csi_setup_bus(struct sun6i_csi_dev *sdev) +static void sun6i_csi_setup_bus(struct sun6i_csi_device *csi_dev) { - struct v4l2_fwnode_endpoint *endpoint = &sdev->csi.v4l2_ep; - struct sun6i_csi *csi = &sdev->csi; + struct v4l2_fwnode_endpoint *endpoint = &csi_dev->v4l2.v4l2_ep; + struct sun6i_csi_config *config = &csi_dev->config; unsigned char bus_width; u32 flags; u32 cfg; bool input_interlaced = false; - if (csi->config.field == V4L2_FIELD_INTERLACED - || csi->config.field == V4L2_FIELD_INTERLACED_TB - || csi->config.field == V4L2_FIELD_INTERLACED_BT) + if (config->field == V4L2_FIELD_INTERLACED + || config->field == V4L2_FIELD_INTERLACED_TB + || config->field == V4L2_FIELD_INTERLACED_BT) input_interlaced = true; bus_width = endpoint->bus.parallel.bus_width; - regmap_read(sdev->regmap, CSI_IF_CFG_REG, &cfg); + regmap_read(csi_dev->regmap, CSI_IF_CFG_REG, &cfg); cfg &= ~(CSI_IF_CFG_CSI_IF_MASK | CSI_IF_CFG_MIPI_IF_MASK | CSI_IF_CFG_IF_DATA_WIDTH_MASK | @@ -432,7 +417,7 @@ static void sun6i_csi_setup_bus(struct sun6i_csi_dev *sdev) cfg |= CSI_IF_CFG_CLK_POL_FALLING_EDGE; break; default: - dev_warn(sdev->dev, "Unsupported bus type: %d\n", + dev_warn(csi_dev->dev, "Unsupported bus type: %d\n", endpoint->bus_type); break; } @@ -450,54 +435,54 @@ static void sun6i_csi_setup_bus(struct sun6i_csi_dev *sdev) case 16: /* No need to configure DATA_WIDTH for 16bit */ break; default: - dev_warn(sdev->dev, "Unsupported bus width: %u\n", bus_width); + dev_warn(csi_dev->dev, "Unsupported bus width: %u\n", bus_width); break; } - regmap_write(sdev->regmap, CSI_IF_CFG_REG, cfg); + regmap_write(csi_dev->regmap, CSI_IF_CFG_REG, cfg); } -static void sun6i_csi_set_format(struct sun6i_csi_dev *sdev) +static void sun6i_csi_set_format(struct sun6i_csi_device *csi_dev) { - struct sun6i_csi *csi = &sdev->csi; + struct sun6i_csi_config *config = &csi_dev->config; u32 cfg; u32 val; - regmap_read(sdev->regmap, CSI_CH_CFG_REG, &cfg); + regmap_read(csi_dev->regmap, CSI_CH_CFG_REG, &cfg); cfg &= ~(CSI_CH_CFG_INPUT_FMT_MASK | CSI_CH_CFG_OUTPUT_FMT_MASK | CSI_CH_CFG_VFLIP_EN | CSI_CH_CFG_HFLIP_EN | CSI_CH_CFG_FIELD_SEL_MASK | CSI_CH_CFG_INPUT_SEQ_MASK); - val = get_csi_input_format(sdev, csi->config.code, - csi->config.pixelformat); + val = get_csi_input_format(csi_dev, config->code, + config->pixelformat); cfg |= CSI_CH_CFG_INPUT_FMT(val); - val = get_csi_output_format(sdev, csi->config.pixelformat, - csi->config.field); + val = get_csi_output_format(csi_dev, config->pixelformat, + config->field); cfg |= CSI_CH_CFG_OUTPUT_FMT(val); - val = get_csi_input_seq(sdev, csi->config.code, - csi->config.pixelformat); + val = get_csi_input_seq(csi_dev, config->code, + config->pixelformat); cfg |= CSI_CH_CFG_INPUT_SEQ(val); - if (csi->config.field == V4L2_FIELD_TOP) + if (config->field == V4L2_FIELD_TOP) cfg |= CSI_CH_CFG_FIELD_SEL_FIELD0; - else if (csi->config.field == V4L2_FIELD_BOTTOM) + else if (config->field == V4L2_FIELD_BOTTOM) cfg |= CSI_CH_CFG_FIELD_SEL_FIELD1; else cfg |= CSI_CH_CFG_FIELD_SEL_BOTH; - regmap_write(sdev->regmap, CSI_CH_CFG_REG, cfg); + regmap_write(csi_dev->regmap, CSI_CH_CFG_REG, cfg); } -static void sun6i_csi_set_window(struct sun6i_csi_dev *sdev) +static void sun6i_csi_set_window(struct sun6i_csi_device *csi_dev) { - struct sun6i_csi_config *config = &sdev->csi.config; + struct sun6i_csi_config *config = &csi_dev->config; u32 bytesperline_y; u32 bytesperline_c; - int *planar_offset = sdev->planar_offset; + int *planar_offset = csi_dev->planar_offset; u32 width = config->width; u32 height = config->height; u32 hor_len = width; @@ -507,7 +492,7 @@ static void sun6i_csi_set_window(struct sun6i_csi_dev *sdev) case V4L2_PIX_FMT_YVYU: case V4L2_PIX_FMT_UYVY: case V4L2_PIX_FMT_VYUY: - dev_dbg(sdev->dev, + dev_dbg(csi_dev->dev, "Horizontal length should be 2 times of width for packed YUV formats!\n"); hor_len = width * 2; break; @@ -515,10 +500,10 @@ static void sun6i_csi_set_window(struct sun6i_csi_dev *sdev) break; } - regmap_write(sdev->regmap, CSI_CH_HSIZE_REG, + regmap_write(csi_dev->regmap, CSI_CH_HSIZE_REG, CSI_CH_HSIZE_HOR_LEN(hor_len) | CSI_CH_HSIZE_HOR_START(0)); - regmap_write(sdev->regmap, CSI_CH_VSIZE_REG, + regmap_write(csi_dev->regmap, CSI_CH_VSIZE_REG, CSI_CH_VSIZE_VER_LEN(height) | CSI_CH_VSIZE_VER_START(0)); @@ -550,7 +535,7 @@ static void sun6i_csi_set_window(struct sun6i_csi_dev *sdev) bytesperline_c * height; break; default: /* raw */ - dev_dbg(sdev->dev, + dev_dbg(csi_dev->dev, "Calculating pixelformat(0x%x)'s bytesperline as a packed format\n", config->pixelformat); bytesperline_y = (sun6i_csi_get_bpp(config->pixelformat) * @@ -561,46 +546,42 @@ static void sun6i_csi_set_window(struct sun6i_csi_dev *sdev) break; } - regmap_write(sdev->regmap, CSI_CH_BUF_LEN_REG, + regmap_write(csi_dev->regmap, CSI_CH_BUF_LEN_REG, CSI_CH_BUF_LEN_BUF_LEN_C(bytesperline_c) | CSI_CH_BUF_LEN_BUF_LEN_Y(bytesperline_y)); } -int sun6i_csi_update_config(struct sun6i_csi *csi, +int sun6i_csi_update_config(struct sun6i_csi_device *csi_dev, struct sun6i_csi_config *config) { - struct sun6i_csi_dev *sdev = sun6i_csi_to_dev(csi); - if (!config) return -EINVAL; - memcpy(&csi->config, config, sizeof(csi->config)); + memcpy(&csi_dev->config, config, sizeof(csi_dev->config)); - sun6i_csi_setup_bus(sdev); - sun6i_csi_set_format(sdev); - sun6i_csi_set_window(sdev); + sun6i_csi_setup_bus(csi_dev); + sun6i_csi_set_format(csi_dev); + sun6i_csi_set_window(csi_dev); return 0; } -void sun6i_csi_update_buf_addr(struct sun6i_csi *csi, dma_addr_t addr) +void sun6i_csi_update_buf_addr(struct sun6i_csi_device *csi_dev, + dma_addr_t addr) { - struct sun6i_csi_dev *sdev = sun6i_csi_to_dev(csi); - - regmap_write(sdev->regmap, CSI_CH_F0_BUFA_REG, - (addr + sdev->planar_offset[0]) >> 2); - if (sdev->planar_offset[1] != -1) - regmap_write(sdev->regmap, CSI_CH_F1_BUFA_REG, - (addr + sdev->planar_offset[1]) >> 2); - if (sdev->planar_offset[2] != -1) - regmap_write(sdev->regmap, CSI_CH_F2_BUFA_REG, - (addr + sdev->planar_offset[2]) >> 2); + regmap_write(csi_dev->regmap, CSI_CH_F0_BUFA_REG, + (addr + csi_dev->planar_offset[0]) >> 2); + if (csi_dev->planar_offset[1] != -1) + regmap_write(csi_dev->regmap, CSI_CH_F1_BUFA_REG, + (addr + csi_dev->planar_offset[1]) >> 2); + if (csi_dev->planar_offset[2] != -1) + regmap_write(csi_dev->regmap, CSI_CH_F2_BUFA_REG, + (addr + csi_dev->planar_offset[2]) >> 2); } -void sun6i_csi_set_stream(struct sun6i_csi *csi, bool enable) +void sun6i_csi_set_stream(struct sun6i_csi_device *csi_dev, bool enable) { - struct sun6i_csi_dev *sdev = sun6i_csi_to_dev(csi); - struct regmap *regmap = sdev->regmap; + struct regmap *regmap = csi_dev->regmap; if (!enable) { regmap_update_bits(regmap, CSI_CAP_REG, CSI_CAP_CH0_VCAP_ON, 0); @@ -624,7 +605,7 @@ void sun6i_csi_set_stream(struct sun6i_csi *csi, bool enable) /* ----------------------------------------------------------------------------- * Media Controller and V4L2 */ -static int sun6i_csi_link_entity(struct sun6i_csi *csi, +static int sun6i_csi_link_entity(struct sun6i_csi_device *csi_dev, struct media_entity *entity, struct fwnode_handle *fwnode) { @@ -635,24 +616,25 @@ static int sun6i_csi_link_entity(struct sun6i_csi *csi, ret = media_entity_get_fwnode_pad(entity, fwnode, MEDIA_PAD_FL_SOURCE); if (ret < 0) { - dev_err(csi->dev, "%s: no source pad in external entity %s\n", - __func__, entity->name); + dev_err(csi_dev->dev, + "%s: no source pad in external entity %s\n", __func__, + entity->name); return -EINVAL; } src_pad_index = ret; - sink = &csi->video.vdev.entity; - sink_pad = &csi->video.pad; + sink = &csi_dev->video.vdev.entity; + sink_pad = &csi_dev->video.pad; - dev_dbg(csi->dev, "creating %s:%u -> %s:%u link\n", + dev_dbg(csi_dev->dev, "creating %s:%u -> %s:%u link\n", entity->name, src_pad_index, sink->name, sink_pad->index); ret = media_create_pad_link(entity, src_pad_index, sink, sink_pad->index, MEDIA_LNK_FL_ENABLED | MEDIA_LNK_FL_IMMUTABLE); if (ret < 0) { - dev_err(csi->dev, "failed to create %s:%u -> %s:%u link\n", + dev_err(csi_dev->dev, "failed to create %s:%u -> %s:%u link\n", entity->name, src_pad_index, sink->name, sink_pad->index); return ret; @@ -663,27 +645,29 @@ static int sun6i_csi_link_entity(struct sun6i_csi *csi, static int sun6i_subdev_notify_complete(struct v4l2_async_notifier *notifier) { - struct sun6i_csi *csi = container_of(notifier, struct sun6i_csi, - notifier); - struct v4l2_device *v4l2_dev = &csi->v4l2_dev; + struct sun6i_csi_device *csi_dev = + container_of(notifier, struct sun6i_csi_device, + v4l2.notifier); + struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; + struct v4l2_device *v4l2_dev = &v4l2->v4l2_dev; struct v4l2_subdev *sd; int ret; - dev_dbg(csi->dev, "notify complete, all subdevs registered\n"); + dev_dbg(csi_dev->dev, "notify complete, all subdevs registered\n"); sd = list_first_entry(&v4l2_dev->subdevs, struct v4l2_subdev, list); if (!sd) return -EINVAL; - ret = sun6i_csi_link_entity(csi, &sd->entity, sd->fwnode); + ret = sun6i_csi_link_entity(csi_dev, &sd->entity, sd->fwnode); if (ret < 0) return ret; - ret = v4l2_device_register_subdev_nodes(&csi->v4l2_dev); + ret = v4l2_device_register_subdev_nodes(v4l2_dev); if (ret < 0) return ret; - return media_device_register(&csi->media_dev); + return media_device_register(&v4l2->media_dev); } static const struct v4l2_async_notifier_operations sun6i_csi_async_ops = { @@ -694,7 +678,7 @@ static int sun6i_csi_fwnode_parse(struct device *dev, struct v4l2_fwnode_endpoint *vep, struct v4l2_async_subdev *asd) { - struct sun6i_csi *csi = dev_get_drvdata(dev); + struct sun6i_csi_device *csi_dev = dev_get_drvdata(dev); if (vep->base.port || vep->base.id) { dev_warn(dev, "Only support a single port with one endpoint\n"); @@ -704,7 +688,7 @@ static int sun6i_csi_fwnode_parse(struct device *dev, switch (vep->bus_type) { case V4L2_MBUS_PARALLEL: case V4L2_MBUS_BT656: - csi->v4l2_ep = *vep; + csi_dev->v4l2.v4l2_ep = *vep; return 0; default: dev_err(dev, "Unsupported media bus type\n"); @@ -712,76 +696,79 @@ static int sun6i_csi_fwnode_parse(struct device *dev, } } -static void sun6i_csi_v4l2_cleanup(struct sun6i_csi *csi) +static void sun6i_csi_v4l2_cleanup(struct sun6i_csi_device *csi_dev) { - media_device_unregister(&csi->media_dev); - v4l2_async_nf_unregister(&csi->notifier); - v4l2_async_nf_cleanup(&csi->notifier); - sun6i_video_cleanup(&csi->video); - v4l2_device_unregister(&csi->v4l2_dev); - v4l2_ctrl_handler_free(&csi->ctrl_handler); - media_device_cleanup(&csi->media_dev); + struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; + + media_device_unregister(&v4l2->media_dev); + v4l2_async_nf_unregister(&v4l2->notifier); + v4l2_async_nf_cleanup(&v4l2->notifier); + sun6i_video_cleanup(&csi_dev->video); + v4l2_device_unregister(&v4l2->v4l2_dev); + v4l2_ctrl_handler_free(&v4l2->ctrl_handler); + media_device_cleanup(&v4l2->media_dev); } -static int sun6i_csi_v4l2_init(struct sun6i_csi *csi) +static int sun6i_csi_v4l2_init(struct sun6i_csi_device *csi_dev) { + struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; int ret; - csi->media_dev.dev = csi->dev; - strscpy(csi->media_dev.model, SUN6I_CSI_DESCRIPTION, - sizeof(csi->media_dev.model)); - csi->media_dev.hw_revision = 0; + v4l2->media_dev.dev = csi_dev->dev; + strscpy(v4l2->media_dev.model, SUN6I_CSI_DESCRIPTION, + sizeof(v4l2->media_dev.model)); + v4l2->media_dev.hw_revision = 0; - media_device_init(&csi->media_dev); - v4l2_async_nf_init(&csi->notifier); + media_device_init(&v4l2->media_dev); + v4l2_async_nf_init(&v4l2->notifier); - ret = v4l2_ctrl_handler_init(&csi->ctrl_handler, 0); + ret = v4l2_ctrl_handler_init(&v4l2->ctrl_handler, 0); if (ret) { - dev_err(csi->dev, "V4L2 controls handler init failed (%d)\n", + dev_err(csi_dev->dev, "V4L2 controls handler init failed (%d)\n", ret); goto clean_media; } - csi->v4l2_dev.mdev = &csi->media_dev; - csi->v4l2_dev.ctrl_handler = &csi->ctrl_handler; - ret = v4l2_device_register(csi->dev, &csi->v4l2_dev); + v4l2->v4l2_dev.mdev = &v4l2->media_dev; + v4l2->v4l2_dev.ctrl_handler = &v4l2->ctrl_handler; + ret = v4l2_device_register(csi_dev->dev, &v4l2->v4l2_dev); if (ret) { - dev_err(csi->dev, "V4L2 device registration failed (%d)\n", + dev_err(csi_dev->dev, "V4L2 device registration failed (%d)\n", ret); goto free_ctrl; } - ret = sun6i_video_init(&csi->video, csi, SUN6I_CSI_NAME); + ret = sun6i_video_init(&csi_dev->video, csi_dev, SUN6I_CSI_NAME); if (ret) goto unreg_v4l2; - ret = v4l2_async_nf_parse_fwnode_endpoints(csi->dev, - &csi->notifier, + ret = v4l2_async_nf_parse_fwnode_endpoints(csi_dev->dev, + &v4l2->notifier, sizeof(struct v4l2_async_subdev), sun6i_csi_fwnode_parse); if (ret) goto clean_video; - csi->notifier.ops = &sun6i_csi_async_ops; + v4l2->notifier.ops = &sun6i_csi_async_ops; - ret = v4l2_async_nf_register(&csi->v4l2_dev, &csi->notifier); + ret = v4l2_async_nf_register(&v4l2->v4l2_dev, &v4l2->notifier); if (ret) { - dev_err(csi->dev, "notifier registration failed\n"); + dev_err(csi_dev->dev, "notifier registration failed\n"); goto clean_video; } return 0; clean_video: - sun6i_video_cleanup(&csi->video); + sun6i_video_cleanup(&csi_dev->video); unreg_v4l2: - v4l2_device_unregister(&csi->v4l2_dev); + v4l2_device_unregister(&v4l2->v4l2_dev); free_ctrl: - v4l2_ctrl_handler_free(&csi->ctrl_handler); + v4l2_ctrl_handler_free(&v4l2->ctrl_handler); clean_media: - v4l2_async_nf_cleanup(&csi->notifier); - media_device_cleanup(&csi->media_dev); + v4l2_async_nf_cleanup(&v4l2->notifier); + media_device_cleanup(&v4l2->media_dev); return ret; } @@ -791,8 +778,8 @@ clean_media: */ static irqreturn_t sun6i_csi_isr(int irq, void *dev_id) { - struct sun6i_csi_dev *sdev = (struct sun6i_csi_dev *)dev_id; - struct regmap *regmap = sdev->regmap; + struct sun6i_csi_device *csi_dev = (struct sun6i_csi_device *)dev_id; + struct regmap *regmap = csi_dev->regmap; u32 status; regmap_read(regmap, CSI_CH_INT_STA_REG, &status); @@ -812,7 +799,7 @@ static irqreturn_t sun6i_csi_isr(int irq, void *dev_id) } if (status & CSI_CH_INT_STA_FD_PD) - sun6i_video_frame_done(&sdev->csi.video); + sun6i_video_frame_done(&csi_dev->video); regmap_write(regmap, CSI_CH_INT_STA_REG, status); @@ -826,7 +813,7 @@ static const struct regmap_config sun6i_csi_regmap_config = { .max_register = 0x9c, }; -static int sun6i_csi_resource_request(struct sun6i_csi_dev *sdev, +static int sun6i_csi_resource_request(struct sun6i_csi_device *csi_dev, struct platform_device *pdev) { void __iomem *io_base; @@ -837,29 +824,29 @@ static int sun6i_csi_resource_request(struct sun6i_csi_dev *sdev, if (IS_ERR(io_base)) return PTR_ERR(io_base); - sdev->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "bus", io_base, - &sun6i_csi_regmap_config); - if (IS_ERR(sdev->regmap)) { + csi_dev->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "bus", io_base, + &sun6i_csi_regmap_config); + if (IS_ERR(csi_dev->regmap)) { dev_err(&pdev->dev, "Failed to init register map\n"); - return PTR_ERR(sdev->regmap); + return PTR_ERR(csi_dev->regmap); } - sdev->clk_mod = devm_clk_get(&pdev->dev, "mod"); - if (IS_ERR(sdev->clk_mod)) { + csi_dev->clk_mod = devm_clk_get(&pdev->dev, "mod"); + if (IS_ERR(csi_dev->clk_mod)) { dev_err(&pdev->dev, "Unable to acquire csi clock\n"); - return PTR_ERR(sdev->clk_mod); + return PTR_ERR(csi_dev->clk_mod); } - sdev->clk_ram = devm_clk_get(&pdev->dev, "ram"); - if (IS_ERR(sdev->clk_ram)) { + csi_dev->clk_ram = devm_clk_get(&pdev->dev, "ram"); + if (IS_ERR(csi_dev->clk_ram)) { dev_err(&pdev->dev, "Unable to acquire dram-csi clock\n"); - return PTR_ERR(sdev->clk_ram); + return PTR_ERR(csi_dev->clk_ram); } - sdev->rstc_bus = devm_reset_control_get_shared(&pdev->dev, NULL); - if (IS_ERR(sdev->rstc_bus)) { + csi_dev->reset = devm_reset_control_get_shared(&pdev->dev, NULL); + if (IS_ERR(csi_dev->reset)) { dev_err(&pdev->dev, "Cannot get reset controller\n"); - return PTR_ERR(sdev->rstc_bus); + return PTR_ERR(csi_dev->reset); } irq = platform_get_irq(pdev, 0); @@ -867,7 +854,7 @@ static int sun6i_csi_resource_request(struct sun6i_csi_dev *sdev, return -ENXIO; ret = devm_request_irq(&pdev->dev, irq, sun6i_csi_isr, 0, - SUN6I_CSI_NAME, sdev); + SUN6I_CSI_NAME, csi_dev); if (ret) { dev_err(&pdev->dev, "Cannot request csi IRQ\n"); return ret; @@ -878,30 +865,29 @@ static int sun6i_csi_resource_request(struct sun6i_csi_dev *sdev, static int sun6i_csi_probe(struct platform_device *pdev) { - struct sun6i_csi_dev *sdev; + struct sun6i_csi_device *csi_dev; int ret; - sdev = devm_kzalloc(&pdev->dev, sizeof(*sdev), GFP_KERNEL); - if (!sdev) + csi_dev = devm_kzalloc(&pdev->dev, sizeof(*csi_dev), GFP_KERNEL); + if (!csi_dev) return -ENOMEM; - sdev->dev = &pdev->dev; + csi_dev->dev = &pdev->dev; - ret = sun6i_csi_resource_request(sdev, pdev); + ret = sun6i_csi_resource_request(csi_dev, pdev); if (ret) return ret; - platform_set_drvdata(pdev, sdev); + platform_set_drvdata(pdev, csi_dev); - sdev->csi.dev = &pdev->dev; - return sun6i_csi_v4l2_init(&sdev->csi); + return sun6i_csi_v4l2_init(csi_dev); } static int sun6i_csi_remove(struct platform_device *pdev) { - struct sun6i_csi_dev *sdev = platform_get_drvdata(pdev); + struct sun6i_csi_device *csi_dev = platform_get_drvdata(pdev); - sun6i_csi_v4l2_cleanup(&sdev->csi); + sun6i_csi_v4l2_cleanup(csi_dev); return 0; } diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h index e04f3c3fa27b7..e151f983dbc64 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h @@ -17,8 +17,6 @@ #define SUN6I_CSI_NAME "sun6i-csi" #define SUN6I_CSI_DESCRIPTION "Allwinner A31 CSI Device" -struct sun6i_csi; - /** * struct sun6i_csi_config - configs for sun6i csi * @pixelformat: v4l2 pixel format (V4L2_PIX_FMT_*) @@ -35,59 +33,75 @@ struct sun6i_csi_config { u32 height; }; -struct sun6i_csi { - struct device *dev; - struct v4l2_ctrl_handler ctrl_handler; +struct sun6i_csi_v4l2 { struct v4l2_device v4l2_dev; + struct v4l2_ctrl_handler ctrl_handler; struct media_device media_dev; struct v4l2_async_notifier notifier; - /* video port settings */ struct v4l2_fwnode_endpoint v4l2_ep; +}; - struct sun6i_csi_config config; +struct sun6i_csi_device { + struct device *dev; + struct sun6i_csi_config config; + struct sun6i_csi_v4l2 v4l2; struct sun6i_video video; + + struct regmap *regmap; + struct clk *clk_mod; + struct clk *clk_ram; + struct reset_control *reset; + + int planar_offset[3]; }; /** * sun6i_csi_is_format_supported() - check if the format supported by csi - * @csi: pointer to the csi + * @csi_dev: pointer to the csi device * @pixformat: v4l2 pixel format (V4L2_PIX_FMT_*) * @mbus_code: media bus format code (MEDIA_BUS_FMT_*) + * + * Return: true if format is supported, false otherwise. */ -bool sun6i_csi_is_format_supported(struct sun6i_csi *csi, u32 pixformat, - u32 mbus_code); +bool sun6i_csi_is_format_supported(struct sun6i_csi_device *csi_dev, + u32 pixformat, u32 mbus_code); /** * sun6i_csi_set_power() - power on/off the csi - * @csi: pointer to the csi + * @csi_dev: pointer to the csi device * @enable: on/off + * + * Return: 0 if successful, error code otherwise. */ -int sun6i_csi_set_power(struct sun6i_csi *csi, bool enable); +int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable); /** * sun6i_csi_update_config() - update the csi register settings - * @csi: pointer to the csi + * @csi_dev: pointer to the csi device * @config: see struct sun6i_csi_config + * + * Return: 0 if successful, error code otherwise. */ -int sun6i_csi_update_config(struct sun6i_csi *csi, +int sun6i_csi_update_config(struct sun6i_csi_device *csi_dev, struct sun6i_csi_config *config); /** * sun6i_csi_update_buf_addr() - update the csi frame buffer address - * @csi: pointer to the csi + * @csi_dev: pointer to the csi device * @addr: frame buffer's physical address */ -void sun6i_csi_update_buf_addr(struct sun6i_csi *csi, dma_addr_t addr); +void sun6i_csi_update_buf_addr(struct sun6i_csi_device *csi_dev, + dma_addr_t addr); /** * sun6i_csi_set_stream() - start/stop csi streaming - * @csi: pointer to the csi + * @csi_dev: pointer to the csi device * @enable: start/stop */ -void sun6i_csi_set_stream(struct sun6i_csi *csi, bool enable); +void sun6i_csi_set_stream(struct sun6i_csi_device *csi_dev, bool enable); /* get bpp form v4l2 pixformat */ static inline int sun6i_csi_get_bpp(unsigned int pixformat) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index da4b7f9557a10..1bfe7b3abc91f 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -162,7 +162,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) config.width = video->fmt.fmt.pix.width; config.height = video->fmt.fmt.pix.height; - ret = sun6i_csi_update_config(video->csi, &config); + ret = sun6i_csi_update_config(video->csi_dev, &config); if (ret < 0) goto stop_media_pipeline; @@ -171,9 +171,9 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) buf = list_first_entry(&video->dma_queue, struct sun6i_csi_buffer, list); buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi, buf->dma_addr); + sun6i_csi_update_buf_addr(video->csi_dev, buf->dma_addr); - sun6i_csi_set_stream(video->csi, true); + sun6i_csi_set_stream(video->csi_dev, true); /* * CSI will lookup the next dma buffer for next frame before the @@ -194,7 +194,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) */ next_buf = list_next_entry(buf, list); next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi, next_buf->dma_addr); + sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); spin_unlock_irqrestore(&video->dma_queue_lock, flags); @@ -205,7 +205,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) return 0; stop_csi_stream: - sun6i_csi_set_stream(video->csi, false); + sun6i_csi_set_stream(video->csi_dev, false); stop_media_pipeline: video_device_pipeline_stop(&video->vdev); clear_dma_queue: @@ -229,7 +229,7 @@ static void sun6i_video_stop_streaming(struct vb2_queue *vq) if (subdev) v4l2_subdev_call(subdev, video, s_stream, 0); - sun6i_csi_set_stream(video->csi, false); + sun6i_csi_set_stream(video->csi_dev, false); video_device_pipeline_stop(&video->vdev); @@ -266,7 +266,7 @@ void sun6i_video_frame_done(struct sun6i_video *video) buf = list_first_entry(&video->dma_queue, struct sun6i_csi_buffer, list); if (list_is_last(&buf->list, &video->dma_queue)) { - dev_dbg(video->csi->dev, "Frame dropped!\n"); + dev_dbg(video->csi_dev->dev, "Frame dropped!\n"); goto unlock; } @@ -278,8 +278,8 @@ void sun6i_video_frame_done(struct sun6i_video *video) */ if (!next_buf->queued_to_csi) { next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi, next_buf->dma_addr); - dev_dbg(video->csi->dev, "Frame dropped!\n"); + sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); + dev_dbg(video->csi_dev->dev, "Frame dropped!\n"); goto unlock; } @@ -293,9 +293,9 @@ void sun6i_video_frame_done(struct sun6i_video *video) if (!list_is_last(&next_buf->list, &video->dma_queue)) { next_buf = list_next_entry(next_buf, list); next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi, next_buf->dma_addr); + sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); } else { - dev_dbg(video->csi->dev, "Next frame will be dropped!\n"); + dev_dbg(video->csi_dev->dev, "Next frame will be dropped!\n"); } unlock: @@ -321,7 +321,7 @@ static int vidioc_querycap(struct file *file, void *priv, strscpy(cap->driver, "sun6i-video", sizeof(cap->driver)); strscpy(cap->card, video->vdev.name, sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "platform:%s", - video->csi->dev->of_node->name); + video->csi_dev->dev->of_node->name); return 0; } @@ -488,7 +488,7 @@ static int sun6i_video_open(struct file *file) if (!v4l2_fh_is_singular_file(file)) goto unlock; - ret = sun6i_csi_set_power(video->csi, true); + ret = sun6i_csi_set_power(video->csi_dev, true); if (ret < 0) goto fh_release; @@ -516,7 +516,7 @@ static int sun6i_video_close(struct file *file) v4l2_pipeline_pm_put(&video->vdev.entity); if (last_fh) - sun6i_csi_set_power(video->csi, false); + sun6i_csi_set_power(video->csi_dev, false); mutex_unlock(&video->lock); @@ -561,7 +561,7 @@ static int sun6i_video_link_validate(struct media_link *link) video->mbus_code = 0; if (!media_pad_remote_pad_first(link->sink->entity->pads)) { - dev_info(video->csi->dev, + dev_info(video->csi_dev->dev, "video node %s pad not connected\n", vdev->name); return -ENOLINK; } @@ -570,10 +570,10 @@ static int sun6i_video_link_validate(struct media_link *link) if (ret < 0) return ret; - if (!sun6i_csi_is_format_supported(video->csi, + if (!sun6i_csi_is_format_supported(video->csi_dev, video->fmt.fmt.pix.pixelformat, source_fmt.format.code)) { - dev_err(video->csi->dev, + dev_err(video->csi_dev->dev, "Unsupported pixformat: 0x%x with mbus code: 0x%x!\n", video->fmt.fmt.pix.pixelformat, source_fmt.format.code); @@ -582,7 +582,7 @@ static int sun6i_video_link_validate(struct media_link *link) if (source_fmt.format.width != video->fmt.fmt.pix.width || source_fmt.format.height != video->fmt.fmt.pix.height) { - dev_err(video->csi->dev, + dev_err(video->csi_dev->dev, "Wrong width or height %ux%u (%ux%u expected)\n", video->fmt.fmt.pix.width, video->fmt.fmt.pix.height, source_fmt.format.width, source_fmt.format.height); @@ -598,15 +598,16 @@ static const struct media_entity_operations sun6i_video_media_ops = { .link_validate = sun6i_video_link_validate }; -int sun6i_video_init(struct sun6i_video *video, struct sun6i_csi *csi, - const char *name) +int sun6i_video_init(struct sun6i_video *video, + struct sun6i_csi_device *csi_dev, const char *name) { + struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; struct video_device *vdev = &video->vdev; struct vb2_queue *vidq = &video->vb2_vidq; struct v4l2_format fmt = { 0 }; int ret; - video->csi = csi; + video->csi_dev = csi_dev; /* Initialize the media entity... */ video->pad.flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT; @@ -641,11 +642,12 @@ int sun6i_video_init(struct sun6i_video *video, struct sun6i_csi *csi, vidq->lock = &video->lock; /* Make sure non-dropped frame */ vidq->min_buffers_needed = 3; - vidq->dev = csi->dev; + vidq->dev = csi_dev->dev; ret = vb2_queue_init(vidq); if (ret) { - v4l2_err(&csi->v4l2_dev, "vb2_queue_init failed: %d\n", ret); + v4l2_err(&v4l2->v4l2_dev, "vb2_queue_init failed: %d\n", + ret); goto clean_entity; } @@ -656,7 +658,7 @@ int sun6i_video_init(struct sun6i_video *video, struct sun6i_csi *csi, vdev->ioctl_ops = &sun6i_video_ioctl_ops; vdev->vfl_type = VFL_TYPE_VIDEO; vdev->vfl_dir = VFL_DIR_RX; - vdev->v4l2_dev = &csi->v4l2_dev; + vdev->v4l2_dev = &v4l2->v4l2_dev; vdev->queue = vidq; vdev->lock = &video->lock; vdev->device_caps = V4L2_CAP_STREAMING | V4L2_CAP_VIDEO_CAPTURE; @@ -664,7 +666,7 @@ int sun6i_video_init(struct sun6i_video *video, struct sun6i_csi *csi, ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) { - v4l2_err(&csi->v4l2_dev, + v4l2_err(&v4l2->v4l2_dev, "video_register_device failed: %d\n", ret); goto clean_entity; } diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h index b9cd919c24ac3..30e37ee0d07f2 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h @@ -11,12 +11,12 @@ #include #include -struct sun6i_csi; +struct sun6i_csi_device; struct sun6i_video { + struct sun6i_csi_device *csi_dev; struct video_device vdev; struct media_pad pad; - struct sun6i_csi *csi; struct mutex lock; @@ -29,8 +29,8 @@ struct sun6i_video { u32 mbus_code; }; -int sun6i_video_init(struct sun6i_video *video, struct sun6i_csi *csi, - const char *name); +int sun6i_video_init(struct sun6i_video *video, + struct sun6i_csi_device *csi_dev, const char *name); void sun6i_video_cleanup(struct sun6i_video *video); void sun6i_video_frame_done(struct sun6i_video *video); -- GitLab From 43e80196625cdb3133ec6fef4ac06e6a967094fd Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:00 +0200 Subject: [PATCH 0613/2223] media: sun6i-csi: Tidy up platform code Various renames, variables lowering and other cosmetic changes in the platform-support code. No functional change intended. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 98 ++++++++++--------- .../platform/sunxi/sun6i-csi/sun6i_csi.h | 4 +- 2 files changed, 56 insertions(+), 46 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 0e2b4d38e81cf..514f97d67c1cc 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -153,25 +153,25 @@ int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) if (!enable) { regmap_update_bits(regmap, CSI_EN_REG, CSI_EN_CSI_EN, 0); - clk_disable_unprepare(csi_dev->clk_ram); + clk_disable_unprepare(csi_dev->clock_ram); if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_rate_exclusive_put(csi_dev->clk_mod); - clk_disable_unprepare(csi_dev->clk_mod); + clk_rate_exclusive_put(csi_dev->clock_mod); + clk_disable_unprepare(csi_dev->clock_mod); reset_control_assert(csi_dev->reset); return 0; } - ret = clk_prepare_enable(csi_dev->clk_mod); + ret = clk_prepare_enable(csi_dev->clock_mod); if (ret) { dev_err(csi_dev->dev, "Enable csi clk err %d\n", ret); return ret; } if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_set_rate_exclusive(csi_dev->clk_mod, 300000000); + clk_set_rate_exclusive(csi_dev->clock_mod, 300000000); - ret = clk_prepare_enable(csi_dev->clk_ram); + ret = clk_prepare_enable(csi_dev->clock_ram); if (ret) { dev_err(csi_dev->dev, "Enable clk_dram_csi clk err %d\n", ret); goto clk_mod_disable; @@ -188,11 +188,11 @@ int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) return 0; clk_ram_disable: - clk_disable_unprepare(csi_dev->clk_ram); + clk_disable_unprepare(csi_dev->clock_ram); clk_mod_disable: if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_rate_exclusive_put(csi_dev->clk_mod); - clk_disable_unprepare(csi_dev->clk_mod); + clk_rate_exclusive_put(csi_dev->clock_mod); + clk_disable_unprepare(csi_dev->clock_mod); return ret; } @@ -773,12 +773,11 @@ clean_media: return ret; } -/* ----------------------------------------------------------------------------- - * Resources and IRQ - */ -static irqreturn_t sun6i_csi_isr(int irq, void *dev_id) +/* Platform */ + +static irqreturn_t sun6i_csi_interrupt(int irq, void *private) { - struct sun6i_csi_device *csi_dev = (struct sun6i_csi_device *)dev_id; + struct sun6i_csi_device *csi_dev = private; struct regmap *regmap = csi_dev->regmap; u32 status; @@ -813,73 +812,82 @@ static const struct regmap_config sun6i_csi_regmap_config = { .max_register = 0x9c, }; -static int sun6i_csi_resource_request(struct sun6i_csi_device *csi_dev, - struct platform_device *pdev) +static int sun6i_csi_resources_setup(struct sun6i_csi_device *csi_dev, + struct platform_device *platform_dev) { + struct device *dev = csi_dev->dev; void __iomem *io_base; int ret; int irq; - io_base = devm_platform_ioremap_resource(pdev, 0); + /* Registers */ + + io_base = devm_platform_ioremap_resource(platform_dev, 0); if (IS_ERR(io_base)) return PTR_ERR(io_base); - csi_dev->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "bus", io_base, + csi_dev->regmap = devm_regmap_init_mmio_clk(dev, "bus", io_base, &sun6i_csi_regmap_config); if (IS_ERR(csi_dev->regmap)) { - dev_err(&pdev->dev, "Failed to init register map\n"); + dev_err(dev, "failed to init register map\n"); return PTR_ERR(csi_dev->regmap); } - csi_dev->clk_mod = devm_clk_get(&pdev->dev, "mod"); - if (IS_ERR(csi_dev->clk_mod)) { - dev_err(&pdev->dev, "Unable to acquire csi clock\n"); - return PTR_ERR(csi_dev->clk_mod); + /* Clocks */ + + csi_dev->clock_mod = devm_clk_get(dev, "mod"); + if (IS_ERR(csi_dev->clock_mod)) { + dev_err(dev, "failed to acquire module clock\n"); + return PTR_ERR(csi_dev->clock_mod); } - csi_dev->clk_ram = devm_clk_get(&pdev->dev, "ram"); - if (IS_ERR(csi_dev->clk_ram)) { - dev_err(&pdev->dev, "Unable to acquire dram-csi clock\n"); - return PTR_ERR(csi_dev->clk_ram); + csi_dev->clock_ram = devm_clk_get(dev, "ram"); + if (IS_ERR(csi_dev->clock_ram)) { + dev_err(dev, "failed to acquire ram clock\n"); + return PTR_ERR(csi_dev->clock_ram); } - csi_dev->reset = devm_reset_control_get_shared(&pdev->dev, NULL); + /* Reset */ + + csi_dev->reset = devm_reset_control_get_shared(dev, NULL); if (IS_ERR(csi_dev->reset)) { - dev_err(&pdev->dev, "Cannot get reset controller\n"); + dev_err(dev, "failed to acquire reset\n"); return PTR_ERR(csi_dev->reset); } - irq = platform_get_irq(pdev, 0); + /* Interrupt */ + + irq = platform_get_irq(platform_dev, 0); if (irq < 0) return -ENXIO; - ret = devm_request_irq(&pdev->dev, irq, sun6i_csi_isr, 0, - SUN6I_CSI_NAME, csi_dev); + ret = devm_request_irq(dev, irq, sun6i_csi_interrupt, 0, SUN6I_CSI_NAME, + csi_dev); if (ret) { - dev_err(&pdev->dev, "Cannot request csi IRQ\n"); + dev_err(dev, "failed to request interrupt\n"); return ret; } return 0; } -static int sun6i_csi_probe(struct platform_device *pdev) +static int sun6i_csi_probe(struct platform_device *platform_dev) { struct sun6i_csi_device *csi_dev; + struct device *dev = &platform_dev->dev; int ret; - csi_dev = devm_kzalloc(&pdev->dev, sizeof(*csi_dev), GFP_KERNEL); + csi_dev = devm_kzalloc(dev, sizeof(*csi_dev), GFP_KERNEL); if (!csi_dev) return -ENOMEM; - csi_dev->dev = &pdev->dev; + csi_dev->dev = &platform_dev->dev; + platform_set_drvdata(platform_dev, csi_dev); - ret = sun6i_csi_resource_request(csi_dev, pdev); + ret = sun6i_csi_resources_setup(csi_dev, platform_dev); if (ret) return ret; - platform_set_drvdata(pdev, csi_dev); - return sun6i_csi_v4l2_init(csi_dev); } @@ -900,16 +908,18 @@ static const struct of_device_id sun6i_csi_of_match[] = { { .compatible = "allwinner,sun50i-a64-csi", }, {}, }; + MODULE_DEVICE_TABLE(of, sun6i_csi_of_match); static struct platform_driver sun6i_csi_platform_driver = { - .probe = sun6i_csi_probe, - .remove = sun6i_csi_remove, - .driver = { - .name = SUN6I_CSI_NAME, - .of_match_table = of_match_ptr(sun6i_csi_of_match), + .probe = sun6i_csi_probe, + .remove = sun6i_csi_remove, + .driver = { + .name = SUN6I_CSI_NAME, + .of_match_table = of_match_ptr(sun6i_csi_of_match), }, }; + module_platform_driver(sun6i_csi_platform_driver); MODULE_DESCRIPTION("Allwinner A31 Camera Sensor Interface driver"); diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h index e151f983dbc64..937ca0fe4ee6e 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h @@ -51,8 +51,8 @@ struct sun6i_csi_device { struct sun6i_video video; struct regmap *regmap; - struct clk *clk_mod; - struct clk *clk_ram; + struct clk *clock_mod; + struct clk *clock_ram; struct reset_control *reset; int planar_offset[3]; -- GitLab From 740b5b3d156b6a9cb3b3d3ee2427acca0a890607 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:01 +0200 Subject: [PATCH 0614/2223] media: sun6i-csi: Always set exclusive module clock rate In some situations the default rate of the module clock is not the required one for operation (for example when reconfiguring the clock tree to use a different parent). As a result, always set the correct rate for the clock (and take care of cleanup). Signed-off-by: Paul Kocialkowski Reviewed-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 54 ++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 514f97d67c1cc..89a15cd779ac4 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -154,9 +154,6 @@ int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) regmap_update_bits(regmap, CSI_EN_REG, CSI_EN_CSI_EN, 0); clk_disable_unprepare(csi_dev->clock_ram); - if (of_device_is_compatible(dev->of_node, - "allwinner,sun50i-a64-csi")) - clk_rate_exclusive_put(csi_dev->clock_mod); clk_disable_unprepare(csi_dev->clock_mod); reset_control_assert(csi_dev->reset); return 0; @@ -168,9 +165,6 @@ int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) return ret; } - if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_set_rate_exclusive(csi_dev->clock_mod, 300000000); - ret = clk_prepare_enable(csi_dev->clock_ram); if (ret) { dev_err(csi_dev->dev, "Enable clk_dram_csi clk err %d\n", ret); @@ -190,8 +184,6 @@ int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) clk_ram_disable: clk_disable_unprepare(csi_dev->clock_ram); clk_mod_disable: - if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clk_rate_exclusive_put(csi_dev->clock_mod); clk_disable_unprepare(csi_dev->clock_mod); return ret; } @@ -816,6 +808,7 @@ static int sun6i_csi_resources_setup(struct sun6i_csi_device *csi_dev, struct platform_device *platform_dev) { struct device *dev = csi_dev->dev; + unsigned long clock_mod_rate; void __iomem *io_base; int ret; int irq; @@ -847,28 +840,53 @@ static int sun6i_csi_resources_setup(struct sun6i_csi_device *csi_dev, return PTR_ERR(csi_dev->clock_ram); } + if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) + clock_mod_rate = 300000000; + else + clock_mod_rate = 297000000; + + ret = clk_set_rate_exclusive(csi_dev->clock_mod, clock_mod_rate); + if (ret) { + dev_err(dev, "failed to set mod clock rate\n"); + return ret; + } + /* Reset */ csi_dev->reset = devm_reset_control_get_shared(dev, NULL); if (IS_ERR(csi_dev->reset)) { dev_err(dev, "failed to acquire reset\n"); - return PTR_ERR(csi_dev->reset); + ret = PTR_ERR(csi_dev->reset); + goto error_clock_rate_exclusive; } /* Interrupt */ irq = platform_get_irq(platform_dev, 0); - if (irq < 0) - return -ENXIO; + if (irq < 0) { + dev_err(dev, "failed to get interrupt\n"); + ret = -ENXIO; + goto error_clock_rate_exclusive; + } ret = devm_request_irq(dev, irq, sun6i_csi_interrupt, 0, SUN6I_CSI_NAME, csi_dev); if (ret) { dev_err(dev, "failed to request interrupt\n"); - return ret; + goto error_clock_rate_exclusive; } return 0; + +error_clock_rate_exclusive: + clk_rate_exclusive_put(csi_dev->clock_mod); + + return ret; +} + +static void sun6i_csi_resources_cleanup(struct sun6i_csi_device *csi_dev) +{ + clk_rate_exclusive_put(csi_dev->clock_mod); } static int sun6i_csi_probe(struct platform_device *platform_dev) @@ -888,7 +906,16 @@ static int sun6i_csi_probe(struct platform_device *platform_dev) if (ret) return ret; - return sun6i_csi_v4l2_init(csi_dev); + ret = sun6i_csi_v4l2_init(csi_dev); + if (ret) + goto error_resources; + + return 0; + +error_resources: + sun6i_csi_resources_cleanup(csi_dev); + + return ret; } static int sun6i_csi_remove(struct platform_device *pdev) @@ -896,6 +923,7 @@ static int sun6i_csi_remove(struct platform_device *pdev) struct sun6i_csi_device *csi_dev = platform_get_drvdata(pdev); sun6i_csi_v4l2_cleanup(csi_dev); + sun6i_csi_resources_cleanup(csi_dev); return 0; } -- GitLab From bc67ec9e1348d94ead4e4704ca79e7a4bc97fac3 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:02 +0200 Subject: [PATCH 0615/2223] media: sun6i-csi: Define and use variant to get module clock rate Introduce a proper variant structure with the module clock rate instead of hardcoding it with a manual check on the compatible. Signed-off-by: Paul Kocialkowski Reviewed-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 47 ++++++++++++++----- .../platform/sunxi/sun6i-csi/sun6i_csi.h | 4 ++ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 89a15cd779ac4..800851f4e18c3 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -808,11 +808,15 @@ static int sun6i_csi_resources_setup(struct sun6i_csi_device *csi_dev, struct platform_device *platform_dev) { struct device *dev = csi_dev->dev; - unsigned long clock_mod_rate; + const struct sun6i_csi_variant *variant; void __iomem *io_base; int ret; int irq; + variant = of_device_get_match_data(dev); + if (!variant) + return -EINVAL; + /* Registers */ io_base = devm_platform_ioremap_resource(platform_dev, 0); @@ -840,12 +844,8 @@ static int sun6i_csi_resources_setup(struct sun6i_csi_device *csi_dev, return PTR_ERR(csi_dev->clock_ram); } - if (of_device_is_compatible(dev->of_node, "allwinner,sun50i-a64-csi")) - clock_mod_rate = 300000000; - else - clock_mod_rate = 297000000; - - ret = clk_set_rate_exclusive(csi_dev->clock_mod, clock_mod_rate); + ret = clk_set_rate_exclusive(csi_dev->clock_mod, + variant->clock_mod_rate); if (ret) { dev_err(dev, "failed to set mod clock rate\n"); return ret; @@ -928,12 +928,35 @@ static int sun6i_csi_remove(struct platform_device *pdev) return 0; } +static const struct sun6i_csi_variant sun6i_a31_csi_variant = { + .clock_mod_rate = 297000000, +}; + +static const struct sun6i_csi_variant sun50i_a64_csi_variant = { + .clock_mod_rate = 300000000, +}; + static const struct of_device_id sun6i_csi_of_match[] = { - { .compatible = "allwinner,sun6i-a31-csi", }, - { .compatible = "allwinner,sun8i-a83t-csi", }, - { .compatible = "allwinner,sun8i-h3-csi", }, - { .compatible = "allwinner,sun8i-v3s-csi", }, - { .compatible = "allwinner,sun50i-a64-csi", }, + { + .compatible = "allwinner,sun6i-a31-csi", + .data = &sun6i_a31_csi_variant, + }, + { + .compatible = "allwinner,sun8i-a83t-csi", + .data = &sun6i_a31_csi_variant, + }, + { + .compatible = "allwinner,sun8i-h3-csi", + .data = &sun6i_a31_csi_variant, + }, + { + .compatible = "allwinner,sun8i-v3s-csi", + .data = &sun6i_a31_csi_variant, + }, + { + .compatible = "allwinner,sun50i-a64-csi", + .data = &sun50i_a64_csi_variant, + }, {}, }; diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h index 937ca0fe4ee6e..e301d80362cfc 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h @@ -58,6 +58,10 @@ struct sun6i_csi_device { int planar_offset[3]; }; +struct sun6i_csi_variant { + unsigned long clock_mod_rate; +}; + /** * sun6i_csi_is_format_supported() - check if the format supported by csi * @csi_dev: pointer to the csi device -- GitLab From 69b80659a728b99b868518b975d00f25ea70ec21 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:03 +0200 Subject: [PATCH 0616/2223] media: sun6i-csi: Use runtime pm for clocks and reset Wrap the clock and reset preparation into runtime pm functions for better organization of the code. Also fix the clock and reset enable order to first deassert reset, as recommended in Allwinner literature. Make the driver depend on PM while at it since runtime pm is mandatory for the driver to work. Signed-off-by: Paul Kocialkowski Reviewed-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/sunxi/sun6i-csi/Kconfig | 2 +- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 84 +++++++++++++------ 2 files changed, 60 insertions(+), 26 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/Kconfig b/drivers/media/platform/sunxi/sun6i-csi/Kconfig index e5b6991ce7f04..a472f46648af2 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/Kconfig +++ b/drivers/media/platform/sunxi/sun6i-csi/Kconfig @@ -2,7 +2,7 @@ config VIDEO_SUN6I_CSI tristate "Allwinner V3s Camera Sensor Interface driver" depends on V4L_PLATFORM_DRIVERS - depends on VIDEO_DEV && COMMON_CLK && RESET_CONTROLLER && HAS_DMA + depends on VIDEO_DEV && COMMON_CLK && RESET_CONTROLLER && HAS_DMA && PM depends on ARCH_SUNXI || COMPILE_TEST select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 800851f4e18c3..31374d45eb9fd 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -152,40 +152,18 @@ int sun6i_csi_set_power(struct sun6i_csi_device *csi_dev, bool enable) if (!enable) { regmap_update_bits(regmap, CSI_EN_REG, CSI_EN_CSI_EN, 0); + pm_runtime_put(dev); - clk_disable_unprepare(csi_dev->clock_ram); - clk_disable_unprepare(csi_dev->clock_mod); - reset_control_assert(csi_dev->reset); return 0; } - ret = clk_prepare_enable(csi_dev->clock_mod); - if (ret) { - dev_err(csi_dev->dev, "Enable csi clk err %d\n", ret); + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) return ret; - } - - ret = clk_prepare_enable(csi_dev->clock_ram); - if (ret) { - dev_err(csi_dev->dev, "Enable clk_dram_csi clk err %d\n", ret); - goto clk_mod_disable; - } - - ret = reset_control_deassert(csi_dev->reset); - if (ret) { - dev_err(csi_dev->dev, "reset err %d\n", ret); - goto clk_ram_disable; - } regmap_update_bits(regmap, CSI_EN_REG, CSI_EN_CSI_EN, CSI_EN_CSI_EN); return 0; - -clk_ram_disable: - clk_disable_unprepare(csi_dev->clock_ram); -clk_mod_disable: - clk_disable_unprepare(csi_dev->clock_mod); - return ret; } static enum csi_input_fmt get_csi_input_format(struct sun6i_csi_device *csi_dev, @@ -797,6 +775,56 @@ static irqreturn_t sun6i_csi_interrupt(int irq, void *private) return IRQ_HANDLED; } +static int sun6i_csi_suspend(struct device *dev) +{ + struct sun6i_csi_device *csi_dev = dev_get_drvdata(dev); + + reset_control_assert(csi_dev->reset); + clk_disable_unprepare(csi_dev->clock_ram); + clk_disable_unprepare(csi_dev->clock_mod); + + return 0; +} + +static int sun6i_csi_resume(struct device *dev) +{ + struct sun6i_csi_device *csi_dev = dev_get_drvdata(dev); + int ret; + + ret = reset_control_deassert(csi_dev->reset); + if (ret) { + dev_err(dev, "failed to deassert reset\n"); + return ret; + } + + ret = clk_prepare_enable(csi_dev->clock_mod); + if (ret) { + dev_err(dev, "failed to enable module clock\n"); + goto error_reset; + } + + ret = clk_prepare_enable(csi_dev->clock_ram); + if (ret) { + dev_err(dev, "failed to enable ram clock\n"); + goto error_clock_mod; + } + + return 0; + +error_clock_mod: + clk_disable_unprepare(csi_dev->clock_mod); + +error_reset: + reset_control_assert(csi_dev->reset); + + return ret; +} + +static const struct dev_pm_ops sun6i_csi_pm_ops = { + .runtime_suspend = sun6i_csi_suspend, + .runtime_resume = sun6i_csi_resume, +}; + static const struct regmap_config sun6i_csi_regmap_config = { .reg_bits = 32, .reg_stride = 4, @@ -876,6 +904,10 @@ static int sun6i_csi_resources_setup(struct sun6i_csi_device *csi_dev, goto error_clock_rate_exclusive; } + /* Runtime PM */ + + pm_runtime_enable(dev); + return 0; error_clock_rate_exclusive: @@ -886,6 +918,7 @@ error_clock_rate_exclusive: static void sun6i_csi_resources_cleanup(struct sun6i_csi_device *csi_dev) { + pm_runtime_disable(csi_dev->dev); clk_rate_exclusive_put(csi_dev->clock_mod); } @@ -968,6 +1001,7 @@ static struct platform_driver sun6i_csi_platform_driver = { .driver = { .name = SUN6I_CSI_NAME, .of_match_table = of_match_ptr(sun6i_csi_of_match), + .pm = &sun6i_csi_pm_ops, }, }; -- GitLab From bffb52248930d76e98fcc5deecda5bc9b3043ef5 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:04 +0200 Subject: [PATCH 0617/2223] media: sun6i-csi: Tidy up Kconfig Update the option title and help, group related options together, add dependency on VIDEO_DEV since the driver uses it and update the description. Signed-off-by: Paul Kocialkowski Acked-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-csi/Kconfig | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/Kconfig b/drivers/media/platform/sunxi/sun6i-csi/Kconfig index a472f46648af2..886006f6a48a1 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/Kconfig +++ b/drivers/media/platform/sunxi/sun6i-csi/Kconfig @@ -1,13 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only config VIDEO_SUN6I_CSI - tristate "Allwinner V3s Camera Sensor Interface driver" - depends on V4L_PLATFORM_DRIVERS - depends on VIDEO_DEV && COMMON_CLK && RESET_CONTROLLER && HAS_DMA && PM + tristate "Allwinner A31 Camera Sensor Interface (CSI) Driver" + depends on V4L_PLATFORM_DRIVERS && VIDEO_DEV depends on ARCH_SUNXI || COMPILE_TEST + depends on PM && COMMON_CLK && RESET_CONTROLLER && HAS_DMA select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API select VIDEOBUF2_DMA_CONTIG - select REGMAP_MMIO select V4L2_FWNODE + select REGMAP_MMIO help - Support for the Allwinner Camera Sensor Interface Controller on V3s. + Support for the Allwinner A31 Camera Sensor Interface (CSI) + controller, also found on other platforms such as the A83T, H3, + V3/V3s or A64. -- GitLab From cad7f35c1ab50bad92479e64ae57628b61da8b1b Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:05 +0200 Subject: [PATCH 0618/2223] media: sun6i-csi: Tidy up v4l2 code Various cosmetic improvements to the v4l2 registration code, with renames, lowerings, etc. The cleanup function is moved down after setup. No functional change intended. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 113 ++++++++++-------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 31374d45eb9fd..98c9c887c5433 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -27,6 +27,8 @@ #include "sun6i_csi.h" #include "sun6i_csi_reg.h" +/* Helpers */ + /* TODO add 10&12 bit YUV, RGB support */ bool sun6i_csi_is_format_supported(struct sun6i_csi_device *csi_dev, u32 pixformat, u32 mbus_code) @@ -572,9 +574,8 @@ void sun6i_csi_set_stream(struct sun6i_csi_device *csi_dev, bool enable) CSI_CAP_CH0_VCAP_ON); } -/* ----------------------------------------------------------------------------- - * Media Controller and V4L2 - */ +/* V4L2 */ + static int sun6i_csi_link_entity(struct sun6i_csi_device *csi_dev, struct media_entity *entity, struct fwnode_handle *fwnode) @@ -666,83 +667,101 @@ static int sun6i_csi_fwnode_parse(struct device *dev, } } -static void sun6i_csi_v4l2_cleanup(struct sun6i_csi_device *csi_dev) +static int sun6i_csi_v4l2_setup(struct sun6i_csi_device *csi_dev) { struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; + struct media_device *media_dev = &v4l2->media_dev; + struct v4l2_device *v4l2_dev = &v4l2->v4l2_dev; + struct v4l2_async_notifier *notifier = &v4l2->notifier; + struct device *dev = csi_dev->dev; + int ret; - media_device_unregister(&v4l2->media_dev); - v4l2_async_nf_unregister(&v4l2->notifier); - v4l2_async_nf_cleanup(&v4l2->notifier); - sun6i_video_cleanup(&csi_dev->video); - v4l2_device_unregister(&v4l2->v4l2_dev); - v4l2_ctrl_handler_free(&v4l2->ctrl_handler); - media_device_cleanup(&v4l2->media_dev); -} + /* Media Device */ -static int sun6i_csi_v4l2_init(struct sun6i_csi_device *csi_dev) -{ - struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; - int ret; + strscpy(media_dev->model, SUN6I_CSI_DESCRIPTION, + sizeof(media_dev->model)); + media_dev->hw_revision = 0; + media_dev->dev = dev; - v4l2->media_dev.dev = csi_dev->dev; - strscpy(v4l2->media_dev.model, SUN6I_CSI_DESCRIPTION, - sizeof(v4l2->media_dev.model)); - v4l2->media_dev.hw_revision = 0; + media_device_init(media_dev); - media_device_init(&v4l2->media_dev); - v4l2_async_nf_init(&v4l2->notifier); + /* V4L2 Control Handler */ ret = v4l2_ctrl_handler_init(&v4l2->ctrl_handler, 0); if (ret) { - dev_err(csi_dev->dev, "V4L2 controls handler init failed (%d)\n", - ret); - goto clean_media; + dev_err(dev, "failed to init v4l2 control handler: %d\n", ret); + goto error_media; } - v4l2->v4l2_dev.mdev = &v4l2->media_dev; - v4l2->v4l2_dev.ctrl_handler = &v4l2->ctrl_handler; - ret = v4l2_device_register(csi_dev->dev, &v4l2->v4l2_dev); + /* V4L2 Device */ + + v4l2_dev->mdev = media_dev; + v4l2_dev->ctrl_handler = &v4l2->ctrl_handler; + + ret = v4l2_device_register(dev, v4l2_dev); if (ret) { - dev_err(csi_dev->dev, "V4L2 device registration failed (%d)\n", - ret); - goto free_ctrl; + dev_err(dev, "failed to register v4l2 device: %d\n", ret); + goto error_v4l2_ctrl; } + /* Video */ + ret = sun6i_video_init(&csi_dev->video, csi_dev, SUN6I_CSI_NAME); if (ret) - goto unreg_v4l2; + goto error_v4l2_device; - ret = v4l2_async_nf_parse_fwnode_endpoints(csi_dev->dev, - &v4l2->notifier, + /* V4L2 Async */ + + v4l2_async_nf_init(notifier); + notifier->ops = &sun6i_csi_async_ops; + + ret = v4l2_async_nf_parse_fwnode_endpoints(dev, notifier, sizeof(struct v4l2_async_subdev), sun6i_csi_fwnode_parse); if (ret) - goto clean_video; - - v4l2->notifier.ops = &sun6i_csi_async_ops; + goto error_video; - ret = v4l2_async_nf_register(&v4l2->v4l2_dev, &v4l2->notifier); + ret = v4l2_async_nf_register(v4l2_dev, notifier); if (ret) { - dev_err(csi_dev->dev, "notifier registration failed\n"); - goto clean_video; + dev_err(dev, "failed to register v4l2 async notifier: %d\n", + ret); + goto error_v4l2_async_notifier; } return 0; -clean_video: +error_v4l2_async_notifier: + v4l2_async_nf_cleanup(notifier); + +error_video: sun6i_video_cleanup(&csi_dev->video); -unreg_v4l2: + +error_v4l2_device: v4l2_device_unregister(&v4l2->v4l2_dev); -free_ctrl: + +error_v4l2_ctrl: v4l2_ctrl_handler_free(&v4l2->ctrl_handler); -clean_media: - v4l2_async_nf_cleanup(&v4l2->notifier); - media_device_cleanup(&v4l2->media_dev); + +error_media: + media_device_cleanup(media_dev); return ret; } +static void sun6i_csi_v4l2_cleanup(struct sun6i_csi_device *csi_dev) +{ + struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; + + media_device_unregister(&v4l2->media_dev); + v4l2_async_nf_unregister(&v4l2->notifier); + v4l2_async_nf_cleanup(&v4l2->notifier); + sun6i_video_cleanup(&csi_dev->video); + v4l2_device_unregister(&v4l2->v4l2_dev); + v4l2_ctrl_handler_free(&v4l2->ctrl_handler); + media_device_cleanup(&v4l2->media_dev); +} + /* Platform */ static irqreturn_t sun6i_csi_interrupt(int irq, void *private) @@ -939,7 +958,7 @@ static int sun6i_csi_probe(struct platform_device *platform_dev) if (ret) return ret; - ret = sun6i_csi_v4l2_init(csi_dev); + ret = sun6i_csi_v4l2_setup(csi_dev); if (ret) goto error_resources; -- GitLab From ab2e8d5d67fb86ff74ccb3cad9b57988fb1adfcd Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:06 +0200 Subject: [PATCH 0619/2223] media: sun6i-csi: Tidy up video code Some code cleanups, renames, variable lowerings and moving things around for better organization. No functional change intended. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 4 +- .../platform/sunxi/sun6i-csi/sun6i_video.c | 509 ++++++++++-------- .../platform/sunxi/sun6i-csi/sun6i_video.h | 18 +- 3 files changed, 285 insertions(+), 246 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index 98c9c887c5433..b4f90b065a0c1 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -595,7 +595,7 @@ static int sun6i_csi_link_entity(struct sun6i_csi_device *csi_dev, src_pad_index = ret; - sink = &csi_dev->video.vdev.entity; + sink = &csi_dev->video.video_dev.entity; sink_pad = &csi_dev->video.pad; dev_dbg(csi_dev->dev, "creating %s:%u -> %s:%u link\n", @@ -706,7 +706,7 @@ static int sun6i_csi_v4l2_setup(struct sun6i_csi_device *csi_dev) /* Video */ - ret = sun6i_video_init(&csi_dev->video, csi_dev, SUN6I_CSI_NAME); + ret = sun6i_video_setup(&csi_dev->video, csi_dev); if (ret) goto error_v4l2_device; diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index 1bfe7b3abc91f..4710902447ac4 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -24,14 +24,34 @@ #define MAX_HEIGHT (4800) struct sun6i_csi_buffer { - struct vb2_v4l2_buffer vb; + struct vb2_v4l2_buffer v4l2_buffer; struct list_head list; dma_addr_t dma_addr; bool queued_to_csi; }; -static const u32 supported_pixformats[] = { +/* Helpers */ + +static struct v4l2_subdev * +sun6i_video_remote_subdev(struct sun6i_video *video, u32 *pad) +{ + struct media_pad *remote; + + remote = media_pad_remote_pad_first(&video->pad); + + if (!remote || !is_media_entity_v4l2_subdev(remote->entity)) + return NULL; + + if (pad) + *pad = remote->index; + + return media_entity_to_v4l2_subdev(remote->entity); +} + +/* Format */ + +static const u32 sun6i_video_formats[] = { V4L2_PIX_FMT_SBGGR8, V4L2_PIX_FMT_SGBRG8, V4L2_PIX_FMT_SGRBG8, @@ -61,77 +81,80 @@ static const u32 supported_pixformats[] = { V4L2_PIX_FMT_JPEG, }; -static bool is_pixformat_valid(unsigned int pixformat) +static bool sun6i_video_format_check(u32 format) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(supported_pixformats); i++) - if (supported_pixformats[i] == pixformat) + for (i = 0; i < ARRAY_SIZE(sun6i_video_formats); i++) + if (sun6i_video_formats[i] == format) return true; return false; } -static struct v4l2_subdev * -sun6i_video_remote_subdev(struct sun6i_video *video, u32 *pad) -{ - struct media_pad *remote; - - remote = media_pad_remote_pad_first(&video->pad); +/* Queue */ - if (!remote || !is_media_entity_v4l2_subdev(remote->entity)) - return NULL; - - if (pad) - *pad = remote->index; - - return media_entity_to_v4l2_subdev(remote->entity); -} - -static int sun6i_video_queue_setup(struct vb2_queue *vq, - unsigned int *nbuffers, - unsigned int *nplanes, +static int sun6i_video_queue_setup(struct vb2_queue *queue, + unsigned int *buffers_count, + unsigned int *planes_count, unsigned int sizes[], struct device *alloc_devs[]) { - struct sun6i_video *video = vb2_get_drv_priv(vq); - unsigned int size = video->fmt.fmt.pix.sizeimage; + struct sun6i_video *video = vb2_get_drv_priv(queue); + unsigned int size = video->format.fmt.pix.sizeimage; - if (*nplanes) + if (*planes_count) return sizes[0] < size ? -EINVAL : 0; - *nplanes = 1; + *planes_count = 1; sizes[0] = size; return 0; } -static int sun6i_video_buffer_prepare(struct vb2_buffer *vb) +static int sun6i_video_buffer_prepare(struct vb2_buffer *buffer) { - struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); - struct sun6i_csi_buffer *buf = - container_of(vbuf, struct sun6i_csi_buffer, vb); - struct sun6i_video *video = vb2_get_drv_priv(vb->vb2_queue); - unsigned long size = video->fmt.fmt.pix.sizeimage; - - if (vb2_plane_size(vb, 0) < size) { - v4l2_err(video->vdev.v4l2_dev, "buffer too small (%lu < %lu)\n", - vb2_plane_size(vb, 0), size); + struct sun6i_video *video = vb2_get_drv_priv(buffer->vb2_queue); + struct sun6i_csi_device *csi_dev = video->csi_dev; + struct v4l2_device *v4l2_dev = &csi_dev->v4l2.v4l2_dev; + struct vb2_v4l2_buffer *v4l2_buffer = to_vb2_v4l2_buffer(buffer); + struct sun6i_csi_buffer *csi_buffer = + container_of(v4l2_buffer, struct sun6i_csi_buffer, v4l2_buffer); + unsigned long size = video->format.fmt.pix.sizeimage; + + if (vb2_plane_size(buffer, 0) < size) { + v4l2_err(v4l2_dev, "buffer too small (%lu < %lu)\n", + vb2_plane_size(buffer, 0), size); return -EINVAL; } - vb2_set_plane_payload(vb, 0, size); - - buf->dma_addr = vb2_dma_contig_plane_dma_addr(vb, 0); + vb2_set_plane_payload(buffer, 0, size); - vbuf->field = video->fmt.fmt.pix.field; + csi_buffer->dma_addr = vb2_dma_contig_plane_dma_addr(buffer, 0); + v4l2_buffer->field = video->format.fmt.pix.field; return 0; } -static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) +static void sun6i_video_buffer_queue(struct vb2_buffer *buffer) +{ + struct sun6i_video *video = vb2_get_drv_priv(buffer->vb2_queue); + struct vb2_v4l2_buffer *v4l2_buffer = to_vb2_v4l2_buffer(buffer); + struct sun6i_csi_buffer *csi_buffer = + container_of(v4l2_buffer, struct sun6i_csi_buffer, v4l2_buffer); + unsigned long flags; + + spin_lock_irqsave(&video->dma_queue_lock, flags); + csi_buffer->queued_to_csi = false; + list_add_tail(&csi_buffer->list, &video->dma_queue); + spin_unlock_irqrestore(&video->dma_queue_lock, flags); +} + +static int sun6i_video_start_streaming(struct vb2_queue *queue, + unsigned int count) { - struct sun6i_video *video = vb2_get_drv_priv(vq); + struct sun6i_video *video = vb2_get_drv_priv(queue); + struct video_device *video_dev = &video->video_dev; struct sun6i_csi_buffer *buf; struct sun6i_csi_buffer *next_buf; struct sun6i_csi_config config; @@ -141,30 +164,30 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) video->sequence = 0; - ret = video_device_pipeline_alloc_start(&video->vdev); + ret = video_device_pipeline_alloc_start(video_dev); if (ret < 0) - goto clear_dma_queue; + goto error_dma_queue_flush; if (video->mbus_code == 0) { ret = -EINVAL; - goto stop_media_pipeline; + goto error_media_pipeline; } subdev = sun6i_video_remote_subdev(video, NULL); if (!subdev) { ret = -EINVAL; - goto stop_media_pipeline; + goto error_media_pipeline; } - config.pixelformat = video->fmt.fmt.pix.pixelformat; + config.pixelformat = video->format.fmt.pix.pixelformat; config.code = video->mbus_code; - config.field = video->fmt.fmt.pix.field; - config.width = video->fmt.fmt.pix.width; - config.height = video->fmt.fmt.pix.height; + config.field = video->format.fmt.pix.field; + config.width = video->format.fmt.pix.width; + config.height = video->format.fmt.pix.height; ret = sun6i_csi_update_config(video->csi_dev, &config); if (ret < 0) - goto stop_media_pipeline; + goto error_media_pipeline; spin_lock_irqsave(&video->dma_queue_lock, flags); @@ -200,27 +223,30 @@ static int sun6i_video_start_streaming(struct vb2_queue *vq, unsigned int count) ret = v4l2_subdev_call(subdev, video, s_stream, 1); if (ret && ret != -ENOIOCTLCMD) - goto stop_csi_stream; + goto error_stream; return 0; -stop_csi_stream: +error_stream: sun6i_csi_set_stream(video->csi_dev, false); -stop_media_pipeline: - video_device_pipeline_stop(&video->vdev); -clear_dma_queue: + +error_media_pipeline: + video_device_pipeline_stop(video_dev); + +error_dma_queue_flush: spin_lock_irqsave(&video->dma_queue_lock, flags); list_for_each_entry(buf, &video->dma_queue, list) - vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); + vb2_buffer_done(&buf->v4l2_buffer.vb2_buf, + VB2_BUF_STATE_QUEUED); INIT_LIST_HEAD(&video->dma_queue); spin_unlock_irqrestore(&video->dma_queue_lock, flags); return ret; } -static void sun6i_video_stop_streaming(struct vb2_queue *vq) +static void sun6i_video_stop_streaming(struct vb2_queue *queue) { - struct sun6i_video *video = vb2_get_drv_priv(vq); + struct sun6i_video *video = vb2_get_drv_priv(queue); struct v4l2_subdev *subdev; unsigned long flags; struct sun6i_csi_buffer *buf; @@ -231,35 +257,21 @@ static void sun6i_video_stop_streaming(struct vb2_queue *vq) sun6i_csi_set_stream(video->csi_dev, false); - video_device_pipeline_stop(&video->vdev); + video_device_pipeline_stop(&video->video_dev); /* Release all active buffers */ spin_lock_irqsave(&video->dma_queue_lock, flags); list_for_each_entry(buf, &video->dma_queue, list) - vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); + vb2_buffer_done(&buf->v4l2_buffer.vb2_buf, VB2_BUF_STATE_ERROR); INIT_LIST_HEAD(&video->dma_queue); spin_unlock_irqrestore(&video->dma_queue_lock, flags); } -static void sun6i_video_buffer_queue(struct vb2_buffer *vb) -{ - struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); - struct sun6i_csi_buffer *buf = - container_of(vbuf, struct sun6i_csi_buffer, vb); - struct sun6i_video *video = vb2_get_drv_priv(vb->vb2_queue); - unsigned long flags; - - spin_lock_irqsave(&video->dma_queue_lock, flags); - buf->queued_to_csi = false; - list_add_tail(&buf->list, &video->dma_queue); - spin_unlock_irqrestore(&video->dma_queue_lock, flags); -} - void sun6i_video_frame_done(struct sun6i_video *video) { struct sun6i_csi_buffer *buf; struct sun6i_csi_buffer *next_buf; - struct vb2_v4l2_buffer *vbuf; + struct vb2_v4l2_buffer *v4l2_buffer; spin_lock(&video->dma_queue_lock); @@ -267,7 +279,7 @@ void sun6i_video_frame_done(struct sun6i_video *video) struct sun6i_csi_buffer, list); if (list_is_last(&buf->list, &video->dma_queue)) { dev_dbg(video->csi_dev->dev, "Frame dropped!\n"); - goto unlock; + goto complete; } next_buf = list_next_entry(buf, list); @@ -280,14 +292,14 @@ void sun6i_video_frame_done(struct sun6i_video *video) next_buf->queued_to_csi = true; sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); dev_dbg(video->csi_dev->dev, "Frame dropped!\n"); - goto unlock; + goto complete; } list_del(&buf->list); - vbuf = &buf->vb; - vbuf->vb2_buf.timestamp = ktime_get_ns(); - vbuf->sequence = video->sequence; - vb2_buffer_done(&vbuf->vb2_buf, VB2_BUF_STATE_DONE); + v4l2_buffer = &buf->v4l2_buffer; + v4l2_buffer->vb2_buf.timestamp = ktime_get_ns(); + v4l2_buffer->sequence = video->sequence; + vb2_buffer_done(&v4l2_buffer->vb2_buf, VB2_BUF_STATE_DONE); /* Prepare buffer for next frame but one. */ if (!list_is_last(&next_buf->list, &video->dma_queue)) { @@ -298,165 +310,173 @@ void sun6i_video_frame_done(struct sun6i_video *video) dev_dbg(video->csi_dev->dev, "Next frame will be dropped!\n"); } -unlock: +complete: video->sequence++; spin_unlock(&video->dma_queue_lock); } -static const struct vb2_ops sun6i_csi_vb2_ops = { +static const struct vb2_ops sun6i_video_queue_ops = { .queue_setup = sun6i_video_queue_setup, - .wait_prepare = vb2_ops_wait_prepare, - .wait_finish = vb2_ops_wait_finish, .buf_prepare = sun6i_video_buffer_prepare, + .buf_queue = sun6i_video_buffer_queue, .start_streaming = sun6i_video_start_streaming, .stop_streaming = sun6i_video_stop_streaming, - .buf_queue = sun6i_video_buffer_queue, + .wait_prepare = vb2_ops_wait_prepare, + .wait_finish = vb2_ops_wait_finish, }; -static int vidioc_querycap(struct file *file, void *priv, - struct v4l2_capability *cap) +/* V4L2 Device */ + +static int sun6i_video_querycap(struct file *file, void *private, + struct v4l2_capability *capability) { struct sun6i_video *video = video_drvdata(file); + struct sun6i_csi_device *csi_dev = video->csi_dev; + struct video_device *video_dev = &video->video_dev; - strscpy(cap->driver, "sun6i-video", sizeof(cap->driver)); - strscpy(cap->card, video->vdev.name, sizeof(cap->card)); - snprintf(cap->bus_info, sizeof(cap->bus_info), "platform:%s", - video->csi_dev->dev->of_node->name); + strscpy(capability->driver, SUN6I_CSI_NAME, sizeof(capability->driver)); + strscpy(capability->card, video_dev->name, sizeof(capability->card)); + snprintf(capability->bus_info, sizeof(capability->bus_info), + "platform:%s", dev_name(csi_dev->dev)); return 0; } -static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, - struct v4l2_fmtdesc *f) +static int sun6i_video_enum_fmt(struct file *file, void *private, + struct v4l2_fmtdesc *fmtdesc) { - u32 index = f->index; + u32 index = fmtdesc->index; - if (index >= ARRAY_SIZE(supported_pixformats)) + if (index >= ARRAY_SIZE(sun6i_video_formats)) return -EINVAL; - f->pixelformat = supported_pixformats[index]; + fmtdesc->pixelformat = sun6i_video_formats[index]; return 0; } -static int vidioc_g_fmt_vid_cap(struct file *file, void *priv, - struct v4l2_format *fmt) +static int sun6i_video_g_fmt(struct file *file, void *private, + struct v4l2_format *format) { struct sun6i_video *video = video_drvdata(file); - *fmt = video->fmt; + *format = video->format; return 0; } -static int sun6i_video_try_fmt(struct sun6i_video *video, - struct v4l2_format *f) +static int sun6i_video_format_try(struct sun6i_video *video, + struct v4l2_format *format) { - struct v4l2_pix_format *pixfmt = &f->fmt.pix; + struct v4l2_pix_format *pix_format = &format->fmt.pix; int bpp; - if (!is_pixformat_valid(pixfmt->pixelformat)) - pixfmt->pixelformat = supported_pixformats[0]; + if (!sun6i_video_format_check(pix_format->pixelformat)) + pix_format->pixelformat = sun6i_video_formats[0]; - v4l_bound_align_image(&pixfmt->width, MIN_WIDTH, MAX_WIDTH, 1, - &pixfmt->height, MIN_HEIGHT, MAX_WIDTH, 1, 1); + v4l_bound_align_image(&pix_format->width, MIN_WIDTH, MAX_WIDTH, 1, + &pix_format->height, MIN_HEIGHT, MAX_WIDTH, 1, 1); - bpp = sun6i_csi_get_bpp(pixfmt->pixelformat); - pixfmt->bytesperline = (pixfmt->width * bpp) >> 3; - pixfmt->sizeimage = pixfmt->bytesperline * pixfmt->height; + bpp = sun6i_csi_get_bpp(pix_format->pixelformat); + pix_format->bytesperline = (pix_format->width * bpp) >> 3; + pix_format->sizeimage = pix_format->bytesperline * pix_format->height; - if (pixfmt->field == V4L2_FIELD_ANY) - pixfmt->field = V4L2_FIELD_NONE; + if (pix_format->field == V4L2_FIELD_ANY) + pix_format->field = V4L2_FIELD_NONE; - if (pixfmt->pixelformat == V4L2_PIX_FMT_JPEG) - pixfmt->colorspace = V4L2_COLORSPACE_JPEG; + if (pix_format->pixelformat == V4L2_PIX_FMT_JPEG) + pix_format->colorspace = V4L2_COLORSPACE_JPEG; else - pixfmt->colorspace = V4L2_COLORSPACE_SRGB; + pix_format->colorspace = V4L2_COLORSPACE_SRGB; - pixfmt->ycbcr_enc = V4L2_YCBCR_ENC_DEFAULT; - pixfmt->quantization = V4L2_QUANTIZATION_DEFAULT; - pixfmt->xfer_func = V4L2_XFER_FUNC_DEFAULT; + pix_format->ycbcr_enc = V4L2_YCBCR_ENC_DEFAULT; + pix_format->quantization = V4L2_QUANTIZATION_DEFAULT; + pix_format->xfer_func = V4L2_XFER_FUNC_DEFAULT; return 0; } -static int sun6i_video_set_fmt(struct sun6i_video *video, struct v4l2_format *f) +static int sun6i_video_format_set(struct sun6i_video *video, + struct v4l2_format *format) { int ret; - ret = sun6i_video_try_fmt(video, f); + ret = sun6i_video_format_try(video, format); if (ret) return ret; - video->fmt = *f; + video->format = *format; return 0; } -static int vidioc_s_fmt_vid_cap(struct file *file, void *priv, - struct v4l2_format *f) +static int sun6i_video_s_fmt(struct file *file, void *private, + struct v4l2_format *format) { struct sun6i_video *video = video_drvdata(file); - if (vb2_is_busy(&video->vb2_vidq)) + if (vb2_is_busy(&video->queue)) return -EBUSY; - return sun6i_video_set_fmt(video, f); + return sun6i_video_format_set(video, format); } -static int vidioc_try_fmt_vid_cap(struct file *file, void *priv, - struct v4l2_format *f) +static int sun6i_video_try_fmt(struct file *file, void *private, + struct v4l2_format *format) { struct sun6i_video *video = video_drvdata(file); - return sun6i_video_try_fmt(video, f); + return sun6i_video_format_try(video, format); } -static int vidioc_enum_input(struct file *file, void *fh, - struct v4l2_input *inp) +static int sun6i_video_enum_input(struct file *file, void *private, + struct v4l2_input *input) { - if (inp->index != 0) + if (input->index != 0) return -EINVAL; - strscpy(inp->name, "camera", sizeof(inp->name)); - inp->type = V4L2_INPUT_TYPE_CAMERA; + input->type = V4L2_INPUT_TYPE_CAMERA; + strscpy(input->name, "Camera", sizeof(input->name)); return 0; } -static int vidioc_g_input(struct file *file, void *fh, unsigned int *i) +static int sun6i_video_g_input(struct file *file, void *private, + unsigned int *index) { - *i = 0; + *index = 0; return 0; } -static int vidioc_s_input(struct file *file, void *fh, unsigned int i) +static int sun6i_video_s_input(struct file *file, void *private, + unsigned int index) { - if (i != 0) + if (index != 0) return -EINVAL; return 0; } static const struct v4l2_ioctl_ops sun6i_video_ioctl_ops = { - .vidioc_querycap = vidioc_querycap, - .vidioc_enum_fmt_vid_cap = vidioc_enum_fmt_vid_cap, - .vidioc_g_fmt_vid_cap = vidioc_g_fmt_vid_cap, - .vidioc_s_fmt_vid_cap = vidioc_s_fmt_vid_cap, - .vidioc_try_fmt_vid_cap = vidioc_try_fmt_vid_cap, + .vidioc_querycap = sun6i_video_querycap, + + .vidioc_enum_fmt_vid_cap = sun6i_video_enum_fmt, + .vidioc_g_fmt_vid_cap = sun6i_video_g_fmt, + .vidioc_s_fmt_vid_cap = sun6i_video_s_fmt, + .vidioc_try_fmt_vid_cap = sun6i_video_try_fmt, - .vidioc_enum_input = vidioc_enum_input, - .vidioc_s_input = vidioc_s_input, - .vidioc_g_input = vidioc_g_input, + .vidioc_enum_input = sun6i_video_enum_input, + .vidioc_g_input = sun6i_video_g_input, + .vidioc_s_input = sun6i_video_s_input, + .vidioc_create_bufs = vb2_ioctl_create_bufs, + .vidioc_prepare_buf = vb2_ioctl_prepare_buf, .vidioc_reqbufs = vb2_ioctl_reqbufs, .vidioc_querybuf = vb2_ioctl_querybuf, - .vidioc_qbuf = vb2_ioctl_qbuf, .vidioc_expbuf = vb2_ioctl_expbuf, + .vidioc_qbuf = vb2_ioctl_qbuf, .vidioc_dqbuf = vb2_ioctl_dqbuf, - .vidioc_create_bufs = vb2_ioctl_create_bufs, - .vidioc_prepare_buf = vb2_ioctl_prepare_buf, .vidioc_streamon = vb2_ioctl_streamon, .vidioc_streamoff = vb2_ioctl_streamoff, @@ -465,9 +485,8 @@ static const struct v4l2_ioctl_ops sun6i_video_ioctl_ops = { .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; -/* ----------------------------------------------------------------------------- - * V4L2 file operations - */ +/* V4L2 File */ + static int sun6i_video_open(struct file *file) { struct sun6i_video *video = video_drvdata(file); @@ -478,44 +497,46 @@ static int sun6i_video_open(struct file *file) ret = v4l2_fh_open(file); if (ret < 0) - goto unlock; + goto error_lock; - ret = v4l2_pipeline_pm_get(&video->vdev.entity); + ret = v4l2_pipeline_pm_get(&video->video_dev.entity); if (ret < 0) - goto fh_release; - - /* check if already powered */ - if (!v4l2_fh_is_singular_file(file)) - goto unlock; + goto error_v4l2_fh; - ret = sun6i_csi_set_power(video->csi_dev, true); - if (ret < 0) - goto fh_release; + /* Power on at first open. */ + if (v4l2_fh_is_singular_file(file)) { + ret = sun6i_csi_set_power(video->csi_dev, true); + if (ret < 0) + goto error_v4l2_fh; + } mutex_unlock(&video->lock); + return 0; -fh_release: +error_v4l2_fh: v4l2_fh_release(file); -unlock: + +error_lock: mutex_unlock(&video->lock); + return ret; } static int sun6i_video_close(struct file *file) { struct sun6i_video *video = video_drvdata(file); - bool last_fh; + bool last_close; mutex_lock(&video->lock); - last_fh = v4l2_fh_is_singular_file(file); + last_close = v4l2_fh_is_singular_file(file); _vb2_fop_release(file, NULL); + v4l2_pipeline_pm_put(&video->video_dev.entity); - v4l2_pipeline_pm_put(&video->vdev.entity); - - if (last_fh) + /* Power off at last close. */ + if (last_close) sun6i_csi_set_power(video->csi_dev, false); mutex_unlock(&video->lock); @@ -532,9 +553,8 @@ static const struct v4l2_file_operations sun6i_video_fops = { .poll = vb2_fop_poll }; -/* ----------------------------------------------------------------------------- - * Media Operations - */ +/* Media Entity */ + static int sun6i_video_link_validate_get_format(struct media_pad *pad, struct v4l2_subdev_format *fmt) { @@ -571,20 +591,20 @@ static int sun6i_video_link_validate(struct media_link *link) return ret; if (!sun6i_csi_is_format_supported(video->csi_dev, - video->fmt.fmt.pix.pixelformat, + video->format.fmt.pix.pixelformat, source_fmt.format.code)) { dev_err(video->csi_dev->dev, "Unsupported pixformat: 0x%x with mbus code: 0x%x!\n", - video->fmt.fmt.pix.pixelformat, + video->format.fmt.pix.pixelformat, source_fmt.format.code); return -EPIPE; } - if (source_fmt.format.width != video->fmt.fmt.pix.width || - source_fmt.format.height != video->fmt.fmt.pix.height) { + if (source_fmt.format.width != video->format.fmt.pix.width || + source_fmt.format.height != video->format.fmt.pix.height) { dev_err(video->csi_dev->dev, "Wrong width or height %ux%u (%ux%u expected)\n", - video->fmt.fmt.pix.width, video->fmt.fmt.pix.height, + video->format.fmt.pix.width, video->format.fmt.pix.height, source_fmt.format.width, source_fmt.format.height); return -EPIPE; } @@ -598,90 +618,109 @@ static const struct media_entity_operations sun6i_video_media_ops = { .link_validate = sun6i_video_link_validate }; -int sun6i_video_init(struct sun6i_video *video, - struct sun6i_csi_device *csi_dev, const char *name) +/* Video */ + +int sun6i_video_setup(struct sun6i_video *video, + struct sun6i_csi_device *csi_dev) { - struct sun6i_csi_v4l2 *v4l2 = &csi_dev->v4l2; - struct video_device *vdev = &video->vdev; - struct vb2_queue *vidq = &video->vb2_vidq; - struct v4l2_format fmt = { 0 }; + struct v4l2_device *v4l2_dev = &csi_dev->v4l2.v4l2_dev; + struct video_device *video_dev = &video->video_dev; + struct vb2_queue *queue = &video->queue; + struct media_pad *pad = &video->pad; + struct v4l2_format format = { 0 }; + struct v4l2_pix_format *pix_format = &format.fmt.pix; int ret; video->csi_dev = csi_dev; - /* Initialize the media entity... */ - video->pad.flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT; - vdev->entity.ops = &sun6i_video_media_ops; - ret = media_entity_pads_init(&vdev->entity, 1, &video->pad); + /* Media Entity */ + + video_dev->entity.ops = &sun6i_video_media_ops; + + /* Media Pad */ + + pad->flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT; + + ret = media_entity_pads_init(&video_dev->entity, 1, pad); if (ret < 0) return ret; - mutex_init(&video->lock); + /* DMA queue */ INIT_LIST_HEAD(&video->dma_queue); spin_lock_init(&video->dma_queue_lock); video->sequence = 0; - /* Setup default format */ - fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - fmt.fmt.pix.pixelformat = supported_pixformats[0]; - fmt.fmt.pix.width = 1280; - fmt.fmt.pix.height = 720; - fmt.fmt.pix.field = V4L2_FIELD_NONE; - sun6i_video_set_fmt(video, &fmt); - - /* Initialize videobuf2 queue */ - vidq->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - vidq->io_modes = VB2_MMAP | VB2_DMABUF; - vidq->drv_priv = video; - vidq->buf_struct_size = sizeof(struct sun6i_csi_buffer); - vidq->ops = &sun6i_csi_vb2_ops; - vidq->mem_ops = &vb2_dma_contig_memops; - vidq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; - vidq->lock = &video->lock; - /* Make sure non-dropped frame */ - vidq->min_buffers_needed = 3; - vidq->dev = csi_dev->dev; - - ret = vb2_queue_init(vidq); + /* Queue */ + + mutex_init(&video->lock); + + queue->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + queue->io_modes = VB2_MMAP | VB2_DMABUF; + queue->buf_struct_size = sizeof(struct sun6i_csi_buffer); + queue->ops = &sun6i_video_queue_ops; + queue->mem_ops = &vb2_dma_contig_memops; + queue->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; + queue->lock = &video->lock; + queue->dev = csi_dev->dev; + queue->drv_priv = video; + + /* Make sure non-dropped frame. */ + queue->min_buffers_needed = 3; + + ret = vb2_queue_init(queue); if (ret) { - v4l2_err(&v4l2->v4l2_dev, "vb2_queue_init failed: %d\n", - ret); - goto clean_entity; + v4l2_err(v4l2_dev, "failed to initialize vb2 queue: %d\n", ret); + goto error_media_entity; } - /* Register video device */ - strscpy(vdev->name, name, sizeof(vdev->name)); - vdev->release = video_device_release_empty; - vdev->fops = &sun6i_video_fops; - vdev->ioctl_ops = &sun6i_video_ioctl_ops; - vdev->vfl_type = VFL_TYPE_VIDEO; - vdev->vfl_dir = VFL_DIR_RX; - vdev->v4l2_dev = &v4l2->v4l2_dev; - vdev->queue = vidq; - vdev->lock = &video->lock; - vdev->device_caps = V4L2_CAP_STREAMING | V4L2_CAP_VIDEO_CAPTURE; - video_set_drvdata(vdev, video); - - ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); + /* V4L2 Format */ + + format.type = queue->type; + pix_format->pixelformat = sun6i_video_formats[0]; + pix_format->width = 1280; + pix_format->height = 720; + pix_format->field = V4L2_FIELD_NONE; + + sun6i_video_format_set(video, &format); + + /* Video Device */ + + strscpy(video_dev->name, SUN6I_CSI_NAME, sizeof(video_dev->name)); + video_dev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; + video_dev->vfl_dir = VFL_DIR_RX; + video_dev->release = video_device_release_empty; + video_dev->fops = &sun6i_video_fops; + video_dev->ioctl_ops = &sun6i_video_ioctl_ops; + video_dev->v4l2_dev = v4l2_dev; + video_dev->queue = queue; + video_dev->lock = &video->lock; + + video_set_drvdata(video_dev, video); + + ret = video_register_device(video_dev, VFL_TYPE_VIDEO, -1); if (ret < 0) { - v4l2_err(&v4l2->v4l2_dev, - "video_register_device failed: %d\n", ret); - goto clean_entity; + v4l2_err(v4l2_dev, "failed to register video device: %d\n", + ret); + goto error_media_entity; } return 0; -clean_entity: - media_entity_cleanup(&video->vdev.entity); +error_media_entity: + media_entity_cleanup(&video_dev->entity); + mutex_destroy(&video->lock); + return ret; } void sun6i_video_cleanup(struct sun6i_video *video) { - vb2_video_unregister_device(&video->vdev); - media_entity_cleanup(&video->vdev.entity); + struct video_device *video_dev = &video->video_dev; + + vb2_video_unregister_device(video_dev); + media_entity_cleanup(&video_dev->entity); mutex_destroy(&video->lock); } diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h index 30e37ee0d07f2..7864f062d05bc 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h @@ -15,22 +15,22 @@ struct sun6i_csi_device; struct sun6i_video { struct sun6i_csi_device *csi_dev; - struct video_device vdev; - struct media_pad pad; - struct mutex lock; + struct video_device video_dev; + struct vb2_queue queue; + struct mutex lock; /* Queue lock. */ + struct media_pad pad; - struct vb2_queue vb2_vidq; - spinlock_t dma_queue_lock; struct list_head dma_queue; + spinlock_t dma_queue_lock; /* DMA queue lock. */ - unsigned int sequence; - struct v4l2_format fmt; + struct v4l2_format format; u32 mbus_code; + unsigned int sequence; }; -int sun6i_video_init(struct sun6i_video *video, - struct sun6i_csi_device *csi_dev, const char *name); +int sun6i_video_setup(struct sun6i_video *video, + struct sun6i_csi_device *csi_dev); void sun6i_video_cleanup(struct sun6i_video *video); void sun6i_video_frame_done(struct sun6i_video *video); -- GitLab From 92cc51588225d82a4e3d2358e47367ac52b1661a Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:07 +0200 Subject: [PATCH 0620/2223] media: sun6i-csi: Pass and store csi device directly in video code The video structure is part of the main csi device structure, so pass pointers to that top-level structure directly. This makes it easier to navigate and access other elements. No functional change intended. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_csi.c | 8 +- .../platform/sunxi/sun6i-csi/sun6i_video.c | 91 ++++++++++--------- .../platform/sunxi/sun6i-csi/sun6i_video.h | 9 +- 3 files changed, 57 insertions(+), 51 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index b4f90b065a0c1..a55347b7a6d64 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -706,7 +706,7 @@ static int sun6i_csi_v4l2_setup(struct sun6i_csi_device *csi_dev) /* Video */ - ret = sun6i_video_setup(&csi_dev->video, csi_dev); + ret = sun6i_video_setup(csi_dev); if (ret) goto error_v4l2_device; @@ -735,7 +735,7 @@ error_v4l2_async_notifier: v4l2_async_nf_cleanup(notifier); error_video: - sun6i_video_cleanup(&csi_dev->video); + sun6i_video_cleanup(csi_dev); error_v4l2_device: v4l2_device_unregister(&v4l2->v4l2_dev); @@ -756,7 +756,7 @@ static void sun6i_csi_v4l2_cleanup(struct sun6i_csi_device *csi_dev) media_device_unregister(&v4l2->media_dev); v4l2_async_nf_unregister(&v4l2->notifier); v4l2_async_nf_cleanup(&v4l2->notifier); - sun6i_video_cleanup(&csi_dev->video); + sun6i_video_cleanup(csi_dev); v4l2_device_unregister(&v4l2->v4l2_dev); v4l2_ctrl_handler_free(&v4l2->ctrl_handler); media_device_cleanup(&v4l2->media_dev); @@ -787,7 +787,7 @@ static irqreturn_t sun6i_csi_interrupt(int irq, void *private) } if (status & CSI_CH_INT_STA_FD_PD) - sun6i_video_frame_done(&csi_dev->video); + sun6i_video_frame_done(csi_dev); regmap_write(regmap, CSI_CH_INT_STA_REG, status); diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index 4710902447ac4..50e5136e2281d 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -100,7 +100,8 @@ static int sun6i_video_queue_setup(struct vb2_queue *queue, unsigned int sizes[], struct device *alloc_devs[]) { - struct sun6i_video *video = vb2_get_drv_priv(queue); + struct sun6i_csi_device *csi_dev = vb2_get_drv_priv(queue); + struct sun6i_video *video = &csi_dev->video; unsigned int size = video->format.fmt.pix.sizeimage; if (*planes_count) @@ -114,8 +115,8 @@ static int sun6i_video_queue_setup(struct vb2_queue *queue, static int sun6i_video_buffer_prepare(struct vb2_buffer *buffer) { - struct sun6i_video *video = vb2_get_drv_priv(buffer->vb2_queue); - struct sun6i_csi_device *csi_dev = video->csi_dev; + struct sun6i_csi_device *csi_dev = vb2_get_drv_priv(buffer->vb2_queue); + struct sun6i_video *video = &csi_dev->video; struct v4l2_device *v4l2_dev = &csi_dev->v4l2.v4l2_dev; struct vb2_v4l2_buffer *v4l2_buffer = to_vb2_v4l2_buffer(buffer); struct sun6i_csi_buffer *csi_buffer = @@ -138,7 +139,8 @@ static int sun6i_video_buffer_prepare(struct vb2_buffer *buffer) static void sun6i_video_buffer_queue(struct vb2_buffer *buffer) { - struct sun6i_video *video = vb2_get_drv_priv(buffer->vb2_queue); + struct sun6i_csi_device *csi_dev = vb2_get_drv_priv(buffer->vb2_queue); + struct sun6i_video *video = &csi_dev->video; struct vb2_v4l2_buffer *v4l2_buffer = to_vb2_v4l2_buffer(buffer); struct sun6i_csi_buffer *csi_buffer = container_of(v4l2_buffer, struct sun6i_csi_buffer, v4l2_buffer); @@ -153,7 +155,8 @@ static void sun6i_video_buffer_queue(struct vb2_buffer *buffer) static int sun6i_video_start_streaming(struct vb2_queue *queue, unsigned int count) { - struct sun6i_video *video = vb2_get_drv_priv(queue); + struct sun6i_csi_device *csi_dev = vb2_get_drv_priv(queue); + struct sun6i_video *video = &csi_dev->video; struct video_device *video_dev = &video->video_dev; struct sun6i_csi_buffer *buf; struct sun6i_csi_buffer *next_buf; @@ -185,7 +188,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, config.width = video->format.fmt.pix.width; config.height = video->format.fmt.pix.height; - ret = sun6i_csi_update_config(video->csi_dev, &config); + ret = sun6i_csi_update_config(csi_dev, &config); if (ret < 0) goto error_media_pipeline; @@ -194,9 +197,9 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, buf = list_first_entry(&video->dma_queue, struct sun6i_csi_buffer, list); buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi_dev, buf->dma_addr); + sun6i_csi_update_buf_addr(csi_dev, buf->dma_addr); - sun6i_csi_set_stream(video->csi_dev, true); + sun6i_csi_set_stream(csi_dev, true); /* * CSI will lookup the next dma buffer for next frame before the @@ -217,7 +220,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, */ next_buf = list_next_entry(buf, list); next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); + sun6i_csi_update_buf_addr(csi_dev, next_buf->dma_addr); spin_unlock_irqrestore(&video->dma_queue_lock, flags); @@ -228,7 +231,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, return 0; error_stream: - sun6i_csi_set_stream(video->csi_dev, false); + sun6i_csi_set_stream(csi_dev, false); error_media_pipeline: video_device_pipeline_stop(video_dev); @@ -246,7 +249,8 @@ error_dma_queue_flush: static void sun6i_video_stop_streaming(struct vb2_queue *queue) { - struct sun6i_video *video = vb2_get_drv_priv(queue); + struct sun6i_csi_device *csi_dev = vb2_get_drv_priv(queue); + struct sun6i_video *video = &csi_dev->video; struct v4l2_subdev *subdev; unsigned long flags; struct sun6i_csi_buffer *buf; @@ -255,7 +259,7 @@ static void sun6i_video_stop_streaming(struct vb2_queue *queue) if (subdev) v4l2_subdev_call(subdev, video, s_stream, 0); - sun6i_csi_set_stream(video->csi_dev, false); + sun6i_csi_set_stream(csi_dev, false); video_device_pipeline_stop(&video->video_dev); @@ -267,8 +271,9 @@ static void sun6i_video_stop_streaming(struct vb2_queue *queue) spin_unlock_irqrestore(&video->dma_queue_lock, flags); } -void sun6i_video_frame_done(struct sun6i_video *video) +void sun6i_video_frame_done(struct sun6i_csi_device *csi_dev) { + struct sun6i_video *video = &csi_dev->video; struct sun6i_csi_buffer *buf; struct sun6i_csi_buffer *next_buf; struct vb2_v4l2_buffer *v4l2_buffer; @@ -278,7 +283,7 @@ void sun6i_video_frame_done(struct sun6i_video *video) buf = list_first_entry(&video->dma_queue, struct sun6i_csi_buffer, list); if (list_is_last(&buf->list, &video->dma_queue)) { - dev_dbg(video->csi_dev->dev, "Frame dropped!\n"); + dev_dbg(csi_dev->dev, "Frame dropped!\n"); goto complete; } @@ -290,8 +295,8 @@ void sun6i_video_frame_done(struct sun6i_video *video) */ if (!next_buf->queued_to_csi) { next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); - dev_dbg(video->csi_dev->dev, "Frame dropped!\n"); + sun6i_csi_update_buf_addr(csi_dev, next_buf->dma_addr); + dev_dbg(csi_dev->dev, "Frame dropped!\n"); goto complete; } @@ -305,9 +310,9 @@ void sun6i_video_frame_done(struct sun6i_video *video) if (!list_is_last(&next_buf->list, &video->dma_queue)) { next_buf = list_next_entry(next_buf, list); next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(video->csi_dev, next_buf->dma_addr); + sun6i_csi_update_buf_addr(csi_dev, next_buf->dma_addr); } else { - dev_dbg(video->csi_dev->dev, "Next frame will be dropped!\n"); + dev_dbg(csi_dev->dev, "Next frame will be dropped!\n"); } complete: @@ -330,9 +335,8 @@ static const struct vb2_ops sun6i_video_queue_ops = { static int sun6i_video_querycap(struct file *file, void *private, struct v4l2_capability *capability) { - struct sun6i_video *video = video_drvdata(file); - struct sun6i_csi_device *csi_dev = video->csi_dev; - struct video_device *video_dev = &video->video_dev; + struct sun6i_csi_device *csi_dev = video_drvdata(file); + struct video_device *video_dev = &csi_dev->video.video_dev; strscpy(capability->driver, SUN6I_CSI_NAME, sizeof(capability->driver)); strscpy(capability->card, video_dev->name, sizeof(capability->card)); @@ -358,7 +362,8 @@ static int sun6i_video_enum_fmt(struct file *file, void *private, static int sun6i_video_g_fmt(struct file *file, void *private, struct v4l2_format *format) { - struct sun6i_video *video = video_drvdata(file); + struct sun6i_csi_device *csi_dev = video_drvdata(file); + struct sun6i_video *video = &csi_dev->video; *format = video->format; @@ -413,7 +418,8 @@ static int sun6i_video_format_set(struct sun6i_video *video, static int sun6i_video_s_fmt(struct file *file, void *private, struct v4l2_format *format) { - struct sun6i_video *video = video_drvdata(file); + struct sun6i_csi_device *csi_dev = video_drvdata(file); + struct sun6i_video *video = &csi_dev->video; if (vb2_is_busy(&video->queue)) return -EBUSY; @@ -424,7 +430,8 @@ static int sun6i_video_s_fmt(struct file *file, void *private, static int sun6i_video_try_fmt(struct file *file, void *private, struct v4l2_format *format) { - struct sun6i_video *video = video_drvdata(file); + struct sun6i_csi_device *csi_dev = video_drvdata(file); + struct sun6i_video *video = &csi_dev->video; return sun6i_video_format_try(video, format); } @@ -489,7 +496,8 @@ static const struct v4l2_ioctl_ops sun6i_video_ioctl_ops = { static int sun6i_video_open(struct file *file) { - struct sun6i_video *video = video_drvdata(file); + struct sun6i_csi_device *csi_dev = video_drvdata(file); + struct sun6i_video *video = &csi_dev->video; int ret = 0; if (mutex_lock_interruptible(&video->lock)) @@ -505,7 +513,7 @@ static int sun6i_video_open(struct file *file) /* Power on at first open. */ if (v4l2_fh_is_singular_file(file)) { - ret = sun6i_csi_set_power(video->csi_dev, true); + ret = sun6i_csi_set_power(csi_dev, true); if (ret < 0) goto error_v4l2_fh; } @@ -525,7 +533,8 @@ error_lock: static int sun6i_video_close(struct file *file) { - struct sun6i_video *video = video_drvdata(file); + struct sun6i_csi_device *csi_dev = video_drvdata(file); + struct sun6i_video *video = &csi_dev->video; bool last_close; mutex_lock(&video->lock); @@ -537,7 +546,7 @@ static int sun6i_video_close(struct file *file) /* Power off at last close. */ if (last_close) - sun6i_csi_set_power(video->csi_dev, false); + sun6i_csi_set_power(csi_dev, false); mutex_unlock(&video->lock); @@ -574,15 +583,16 @@ static int sun6i_video_link_validate(struct media_link *link) { struct video_device *vdev = container_of(link->sink->entity, struct video_device, entity); - struct sun6i_video *video = video_get_drvdata(vdev); + struct sun6i_csi_device *csi_dev = video_get_drvdata(vdev); + struct sun6i_video *video = &csi_dev->video; struct v4l2_subdev_format source_fmt; int ret; video->mbus_code = 0; if (!media_pad_remote_pad_first(link->sink->entity->pads)) { - dev_info(video->csi_dev->dev, - "video node %s pad not connected\n", vdev->name); + dev_info(csi_dev->dev, "video node %s pad not connected\n", + vdev->name); return -ENOLINK; } @@ -590,10 +600,10 @@ static int sun6i_video_link_validate(struct media_link *link) if (ret < 0) return ret; - if (!sun6i_csi_is_format_supported(video->csi_dev, + if (!sun6i_csi_is_format_supported(csi_dev, video->format.fmt.pix.pixelformat, source_fmt.format.code)) { - dev_err(video->csi_dev->dev, + dev_err(csi_dev->dev, "Unsupported pixformat: 0x%x with mbus code: 0x%x!\n", video->format.fmt.pix.pixelformat, source_fmt.format.code); @@ -602,7 +612,7 @@ static int sun6i_video_link_validate(struct media_link *link) if (source_fmt.format.width != video->format.fmt.pix.width || source_fmt.format.height != video->format.fmt.pix.height) { - dev_err(video->csi_dev->dev, + dev_err(csi_dev->dev, "Wrong width or height %ux%u (%ux%u expected)\n", video->format.fmt.pix.width, video->format.fmt.pix.height, source_fmt.format.width, source_fmt.format.height); @@ -620,9 +630,9 @@ static const struct media_entity_operations sun6i_video_media_ops = { /* Video */ -int sun6i_video_setup(struct sun6i_video *video, - struct sun6i_csi_device *csi_dev) +int sun6i_video_setup(struct sun6i_csi_device *csi_dev) { + struct sun6i_video *video = &csi_dev->video; struct v4l2_device *v4l2_dev = &csi_dev->v4l2.v4l2_dev; struct video_device *video_dev = &video->video_dev; struct vb2_queue *queue = &video->queue; @@ -631,8 +641,6 @@ int sun6i_video_setup(struct sun6i_video *video, struct v4l2_pix_format *pix_format = &format.fmt.pix; int ret; - video->csi_dev = csi_dev; - /* Media Entity */ video_dev->entity.ops = &sun6i_video_media_ops; @@ -664,7 +672,7 @@ int sun6i_video_setup(struct sun6i_video *video, queue->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; queue->lock = &video->lock; queue->dev = csi_dev->dev; - queue->drv_priv = video; + queue->drv_priv = csi_dev; /* Make sure non-dropped frame. */ queue->min_buffers_needed = 3; @@ -697,7 +705,7 @@ int sun6i_video_setup(struct sun6i_video *video, video_dev->queue = queue; video_dev->lock = &video->lock; - video_set_drvdata(video_dev, video); + video_set_drvdata(video_dev, csi_dev); ret = video_register_device(video_dev, VFL_TYPE_VIDEO, -1); if (ret < 0) { @@ -716,8 +724,9 @@ error_media_entity: return ret; } -void sun6i_video_cleanup(struct sun6i_video *video) +void sun6i_video_cleanup(struct sun6i_csi_device *csi_dev) { + struct sun6i_video *video = &csi_dev->video; struct video_device *video_dev = &video->video_dev; vb2_video_unregister_device(video_dev); diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h index 7864f062d05bc..a917d2da6debb 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.h @@ -14,8 +14,6 @@ struct sun6i_csi_device; struct sun6i_video { - struct sun6i_csi_device *csi_dev; - struct video_device video_dev; struct vb2_queue queue; struct mutex lock; /* Queue lock. */ @@ -29,10 +27,9 @@ struct sun6i_video { unsigned int sequence; }; -int sun6i_video_setup(struct sun6i_video *video, - struct sun6i_csi_device *csi_dev); -void sun6i_video_cleanup(struct sun6i_video *video); +int sun6i_video_setup(struct sun6i_csi_device *csi_dev); +void sun6i_video_cleanup(struct sun6i_csi_device *csi_dev); -void sun6i_video_frame_done(struct sun6i_video *video); +void sun6i_video_frame_done(struct sun6i_csi_device *csi_dev); #endif /* __SUN6I_VIDEO_H__ */ -- GitLab From f9a6e5651efff2b3461ef8256067437a39b54e9e Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:08 +0200 Subject: [PATCH 0621/2223] media: sun6i-csi: Register the media device after creation There is no particular need to register the media device in the subdev notify complete callback. Register it in the v4l2 code instead where it's more in-context. Signed-off-by: Paul Kocialkowski Acked-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index a55347b7a6d64..e3d60b647cb20 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -638,7 +638,7 @@ static int sun6i_subdev_notify_complete(struct v4l2_async_notifier *notifier) if (ret < 0) return ret; - return media_device_register(&v4l2->media_dev); + return 0; } static const struct v4l2_async_notifier_operations sun6i_csi_async_ops = { @@ -685,6 +685,12 @@ static int sun6i_csi_v4l2_setup(struct sun6i_csi_device *csi_dev) media_device_init(media_dev); + ret = media_device_register(media_dev); + if (ret) { + dev_err(dev, "failed to register media device: %d\n", ret); + goto error_media; + } + /* V4L2 Control Handler */ ret = v4l2_ctrl_handler_init(&v4l2->ctrl_handler, 0); @@ -744,6 +750,7 @@ error_v4l2_ctrl: v4l2_ctrl_handler_free(&v4l2->ctrl_handler); error_media: + media_device_unregister(media_dev); media_device_cleanup(media_dev); return ret; -- GitLab From daafbb94adc543641cd37b04dbe911244b8bf0f6 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:09 +0200 Subject: [PATCH 0622/2223] media: sun6i-csi: Remove controls handler from the driver The driver does not expose controls directly and thus does not need a controls handler for its own use. Controls attached to subdevs used to be exposed that way, however this can easily lead to issue when multiple subdevs attached to the same v4l2 device expose the same controls. Subdev controls should be set through each individual subdev node instead. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/sunxi/sun6i-csi/sun6i_csi.c | 15 +-------------- .../media/platform/sunxi/sun6i-csi/sun6i_csi.h | 2 -- .../media/platform/sunxi/sun6i-csi/sun6i_video.c | 4 ---- 3 files changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index e3d60b647cb20..d74eaa3132d60 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -691,23 +691,14 @@ static int sun6i_csi_v4l2_setup(struct sun6i_csi_device *csi_dev) goto error_media; } - /* V4L2 Control Handler */ - - ret = v4l2_ctrl_handler_init(&v4l2->ctrl_handler, 0); - if (ret) { - dev_err(dev, "failed to init v4l2 control handler: %d\n", ret); - goto error_media; - } - /* V4L2 Device */ v4l2_dev->mdev = media_dev; - v4l2_dev->ctrl_handler = &v4l2->ctrl_handler; ret = v4l2_device_register(dev, v4l2_dev); if (ret) { dev_err(dev, "failed to register v4l2 device: %d\n", ret); - goto error_v4l2_ctrl; + goto error_media; } /* Video */ @@ -746,9 +737,6 @@ error_video: error_v4l2_device: v4l2_device_unregister(&v4l2->v4l2_dev); -error_v4l2_ctrl: - v4l2_ctrl_handler_free(&v4l2->ctrl_handler); - error_media: media_device_unregister(media_dev); media_device_cleanup(media_dev); @@ -765,7 +753,6 @@ static void sun6i_csi_v4l2_cleanup(struct sun6i_csi_device *csi_dev) v4l2_async_nf_cleanup(&v4l2->notifier); sun6i_video_cleanup(csi_dev); v4l2_device_unregister(&v4l2->v4l2_dev); - v4l2_ctrl_handler_free(&v4l2->ctrl_handler); media_device_cleanup(&v4l2->media_dev); } diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h index e301d80362cfc..4bb4c3d1be072 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h @@ -8,7 +8,6 @@ #ifndef __SUN6I_CSI_H__ #define __SUN6I_CSI_H__ -#include #include #include @@ -35,7 +34,6 @@ struct sun6i_csi_config { struct sun6i_csi_v4l2 { struct v4l2_device v4l2_dev; - struct v4l2_ctrl_handler ctrl_handler; struct media_device media_dev; struct v4l2_async_notifier notifier; diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index 50e5136e2281d..d75f762abb786 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -486,10 +486,6 @@ static const struct v4l2_ioctl_ops sun6i_video_ioctl_ops = { .vidioc_dqbuf = vb2_ioctl_dqbuf, .vidioc_streamon = vb2_ioctl_streamon, .vidioc_streamoff = vb2_ioctl_streamoff, - - .vidioc_log_status = v4l2_ctrl_log_status, - .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, - .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; /* V4L2 File */ -- GitLab From b11d91321cacf638cbfbbb424f9267d4146872a9 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:10 +0200 Subject: [PATCH 0623/2223] media: sun6i-csi: Add media ops with link notify callback In order to keep the power use count fields balanced when link changes happen between v4l2_pipeline_pm_get/set calls (in open/close), the link_notify media operation callback needs to be registered. Signed-off-by: Paul Kocialkowski Acked-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c index d74eaa3132d60..8b99c17e8403f 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "sun6i_csi.h" #include "sun6i_csi_reg.h" @@ -574,6 +575,12 @@ void sun6i_csi_set_stream(struct sun6i_csi_device *csi_dev, bool enable) CSI_CAP_CH0_VCAP_ON); } +/* Media */ + +static const struct media_device_ops sun6i_csi_media_ops = { + .link_notify = v4l2_pipeline_link_notify, +}; + /* V4L2 */ static int sun6i_csi_link_entity(struct sun6i_csi_device *csi_dev, @@ -681,6 +688,7 @@ static int sun6i_csi_v4l2_setup(struct sun6i_csi_device *csi_dev) strscpy(media_dev->model, SUN6I_CSI_DESCRIPTION, sizeof(media_dev->model)); media_dev->hw_revision = 0; + media_dev->ops = &sun6i_csi_media_ops; media_dev->dev = dev; media_device_init(media_dev); -- GitLab From 7d13cf0e0023000768876c4e65c89e7714e9071f Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:11 +0200 Subject: [PATCH 0624/2223] media: sun6i-csi: Introduce and use video helper functions Introduce some helpers for buffer and general video configuration. Signed-off-by: Paul Kocialkowski Reviewed-by: Jernej Skrabec Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../platform/sunxi/sun6i-csi/sun6i_video.c | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index d75f762abb786..dd3748d337bef 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -92,6 +92,29 @@ static bool sun6i_video_format_check(u32 format) return false; } +/* Video */ + +static void sun6i_video_buffer_configure(struct sun6i_csi_device *csi_dev, + struct sun6i_csi_buffer *csi_buffer) +{ + csi_buffer->queued_to_csi = true; + sun6i_csi_update_buf_addr(csi_dev, csi_buffer->dma_addr); +} + +static void sun6i_video_configure(struct sun6i_csi_device *csi_dev) +{ + struct sun6i_video *video = &csi_dev->video; + struct sun6i_csi_config config = { 0 }; + + config.pixelformat = video->format.fmt.pix.pixelformat; + config.code = video->mbus_code; + config.field = video->format.fmt.pix.field; + config.width = video->format.fmt.pix.width; + config.height = video->format.fmt.pix.height; + + sun6i_csi_update_config(csi_dev, &config); +} + /* Queue */ static int sun6i_video_queue_setup(struct vb2_queue *queue, @@ -160,7 +183,6 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, struct video_device *video_dev = &video->video_dev; struct sun6i_csi_buffer *buf; struct sun6i_csi_buffer *next_buf; - struct sun6i_csi_config config; struct v4l2_subdev *subdev; unsigned long flags; int ret; @@ -182,22 +204,13 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, goto error_media_pipeline; } - config.pixelformat = video->format.fmt.pix.pixelformat; - config.code = video->mbus_code; - config.field = video->format.fmt.pix.field; - config.width = video->format.fmt.pix.width; - config.height = video->format.fmt.pix.height; - - ret = sun6i_csi_update_config(csi_dev, &config); - if (ret < 0) - goto error_media_pipeline; + sun6i_video_configure(csi_dev); spin_lock_irqsave(&video->dma_queue_lock, flags); buf = list_first_entry(&video->dma_queue, struct sun6i_csi_buffer, list); - buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(csi_dev, buf->dma_addr); + sun6i_video_buffer_configure(csi_dev, buf); sun6i_csi_set_stream(csi_dev, true); @@ -219,8 +232,7 @@ static int sun6i_video_start_streaming(struct vb2_queue *queue, * would also drop frame when lacking of queued buffer. */ next_buf = list_next_entry(buf, list); - next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(csi_dev, next_buf->dma_addr); + sun6i_video_buffer_configure(csi_dev, next_buf); spin_unlock_irqrestore(&video->dma_queue_lock, flags); @@ -294,8 +306,7 @@ void sun6i_video_frame_done(struct sun6i_csi_device *csi_dev) * for next ISR call. */ if (!next_buf->queued_to_csi) { - next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(csi_dev, next_buf->dma_addr); + sun6i_video_buffer_configure(csi_dev, next_buf); dev_dbg(csi_dev->dev, "Frame dropped!\n"); goto complete; } @@ -309,8 +320,7 @@ void sun6i_video_frame_done(struct sun6i_csi_device *csi_dev) /* Prepare buffer for next frame but one. */ if (!list_is_last(&next_buf->list, &video->dma_queue)) { next_buf = list_next_entry(next_buf, list); - next_buf->queued_to_csi = true; - sun6i_csi_update_buf_addr(csi_dev, next_buf->dma_addr); + sun6i_video_buffer_configure(csi_dev, next_buf); } else { dev_dbg(csi_dev->dev, "Next frame will be dropped!\n"); } -- GitLab From 70a7ce22e9229f216c638426a854efd5bd127b24 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 26 Aug 2022 20:32:12 +0200 Subject: [PATCH 0625/2223] media: sun6i-csi: Move csi buffer definition to main header file The buffer structure is a top-level definition, put it in the main header to keep things tidy. No functional change intended. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h | 9 +++++++++ drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h index 4bb4c3d1be072..bab705678280c 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.h @@ -10,12 +10,21 @@ #include #include +#include #include "sun6i_video.h" #define SUN6I_CSI_NAME "sun6i-csi" #define SUN6I_CSI_DESCRIPTION "Allwinner A31 CSI Device" +struct sun6i_csi_buffer { + struct vb2_v4l2_buffer v4l2_buffer; + struct list_head list; + + dma_addr_t dma_addr; + bool queued_to_csi; +}; + /** * struct sun6i_csi_config - configs for sun6i csi * @pixelformat: v4l2 pixel format (V4L2_PIX_FMT_*) diff --git a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c index dd3748d337bef..791583d23a656 100644 --- a/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c +++ b/drivers/media/platform/sunxi/sun6i-csi/sun6i_video.c @@ -23,14 +23,6 @@ #define MAX_WIDTH (4800) #define MAX_HEIGHT (4800) -struct sun6i_csi_buffer { - struct vb2_v4l2_buffer v4l2_buffer; - struct list_head list; - - dma_addr_t dma_addr; - bool queued_to_csi; -}; - /* Helpers */ static struct v4l2_subdev * -- GitLab From 6dc0a438f91d5ece823261204248670995504139 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Sat, 24 Sep 2022 22:35:30 -0700 Subject: [PATCH 0626/2223] Input: twl4030-vibra - drop legacy, non DT boot support Legacy or non DT boot is no longer possible on systems where the tw4030/5030 is used. Drop the support for handling legacy pdata. Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20220616153323.29464-1-peter.ujfalusi@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/twl4030-vibra.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/input/misc/twl4030-vibra.c b/drivers/input/misc/twl4030-vibra.c index e0ff616fb857e..5619996da86fc 100644 --- a/drivers/input/misc/twl4030-vibra.c +++ b/drivers/input/misc/twl4030-vibra.c @@ -163,14 +163,10 @@ static int __maybe_unused twl4030_vibra_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(twl4030_vibra_pm_ops, twl4030_vibra_suspend, twl4030_vibra_resume); -static bool twl4030_vibra_check_coexist(struct twl4030_vibra_data *pdata, - struct device_node *parent) +static bool twl4030_vibra_check_coexist(struct device_node *parent) { struct device_node *node; - if (pdata && pdata->coexist) - return true; - node = of_get_child_by_name(parent, "codec"); if (node) { of_node_put(node); @@ -182,13 +178,12 @@ static bool twl4030_vibra_check_coexist(struct twl4030_vibra_data *pdata, static int twl4030_vibra_probe(struct platform_device *pdev) { - struct twl4030_vibra_data *pdata = dev_get_platdata(&pdev->dev); struct device_node *twl4030_core_node = pdev->dev.parent->of_node; struct vibra_info *info; int ret; - if (!pdata && !twl4030_core_node) { - dev_dbg(&pdev->dev, "platform_data not available\n"); + if (!twl4030_core_node) { + dev_dbg(&pdev->dev, "twl4030 OF node is missing\n"); return -EINVAL; } @@ -197,7 +192,7 @@ static int twl4030_vibra_probe(struct platform_device *pdev) return -ENOMEM; info->dev = &pdev->dev; - info->coexist = twl4030_vibra_check_coexist(pdata, twl4030_core_node); + info->coexist = twl4030_vibra_check_coexist(twl4030_core_node); INIT_WORK(&info->play_work, vibra_play_work); info->input_dev = devm_input_allocate_device(&pdev->dev); -- GitLab From 4160f9680d7f8bb0f4e4e114869146a694347b89 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sat, 17 Sep 2022 17:57:00 +0200 Subject: [PATCH 0627/2223] dt-bindings: input: qcom,pm8xxx-vib: convert to yaml Convert the PM8xxx PMIC Vibrator bindings to dt-schema. Signed-off-by: Luca Weiss Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220917155705.2284-1-luca@z3ntu.xyz Signed-off-by: Dmitry Torokhov --- .../bindings/input/qcom,pm8xxx-vib.txt | 23 ----------- .../bindings/input/qcom,pm8xxx-vib.yaml | 38 +++++++++++++++++++ 2 files changed, 38 insertions(+), 23 deletions(-) delete mode 100644 Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt create mode 100644 Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.yaml diff --git a/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt b/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt deleted file mode 100644 index 64bb990075c31..0000000000000 --- a/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt +++ /dev/null @@ -1,23 +0,0 @@ -Qualcomm PM8xxx PMIC Vibrator - -PROPERTIES - -- compatible: - Usage: required - Value type: - Definition: must be one of: - "qcom,pm8058-vib" - "qcom,pm8916-vib" - "qcom,pm8921-vib" - -- reg: - Usage: required - Value type: - Definition: address of vibration control register - -EXAMPLE - - vibrator@4a { - compatible = "qcom,pm8058-vib"; - reg = <0x4a>; - }; diff --git a/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.yaml b/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.yaml new file mode 100644 index 0000000000000..c8832cd0d7da2 --- /dev/null +++ b/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.yaml @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/qcom,pm8xxx-vib.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm PM8xxx PMIC Vibrator + +maintainers: + - Bjorn Andersson + +properties: + compatible: + enum: + - qcom,pm8058-vib + - qcom,pm8916-vib + - qcom,pm8921-vib + + reg: + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + pmic { + #address-cells = <1>; + #size-cells = <0>; + + vibrator@4a { + compatible = "qcom,pm8058-vib"; + reg = <0x4a>; + }; + }; -- GitLab From 5db8a0d31cab2798f693a360628dcafaee1ecce9 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Sun, 25 Sep 2022 00:19:11 -0700 Subject: [PATCH 0628/2223] Input: joydev - fix comment typo The double `from' is duplicated in the comment, remove one. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20220804120800.60415-1-wangborong@cdjrlc.com Signed-off-by: Dmitry Torokhov --- drivers/input/joydev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index b45ddb4570028..5824bca02e5a0 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -746,7 +746,7 @@ static void joydev_cleanup(struct joydev *joydev) } /* - * These codes are copied from from hid-ids.h, unfortunately there is no common + * These codes are copied from hid-ids.h, unfortunately there is no common * usb_ids/bt_ids.h header. */ #define USB_VENDOR_ID_SONY 0x054c -- GitLab From c3b6eed31f441129aee1cd8e59fd20ba2842f3c9 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Tue, 6 Sep 2022 11:24:35 +0800 Subject: [PATCH 0629/2223] cifs: misc: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Signed-off-by: Steve French --- fs/cifs/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 87f60f7367315..c6679398fff9f 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -824,7 +824,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) free_dentry_path(page); } -/* parses DFS refferal V3 structure +/* parses DFS referral V3 structure * caller is responsible for freeing target_nodes * returns: * - on success - 0 -- GitLab From d7752a6c60c2de889425e27912e3fa96ba5626b2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 19 Sep 2022 23:08:03 -0500 Subject: [PATCH 0630/2223] MAINTAINERS: Add Tom Talpey as cifs.ko reviewer He has been actively reviewing and submitting patches, especially for smbdirect (RDMA) so add him as a reviewer for cifs.ko Acked-by: Tom Talpey Signed-off-by: Steve French --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f5ca4aefd184c..77ce0efb84c54 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5139,6 +5139,7 @@ M: Steve French R: Paulo Alcantara (DFS, global name space) R: Ronnie Sahlberg (directory leases, sparse files) R: Shyam Prasad N (multichannel) +R: Tom Talpey (RDMA, smbdirect) L: linux-cifs@vger.kernel.org L: samba-technical@lists.samba.org (moderated for non-subscribers) S: Supported -- GitLab From 09a1f9a168ae1f69f701689429871793174417d2 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Fri, 16 Sep 2022 20:57:05 -0300 Subject: [PATCH 0631/2223] cifs: return correct error in ->calc_signature() If an error happens while getting the key or session in the ->calc_signature implementations, 0 (success) is returned. Fix it by returning a proper error code. Since it seems to be highly unlikely to happen wrap the rc check in unlikely() too. Reviewed-by: Ronnie Sahlberg Fixes: 32811d242ff6 ("cifs: Start using per session key for smb2/3 for signature generation") Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/cifs/smb2transport.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 1a5fc3314dbf5..4640fc4a8b133 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -225,9 +225,9 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct smb_rqst drqst; ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); - if (!ses) { + if (unlikely(!ses)) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); - return 0; + return -ENOENT; } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); @@ -557,8 +557,10 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, u8 key[SMB3_SIGN_KEY_SIZE]; rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); - if (rc) - return 0; + if (unlikely(rc)) { + cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__); + return rc; + } if (allocate_crypto) { rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc); -- GitLab From bb44c31cdcac107344dd2fcc3bd0504a53575c51 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 20 Sep 2022 14:32:02 +1000 Subject: [PATCH 0632/2223] cifs: destage dirty pages before re-reading them for cache=none This is the opposite case of kernel bugzilla 216301. If we mmap a file using cache=none and then proceed to update the mmapped area these updates are not reflected in a later pread() of that part of the file. To fix this we must first destage any dirty pages in the range before we allow the pread() to proceed. Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Enzo Matsumiya Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6f38b134a3468..7d756721e1a68 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4271,6 +4271,15 @@ static ssize_t __cifs_readv( len = ctx->len; } + if (direct) { + rc = filemap_write_and_wait_range(file->f_inode->i_mapping, + offset, offset + len - 1); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EAGAIN; + } + } + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); -- GitLab From 096560dd13251e351176aef54b7aee91c99920a3 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 23 Sep 2022 14:54:47 +0800 Subject: [PATCH 0633/2223] KVM: arm64: vgic: Remove duplicate check in update_affinity_collection() The 'coll' parameter to update_affinity_collection() is never NULL, so comparing it with 'ite->collection' is enough to cover both the NULL case and the "another collection" case. Remove the duplicate check in update_affinity_collection(). Signed-off-by: Gavin Shan [maz: repainted commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220923065447.323445-1-gshan@redhat.com --- arch/arm64/kvm/vgic/vgic-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 9d3299a702423..24d7778d1ce63 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -406,7 +406,7 @@ static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, struct its_ite *ite; for_each_lpi_its(device, ite, its) { - if (!ite->collection || coll != ite->collection) + if (ite->collection != coll) continue; update_affinity_ite(kvm, ite); -- GitLab From b2a4d007c347b4cb4c60f7512733c3f8300a129c Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Tue, 20 Sep 2022 12:06:58 -0700 Subject: [PATCH 0634/2223] KVM: arm64: Ignore kvm-arm.mode if !is_hyp_mode_available() Ignore kvm-arm.mode if !is_hyp_mode_available(). Specifically, we want to avoid switching kvm_mode to KVM_MODE_PROTECTED if hypervisor mode is not available. This prevents "Protected KVM" cpu capability being reported when Linux is booting in EL1 and would not have KVM enabled. Reasonably though, we should warn if the command line is requesting a KVM mode at all if KVM isn't actually available. Allow "kvm-arm.mode=none" to skip the warning since this would disable KVM anyway. Signed-off-by: Elliot Berman Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220920190658.2880184-1-quic_eberman@quicinc.com --- arch/arm64/kvm/arm.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 2ff0ef62abadc..c7fb2ad8be9f5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2270,6 +2270,16 @@ static int __init early_kvm_mode_cfg(char *arg) if (!arg) return -EINVAL; + if (strcmp(arg, "none") == 0) { + kvm_mode = KVM_MODE_NONE; + return 0; + } + + if (!is_hyp_mode_available()) { + pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n"); + return 0; + } + if (strcmp(arg, "protected") == 0) { if (!is_kernel_in_hyp_mode()) kvm_mode = KVM_MODE_PROTECTED; @@ -2284,11 +2294,6 @@ static int __init early_kvm_mode_cfg(char *arg) return 0; } - if (strcmp(arg, "none") == 0) { - kvm_mode = KVM_MODE_NONE; - return 0; - } - return -EINVAL; } early_param("kvm-arm.mode", early_kvm_mode_cfg); -- GitLab From ddc9589d7921d7af4cc8fa6e0477d83fd95adef5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Sep 2022 12:47:34 -0700 Subject: [PATCH 0635/2223] Input: lm8333 - add missing linux/input.h include We are going to clean up matrix_keymap.h from unnecessary includes, so the driver needs to include API that it uses directly. Also let's sort includes alphabetically and drop unneeded irq.h Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220923194738.927408-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/lm8333.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/input/keyboard/lm8333.c b/drivers/input/keyboard/lm8333.c index 9dac22c141252..3052cd6dedacd 100644 --- a/drivers/input/keyboard/lm8333.c +++ b/drivers/input/keyboard/lm8333.c @@ -4,13 +4,13 @@ * Copyright (C) 2012 Wolfram Sang, Pengutronix */ -#include -#include -#include #include -#include +#include #include #include +#include +#include +#include #define LM8333_FIFO_READ 0x20 #define LM8333_DEBOUNCE 0x22 -- GitLab From d25a9d8f8d314e65a229cc828433ce3cc9cfbd4e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Sep 2022 12:47:35 -0700 Subject: [PATCH 0636/2223] Input: st-keyscan - add missing linux/input.h and linux/of.h includes We are going to clean up matrix_keymap.h from unnecessary includes, so the driver needs to include API that it uses directly. Also let's sort includes alphabetically. Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220923194738.927408-2-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/st-keyscan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/input/keyboard/st-keyscan.c b/drivers/input/keyboard/st-keyscan.c index a045d61165acc..a62bb8fff88c6 100644 --- a/drivers/input/keyboard/st-keyscan.c +++ b/drivers/input/keyboard/st-keyscan.c @@ -8,12 +8,14 @@ * Based on sh_keysc.c, copyright 2008 Magnus Damm */ -#include -#include -#include #include -#include +#include #include +#include +#include +#include +#include +#include #define ST_KEYSCAN_MAXKEYS 16 -- GitLab From 81a7cba79d0015fd50eb99ea6f682efce6005d05 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Sep 2022 12:47:36 -0700 Subject: [PATCH 0637/2223] Input: mt6779-keypad - add missing linux/input.h include We are going to clean up matrix_keymap.h from unnecessary includes, so the driver needs to include API that it uses directly. Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220923194738.927408-3-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/mt6779-keypad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/keyboard/mt6779-keypad.c b/drivers/input/keyboard/mt6779-keypad.c index a05e70af1fd03..19f69d167fbd8 100644 --- a/drivers/input/keyboard/mt6779-keypad.c +++ b/drivers/input/keyboard/mt6779-keypad.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include -- GitLab From 4e9cded6192800a7aed8df7290896e0956b54782 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Sep 2022 12:47:37 -0700 Subject: [PATCH 0638/2223] Input: imx_keypad - add missing linux/input.h include We are going to clean up matrix_keymap.h from unnecessary includes, so the driver needs to include API that it uses directly. Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220923194738.927408-4-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/imx_keypad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/keyboard/imx_keypad.c b/drivers/input/keyboard/imx_keypad.c index ae93038485716..e15a93619e827 100644 --- a/drivers/input/keyboard/imx_keypad.c +++ b/drivers/input/keyboard/imx_keypad.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include -- GitLab From da7a0123ed77dacf6c7bd2c4748bcd39d6bd1b82 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Sep 2022 12:47:38 -0700 Subject: [PATCH 0639/2223] Input: ep93xx_keypad - add missing linux/input.h include We are going to clean up matrix_keymap.h from unnecessary includes, so the driver needs to include API that it uses directly. Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220923194738.927408-5-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/ep93xx_keypad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/keyboard/ep93xx_keypad.c b/drivers/input/keyboard/ep93xx_keypad.c index 7a3b0664ab4f4..f5bf7524722a7 100644 --- a/drivers/input/keyboard/ep93xx_keypad.c +++ b/drivers/input/keyboard/ep93xx_keypad.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include -- GitLab From 4d1632151bde847230a0bd2318806380d309655f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 26 Sep 2022 23:16:37 +0200 Subject: [PATCH 0640/2223] leds: pca963: fix misleading indentation I'm getting warnings: /tmp/next/build/drivers/leds/leds-pca963x.c: In function 'pca963x_register_leds': /tmp/next/build/drivers/leds/leds-pca963x.c:355:3: error: this 'if' clause does not guard... +[-Werror=misleading-indentation] 355 | if (hw_blink) | ^~ /tmp/next/build/drivers/leds/leds-pca963x.c:357:4: note: ...this statement, but the latter is +misleadingly indented as if it were guarded by the 'if' 357 | led->blinking = false; | ^~~ cc1: all warnings being treated as errors Fix the indentation to make them go away. Signed-off-by: Pavel Machek --- drivers/leds/leds-pca963x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-pca963x.c b/drivers/leds/leds-pca963x.c index d8d866bcda19f..a7e052c1db531 100644 --- a/drivers/leds/leds-pca963x.c +++ b/drivers/leds/leds-pca963x.c @@ -354,7 +354,7 @@ static int pca963x_register_leds(struct i2c_client *client, led->led_cdev.brightness_set_blocking = pca963x_led_set; if (hw_blink) led->led_cdev.blink_set = pca963x_blink_set; - led->blinking = false; + led->blinking = false; init_data.fwnode = child; /* for backwards compatibility */ -- GitLab From 3b0e81a1cdc9afbddb0543d08e38edb4e33c4baf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: [PATCH 0641/2223] mmap: change zeroing of maple tree in __vma_adjust() Only write to the maple tree if we are not inserting or the insert isn't going to overwrite the area to clear. This avoids spanning writes and node coealescing when unnecessary. The change requires a custom search for the linked list addition to find the correct VMA for the prev link. Link: https://lkml.kernel.org/r/20220906194824.2110408-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f60d83c7f2337..52a774e70e5b8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -567,11 +567,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, * mm's list and the mm tree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma) + struct vm_area_struct *vma, unsigned long location) { struct vm_area_struct *prev; - mas_set(mas, vma->vm_start); + mas_set(mas, location); prev = mas_prev(mas, 0); vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); @@ -601,6 +601,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; + unsigned long ll_prev = vma->vm_start; /* linked list prev. */ if (next && !insert) { if (end >= next->vm_end) { @@ -728,15 +729,27 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - if (vma->vm_start < start) + if ((vma->vm_start < start) && + (!insert || (insert->vm_end != start))) { vma_mas_szero(&mas, vma->vm_start, start); - vma_changed = true; + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } else { + vma_changed = true; + } vma->vm_start = start; } if (end != vma->vm_end) { - if (vma->vm_end > end) - vma_mas_szero(&mas, end, vma->vm_end); - vma_changed = true; + if (vma->vm_end > end) { + if (!insert || (insert->vm_start != end)) { + vma_mas_szero(&mas, end, vma->vm_end); + VM_WARN_ON(insert && + insert->vm_end < vma->vm_end); + } else if (insert->vm_start == end) { + ll_prev = vma->vm_end; + } + } else { + vma_changed = true; + } vma->vm_end = end; if (!next) mm->highest_vm_end = vm_end_gap(vma); @@ -783,7 +796,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, &mas, insert); + __insert_vm_struct(mm, &mas, insert, ll_prev); } if (anon_vma) { @@ -870,6 +883,7 @@ again: if (insert && file) uprobe_mmap(insert); + mas_destroy(&mas); validate_mm(mm); return 0; } -- GitLab From 7ccf089b262be2c7f2cdc6c08412a1939a812102 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: [PATCH 0642/2223] xen: use vma_lookup() in privcmd_ioctl_mmap() vma_lookup() walks the VMA tree for a specific value, find_vma() will search the tree after walking to a specific value. It is more efficient to only walk to the requested value since privcmd_ioctl_mmap() will exit the loop if vm_start != msg->va. Link: https://lkml.kernel.org/r/20220906194824.2110408-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/xen/privcmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e88e8f6f0a334..fae50a24630bd 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -282,7 +282,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct page, lru); struct privcmd_mmap_entry *msg = page_address(page); - vma = find_vma(mm, msg->va); + vma = vma_lookup(mm, msg->va); rc = -EINVAL; if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) -- GitLab From dc8635b25e87232f62276c02899b9d21dd0793c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: [PATCH 0643/2223] mm: optimize find_exact_vma() to use vma_lookup() Use vma_lookup() to walk the tree to the start value requested. If the vma at the start does not match, then the answer is NULL and there is no need to look at the next vma the way that find_vma() would. Link: https://lkml.kernel.org/r/20220906194824.2110408-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dfce1aaa7a648..a80083091f53a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2850,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; -- GitLab From 94d815b2798bad12d0aec912add7665e69a48400 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: [PATCH 0644/2223] mm/khugepaged: optimize collapse_pte_mapped_thp() by using vma_lookup() vma_lookup() will walk the vma tree once and not continue to look for the next vma. Since the exact vma is checked below, this is a more optimal way of searching. Link: https://lkml.kernel.org/r/20220906194824.2110408-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index dc09cfe76e1f2..9ff3d39b286f8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1389,7 +1389,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { unsigned long haddr = addr & HPAGE_PMD_MASK; - struct vm_area_struct *vma = find_vma(mm, haddr); + struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd; -- GitLab From 2e7ce7d354f2fae4c9becb8af799cbedf4f71665 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: [PATCH 0645/2223] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap() Avoid allocating a new VMA when it a vma modification can occur. When a brk() can expand or contract a VMA, then the single store operation will only modify one index of the maple tree instead of causing a node to split or coalesce. This avoids unnecessary allocations/frees of maple tree nodes and VMAs. Move some limit & flag verifications out of the do_brk_flags() function to use only relevant checks in the code path of bkr() and vm_brk_flags(). Set the vma to check if it can expand in vm_brk_flags() if extra criteria are met. Drop userfaultfd from do_brk_flags() path and only use it in vm_brk_flags() path since that is the only place a munmap will happen. Add a wraper for munmap for the brk case called do_brk_munmap(). Link: https://lkml.kernel.org/r/20220906194824.2110408-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 237 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 177 insertions(+), 60 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 52a774e70e5b8..0baa2ca5b0bf0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, - struct list_head *uf); +/* + * check_brk_limits() - Use platform specific check of range & verify mlock + * limits. + * @addr: The address to check + * @len: The size of increase. + * + * Return: 0 on success. + */ +static int check_brk_limits(unsigned long addr, unsigned long len) +{ + unsigned long mapped_addr; + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; + + return mlock_future_check(current->mm, current->mm->def_flags, len); +} +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, + unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; - struct vm_area_struct *next; + struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate; bool downgraded = false; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_lock to read. + * do_brk_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; + /* Search one past newbrk */ + mas_set(&mas, newbrk); + brkvma = mas_find(&mas, oldbrk); + BUG_ON(brkvma == NULL); + if (brkvma->vm_start >= oldbrk) + goto out; /* mapping intersects with an existing non-brk vma. */ /* - * mm->brk must to be protected by write mmap_lock so update it - * before downgrading mmap_lock. When __do_munmap() fails, - * mm->brk will be restored from origbrk. + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). */ mm->brk = brk; - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); - if (ret < 0) { - mm->brk = origbrk; - goto out; - } else if (ret == 1) { + mas.last = oldbrk - 1; + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { downgraded = true; - } - goto success; + goto success; + } else if (!ret) + goto success; + + mm->brk = origbrk; + goto out; } - /* Check against existing mmap mappings. */ - next = find_vma(mm, oldbrk); + if (check_brk_limits(oldbrk, newbrk - oldbrk)) + goto out; + + /* + * Only check if the next VMA is within the stack_guard_gap of the + * expansion area + */ + mas_set(&mas, oldbrk); + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; + brkvma = mas_prev(&mas, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; + mm->brk = brk; success: @@ -2762,38 +2802,55 @@ out: } /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. */ -static int do_brk_flags(unsigned long addr, unsigned long len, - unsigned long flags, struct list_head *uf) +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - pgoff_t pgoff = addr >> PAGE_SHIFT; - int error; - unsigned long mapped_addr; - validate_mm_mt(mm); - - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (IS_ERR_VALUE(mapped_addr)) - return mapped_addr; + struct mm_struct *mm = vma->vm_mm; + int ret; - error = mlock_future_check(mm, mm->def_flags, len); - if (error) - return error; + arch_unmap(mm, newbrk, oldbrk); + ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + validate_mm_mt(mm); + return ret; +} - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) - return -ENOMEM; +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *prev = NULL; - /* Check against address space limits *after* clearing old maps... */ + validate_mm_mt(mm); + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long addr, unsigned long len, if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* Can we just expand an old private anonymous mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - /* - * create a vma struct for an anonymous mapping + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. */ - vma = vm_area_alloc(mm); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; + if (vma && + (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { + mas->index = vma->vm_start; + mas->last = addr + len - 1; + vma_adjust_trans_huge(vma, addr, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_expand_failed; + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma(vma, flags); + goto out; } + prev = vma; + + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto vma_alloc_fail; vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; + vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - if (vma_link(mm, vma, prev)) - goto no_vma_link; + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_store_fail; + + if (!prev) + prev = mas_prev(mas, 0); + __vma_link_list(mm, vma, prev); + mm->map_count++; out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2837,18 +2918,29 @@ out: validate_mm_mt(mm); return 0; -no_vma_link: +mas_store_fail: vm_area_free(vma); +vma_alloc_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + +mas_expand_failed: + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); len = PAGE_ALIGN(request); if (len < request) @@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + + ret = check_brk_limits(addr, len); + if (ret) + goto limits_failed; + + if (find_vma_intersection(mm, addr, addr + len)) + ret = do_munmap(mm, addr, len, &uf); + + if (ret) + goto munmap_failed; + + vma = mas_prev(&mas, 0); + if (!vma || vma->vm_end != addr || vma_policy(vma) || + !can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) + vma = NULL; + + ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; + +munmap_failed: +limits_failed: + mmap_write_unlock(mm); + return ret; } EXPORT_SYMBOL(vm_brk_flags); -- GitLab From abdba2dda0c477ca708a939b02f9b2e74666ed2d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: [PATCH 0646/2223] mm: use maple tree operations for find_vma_intersection() Move find_vma_intersection() to mmap.c and change implementation to maple tree. When searching for a vma within a range, it is easier to use the maple tree interface. Exported find_vma_intersection() for kvm module. Link: https://lkml.kernel.org/r/20220906194824.2110408-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++------------------ mm/mmap.c | 29 +++++++++++++++++++++++++++++ mm/nommu.c | 11 +++++++++++ 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a80083091f53a..06a6b8db75b7c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2778,26 +2778,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/** - * find_vma_intersection() - Look up the first VMA which intersects the interval - * @mm: The process address space. - * @start_addr: The inclusive start user address. - * @end_addr: The exclusive end user address. - * - * Returns: The first VMA within the provided range, %NULL otherwise. Assumes - * start_addr < end_addr. +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - struct vm_area_struct *vma = find_vma(mm, start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} + unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address diff --git a/mm/mmap.c b/mm/mmap.c index 0baa2ca5b0bf0..699af34c35730 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2061,6 +2061,35 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + struct vm_area_struct *vma; + unsigned long index = start_addr; + + mmap_assert_locked(mm); + /* Check the cache first. */ + vma = vmacache_find(mm, start_addr); + if (likely(vma)) + return vma; + + vma = mt_find(&mm->mm_mt, &index, end_addr - 1); + if (vma) + vmacache_update(start_addr, vma); + return vma; +} +EXPORT_SYMBOL(find_vma_intersection); + /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check diff --git a/mm/nommu.c b/mm/nommu.c index 321c7e6718a89..2702790d05d3c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -642,6 +642,17 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) vm_area_free(vma); } +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + /* * look up the first VMA in which addr resides, NULL if none * - should be called with mm->mmap_lock at least held readlocked -- GitLab From 4dd1b84140c1b87a89d69a683bebbbdaeb620e39 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: [PATCH 0647/2223] mm/mmap: use advanced maple tree API for mmap_region() Changing mmap_region() to use the maple tree state and the advanced maple tree interface allows for a lot less tree walking. This change removes the last caller of munmap_vma_range(), so drop this unused function. Add vma_expand() to expand a VMA if possible by doing the necessary hugepage check, uprobe_munmap of files, dcache flush, modifications then undoing the detaches, etc. Link: https://lkml.kernel.org/r/20220906194824.2110408-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 251 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 203 insertions(+), 48 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 699af34c35730..7a1adc916957d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -470,28 +470,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, return vma->vm_next; } -/* - * munmap_vma_range() - munmap VMAs that overlap a range. - * @mm: The mm struct - * @start: The start of the range. - * @len: The length of the range. - * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * - * Find all the vm_area_struct that overlap from @start to - * @end and munmap them. Set @pprev to the previous vm_area_struct. - * - * Returns: -ENOMEM on munmap failure or 0 on success. - */ -static inline int -munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct list_head *uf) -{ - while (range_has_overlap(mm, start, start + len, pprev)) - if (do_munmap(mm, start, len, uf)) - return -ENOMEM; - return 0; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -618,6 +596,129 @@ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, mm->map_count++; } +/* + * vma_expand - Expand an existing VMA + * + * @mas: The maple state + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = vma->anon_vma; + struct file *file = vma->vm_file; + bool remove_next = false; + + if (next && (vma != next) && (end == next->vm_end)) { + remove_next = true; + if (next->anon_vma && !vma->anon_vma) { + int error; + + anon_vma = next->anon_vma; + vma->anon_vma = anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + i_mmap_lock_write(mapping); + } + + if (anon_vma) { + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + + if (file) { + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_mas_store(vma, mas); + + if (file) { + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + /* Expanding over the next vma */ + if (remove_next) { + /* Remove from mm linked list - also updates highest_vm_end */ + __vma_unlink_list(mm, next); + + /* Kill the cache */ + vmacache_invalidate(mm); + + if (file) + __remove_shared_vm_struct(next, file, mapping); + + } else if (!next) { + mm->highest_vm_end = vm_end_gap(vma); + } + + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(anon_vma); + } + + if (file) { + i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + } + + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + vm_area_free(next); + } + + validate_mm(mm); + return 0; + +nomem: + return -ENOMEM; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -1630,9 +1731,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev, *merge; - int error; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -1642,16 +1749,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ - nr_pages = count_vma_pages_range(mm, addr, addr + len); + nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) + /* Unmap any existing mapping in the area */ + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; + /* * Private writable mapping: check memory availability */ @@ -1662,14 +1770,43 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - /* - * Can we just expand an old mapping? - */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + mas.index = addr; + mas.last = end - 1; +cannot_expand: /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but @@ -1682,7 +1819,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } vma->vm_start = addr; - vma->vm_end = addr + len; + vma->vm_end = end; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -1703,28 +1840,32 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM - * Bug: If addr is changed, prev, rb_link, rb_parent should - * be updated for vma_link() */ WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; + mas_reset(&mas); - /* If vm_flags changed after call_mmap(), we should try merge vma again - * as we may succeed this time. + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { - /* ->mmap() can change vma->vm_file and fput the original file. So - * fput the vma->vm_file here or we would add an extra fput for file - * and cause general protection fault ultimately. + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. */ fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; vm_flags = vma->vm_flags; goto unmap_writable; } @@ -1748,7 +1889,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (vma_link(mm, vma, prev)) { + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { error = -ENOMEM; if (file) goto unmap_and_free_vma; @@ -1756,6 +1897,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + __vma_link_list(mm, vma, prev); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + /* * vma_merge() calls khugepaged_enter_vma() either, the below * call covers the non-merge case. @@ -1767,7 +1924,7 @@ unmap_writable: if (file && vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; -out: +expanded: perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -1794,6 +1951,7 @@ out: vma_set_page_prot(vma); + validate_mm(mm); return addr; unmap_and_free_vma: @@ -1809,6 +1967,7 @@ free_vma: unacct_error: if (charged) vm_unacct_memory(charged); + validate_mm(mm); return error; } @@ -2632,10 +2791,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, prev = vma->vm_prev; /* we have start < vma->vm_end */ - /* if it doesn't overlap, we have nothing.. */ - if (vma->vm_start >= end) - return 0; - /* * If we need to split any vma, do it now to save pain later. * -- GitLab From 7964cf8caa4dfa42c4149f3833d3878713cda3dc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: [PATCH 0648/2223] mm: remove vmacache By using the maple tree and the maple tree state, the vmacache is no longer beneficial and is complicating the VMA code. Remove the vmacache to reduce the work in keeping it up to date and code complexity. Link: https://lkml.kernel.org/r/20220906194824.2110408-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/exec.c | 3 - fs/proc/task_mmu.c | 1 - include/linux/mm_types.h | 1 - include/linux/mm_types_task.h | 12 ---- include/linux/sched.h | 1 - include/linux/vm_event_item.h | 4 -- include/linux/vmacache.h | 28 -------- include/linux/vmstat.h | 6 -- kernel/debug/debug_core.c | 12 ---- kernel/fork.c | 5 -- lib/Kconfig.debug | 8 --- mm/Makefile | 2 +- mm/debug.c | 4 +- mm/mmap.c | 31 +-------- mm/nommu.c | 37 ++--------- mm/vmacache.c | 117 ---------------------------------- mm/vmstat.c | 4 -- 17 files changed, 9 insertions(+), 267 deletions(-) delete mode 100644 include/linux/vmacache.h delete mode 100644 mm/vmacache.c diff --git a/fs/exec.c b/fs/exec.c index 507a317d54db6..2b919b30dc97b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1027,8 +1026,6 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); task_unlock(tsk); lru_gen_use_mm(mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index db2f3a2946a07..9f70bc1c27661 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ac747273c4d65..4541b74b1bdb9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -475,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index c1bc6731125cb..0bb4b6da99939 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -24,18 +24,6 @@ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) -/* - * The per task VMA cache array: - */ -#define VMACACHE_BITS 2 -#define VMACACHE_SIZE (1U << VMACACHE_BITS) -#define VMACACHE_MASK (VMACACHE_SIZE - 1) - -struct vmacache { - u64 seqnum; - struct vm_area_struct *vmas[VMACACHE_SIZE]; -}; - /* * When updating this, please also update struct resident_page_types[] in * kernel/fork.c diff --git a/include/linux/sched.h b/include/linux/sched.h index a2dcfb91df032..fbac3c19fe354 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,7 +861,6 @@ struct task_struct { struct mm_struct *active_mm; /* Per-thread vma caching: */ - struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f3fc36cd2276a..3518dba1e02f4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -129,10 +129,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - VMACACHE_FIND_CALLS, - VMACACHE_FIND_HITS, -#endif #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h deleted file mode 100644 index 6fce268a4588e..0000000000000 --- a/include/linux/vmacache.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_VMACACHE_H -#define __LINUX_VMACACHE_H - -#include -#include - -static inline void vmacache_flush(struct task_struct *tsk) -{ - memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); -} - -extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); -extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, - unsigned long addr); - -#ifndef CONFIG_MMU -extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end); -#endif - -static inline void vmacache_invalidate(struct mm_struct *mm) -{ - mm->vmacache_seqnum++; -} - -#endif /* __LINUX_VMACACHE_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index bfe38869498d7..19cf5b6892ceb 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif -#ifdef CONFIG_DEBUG_VM_VMACACHE -#define count_vm_vmacache_event(x) count_vm_event(x) -#else -#define count_vm_vmacache_event(x) do {} while (0) -#endif - #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7beceb447211d..d5e9ccde3ab8e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm) { - int i; - - for (i = 0; i < VMACACHE_SIZE; i++) { - if (!current->vmacache.vmas[i]) - continue; - flush_cache_range(current->vmacache.vmas[i], - addr, addr + BREAK_INSTR_SIZE); - } - } - /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/fork.c b/kernel/fork.c index 5f81c009bb202..430f63cd7a371 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -1128,7 +1127,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->mmap = NULL; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); - mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); @@ -1585,9 +1583,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; - /* initialize the new vmacache entries */ - vmacache_flush(tsk); - if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2becf60995e18..6d1544d9201e4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -812,14 +812,6 @@ config DEBUG_VM If unsure, say N. -config DEBUG_VM_VMACACHE - bool "Debug VMA caching" - depends on DEBUG_VM - help - Enable this to turn on VMA caching debug information. Doing so - can cause significant overhead, so only enable it in non-production - environments. - config DEBUG_VM_MAPLE_TREE bool "Debug VM maple trees" depends on DEBUG_VM diff --git a/mm/Makefile b/mm/Makefile index 488f604e77e07..a731d1decbb12 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) diff --git a/mm/debug.c b/mm/debug.c index bef329bf28f01..2d625ca0e3269 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -155,7 +155,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" + pr_emerg("mm %px mmap %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif @@ -183,7 +183,7 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/mmap.c b/mm/mmap.c index 7a1adc916957d..7872642e8993d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -680,9 +679,6 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Remove from mm linked list - also updates highest_vm_end */ __vma_unlink_list(mm, next); - /* Kill the cache */ - vmacache_invalidate(mm); - if (file) __remove_shared_vm_struct(next, file, mapping); @@ -923,8 +919,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, __vma_unlink_list(mm, next); if (remove_next == 2) __vma_unlink_list(mm, next_next); - /* Kill the cache */ - vmacache_invalidate(mm); if (file) { __remove_shared_vm_struct(next, file, mapping); @@ -2233,19 +2227,10 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { - struct vm_area_struct *vma; unsigned long index = start_addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, start_addr); - if (likely(vma)) - return vma; - - vma = mt_find(&mm->mm_mt, &index, end_addr - 1); - if (vma) - vmacache_update(start_addr, vma); - return vma; + return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); @@ -2259,19 +2244,10 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; unsigned long index = addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - vma = mt_find(&mm->mm_mt, &index, ULONG_MAX); - if (vma) - vmacache_update(addr, vma); - return vma; + return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); @@ -2660,9 +2636,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; - /* Kill the cache */ - vmacache_invalidate(mm); - /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or * VM_GROWSUP VMA. Such VMAs can change their size under diff --git a/mm/nommu.c b/mm/nommu.c index 2702790d05d3c..265a444a2cc27 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -598,23 +597,12 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { - int i; - struct address_space *mapping; - struct mm_struct *mm = vma->vm_mm; - struct task_struct *curr = current; MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - mm->map_count--; - for (i = 0; i < VMACACHE_SIZE; i++) { - /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache.vmas[i] == vma) { - vmacache_invalidate(mm); - break; - } - } - + vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { + struct address_space *mapping; mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); @@ -626,7 +614,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); - __vma_unlink_list(mm, vma); + __vma_unlink_list(vma->vm_mm, vma); } /* @@ -659,20 +647,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - vma = mas_walk(&mas); - - if (vma) - vmacache_update(addr, vma); - - return vma; + return mas_walk(&mas); } EXPORT_SYMBOL(find_vma); @@ -706,11 +683,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find_exact(mm, addr, end); - if (vma) - return vma; - vma = mas_walk(&mas); if (!vma) return NULL; @@ -719,7 +691,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_end != end) return NULL; - vmacache_update(addr, vma); return vma; } diff --git a/mm/vmacache.c b/mm/vmacache.c deleted file mode 100644 index 01a6e6688ec1f..0000000000000 --- a/mm/vmacache.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2014 Davidlohr Bueso. - */ -#include -#include -#include -#include - -/* - * Hash based on the pmd of addr if configured with MMU, which provides a good - * hit rate for workloads with spatial locality. Otherwise, use pages. - */ -#ifdef CONFIG_MMU -#define VMACACHE_SHIFT PMD_SHIFT -#else -#define VMACACHE_SHIFT PAGE_SHIFT -#endif -#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) - -/* - * This task may be accessing a foreign mm via (for example) - * get_user_pages()->find_vma(). The vmacache is task-local and this - * task's vmacache pertains to a different mm (ie, its own). There is - * nothing we can do here. - * - * Also handle the case where a kernel thread has adopted this mm via - * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. - */ -static inline bool vmacache_valid_mm(struct mm_struct *mm) -{ - return current->mm == mm && !(current->flags & PF_KTHREAD); -} - -void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) -{ - if (vmacache_valid_mm(newvma->vm_mm)) - current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma; -} - -static bool vmacache_valid(struct mm_struct *mm) -{ - struct task_struct *curr; - - if (!vmacache_valid_mm(mm)) - return false; - - curr = current; - if (mm->vmacache_seqnum != curr->vmacache.seqnum) { - /* - * First attempt will always be invalid, initialize - * the new cache for this task here. - */ - curr->vmacache.seqnum = mm->vmacache_seqnum; - vmacache_flush(curr); - return false; - } - return true; -} - -struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) -{ - int idx = VMACACHE_HASH(addr); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma) { -#ifdef CONFIG_DEBUG_VM_VMACACHE - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; -#endif - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} - -#ifndef CONFIG_MMU -struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - int idx = VMACACHE_HASH(start); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma && vma->vm_start == start && vma->vm_end == end) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 779f1ea6e8ea6..bd8040f25c272 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1389,10 +1389,6 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - "vmacache_find_calls", - "vmacache_find_hits", -#endif #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", -- GitLab From d7c62295570f012e1d386ae6ed472b36baf037ad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: [PATCH 0649/2223] mm: convert vma_lookup() to use mtree_load() Unlike the rbtree, the Maple Tree will return a NULL if there's nothing at a particular address. Since the previous commit dropped the vmacache, it is now possible to consult the tree directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06a6b8db75b7c..49a58807719b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2795,12 +2795,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (vma && addr < vma->vm_start) - vma = NULL; - - return vma; + return mtree_load(&mm->mm_mt, addr); } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) -- GitLab From e99668a56430a25a871113bcd3989ed20eae1cfc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: [PATCH 0650/2223] mm/mmap: move mmap_region() below do_munmap() Relocation of code for the next commit. There should be no changes here. Link: https://lkml.kernel.org/r/20220906194824.2110408-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 490 +++++++++++++++++++++++++++--------------------------- 1 file changed, 245 insertions(+), 245 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 7872642e8993d..8c9e526994be4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1720,251 +1720,6 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; - pgoff_t pglen = len >> PAGE_SHIFT; - unsigned long charged = 0; - unsigned long end = addr + len; - unsigned long merge_start = addr, merge_end = end; - pgoff_t vm_pgoff; - int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); - - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } - - /* Unmap any existing mapping in the area */ - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); - if (vm_flags & VM_SPECIAL) - goto cannot_expand; - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && !vma_policy(next) && - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, - NULL_VM_UFFD_CTX, NULL)) { - merge_end = next->vm_end; - vma = next; - vm_pgoff = next->vm_pgoff - pglen; - } - - /* Check prev */ - if (prev && prev->vm_end == addr && !vma_policy(prev) && - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, - pgoff, vma->vm_userfaultfd_ctx, NULL) : - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, - NULL_VM_UFFD_CTX, NULL))) { - merge_start = prev->vm_start; - vma = prev; - vm_pgoff = prev->vm_pgoff; - } - - - /* Actually expand, if possible */ - if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { - khugepaged_enter_vma(vma, vm_flags); - goto expanded; - } - - mas.index = addr; - mas.last = end - 1; -cannot_expand: - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - - vma->vm_start = addr; - vma->vm_end = end; - vma->vm_flags = vm_flags; - vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; - - if (file) { - if (vm_flags & VM_SHARED) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - - vma->vm_file = get_file(file); - error = call_mmap(file, vma); - if (error) - goto unmap_and_free_vma; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; - mas_reset(&mas); - - /* - * If vm_flags changed after call_mmap(), we should try merge - * vma again as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (merge) { - /* - * ->mmap() can change vma->vm_file and fput - * the original file. So fput the vma->vm_file - * here or we would add an extra fput for file - * and cause general protection fault - * ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - addr = vma->vm_start; - vm_flags = vma->vm_flags; - goto unmap_writable; - } - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } else { - vma_set_anonymous(vma); - } - - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { - error = -ENOMEM; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (vma->vm_file) - i_mmap_lock_write(vma->vm_file->f_mapping); - - vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); - mm->map_count++; - if (vma->vm_file) { - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(vma->vm_file->f_mapping); - - flush_dcache_mmap_lock(vma->vm_file->f_mapping); - vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); - flush_dcache_mmap_unlock(vma->vm_file->f_mapping); - i_mmap_unlock_write(vma->vm_file->f_mapping); - } - - /* - * vma_merge() calls khugepaged_enter_vma() either, the below - * call covers the non-merge case. - */ - khugepaged_enter_vma(vma, vma->vm_flags); - - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (file && vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - file = vma->vm_file; -expanded: - perf_event_mmap(vma); - - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - else - mm->locked_vm += (len >> PAGE_SHIFT); - } - - if (file) - uprobe_mmap(vma); - - /* - * New (or expanded) vma always get soft dirty status. - * Otherwise user-space soft-dirty page tracker won't - * be able to distinguish situation when vma area unmapped, - * then new mapped in-place (which must be aimed as - * a completely new data area). - */ - vma->vm_flags |= VM_SOFTDIRTY; - - vma_set_page_prot(vma); - - validate_mm(mm); - return addr; - -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - validate_mm(mm); - return error; -} - /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used @@ -2840,6 +2595,251 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return __do_munmap(mm, start, len, uf, false); } +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; + unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); + + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + nr_pages = count_vma_pages_range(mm, addr, end); + + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + /* Unmap any existing mapping in the area */ + if (do_munmap(mm, addr, len, uf)) + return -ENOMEM; + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + + mas.index = addr; + mas.last = end - 1; +cannot_expand: + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = vm_area_alloc(mm); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_start = addr; + vma->vm_end = end; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + + if (file) { + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto free_vma; + } + + vma->vm_file = get_file(file); + error = call_mmap(file, vma); + if (error) + goto unmap_and_free_vma; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + WARN_ON_ONCE(addr != vma->vm_start); + + addr = vma->vm_start; + mas_reset(&mas); + + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + vma_set_anonymous(vma); + } + + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + error = -ENOMEM; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + __vma_link_list(mm, vma, prev); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + + /* + * vma_merge() calls khugepaged_enter_vma() either, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + + /* Once vma denies write, undo our temporary denial count */ +unmap_writable: + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + file = vma->vm_file; +expanded: + perf_event_mmap(vma); + + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + + vma_set_page_prot(vma); + + validate_mm(mm); + return addr; + +unmap_and_free_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +free_vma: + vm_area_free(vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + validate_mm(mm); + return error; +} + static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; -- GitLab From 11f9a21ab65542189372b7d64bb2d2937dfdc9dc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: [PATCH 0651/2223] mm/mmap: reorganize munmap to use maple states Remove __do_munmap() in favour of do_munmap(), do_mas_munmap(), and do_mas_align_munmap(). do_munmap() is a wrapper to create a maple state for any callers that have not been converted to the maple tree. do_mas_munmap() takes a maple state to mumap a range. This is just a small function which checks for error conditions and aligns the end of the range. do_mas_align_munmap() uses the aligned range to mumap a range. do_mas_align_munmap() starts with the first VMA in the range, then finds the last VMA in the range. Both start and end are split if necessary. Then the VMAs are removed from the linked list and the mm mlock count is updated at the same time. Followed by a single tree operation of overwriting the area in with a NULL. Finally, the detached list is unmapped and freed. By reorganizing the munmap calls as outlined, it is now possible to avoid extra work of aligning pre-aligned callers which are known to be safe, avoid extra VMA lookups or tree walks for modifications. detach_vmas_to_be_unmapped() is no longer used, so drop this code. vm_brk_flags() can just call the do_mas_munmap() as it checks for intersecting VMAs directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +- mm/mmap.c | 228 ++++++++++++++++++++++++++++----------------- mm/mremap.c | 17 ++-- 3 files changed, 158 insertions(+), 92 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 49a58807719b1..579449d6c23ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2710,8 +2710,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); +extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); diff --git a/mm/mmap.c b/mm/mmap.c index 8c9e526994be4..6e587f4e3a7da 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2362,47 +2362,6 @@ static void unmap_region(struct mm_struct *mm, tlb_finish_mmu(&tlb); } -/* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - vma->vm_prev = NULL; - vma_mas_szero(mas, vma->vm_start, end); - do { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(vma); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - if (vma) - vma->vm_prev = prev; - else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - tail_vma->vm_next = NULL; - - /* - * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or - * VM_GROWSUP VMA. Such VMAs can change their size under - * down_read(mmap_lock) and collide with the VMA we are about to unmap. - */ - if (vma && (vma->vm_flags & VM_GROWSDOWN)) - return false; - if (prev && (prev->vm_flags & VM_GROWSUP)) - return false; - return true; -} - /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. @@ -2485,40 +2444,51 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge - */ -int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +static inline int +unlock_range(struct vm_area_struct *start, struct vm_area_struct **tail, + unsigned long limit) { - unsigned long end; - struct vm_area_struct *vma, *prev, *last; - int error = -ENOMEM; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct mm_struct *mm = start->vm_mm; + struct vm_area_struct *tmp = start; + int count = 0; - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; + while (tmp && tmp->vm_start < limit) { + *tail = tmp; + count++; + if (tmp->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(tmp); - len = PAGE_ALIGN(len); - end = start + len; - if (len == 0) - return -EINVAL; + tmp = tmp->vm_next; + } - /* arch_unmap() might do unmaps itself. */ - arch_unmap(mm, start, end); + return count; +} - /* Find the first overlapping VMA where start < vma->vm_end */ - vma = find_vma_intersection(mm, start, end); - if (!vma) - return 0; +/* + * do_mas_align_munmap() - munmap the aligned region from @start to @end. + * @mas: The maple_state, ideally set up to alter the correct tree location. + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @downgrade: Set to true to attempt a write downgrade of the mmap_sem + * + * If @downgrade is true, check return code for potential release of the lock. + */ +static int +do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool downgrade) +{ + struct vm_area_struct *prev, *last; + int error = -ENOMEM; + /* we have start < vma->vm_end */ - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; - prev = vma->vm_prev; - /* we have start < vma->vm_end */ + mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2539,17 +2509,31 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, error = __split_vma(mm, vma, start, 0); if (error) goto split_failed; + prev = vma; + vma = __vma_next(mm, prev); + mas->index = start; + mas_reset(mas); + } else { + prev = vma->vm_prev; } + if (vma->vm_end >= end) + last = vma; + else + last = find_vma_intersection(mm, end - 1, end); + /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { + if (last && end < last->vm_end) { error = __split_vma(mm, last, end, 1); + if (error) goto split_failed; + + if (vma == last) + vma = __vma_next(mm, prev); + mas_reset(mas); } - vma = __vma_next(mm, prev); if (unlikely(uf)) { /* @@ -2562,16 +2546,46 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(vma, start, end, uf); + if (error) goto userfaultfd_error; } - /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, &mas, vma, prev, end)) - downgrade = false; + /* + * unlock any mlock()ed ranges before detaching vmas, count the number + * of VMAs to be dropped, and return the tail entry of the affected + * area. + */ + mm->map_count -= unlock_range(vma, &last, end); + /* Drop removed area from the tree */ + mas_store_prealloc(mas, NULL); - if (downgrade) - mmap_write_downgrade(mm); + /* Detach vmas from the MM linked list */ + vma->vm_prev = NULL; + if (prev) + prev->vm_next = last->vm_next; + else + mm->mmap = last->vm_next; + + if (last->vm_next) { + last->vm_next->vm_prev = prev; + last->vm_next = NULL; + } else + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (downgrade) { + if (last && (last->vm_flags & VM_GROWSDOWN)) + downgrade = false; + else if (prev && (prev->vm_flags & VM_GROWSUP)) + downgrade = false; + else + mmap_write_downgrade(mm); + } unmap_region(mm, vma, prev, start, end); @@ -2585,14 +2599,63 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, map_count_exceeded: split_failed: userfaultfd_error: - mas_destroy(&mas); + mas_destroy(mas); return error; } +/* + * do_mas_munmap() - munmap a given range. + * @mas: The maple state + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @downgrade: set to true if the user wants to attempt to write_downgrade the + * mmap_sem + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned and any arch_unmap work will be preformed. + * + * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. + */ +int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; + + /* arch_unmap() might do unmaps itself. */ + arch_unmap(mm, start, end); + + /* Find the first overlapping VMA */ + vma = mas_find(mas, end - 1); + if (!vma) + return 0; + + return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); +} + +/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length to be munmapped. + * @uf: The userfaultfd list_head + */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - return __do_munmap(mm, start, len, uf, false); + MA_STATE(mas, &mm->mm_mt, start, start); + + return do_mas_munmap(&mas, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2626,7 +2689,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_munmap(mm, addr, len, uf)) + if (do_mas_munmap(&mas, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2845,11 +2908,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, start, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __do_munmap(mm, start, len, &uf, downgrade); + ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2978,7 +3042,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + ret = do_mas_munmap(mas, mm, newbrk, oldbrk-newbrk, uf, true); validate_mm_mt(mm); return ret; } @@ -3116,9 +3180,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - if (find_vma_intersection(mm, addr, addr + len)) - ret = do_munmap(mm, addr, len, &uf); - + ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; diff --git a/mm/mremap.c b/mm/mremap.c index b522cd0259a0f..e0fba90042466 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -975,20 +975,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * __do_munmap does all the needed commit accounting, and + * do_mas_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; + MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); - retval = __do_munmap(mm, addr+new_len, old_len - new_len, - &uf_unmap, true); - if (retval < 0 && old_len != new_len) { - ret = retval; - goto out; + retval = do_mas_munmap(&mas, mm, addr + new_len, + old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ - } else if (retval == 1) + if (retval == 1) { downgraded = true; + } else if (retval < 0 && old_len != new_len) { + ret = retval; + goto out; + } + ret = addr; goto out; } -- GitLab From 67e7c16764c3cbf84a57d441fba3474217ac08d6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: [PATCH 0652/2223] mm/mmap: change do_brk_munmap() to use do_mas_align_munmap() do_brk_munmap() has already aligned the address and has a maple tree state to be used. Use the new do_mas_align_munmap() to avoid unnecessary alignment and error checks. Link: https://lkml.kernel.org/r/20220906194824.2110408-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 6e587f4e3a7da..8b7e9d5afd38c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3042,7 +3042,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_munmap(mas, mm, newbrk, oldbrk-newbrk, uf, true); + ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } -- GitLab From de2b84d24b87172913754bca6db85d5c5998213b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 0653/2223] arm64: remove mmap linked list from vdso Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-31-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/kernel/vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index a61fc4f989b37..a8388af62b99e 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -136,10 +136,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) -- GitLab From ef770d180ebae967b19a3964bc1cc026f3082f9a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 0654/2223] arm64: Change elfcore for_each_mte_vma() to use VMA iterator Rework for_each_mte_vma() to use a VMA iterator instead of an explicit linked-list. Link: https://lkml.kernel.org/r/20220906194824.2110408-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20220218023650.672072-1-Liam.Howlett@oracle.com Signed-off-by: Will Deacon Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arm64/kernel/elfcore.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 98d67444a5b61..27ef7ad3ffd2e 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -8,9 +8,9 @@ #include #include -#define for_each_mte_vma(tsk, vma) \ +#define for_each_mte_vma(vmi, vma) \ if (system_supports_mte()) \ - for (vma = tsk->mm->mmap; vma; vma = vma->vm_next) \ + for_each_vma(vmi, vma) \ if (vma->vm_flags & VM_MTE) static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma) @@ -81,8 +81,9 @@ Elf_Half elf_core_extra_phdrs(void) { struct vm_area_struct *vma; int vma_count = 0; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) + for_each_mte_vma(vmi, vma) vma_count++; return vma_count; @@ -91,8 +92,9 @@ Elf_Half elf_core_extra_phdrs(void) int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) { + for_each_mte_vma(vmi, vma) { struct elf_phdr phdr; phdr.p_type = PT_AARCH64_MEMTAG_MTE; @@ -116,8 +118,9 @@ size_t elf_core_extra_data_size(void) { struct vm_area_struct *vma; size_t data_size = 0; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) + for_each_mte_vma(vmi, vma) data_size += mte_vma_tag_dump_size(vma); return data_size; @@ -126,8 +129,9 @@ size_t elf_core_extra_data_size(void) int elf_core_write_extra_data(struct coredump_params *cprm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) { + for_each_mte_vma(vmi, vma) { if (vma->vm_flags & VM_DONTDUMP) continue; -- GitLab From 70fa203165d96ae03abb83cf60d30c44e6b81a12 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 0655/2223] parisc: remove mmap linked list from cache handling Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-33-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/parisc/kernel/cache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 3feb7694e0ca4..1d3b8bc8a6233 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -657,15 +657,20 @@ static inline unsigned long mm_total_size(struct mm_struct *mm) { struct vm_area_struct *vma; unsigned long usize = 0; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next) + for_each_vma(vmi, vma) { + if (usize >= parisc_cache_flush_threshold) + break; usize += vma->vm_end - vma->vm_start; + } return usize; } void flush_cache_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); /* * Flushing the whole cache on each cpu takes forever on @@ -685,7 +690,7 @@ void flush_cache_mm(struct mm_struct *mm) } /* Flush mm */ - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) flush_cache_pages(vma, vma->vm_start, vma->vm_end); } -- GitLab From 405e669172e20be9a42cecf8be0fbed089fab045 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 0656/2223] powerpc: remove mmap linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-34-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/kernel/vdso.c | 6 +++--- arch/powerpc/mm/book3s32/tlb.c | 11 ++++++----- arch/powerpc/mm/book3s64/subpage_prot.c | 13 ++----------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 0da287544054f..94a8fa5017c35 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -113,18 +113,18 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, &vvar_spec)) zap_page_range(vma, vma->vm_start, size); } - mmap_read_unlock(mm); + return 0; } diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 19f0ef950d773..9ad6b56bfec96 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range); void hash__flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_lock. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + for_each_vma(vmi, mp) hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); } EXPORT_SYMBOL(hash__flush_tlb_mm); diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 60c6ea16a972a..d73b3b4176e81 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; + for_each_vma_range(vmi, vma, addr + len) { vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else -- GitLab From e7b6b990e524f60994da70cf5a22159b1e88ce57 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:54 +0000 Subject: [PATCH 0657/2223] s390: remove vma linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-35-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/kernel/vdso.c | 3 ++- arch/s390/mm/gmap.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 5075cde77b292..535099f2736da 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -69,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (!vma_is_special_mapping(vma, &vvar_mapping)) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 62758cb5872f1..02d15c8dc92e9 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2515,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = { static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for_each_vma(vmi, vma) { vma->vm_flags &= ~VM_HUGEPAGE; vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &thp_split_walk_ops, NULL); @@ -2584,8 +2585,9 @@ int gmap_mark_unmergeable(void) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &vma->vm_flags); if (ret) -- GitLab From a3884621163b7d7fab89b44461b2a48a29c5cc9a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:54 +0000 Subject: [PATCH 0658/2223] x86: remove vma linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-36-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/entry/vdso/vma.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1000d457c3321..6292b960037b7 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, &vvar_mapping)) zap_page_range(vma, vma->vm_start, size); } - mmap_read_unlock(mm); + return 0; } #else @@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); /* @@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vvar_mapping)) { mmap_write_unlock(mm); -- GitLab From 49c40fb4b826c90036f04abf583bb4cb5ba3d203 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: [PATCH 0659/2223] xtensa: remove vma linked list walks Use the VMA iterator instead. Since VMA can no longer be NULL in the loop, then deal with out-of-memory outside the loop. This means a slightly longer run time in the failure case (-ENOMEM) - it will run to the end of the VMAs before erroring instead of in the middle of the loop. Link: https://lkml.kernel.org/r/20220906194824.2110408-37-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/xtensa/kernel/syscall.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index 201356faa7e6e..b3c2450d6f239 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -58,6 +58,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct vm_area_struct *vmm; + struct vma_iterator vmi; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate @@ -79,15 +80,20 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, else addr = PAGE_ALIGN(addr); - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vm_start_gap(vmm)) - return addr; + vma_iter_init(&vmi, current->mm, addr); + for_each_vma(vmi, vmm) { + /* At this point: (addr < vmm->vm_end). */ + if (addr + len <= vm_start_gap(vmm)) + break; + addr = vmm->vm_end; if (flags & MAP_SHARED) addr = COLOUR_ALIGN(addr, pgoff); } + + if (TASK_SIZE - len < addr) + return -ENOMEM; + + return addr; } #endif -- GitLab From d9fa0e37cdd47cfb71f5fa7a599ef5b9e32c55ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: [PATCH 0660/2223] cxl: remove vma linked list walk Use the VMA iterator instead. This requires a little restructuring of the surrounding code to hoist the mm to the caller. That turns cxl_prefault_one() into a trivial function, so call cxl_fault_segment() directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-38-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/misc/cxl/fault.c | 45 ++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 60c829113299b..2c64f55cf01f8 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work) mmput(mm); } -static void cxl_prefault_one(struct cxl_context *ctx, u64 ea) -{ - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_one unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } - - cxl_fault_segment(ctx, mm, ea); - - mmput(mm); -} - static u64 next_segment(u64 ea, u64 vsid) { if (vsid & SLB_VSID_B_1T) @@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid) return ea + 1; } -static void cxl_prefault_vma(struct cxl_context *ctx) +static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm) { u64 ea, last_esid = 0; struct copro_slb slb; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int rc; - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_vm unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { for (ea = vma->vm_start; ea < vma->vm_end; ea = next_segment(ea, slb.vsid)) { rc = copro_calculate_slb(mm, ea, &slb); @@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx) } } mmap_read_unlock(mm); - - mmput(mm); } void cxl_prefault(struct cxl_context *ctx, u64 wed) { + struct mm_struct *mm = get_mem_context(ctx); + + if (mm == NULL) { + pr_devel("cxl_prefault unable to get mm %i\n", + pid_nr(ctx->pid)); + return; + } + switch (ctx->afu->prefault_mode) { case CXL_PREFAULT_WED: - cxl_prefault_one(ctx, wed); + cxl_fault_segment(ctx, mm, wed); break; case CXL_PREFAULT_ALL: - cxl_prefault_vma(ctx); + cxl_prefault_vma(ctx, mm); break; default: break; } + + mmput(mm); } -- GitLab From df724cedcfd7ce6638f40903144902a3e29fcec7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: [PATCH 0661/2223] optee: remove vma linked list walk Use the VMA iterator instead. Change the calling convention of __check_mem_type() to pass in the mm instead of the first vma in the range. Link: https://lkml.kernel.org/r/20220906194824.2110408-39-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/tee/optee/call.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 28f87cd8b3ede..290b1bb0e9cd7 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -492,15 +492,18 @@ static bool is_normal_memory(pgprot_t p) #endif } -static int __check_mem_type(struct vm_area_struct *vma, unsigned long end) +static int __check_mem_type(struct mm_struct *mm, unsigned long start, + unsigned long end) { - while (vma && is_normal_memory(vma->vm_page_prot)) { - if (vma->vm_end >= end) - return 0; - vma = vma->vm_next; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, start); + + for_each_vma_range(vmi, vma, end) { + if (!is_normal_memory(vma->vm_page_prot)) + return -EINVAL; } - return -EINVAL; + return 0; } int optee_check_mem_type(unsigned long start, size_t num_pages) @@ -516,8 +519,7 @@ int optee_check_mem_type(unsigned long start, size_t num_pages) return 0; mmap_read_lock(mm); - rc = __check_mem_type(find_vma(mm, start), - start + num_pages * PAGE_SIZE); + rc = __check_mem_type(mm, start, start + num_pages * PAGE_SIZE); mmap_read_unlock(mm); return rc; -- GitLab From cbd43755ad15687cf8c925793a0b6c60c6181615 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 0662/2223] um: remove vma linked list walk Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-40-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/um/kernel/tlb.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index bc38f79ca3a38..ad449173a1a1c 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -584,21 +584,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, void flush_tlb_mm(struct mm_struct *mm) { - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - while (vma != NULL) { + for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 0); - vma = vma->vm_next; - } } void force_flush_all(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - while (vma != NULL) { + for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 1); - vma = vma->vm_next; - } } -- GitLab From 182ea1d71750ff9a41e7f8225c842246a4375983 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 0663/2223] coredump: remove vma linked list walk Use the Maple Tree iterator instead. This is too complicated for the VMA iterator to handle, so let's open-code it for now. If this turns out to be a common pattern, we can migrate it to common code. Link: https://lkml.kernel.org/r/20220906194824.2110408-41-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/coredump.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae2021093..35f2af85b9bce 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1072,30 +1072,20 @@ whole: return vma->vm_end - vma->vm_start; } -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} - /* * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, +static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) + if (gate_vma && (vma == gate_vma)) return NULL; + + vma = mas_next(mas, ULONG_MAX); + if (vma) + return vma; return gate_vma; } @@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm) */ static bool dump_vma_snapshot(struct coredump_params *cprm) { - struct vm_area_struct *vma, *gate_vma; + struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - int i; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int i = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma), i++) { + while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; @@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); m->pgoff = vma->vm_pgoff; - m->file = vma->vm_file; if (m->file) get_file(m->file); + i++; } mmap_write_unlock(mm); -- GitLab From 19066e58682ec156aac8d6cf94b79ab2f122a556 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 0664/2223] exec: use VMA iterator instead of linked list Remove a use of the vm_next list by doing the initial lookup with the VMA iterator and then using it to find the next entry. Link: https://lkml.kernel.org/r/20220906194824.2110408-42-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/exec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 2b919b30dc97b..afe55d0c3bcfb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -683,6 +683,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -691,7 +693,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * ensure there are no vmas between where we want to go * and where we are */ - if (vma != find_vma(mm, new_start)) + if (vma != vma_next(&vmi)) return -EFAULT; /* @@ -710,12 +712,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) lru_add_drain(); tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -724,7 +727,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); -- GitLab From 5f14b9246e8944243c70253b28830de619800d31 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 0665/2223] fs/proc/base: use the vma iterators in place of linked list Use the vma iterator instead of a for loop across the linked list. The link list of vmas will be removed in this patch set. Link: https://lkml.kernel.org/r/20220906194824.2110408-43-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/proc/base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f7e3d971e4b..12885a75913f5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + struct vma_iterator vmi; genradix_init(&fa); @@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) -- GitLab From c4c84f06285e48f80e9843d0775ad92714ffc35a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:57 +0000 Subject: [PATCH 0666/2223] fs/proc/task_mmu: stop using linked list and highest_vm_end Remove references to mm_struct linked list and highest_vm_end for when they are removed Link: https://lkml.kernel.org/r/20220906194824.2110408-44-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/proc/internal.h | 2 +- fs/proc/task_mmu.c | 73 ++++++++++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d8..f03000764ce52 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -285,7 +285,7 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; + struct vma_iterator iter; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9f70bc1c27661..8b4f3073f8f55 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) @@ -152,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-EINTR); } + vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); - - vma = find_vma(mm, last_addr); - if (vma) - return vma; + if (last_addr == -2UL) + return get_gate_vma(mm); - return priv->tail_vma; + return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next, *vma = v; - - if (vma == priv->tail_vma) - next = NULL; - else if (vma->vm_next) - next = vma->vm_next; - else - next = priv->tail_vma; - - *ppos = next ? next->vm_start : -1UL; - - return next; + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -876,16 +880,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss; - struct mm_struct *mm; + struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; - unsigned long last_vma_end = 0; + unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; - mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; @@ -898,8 +902,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); + vma = mas_find(&mas, 0); + + if (unlikely(!vma)) + goto empty_set; - for (vma = priv->mm->mmap; vma;) { + vma_start = vma->vm_start; + do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; @@ -908,6 +917,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -951,7 +961,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = find_vma(mm, last_vma_end - 1); + vma = mas_find(&mas, ULONG_MAX); /* Case 3 above */ if (!vma) break; @@ -965,11 +975,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) smap_gather_stats(vma, &mss, last_vma_end); } /* Case 2 above */ - vma = vma->vm_next; - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); - show_vma_header_prefix(m, priv->mm->mmap->vm_start, - last_vma_end, 0, 0, 0, 0); +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); @@ -1262,6 +1271,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1281,7 +1291,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; @@ -1293,8 +1303,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, - &cp); + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); -- GitLab From 69dbe6daf1041e32e003f966d71f70f20c63af53 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:57 +0000 Subject: [PATCH 0667/2223] userfaultfd: use maple tree iterator to iterate VMAs Don't use the mm_struct linked list or the vma->vm_next in prep for removal. Link: https://lkml.kernel.org/r/20220906194824.2110408-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 62 ++++++++++++++++++++++++----------- include/linux/userfaultfd_k.h | 7 ++-- mm/mmap.c | 2 +- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4de91ba9e85e3..091d95ddf9a0c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -611,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; + VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } + } mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); @@ -799,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *unmaps) +int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *unmaps) { - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + VMA_ITERATOR(vmi, mm, start); + struct vm_area_struct *vma; + + for_each_vma_range(vmi, vma, end) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; @@ -853,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + MA_STATE(mas, &mm->mm_mt, 0, 0); WRITE_ONCE(ctx->released, true); @@ -869,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -883,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) + if (prev) { + mas_pause(&mas); vma = prev; - else + } else { prev = vma; + } + vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1268,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + MA_STATE(mas, &mm->mm_mt, 0, 0); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1310,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1335,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1395,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1426,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto next; } @@ -1433,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = split_vma(mm, vma, start, 1); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } next: /* @@ -1454,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1499,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + MA_STATE(mas, &mm->mm_mt, 0, 0); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1517,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1542,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ found = false; ret = -EINVAL; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1562,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1632,8 +1654,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e1b8a915e9e9f..f07e6998bb68e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -175,9 +175,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf); +extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); @@ -258,7 +257,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, +static inline int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf) { diff --git a/mm/mmap.c b/mm/mmap.c index 8b7e9d5afd38c..aabd4f986ccfe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2545,7 +2545,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(mm, start, end, uf); if (error) goto userfaultfd_error; -- GitLab From 01293a62bae2fa55c09cebf5a771eab7219171c3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: [PATCH 0668/2223] ipc/shm: use VMA iterator instead of linked list The VMA iterator is faster than the linked llist, and it can be walked even when VMAs are being removed from the address space, so there's no need to keep track of 'next'. Link: https://lkml.kernel.org/r/20220906194824.2110408-46-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- ipc/shm.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index b3048ebd5c315..7d86f058fb861 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1721,7 +1721,7 @@ long ksys_shmdt(char __user *shmaddr) #ifdef CONFIG_MMU loff_t size = 0; struct file *file; - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) @@ -1751,12 +1751,9 @@ long ksys_shmdt(char __user *shmaddr) * match the usual checks anyway. So assume all vma's are * above the starting address given. */ - vma = find_vma(mm, addr); #ifdef CONFIG_MMU - while (vma) { - next = vma->vm_next; - + for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it @@ -1774,6 +1771,7 @@ long ksys_shmdt(char __user *shmaddr) file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + mas_pause(&vmi.mas); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1781,10 +1779,9 @@ long ksys_shmdt(char __user *shmaddr) * searching for matching vma's. */ retval = 0; - vma = next; + vma = vma_next(&vmi); break; } - vma = next; } /* @@ -1794,17 +1791,19 @@ long ksys_shmdt(char __user *shmaddr) */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { - next = vma->vm_next; - /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) + (vma->vm_file == file)) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - vma = next; + mas_pause(&vmi.mas); + } + + vma = vma_next(&vmi); } #else /* CONFIG_MMU */ + vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ -- GitLab From 160c820023bbfe7c478ed3041cc50604d664f047 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: [PATCH 0669/2223] acct: use VMA iterator instead of linked list The VMA iterator is faster than the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-47-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/acct.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 13706356ec54d..62200d799b9b0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead) unsigned long vsize = 0; if (group_dead && current->mm) { + struct mm_struct *mm = current->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - mmap_read_lock(current->mm); - vma = current->mm->mmap; - while (vma) { + mmap_read_lock(mm); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } spin_lock_irq(¤t->sighand->siglock); -- GitLab From fcb72a585aaa4caced555e98f8444e6162912cb7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: [PATCH 0670/2223] perf: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220906194824.2110408-48-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/events/core.c | 3 ++- kernel/events/uprobes.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad260..101c5912c3fce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10229,8 +10229,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2eaa327f8158d..401bc2d24ce06 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -349,9 +349,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -1231,11 +1232,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1983,9 +1985,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* -- GitLab From 0cd4d02c32123afc25647f1d7123bc13b51ac56b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:59 +0000 Subject: [PATCH 0671/2223] sched: use maple tree iterator to walk VMAs The linked list is slower than walking the VMAs using the maple tree. We can't use the VMA iterator here because it doesn't support moving to an earlier position. Link: https://lkml.kernel.org/r/20220906194824.2110408-49-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e3e08a093d49..ff49f28391ea3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2930,6 +2930,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2986,13 +2987,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; -- GitLab From fa5e587679f034530e8c14bc1c466490053b2ff2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:59 +0000 Subject: [PATCH 0672/2223] fork: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220906194824.2110408-50-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/fork.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 430f63cd7a371..49e4ab6f52088 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1301,13 +1301,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, - &old_exe_file->f_path)) + &old_exe_file->f_path)) { ret = -EBUSY; + break; + } } mmap_read_unlock(mm); fput(old_exe_file); -- GitLab From becc8cdb6cb28d9fd3ecf890d1d6e59118a6a53d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:59 +0000 Subject: [PATCH 0673/2223] bpf: remove VMA linked list Use vma_next() and remove reference to the start of the linked list Link: https://lkml.kernel.org/r/20220906194824.2110408-51-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/bpf/task_iter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def49..1c8debd42dc9f 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -299,8 +299,8 @@ struct bpf_iter_seq_task_vma_info { }; enum bpf_task_vma_iter_find_op { - task_vma_iter_first_vma, /* use mm->mmap */ - task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_first_vma, /* use find_vma() with addr 0 */ + task_vma_iter_next_vma, /* use vma_next() with curr_vma */ task_vma_iter_find_vma, /* use find_vma() to find next vma */ }; @@ -400,10 +400,10 @@ again: switch (op) { case task_vma_iter_first_vma: - curr_vma = curr_task->mm->mmap; + curr_vma = find_vma(curr_task->mm, 0); break; case task_vma_iter_next_vma: - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma @@ -417,7 +417,7 @@ again: if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; } if (!curr_vma) { -- GitLab From c4d1a92d0d3ada8a4073b8af8eff462d689d64c5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:00 +0000 Subject: [PATCH 0674/2223] mm/gup: use maple tree navigation instead of linked list Use find_vma_intersection() to locate the VMAs in __mm_populate() instead of using find_vma() and the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-52-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/gup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index d4f706dc245f6..6e49fe5da5133 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1674,10 +1674,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) if (!locked) { locked = 1; mmap_read_lock(mm); - vma = find_vma(mm, nstart); + vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) + vma = find_vma_intersection(mm, vma->vm_end, end); + + if (!vma) break; /* * Set [nstart; nend) to intersection of desired address -- GitLab From 685405020b9f24ec979d41e6c27207be97c000cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:00 +0000 Subject: [PATCH 0675/2223] mm/khugepaged: stop using vma linked list Use vma iterator & find_vma() instead of vma linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-53-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 534d30cff9d75..63b4d8ff4b556 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2341,11 +2341,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9ff3d39b286f8..7c13d65aeb14e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2050,6 +2050,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { + struct vma_iterator vmi; struct mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; @@ -2078,11 +2079,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!hpage_collapse_test_exit(mm))) - vma = find_vma(mm, khugepaged_scan.address); progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(hpage_collapse_test_exit(mm))) + goto breakouterloop; + + vma_iter_init(&vmi, mm, khugepaged_scan.address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); -- GitLab From a5f18ba0727656bd1fe3bcdb0d563f81790f9a04 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: [PATCH 0676/2223] mm/ksm: use vma iterators instead of vma linked list Remove the use of the linked list for eventual removal. Link: https://lkml.kernel.org/r/20220906194824.2110408-54-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/ksm.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index fd6d03cb04638..533ede86b4b9f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -981,11 +981,13 @@ static int unmerge_and_remove_all_rmap_items(void) struct mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); - for (mm_slot = ksm_scan.mm_slot; - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; + mm_slot = ksm_scan.mm_slot) { + VMA_ITERATOR(vmi, mm_slot->mm, 0); + mm = mm_slot->mm; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) @@ -2243,6 +2245,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.mm_list)) @@ -2301,13 +2304,13 @@ next_mm: } mm = slot->mm; + vma_iter_init(&vmi, mm, ksm_scan.address); + mmap_read_lock(mm); if (ksm_test_exit(mm)) - vma = NULL; - else - vma = find_vma(mm, ksm_scan.address); + goto no_vmas; - for (; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -2345,6 +2348,7 @@ next_mm: } if (ksm_test_exit(mm)) { +no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &slot->rmap_list; } -- GitLab From 3547481831acd99d6f9c3b2cef16f269e6eaad9c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: [PATCH 0677/2223] mm/madvise: use vma_find() instead of vma linked list madvise_walk_vmas() no longer uses linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-55-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 4f86eb7f554d6..a3fc4cd32ed36 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1245,7 +1245,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, if (start >= end) break; if (prev) - vma = prev->vm_next; + vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } -- GitLab From ba0aff8ea6ff0ba4dacfc896facadf3d91c8cd8a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: [PATCH 0678/2223] mm/memcontrol: stop using mm->highest_vm_end Pass through ULONG_MAX instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-56-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 392b1fd1e8c48..e804056422db0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5879,7 +5879,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6177,9 +6177,7 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } -- GitLab From 66850be55e8e5f371db2c091751a932a656c5f4d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:02 +0000 Subject: [PATCH 0679/2223] mm/mempolicy: use vma iterator & maple state instead of vma linked list Reworked the way mbind_range() finds the first VMA to reuse the maple state and limit the number of tree walks needed. Note, this drops the VM_BUG_ON(!vma) call, which would catch a start address higher than the last VMA. The code was written in a way that allowed no VMA updates to occur and still return success. There should be no functional change to this scenario with the new code. Link: https://lkml.kernel.org/r/20220906194824.2110408-57-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mempolicy.c | 56 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a88fd94e18d6f..143e2eaaa6ec5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } @@ -654,7 +655,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->vma; + struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; @@ -669,9 +670,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, /* hole at head side of range */ return -EFAULT; } + next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && - (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; @@ -785,26 +787,24 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { + MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - unsigned long vmstart; - unsigned long vmend; - - vma = find_vma(mm, start); - VM_BUG_ON(!vma); - prev = vma->vm_prev; - if (start > vma->vm_start) - prev = vma; + prev = mas_find_rev(&mas, 0); + if (prev && (start < prev->vm_end)) + vma = prev; + else + vma = mas_next(&mas, end - 1); - for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); + for (; vma; vma = mas_next(&mas, end - 1)) { + unsigned long vmstart = max(start, vma->vm_start); + unsigned long vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) - continue; + goto next; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); @@ -813,6 +813,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto replace; } @@ -820,19 +822,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } - replace: +replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; +next: + prev = vma; } - out: +out: return err; } @@ -1049,6 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; + struct vm_area_struct *vma; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { @@ -1064,8 +1073,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ + vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { @@ -1195,14 +1205,13 @@ static struct page *new_page(struct page *page, unsigned long start) struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; + VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - vma = find_vma(current->mm, start); - while (vma) { + for_each_vma(vmi, vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; - vma = vma->vm_next; } if (folio_test_hugetlb(src)) @@ -1480,6 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long vmend; unsigned long end; int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) @@ -1505,9 +1515,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); - vma = find_vma(mm, start); - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - + for_each_vma_range(vmi, vma, end) { vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); new = mpol_dup(vma_policy(vma)); -- GitLab From 33108b05f39b78137c38c677b7a2d0fb7defed14 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:02 +0000 Subject: [PATCH 0680/2223] mm/mlock: use vma iterator and maple state instead of vma linked list Handle overflow checking in count_mm_mlocked_page_nr() differently. Link: https://lkml.kernel.org/r/20220906194824.2110408-58-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mlock.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b14e929084cca..43d19a1f28eb3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = find_vma(current->mm, start); - if (!vma || vma->vm_start > start) + vma = mas_walk(&mas); + if (!vma) return -ENOMEM; - prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; + else + prev = mas_prev(&mas, 0); for (nstart = start ; ; ) { vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(prev->vm_mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; @@ -526,24 +528,23 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long count = 0; + unsigned long end; + VMA_ITERATOR(vmi, mm, start); if (mm == NULL) mm = current->mm; - vma = find_vma(mm, start); - if (vma == NULL) - return 0; - - for (; vma ; vma = vma->vm_next) { - if (start >= vma->vm_end) - continue; - if (start + len <= vma->vm_start) - break; + /* Don't overflow past ULONG_MAX */ + if (unlikely(ULONG_MAX - len < start)) + end = ULONG_MAX; + else + end = start + len; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); - if (start + len < vma->vm_end) { - count += start + len - vma->vm_start; + if (end < vma->vm_end) { + count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; @@ -659,6 +660,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,7 +681,7 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -687,6 +689,7 @@ static int apply_mlockall_flags(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mas_pause(&mas); cond_resched(); } out: -- GitLab From 70821e0b89dd477109c42a92d571f6dc6f6aa956 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:02 +0000 Subject: [PATCH 0681/2223] mm/mprotect: use maple tree navigation instead of VMA linked list Switch to navigating the VMA list with the maple tree operators in preparation for removing the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-59-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mprotect.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 55ed4a889990f..461dcbd4f21a6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -676,6 +676,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); start = untagged_addr(start); @@ -707,7 +708,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); error = -ENOMEM; if (!vma) goto out; @@ -733,7 +735,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (start > vma->vm_start) prev = vma; else - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); for (nstart = start ; ; ) { @@ -796,7 +798,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(current->mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; -- GitLab From 396a44cc58910317b03dd73a93ab4fe6b76df658 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:03 +0000 Subject: [PATCH 0682/2223] mm/mremap: use vma_find_intersection() instead of vma linked list Using the vma_find_intersection() call allows for cleaner code and removes linked list users in preparation of the linked list removal. Also remove one user of the linked list at the same time in favour of find_vma(). Link: https://lkml.kernel.org/r/20220906194824.2110408-60-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e0fba90042466..8644ff278f029 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -716,7 +716,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (excess) { vma->vm_flags |= VM_ACCOUNT; if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; + find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; @@ -866,9 +866,10 @@ out: static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ return 0; - if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) -- GitLab From 4267d1fd7825454ed41ebf53af62e7cedd779f83 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:03 +0000 Subject: [PATCH 0683/2223] mm/msync: use vma_find() instead of vma linked list Remove a single use of the vma linked list in preparation for the removal of the linked list. Uses find_vma() to get the next element. Link: https://lkml.kernel.org/r/20220906194824.2110408-61-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/msync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/msync.c b/mm/msync.c index 137d1c104f3e9..ac4c9bfea2e7f 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) error = 0; goto out_unlock; } - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); } } out_unlock: -- GitLab From e1c2c775d448be0503a3ac90681d86980919bad0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:03 +0000 Subject: [PATCH 0684/2223] mm/oom_kill: use vma iterators instead of vma linked list Use vma iterator in preparation of removing the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-62-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3c6cf9e3cd66e..3996301450e8d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -513,6 +513,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; + VMA_ITERATOR(vmi, mm, 0); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - for (vma = mm->mmap ; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; -- GitLab From 9ec08f30f86d70b8891c25642df7d1f16647fde4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: [PATCH 0685/2223] mm/pagewalk: use vma_find() instead of vma linked list walk_page_range() no longer uses the one vma linked list reference. Link: https://lkml.kernel.org/r/20220906194824.2110408-63-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 908ec1577f401..131b2b335b2cd 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -456,7 +456,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { -- GitLab From 208c09db6d88f4442fb755d20cfb237a37a49f48 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: [PATCH 0686/2223] mm/swapfile: use vma iterator instead of vma linked list unuse_mm() no longer needs to reference the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-64-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 263b19e693cfa..469d9af86be2f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1994,14 +1994,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } + cond_resched(); } mmap_read_unlock(mm); -- GitLab From f683b9d613193362ceb954c216f663a43c027302 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: [PATCH 0687/2223] i915: use the VMA iterator Replace the linked list in probe_range() with the VMA iterator. Link: https://lkml.kernel.org/r/20220906194824.2110408-65-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 8423df021b713..d4398948f0162 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -426,12 +426,11 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = { static int probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { - const unsigned long end = addr + len; + VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; - int ret = -EFAULT; mmap_read_lock(mm); - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + for_each_vma_range(vmi, vma, addr + len) { /* Check for holes, note that we also update the addr below */ if (vma->vm_start > addr) break; @@ -439,16 +438,13 @@ probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) break; - if (vma->vm_end >= end) { - ret = 0; - break; - } - addr = vma->vm_end; } mmap_read_unlock(mm); - return ret; + if (vma) + return -EFAULT; + return 0; } /* -- GitLab From 8220543df1489ef96c3d4e8b0b3b03c340e3943e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:05 +0000 Subject: [PATCH 0688/2223] nommu: remove uses of VMA linked list Use the maple tree or VMA iterator instead. This is faster and will allow us to shrink the VMA. Link: https://lkml.kernel.org/r/20220906194824.2110408-66-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/nommu.c | 146 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 37 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 265a444a2cc27..269df51e92266 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -557,26 +557,14 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) mas_store_prealloc(mas, NULL); } -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { - struct address_space *mapping; - struct vm_area_struct *prev; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - - BUG_ON(!vma->vm_region); - mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); @@ -584,21 +572,52 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} - prev = mas_prev(&mas, 0); - mas_reset(&mas); +/* + * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). + * @mas: The maple state with preallocations. + * @mm: The mm_struct + * @vma: The vma to add + * + */ +static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, + struct vm_area_struct *vma) +{ + struct vm_area_struct *prev; + + BUG_ON(!vma->vm_region); + + setup_vma_to_mm(vma, mm); + + prev = mas_prev(mas, 0); + mas_reset(mas); /* add the VMA to the tree */ - vma_mas_store(vma, &mas); + vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); } /* - * delete a VMA from its owning mm_struct and address space + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_lock held writelocked */ -static void delete_vma_from_mm(struct vm_area_struct *vma) +static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + mas_add_vma_to_mm(&mas, mm, vma); + return 0; +} +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -611,10 +630,25 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); __vma_unlink_list(vma->vm_mm, vma); + return 0; } /* @@ -1024,6 +1058,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); *populate = 0; @@ -1042,6 +1077,7 @@ unsigned long do_mmap(struct file *file, * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); + /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); if (!region) @@ -1051,6 +1087,9 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + goto error_maple_preallocate; + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1191,7 +1230,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - add_vma_to_mm(current->mm, vma); + mas_add_vma_to_mm(&mas, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1217,6 +1256,7 @@ error: sharing_violation: up_write(&nommu_region_sem); + mas_destroy(&mas); pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; @@ -1233,6 +1273,14 @@ error_getting_region: len, current->pid); show_free_areas(0, NULL); return -ENOMEM; + +error_maple_preallocate: + kmem_cache_free(vm_region_jar, region); + vm_area_free(vma); + pr_warn("Allocation of vma tree for process %d failed\n", current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1298,6 +1346,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1312,9 +1361,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return -ENOMEM; new = vm_area_dup(vma); - if (!new) { - kmem_cache_free(vm_region_jar, region); - return -ENOMEM; + if (!new) + goto err_vma_dup; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_mas_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1333,7 +1386,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - delete_vma_from_mm(vma); down_write(&nommu_region_sem); delete_nommu_region(vma->vm_region); if (new_below) { @@ -1346,9 +1398,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); up_write(&nommu_region_sem); - add_vma_to_mm(mm, vma); - add_vma_to_mm(mm, new); + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); + mas_store(&mas, vma); + vma_mas_store(new, &mas); return 0; + +err_mas_preallocate: + vm_area_free(new); +err_vma_dup: + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; } /* @@ -1363,12 +1425,14 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + return -ENOMEM; if (from > vma->vm_start) vma->vm_end = from; else vma->vm_start = to; - add_vma_to_mm(mm, vma); + if (add_vma_to_mm(mm, vma)) + return -ENOMEM; /* cut the backing region down to size */ region = vma->vm_region; @@ -1396,9 +1460,10 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *vma; unsigned long end; - int ret; + int ret = 0; len = PAGE_ALIGN(len); if (len == 0) @@ -1407,7 +1472,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = find_vma(mm, start); + vma = mas_find(&mas, end - 1); if (!vma) { static int limit; if (limit < 5) { @@ -1426,7 +1491,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = vma->vm_next; + vma = mas_next(&mas, end - 1); } while (vma); return -EINVAL; } else { @@ -1448,9 +1513,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list } erase_whole_vma: - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; delete_vma(mm, vma); - return 0; + return ret; } int vm_munmap(unsigned long addr, size_t len) @@ -1475,6 +1541,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) */ void exit_mmap(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; if (!mm) @@ -1482,13 +1549,18 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; - while ((vma = mm->mmap)) { - mm->mmap = vma->vm_next; - delete_vma_from_mm(vma); + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); delete_vma(mm, vma); cond_resched(); } __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); } int vm_brk(unsigned long addr, unsigned long len) -- GitLab From 9b580a1d60de0e1c8957e63859aa989196952e63 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:05 +0000 Subject: [PATCH 0689/2223] riscv: use vma iterator for vdso Remove the linked list use in favour of the vma iterator. Link: https://lkml.kernel.org/r/20220906194824.2110408-67-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/kernel/vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 69b05b6c181b6..692e7ae3dcb80 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -114,11 +114,12 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); struct __vdso_info *vdso_info = mm->context.vdso_info; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, vdso_info->dm)) -- GitLab From 78ba531ff3ec2a444001853d8636ff39ed11ca28 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:05 +0000 Subject: [PATCH 0690/2223] mm/vmscan: use vma iterator instead of vm_next Use the vma iterator in in get_next_vma() instead of the linked list. [yuzhao@google.com: mm/vmscan: use the proper VMA iterator] Link: https://lkml.kernel.org/r/Yx+QGOgHg1Wk8tGK@google.com Link: https://lkml.kernel.org/r/20220906194824.2110408-68-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Yu Zhao Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vmscan.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 809df16c7c0df..3ba9423b141de 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3774,23 +3774,17 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk { unsigned long start = round_up(*vm_end, size); unsigned long end = (start | ~mask) + 1; + VMA_ITERATOR(vmi, args->mm, start); VM_WARN_ON_ONCE(mask & size); VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); - while (args->vma) { - if (start >= args->vma->vm_end) { - args->vma = args->vma->vm_next; - continue; - } - + for_each_vma(vmi, args->vma) { if (end && end <= args->vma->vm_start) return false; - if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { - args->vma = args->vma->vm_next; + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) continue; - } *vm_start = max(start, args->vma->vm_start); *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; -- GitLab From 763ecb035029f500d7e6dc99acd1ad299b7726a1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: [PATCH 0691/2223] mm: remove the vma linked list Replace any vm_next use with vma_find(). Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the maple tree. Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap(). At the same time, alter the loop to be more compact. Now that free_pgtables() and unmap_vmas() take a maple tree as an argument, rearrange do_mas_align_munmap() to use the new tree to hold the vmas to remove. Remove __vma_link_list() and __vma_unlink_list() as they are exclusively used to update the linked list. Drop linked list update from __insert_vm_struct(). Rework validation of tree as it was depending on the linked list. [yang.lee@linux.alibaba.com: fix one kernel-doc comment] Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=1949 Link: https://lkml.kernel.org/r/20220824021918.94116-1-yang.lee@linux.alibaba.comLink: https://lkml.kernel.org/r/20220906194824.2110408-69-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Yang Li Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +- include/linux/mm_types.h | 4 - kernel/fork.c | 19 +- mm/debug.c | 14 +- mm/internal.h | 8 +- mm/memory.c | 34 ++- mm/mmap.c | 469 ++++++++++++++++----------------------- mm/nommu.c | 6 - mm/util.c | 40 ---- 9 files changed, 225 insertions(+), 374 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 579449d6c23ba..37384a84f71aa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1857,8 +1857,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end); struct mmu_notifier_range; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4541b74b1bdb9..5e32211cb5a9d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -408,8 +408,6 @@ struct vm_area_struct { unsigned long vm_end; /* The first byte after our end address within vm_mm. */ - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next, *vm_prev; struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -473,7 +471,6 @@ struct vm_area_struct { struct kioctx_table; struct mm_struct { struct { - struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, @@ -488,7 +485,6 @@ struct mm_struct { unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER diff --git a/kernel/fork.c b/kernel/fork.c index 49e4ab6f52088..50460330306a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -474,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); - new->vm_next = new->vm_prev = NULL; dup_anon_vma_name(orig, new); } return new; @@ -579,7 +578,7 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); @@ -606,18 +605,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); - if (retval) - goto out; - - prev = NULL; - retval = mas_expected_entries(&mas, oldmm->map_count); if (retval) goto out; @@ -689,14 +681,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - /* Link the vma into the MT */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; @@ -1124,7 +1108,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - mm->mmap = NULL; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); diff --git a/mm/debug.c b/mm/debug.c index 2d625ca0e3269..0fd15ba70d163 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -139,13 +139,11 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %px start %px end %px\n" - "next %px prev %px mm %px\n" + pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", - vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, - vma->vm_prev, vma->vm_mm, + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, @@ -155,11 +153,11 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px task_size %lu\n" + pr_emerg("mm %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif - "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" @@ -183,11 +181,11 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, mm->task_size, + mm, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif - mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), diff --git a/mm/internal.h b/mm/internal.h index cf134d58fd6d2..0f106a3982e73 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -85,8 +85,9 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; @@ -480,9 +481,6 @@ static inline bool is_data_mapping(vm_flags_t flags) } /* mm/util.c */ -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev); -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU diff --git a/mm/memory.c b/mm/memory.c index cb955c0b77382..e49faa0a1f9a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -392,12 +392,21 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { - while (vma) { - struct vm_area_struct *next = vma->vm_next; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + do { unsigned long addr = vma->vm_start; + struct vm_area_struct *next; + + /* + * Note: USER_PGTABLES_CEILING may be passed as ceiling and may + * be 0. This will underflow and is okay. + */ + next = mas_find(&mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -416,7 +425,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = vma->vm_next; + next = mas_find(&mas, ceiling - 1); unlink_anon_vmas(vma); unlink_file_vma(vma); } @@ -424,7 +433,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, floor, next ? next->vm_start : ceiling); } vma = next; - } + } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) @@ -1688,6 +1697,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather + * @mt: the maple tree * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -1703,7 +1713,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { @@ -1713,12 +1723,14 @@ void unmap_vmas(struct mmu_gather *tlb, /* Careful - we need to zap private pages too! */ .even_cows = true, }; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1733,8 +1745,11 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { + struct maple_tree *mt = &vma->vm_mm->mm_mt; + unsigned long end = start + size; struct mmu_notifier_range range; struct mmu_gather tlb; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -1742,8 +1757,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + do { unmap_single_vma(&tlb, vma, start, range.end, NULL); + } while ((vma = mas_find(&mas, end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } diff --git a/mm/mmap.c b/mm/mmap.c index aabd4f986ccfe..4441f7ed197a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -75,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); + struct vm_area_struct *next, unsigned long start, + unsigned long end); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -130,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma) } /* - * Close a vm structure and free it, returning the next. + * Close a vm structure and free it. */ -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +static void remove_vma(struct vm_area_struct *vma) { - struct vm_area_struct *next = vma->vm_next; - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -143,7 +142,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); - return next; } /* @@ -168,8 +166,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf); static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, - unsigned long flags); + unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -238,7 +235,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * before calling do_brk_munmap(). */ mm->brk = brk; - mas.last = oldbrk - 1; ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); if (ret == 1) { downgraded = true; @@ -293,44 +289,21 @@ extern void mt_dump(const struct maple_tree *mt); static void validate_mm_mt(struct mm_struct *mm) { struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt, *vma = mm->mmap; + struct vm_area_struct *vma_mt; MA_STATE(mas, mt, 0, 0); mt_validate(&mm->mm_mt); mas_for_each(&mas, vma_mt, ULONG_MAX) { - if (xa_is_zero(vma_mt)) - continue; - - if (!vma) - break; - - if ((vma != vma_mt) || - (vma->vm_start != vma_mt->vm_start) || - (vma->vm_end != vma_mt->vm_end) || - (vma->vm_start != mas.index) || - (vma->vm_end - 1 != mas.last)) { + if ((vma_mt->vm_start != mas.index) || + (vma_mt->vm_end - 1 != mas.last)) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma_mt); - pr_emerg("and vm_next\n"); - dump_vma(vma->vm_next); pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, mas.index, mas.last); pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - if (vma->vm_prev) { - pr_emerg("ll prev: %p %lu - %lu\n", - vma->vm_prev, vma->vm_prev->vm_start, - vma->vm_prev->vm_end); - } - pr_emerg("ll vma: %p %lu - %lu\n", vma, - vma->vm_start, vma->vm_end); - if (vma->vm_next) { - pr_emerg("ll next: %p %lu - %lu\n", - vma->vm_next, vma->vm_next->vm_start, - vma->vm_next->vm_end); - } mt_dump(mas.tree); if (vma_mt->vm_end != mas.last + 1) { @@ -347,23 +320,19 @@ static void validate_mm_mt(struct mm_struct *mm) } VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); } - VM_BUG_ON(vma != vma_mt); - vma = vma->vm_next; - } - VM_BUG_ON(vma); } static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - unsigned long highest_address = 0; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); validate_mm_mt(mm); - while (vma) { + mas_for_each(&mas, vma, ULONG_MAX) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -375,18 +344,10 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } #endif - - highest_address = vm_end_gap(vma); - vma = vma->vm_next; i++; } if (i != mm->map_count) { - pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); - bug = 1; - } - if (highest_address != mm->highest_vm_end) { - pr_emerg("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); + pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); @@ -446,29 +407,13 @@ bool range_has_overlap(struct mm_struct *mm, unsigned long start, struct vm_area_struct *existing; MA_STATE(mas, &mm->mm_mt, start, start); + rcu_read_lock(); existing = mas_find(&mas, end - 1); *pprev = mas_prev(&mas, 0); + rcu_read_unlock(); return existing ? true : false; } -/* - * __vma_next() - Get the next VMA. - * @mm: The mm_struct. - * @vma: The current vma. - * - * If @vma is NULL, return the first vma in the mm. - * - * Returns: The next VMA after @vma. - */ -static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - if (!vma) - return mm->mmap; - - return vma->vm_next; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -553,8 +498,7 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, mas_store_prealloc(mas, NULL); } -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; @@ -568,7 +512,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); __vma_link_file(vma); if (mapping) @@ -579,22 +522,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } -/* - * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and the mm tree. It has already been inserted into the interval tree. - */ -static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long location) -{ - struct vm_area_struct *prev; - - mas_set(mas, location); - prev = mas_prev(mas, 0); - vma_mas_store(vma, mas); - __vma_link_list(mm, vma, prev); - mm->map_count++; -} - /* * vma_expand - Expand an existing VMA * @@ -675,15 +602,8 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, } /* Expanding over the next vma */ - if (remove_next) { - /* Remove from mm linked list - also updates highest_vm_end */ - __vma_unlink_list(mm, next); - - if (file) - __remove_shared_vm_struct(next, file, mapping); - - } else if (!next) { - mm->highest_vm_end = vm_end_gap(vma); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); } if (anon_vma) { @@ -738,7 +658,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; - unsigned long ll_prev = vma->vm_start; /* linked list prev. */ if (next && !insert) { if (end >= next->vm_end) { @@ -773,7 +692,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, next_next = find_vma(mm, next->vm_end); VM_WARN_ON(remove_next == 2 && - end != next->vm_next->vm_end); + end != next_next->vm_end); } exporter = next; @@ -784,7 +703,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * next, if the vma overlaps with it. */ if (remove_next == 2 && !next->anon_vma) - exporter = next->vm_next; + exporter = next_next; } else if (end > next->vm_start) { /* @@ -879,17 +798,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); - } else if (insert->vm_start == end) { - ll_prev = vma->vm_end; } } else { vma_changed = true; } vma->vm_end = end; - if (!next) - mm->highest_vm_end = vm_end_gap(vma); } if (vma_changed) @@ -909,29 +825,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, flush_dcache_mmap_unlock(mapping); } - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - * Since we have expanded over this vma, the maple tree will - * have overwritten by storing the value - */ - __vma_unlink_list(mm, next); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); if (remove_next == 2) - __vma_unlink_list(mm, next_next); - - if (file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } + __remove_shared_vm_struct(next_next, file, mapping); } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, &mas, insert, ll_prev); + mas_reset(&mas); + vma_mas_store(insert, &mas); + mm->map_count++; } if (anon_vma) { @@ -965,54 +871,12 @@ again: /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove another next too. It would clutter - * up the code too much to do both in one go. + * we must remove next_next too. */ - if (remove_next != 3) { - /* - * If "next" was removed and vma->vm_end was - * expanded (up) over it, in turn - * "next->vm_prev->vm_end" changed and the - * "vma->vm_next" gap must be updated. - */ - next = next_next; - } else { - /* - * For the scope of the comment "next" and - * "vma" considered pre-swap(): if "vma" was - * removed, next->vm_start was expanded (down) - * over it and the "next" gap must be updated. - * Because of the swap() the post-swap() "vma" - * actually points to pre-swap() "next" - * (post-swap() "next" as opposed is now a - * dangling pointer). - */ - next = vma; - } if (remove_next == 2) { remove_next = 1; + next = next_next; goto again; - } else if (!next) { - /* - * If remove_next == 2 we obviously can't - * reach this path. - * - * If remove_next == 3 we can't reach this - * path because pre-swap() next is always not - * NULL. pre-swap() "next" is not being - * removed and its next->vm_end is not altered - * (and furthermore "end" already matches - * next->vm_end in remove_next == 3). - * - * We reach this only in the remove_next == 1 - * case if the "next" vma that was removed was - * the highest vma of the mm. However in such - * case next->vm_end == "end" and the extended - * "vma" has vma->vm_end == next->vm_end so - * mm->highest_vm_end doesn't need any update - * in remove_next == 1 case. - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } } if (insert && file) @@ -1020,6 +884,7 @@ again: mas_destroy(&mas); validate_mm(mm); + return 0; } @@ -1179,10 +1044,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = __vma_next(mm, prev); + next = find_vma(mm, prev ? prev->vm_end : 0); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; + next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); @@ -1316,18 +1181,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; /* Try next first. */ - if (vma->vm_next) { - anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + next = mas_walk(&mas); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } + prev = mas_prev(&mas, 0); + VM_BUG_ON_VMA(prev != vma, vma); + prev = mas_prev(&mas, 0); /* Try prev next. */ - if (vma->vm_prev) - anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find @@ -2101,8 +1972,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - next = vma->vm_next; - if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2153,8 +2024,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (!vma->vm_next) - mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2174,16 +2043,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; /* Enforce stack_guard_gap */ - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev)) { @@ -2318,25 +2187,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL_GPL(find_extend_vma); /* - * Ok - we have the memory areas we should free on the vma list, - * so release them, and do the vma updates. + * Ok - we have the memory areas we should free on a maple tree so release them, + * and do the vma updates. * * Called with the mm semaphore held. */ -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) { unsigned long nr_accounted = 0; + struct vm_area_struct *vma; /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); - do { + mas_for_each(mas, vma, ULONG_MAX) { long nrpages = vma_pages(vma); if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, -nrpages); - vma = remove_vma(vma); - } while (vma); + remove_vma(vma); + } vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -2346,18 +2216,18 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, + struct vm_area_struct *next, unsigned long start, unsigned long end) { - struct vm_area_struct *next = __vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end); - free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mt, vma, start, end); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); } @@ -2444,24 +2314,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline int -unlock_range(struct vm_area_struct *start, struct vm_area_struct **tail, - unsigned long limit) +static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) { - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - int count = 0; - - while (tmp && tmp->vm_start < limit) { - *tail = tmp; - count++; - if (tmp->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(tmp); + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; - tmp = tmp->vm_next; - } + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); - return count; + return 0; } /* @@ -2481,9 +2344,13 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { - struct vm_area_struct *prev, *last; + struct vm_area_struct *prev, *next = NULL; + struct maple_tree mt_detach; + int count = 0; int error = -ENOMEM; - /* we have start < vma->vm_end */ + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt_detach, &mm->mmap_lock); if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -2496,6 +2363,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ + + /* Does it split the first one? */ if (start > vma->vm_start) { /* @@ -2506,35 +2375,60 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; + /* + * mas_pause() is not needed since mas->index needs to be set + * differently than vma->vm_end anyways. + */ error = __split_vma(mm, vma, start, 0); if (error) - goto split_failed; + goto start_split_failed; - prev = vma; - vma = __vma_next(mm, prev); - mas->index = start; - mas_reset(mas); - } else { - prev = vma->vm_prev; + mas_set(mas, start); + vma = mas_walk(mas); } - if (vma->vm_end >= end) - last = vma; - else - last = find_vma_intersection(mm, end - 1, end); + prev = mas_prev(mas, 0); + if (unlikely((!prev))) + mas_set(mas, start); + + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + mas_for_each(mas, next, end - 1) { + /* Does it split the end? */ + if (next->vm_end > end) { + struct vm_area_struct *split; + + error = __split_vma(mm, next, end, 1); + if (error) + goto end_split_failed; - /* Does it split the last one? */ - if (last && end < last->vm_end) { - error = __split_vma(mm, last, end, 1); + mas_set(mas, end); + split = mas_prev(mas, 0); + error = munmap_sidetree(split, &mas_detach); + if (error) + goto munmap_sidetree_failed; + count++; + if (vma == next) + vma = split; + break; + } + error = munmap_sidetree(next, &mas_detach); if (error) - goto split_failed; + goto munmap_sidetree_failed; - if (vma == last) - vma = __vma_next(mm, prev); - mas_reset(mas); + count++; +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < start); + BUG_ON(next->vm_start > end); +#endif } + if (!next) + next = mas_next(mas, ULONG_MAX); + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2551,35 +2445,36 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* - * unlock any mlock()ed ranges before detaching vmas, count the number - * of VMAs to be dropped, and return the tail entry of the affected - * area. - */ - mm->map_count -= unlock_range(vma, &last, end); - /* Drop removed area from the tree */ + /* Point of no return */ + mas_set_range(mas, start, end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, &mt_detach, start, end - 1); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + rcu_read_lock(); + vma_test = mas_find(&test, end - 1); + mas_for_each(mas, vma_mas, end - 1) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, end - 1); + } + rcu_read_unlock(); + BUG_ON(count != test_count); + mas_set_range(mas, start, end - 1); + } +#endif mas_store_prealloc(mas, NULL); - - /* Detach vmas from the MM linked list */ - vma->vm_prev = NULL; - if (prev) - prev->vm_next = last->vm_next; - else - mm->mmap = last->vm_next; - - if (last->vm_next) { - last->vm_next->vm_prev = prev; - last->vm_next = NULL; - } else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or * VM_GROWSUP VMA. Such VMAs can change their size under * down_read(mmap_lock) and collide with the VMA we are about to unmap. */ if (downgrade) { - if (last && (last->vm_flags & VM_GROWSDOWN)) + if (next && (next->vm_flags & VM_GROWSDOWN)) downgrade = false; else if (prev && (prev->vm_flags & VM_GROWSUP)) downgrade = false; @@ -2587,18 +2482,22 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, vma, prev, start, end); - - /* Fix up all other VM information */ - remove_vma_list(mm, vma); + unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* Statistics and freeing VMAs */ + mas_set(&mas_detach, start); + remove_mt(mm, &mas_detach); + __mt_destroy(&mt_detach); validate_mm(mm); return downgrade ? 1 : 0; -map_count_exceeded: -split_failed: userfaultfd_error: +munmap_sidetree_failed: +end_split_failed: + __mt_destroy(&mt_detach); +start_split_failed: +map_count_exceeded: mas_destroy(mas); return error; } @@ -2833,7 +2732,6 @@ cannot_expand: i_mmap_lock_write(vma->vm_file->f_mapping); vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2891,7 +2789,7 @@ unmap_and_free_vma: vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); free_vma: @@ -2979,11 +2877,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; if (start + size > vma->vm_end) { - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, vma->vm_end); + struct vm_area_struct *next, *prev = vma; - for (next = vma->vm_next; next; next = next->vm_next) { + for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ - if (next->vm_start != next->vm_prev->vm_end) + if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) @@ -2992,8 +2891,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; - if (start + size <= next->vm_end) - break; + prev = next; } if (!next) @@ -3060,11 +2958,9 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do some brk-specific accounting here. */ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, - unsigned long flags) + unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *prev = NULL; validate_mm_mt(mm); /* @@ -3107,7 +3003,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, khugepaged_enter_vma(vma, flags); goto out; } - prev = vma; /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); @@ -3124,10 +3019,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (mas_store_gfp(mas, vma, GFP_KERNEL)) goto mas_store_fail; - if (!prev) - prev = mas_prev(mas, 0); - - __vma_link_list(mm, vma, prev); mm->map_count++; out: perf_event_mmap(vma); @@ -3136,7 +3027,7 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; - validate_mm_mt(mm); + validate_mm(mm); return 0; mas_store_fail: @@ -3217,6 +3108,8 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -3241,7 +3134,7 @@ void exit_mmap(struct mm_struct *mm) mmap_write_lock(mm); arch_exit_mmap(mm); - vma = mm->mmap; + vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_write_unlock(mm); @@ -3252,22 +3145,29 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ - /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* Walk the list again, actually closing and freeing it. */ - while (vma) { + /* + * Walk the list again, actually closing and freeing it, with preemption + * enabled, without holding any MM locks besides the unreachable + * mmap_write_lock. + */ + do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - vma = remove_vma(vma); + remove_vma(vma); + count++; cond_resched(); - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + + BUG_ON(count != mm->map_count); trace_exit_mmap(mm); __mt_destroy(&mm->mm_mt); - mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3306,7 +3206,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (vma_link(mm, vma, prev)) { + if (vma_link(mm, vma)) { vm_unacct_memory(charged); return -ENOMEM; } @@ -3338,7 +3238,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (range_has_overlap(mm, addr, addr + len, &prev)) + new_vma = find_vma_prev(mm, addr, &prev); + if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, @@ -3381,7 +3282,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - if (vma_link(mm, new_vma, prev)) + if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } @@ -3686,12 +3587,13 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3699,7 +3601,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3707,7 +3610,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3766,11 +3670,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); diff --git a/mm/nommu.c b/mm/nommu.c index 269df51e92266..214c70e1d0594 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -584,17 +584,12 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; - BUG_ON(!vma->vm_region); setup_vma_to_mm(vma, mm); - prev = mas_prev(mas, 0); - mas_reset(mas); /* add the VMA to the tree */ vma_mas_store(vma, mas); - __vma_link_list(mm, vma, prev); } /* @@ -647,7 +642,6 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); - __vma_unlink_list(vma->vm_mm, vma); return 0; } diff --git a/mm/util.c b/mm/util.c index 10effe256dfa0..5cd3f7910f2ce 100644 --- a/mm/util.c +++ b/mm/util.c @@ -272,46 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user_nul); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - next = mm->mmap; - mm->mmap = vma; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; - else - mm->highest_vm_end = vm_end_gap(vma); -} - -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev, *next; - - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) { - next->vm_prev = prev; - } else { - if (prev) - mm->highest_vm_end = vm_end_gap(prev); - else - mm->highest_vm_end = 0; - } -} - /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { -- GitLab From d0601a500c35856f9c134126b2423c9cfc86c701 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: [PATCH 0692/2223] mm/mmap: drop range_has_overlap() function Since there is no longer a linked list, the range_has_overlap() function is identical to the find_vma_intersection() function. Link: https://lkml.kernel.org/r/20220906194824.2110408-70-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4441f7ed197a9..5070af64b99d8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -390,30 +390,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -/* - * range_has_overlap() - Check the @start - @end range for overlapping VMAs and - * sets up a pointer to the previous VMA - * @mm: the mm struct - * @start: the start address of the range - * @end: the end address of the range - * @pprev: the pointer to the pointer of the previous VMA - * - * Returns: True if there is an overlapping VMA, false otherwise - */ -static inline -bool range_has_overlap(struct mm_struct *mm, unsigned long start, - unsigned long end, struct vm_area_struct **pprev) -{ - struct vm_area_struct *existing; - - MA_STATE(mas, &mm->mm_mt, start, start); - rcu_read_lock(); - existing = mas_find(&mas, end - 1); - *pprev = mas_prev(&mas, 0); - rcu_read_unlock(); - return existing ? true : false; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -3178,11 +3154,10 @@ void exit_mmap(struct mm_struct *mm) */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; unsigned long charged = vma_pages(vma); - if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev)) + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && -- GitLab From c154124fe925a451e471233aa7d1ab9a91f0a5ad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: [PATCH 0693/2223] mm/mmap.c: pass in mapping to __vma_link_file() __vma_link_file() resolves the mapping from the file, if there is one. Pass through the mapping and check the vm_file externally since most places already have the required information and check of vm_file. Link: https://lkml.kernel.org/r/20220906194824.2110408-71-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5070af64b99d8..fbe8b52a90a3a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -407,21 +407,15 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, return nr_pages; } -static void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) { - struct file *file; - - file = vma->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(mapping); + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - } + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); } /* @@ -488,10 +482,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) } vma_mas_store(vma, &mas); - __vma_link_file(vma); - if (mapping) + if (mapping) { + __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); + } mm->map_count++; validate_mm(mm); @@ -730,14 +725,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); - if (insert) { + if (insert && insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ - __vma_link_file(insert); + __vma_link_file(insert, insert->vm_file->f_mapping); } } -- GitLab From 66071896cdfe096fcd4aef55a5efbd5216fa15de Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 15 Jun 2022 17:40:58 +0000 Subject: [PATCH 0694/2223] mm/mlock: drop dead code in count_mm_mlocked_page_nr() The check for mm being null has never been needed since the only caller has always passed in current->mm. Remove the check from count_mm_mlocked_page_nr(). Link: https://lkml.kernel.org/r/20220615174050.738523-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Lukas Bulwahn Signed-off-by: Andrew Morton --- mm/mlock.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 43d19a1f28eb3..7032f6dd0ce19 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -531,14 +531,12 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long end; VMA_ITERATOR(vmi, mm, start); - if (mm == NULL) - mm = current->mm; - /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) -- GitLab From bf3980c85212fc71512d27a46f5aab66f46ca284 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:30:59 -0700 Subject: [PATCH 0695/2223] mm: drop oom code from exit_mmap The primary reason to invoke the oom reaper from the exit_mmap path used to be a prevention of an excessive oom killing if the oom victim exit races with the oom reaper (see [1] for more details). The invocation has moved around since then because of the interaction with the munlock logic but the underlying reason has remained the same (see [2]). Munlock code is no longer a problem since [3] and there shouldn't be any blocking operation before the memory is unmapped by exit_mmap so the oom reaper invocation can be dropped. The unmapping part can be done with the non-exclusive mmap_sem and the exclusive one is only required when page tables are freed. Remove the oom_reaper from exit_mmap which will make the code easier to read. This is really unlikely to make any observable difference although some microbenchmarks could benefit from one less branch that needs to be evaluated even though it almost never is true. [1] 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") [2] 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3") [3] a213e5cf71cb ("mm/munlock: delete munlock_vma_pages_all(), allow oomreap") [akpm@linux-foundation.org: restore Suren's mmap_read_lock() optimization] Link: https://lkml.kernel.org/r/20220531223100.510392-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Jann Horn Cc: Johannes Weiner Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/oom.h | 2 -- mm/mmap.c | 30 +++++++++++------------------- mm/oom_kill.c | 2 +- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 02d1e7bbd8cd5..6cdde62b078b5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -106,8 +106,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) return 0; } -bool __oom_reap_task_mm(struct mm_struct *mm); - long oom_badness(struct task_struct *p, unsigned long totalpages); diff --git a/mm/mmap.c b/mm/mmap.c index fbe8b52a90a3a..be111bbe80754 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3085,30 +3085,13 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Manually reap the mm to free as much memory as possible. - * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_lock for - * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_lock is - * dropped. - * - * Nothing can be holding mm->mmap_lock here and the above call - * to mmu_notifier_release(mm) ensures mmu notifier callbacks in - * __oom_reap_task_mm() will not block. - */ - (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); - } - - mmap_write_lock(mm); + mmap_read_lock(mm); arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ - mmap_write_unlock(mm); + mmap_read_unlock(mm); return; } @@ -3118,6 +3101,15 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + mmap_read_unlock(mm); + + /* + * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper + * because the memory has been already freed. Do not bother checking + * mm_is_oom_victim because setting a bit unconditionally is cheaper. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + mmap_write_lock(mm); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3996301450e8d..decb21474c6c5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -509,7 +509,7 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -bool __oom_reap_task_mm(struct mm_struct *mm) +static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; -- GitLab From b3541d912a84dc40cabb516f2deeac9ae6fa30da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:31:00 -0700 Subject: [PATCH 0696/2223] mm: delete unused MMF_OOM_VICTIM flag With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is now unused and can be removed. [akpm@linux-foundation.org: remove comment about now-removed mm_is_oom_victim()] Link: https://lkml.kernel.org/r/20220531223100.510392-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Peter Xu Cc: John Hubbard Cc: Shuah Khan Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/oom.h | 9 --------- include/linux/sched/coredump.h | 7 +++---- mm/mmap.c | 3 +-- mm/oom_kill.c | 4 +--- 4 files changed, 5 insertions(+), 18 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 6cdde62b078b5..7d0c9c48a0c54 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } -/* - * Use this helper if tsk->mm != mm and the victim mm needs a special - * handling. This is guaranteed to stay true after once set. - */ -static inline bool mm_is_oom_victim(struct mm_struct *mm) -{ - return test_bit(MMF_OOM_VICTIM, &mm->flags); -} - /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d0a5be28b70f..8270ad7ae14c2 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into @@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm) * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ -#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/mmap.c b/mm/mmap.c index be111bbe80754..2a62d589d3c27 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3105,8 +3105,7 @@ void exit_mmap(struct mm_struct *mm) /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper - * because the memory has been already freed. Do not bother checking - * mm_is_oom_victim because setting a bit unconditionally is cheaper. + * because the memory has been already freed. */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index decb21474c6c5..35ec75cdfee21 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -765,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) mmgrab(tsk->signal->oom_mm); - set_bit(MMF_OOM_VICTIM, &mm->flags); - } /* * Make sure that the task is woken up from uninterruptible sleep -- GitLab From eef199440df950942b3c7ef2e2de507fd6ced031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= Date: Fri, 3 Jun 2022 16:57:18 +0200 Subject: [PATCH 0697/2223] mm: refactor of vma_merge() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Refactor of vma_merge and new merge call", v4. I am currently working on my master's thesis trying to increase number of merges of VMAs currently failing because of page offset incompatibility and difference in their anon_vmas. The following refactor and added merge call included in this series is just two smaller upgrades I created along the way. This patch (of 2): Refactor vma_merge() to make it shorter and more understandable. Main change is the elimination of code duplicity in the case of merge next check. This is done by first doing checks and caching the results before executing the merge itself. The variable 'area' is divided into 'mid' and 'res' as previously it was used for two purposes, as the middle VMA between prev and next and also as the result of the merge itself. Exit paths are also unified. Link: https://lkml.kernel.org/r/20220603145719.1012094-1-matenajakub@gmail.com Link: https://lkml.kernel.org/r/20220603145719.1012094-2-matenajakub@gmail.com Signed-off-by: Jakub Matěna Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Liam Howlett Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Rik van Riel Cc: Steven Rostedt Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton --- mm/mmap.c | 87 +++++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 2a62d589d3c27..6e447544f07dd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1005,8 +1005,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - int err; + struct vm_area_struct *mid, *next, *res; + int err = -1; + bool merge_prev = false; + bool merge_next = false; /* * We later require that vma->vm_flags == vm_flags, @@ -1016,75 +1018,60 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, return NULL; next = find_vma(mm, prev ? prev->vm_end : 0); - area = next; - if (area && area->vm_end == end) /* cases 6, 7, 8 */ + mid = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* - * Can it merge with the predecessor? - */ + /* Can we merge the predecessor? */ if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, - pgoff+pglen, - vm_userfaultfd_ctx, anon_name) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { - /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); - } else /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); - if (err) - return NULL; - khugepaged_enter_vma(prev, vm_flags); - return prev; + merge_prev = true; } - - /* - * Can this new request be merged in front of next? - */ + /* Can we merge the successor? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { + merge_next = true; + } + /* Can we merge both the predecessor and the successor? */ + if (merge_prev && merge_next && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { /* cases 1, 6 */ + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); + res = prev; + } else if (merge_prev) { /* cases 2, 5, 7 */ + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); + res = prev; + } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); - else { /* cases 3, 8 */ - err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); - /* - * In case 3 area is already equal to next and - * this is a noop, but in case 8 "area" has - * been removed and next was expanded over it. - */ - area = next; - } - if (err) - return NULL; - khugepaged_enter_vma(area, vm_flags); - return area; + addr, prev->vm_pgoff, NULL, next); + else /* cases 3, 8 */ + err = __vma_adjust(mid, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + res = next; } - return NULL; + /* + * Cannot merge with predecessor or successor or error in __vma_adjust? + */ + if (err) + return NULL; + khugepaged_enter_vma(res, vm_flags); + return res; } /* -- GitLab From ca3d76b0aa808a06997297d123b66d17b81e5285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= Date: Fri, 3 Jun 2022 16:57:19 +0200 Subject: [PATCH 0698/2223] mm: add merging after mremap resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mremap call results in expansion, it might be possible to merge the VMA with the next VMA which might become adjacent. This patch adds vma_merge call after the expansion is done to try and merge. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20220603145719.1012094-3-matenajakub@gmail.com Signed-off-by: Jakub Matěna Reviewed-by: Vlastimil Babka Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Steven Rostedt Signed-off-by: Andrew Morton --- mm/mremap.c | 19 ++++++++- tools/testing/selftests/vm/mremap_test.c | 49 +++++++++++++++++++++++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 8644ff278f029..e465ffe279bb0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -1012,6 +1014,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { long pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long extension_start = addr + old_len; + unsigned long extension_end = addr + new_len; + pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1020,8 +1025,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } } - if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + /* + * Function vma_merge() is called on the extension we are adding to + * the already existing vma, vma_merge() will merge this extension with + * the already existing vma (expand operation itself) and possibly also + * with the next vma if it becomes adjacent to the expanded vma and + * otherwise compatible. + */ + vma = vma_merge(mm, vma, extension_start, extension_end, + vma->vm_flags, vma->anon_vma, vma->vm_file, + extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; goto out; diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index db0270127aeb0..9496346973d44 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -118,6 +118,50 @@ static unsigned long long get_mmap_min_addr(void) return addr; } +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + munmap(start + page_size, page_size); + mremap(start, page_size, 2 * page_size, 0); + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) { + ksft_test_result_fail("%s\n", test_name); + return; + } + + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val == start && second_val == start + 3 * page_size) { + success = true; + break; + } + } + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); + fclose(fp); +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -336,6 +380,7 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; + int num_expand_tests = 1; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -407,12 +452,14 @@ int main(int argc, char **argv) (threshold_mb * _1MB >= _1GB); ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0)); + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); + mremap_expand_merge(page_size); + if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); -- GitLab From f7091ed64ec8311b0c35865875f8c3e04e5ea532 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 23 Aug 2022 21:58:41 +0800 Subject: [PATCH 0699/2223] mm: fix the handling Non-LRU pages returned by follow_page The handling Non-LRU pages returned by follow_page() jumps directly, it doesn't call put_page() to handle the reference count, since 'FOLL_GET' flag for follow_page() has get_page() called. Fix the zone device page check by handling the page reference count correctly before returning. And as David reviewed, "device pages are never PageKsm pages". Drop this zone device page check for break_ksm(). Since the zone device page can't be a transparent huge page, so drop the redundant zone device page check for split_huge_pages_pid(). (by Miaohe) Link: https://lkml.kernel.org/r/20220823135841.934465-3-haiyue.wang@intel.com Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages") Signed-off-by: Haiyue Wang Reviewed-by: "Huang, Ying" Reviewed-by: Felix Kuehling Reviewed-by: Alistair Popple Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Alex Sierra Cc: Gerald Schaefer Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/ksm.c | 12 +++++++++--- mm/migrate.c | 19 ++++++++++++------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 63b4d8ff4b556..135acf87d24d8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3001,7 +3001,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) continue; if (!is_transparent_hugepage(page)) diff --git a/mm/ksm.c b/mm/ksm.c index 533ede86b4b9f..1fafd531f669a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) cond_resched(); page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) goto out; + if (is_zone_device_page(page)) + goto out_putpage; if (PageAnon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { +out_putpage: put_page(page); out: page = NULL; @@ -2322,11 +2325,13 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) { + if (IS_ERR_OR_NULL(*page)) { ksm_scan.address += PAGE_SIZE; cond_resched(); continue; } + if (is_zone_device_page(*page)) + goto next_page; if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); @@ -2341,6 +2346,7 @@ next_mm: mmap_read_unlock(mm); return rmap_item; } +next_page: put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); diff --git a/mm/migrate.c b/mm/migrate.c index d74573c36573b..eb594b0db8060 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1691,9 +1691,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; err = -ENOENT; - if (!page || is_zone_device_page(page)) + if (!page) goto out; + if (is_zone_device_page(page)) + goto out_putpage; + err = 0; if (page_to_nid(page) == node) goto out_putpage; @@ -1891,13 +1894,15 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (IS_ERR(page)) goto set_status; - if (page && !is_zone_device_page(page)) { + err = -ENOENT; + if (!page) + goto set_status; + + if (!is_zone_device_page(page)) err = page_to_nid(page); - if (foll_flags & FOLL_GET) - put_page(page); - } else { - err = -ENOENT; - } + + if (foll_flags & FOLL_GET) + put_page(page); set_status: *status = err; -- GitLab From 474098edac262ae26bfab1c48445877075a31cbd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:57 +0200 Subject: [PATCH 0700/2223] mm/gup: replace FOLL_NUMA by gup_can_follow_protnone() Patch series "mm: minor cleanups around NUMA hinting". Working on some GUP cleanups (e.g., getting rid of some FOLL_ flags) and preparing for other GUP changes (getting rid of FOLL_FORCE|FOLL_WRITE for for taking a R/O longterm pin), this is something I can easily send out independently. Get rid of FOLL_NUMA, allow FOLL_FORCE access to PROT_NONE mapped pages in GUP-fast, and fixup some documentation around NUMA hinting. This patch (of 3): No need for a special flag that is not even properly documented to be internal-only. Let's just factor this check out and get rid of this flag. The separate function has the nice benefit that we can centralize comments. Link: https://lkml.kernel.org/r/20220825164659.89824-2-david@redhat.com Link: https://lkml.kernel.org/r/20220825164659.89824-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++++- mm/gup.c | 12 ++---------- mm/huge_memory.c | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 37384a84f71aa..eb25cae06c551 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2933,7 +2933,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ @@ -3054,6 +3053,21 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) return !PageAnonExclusive(page); } +/* + * Indicates whether GUP can follow a PROT_NONE mapped page, or whether + * a (NUMA hinting) fault is required. + */ +static inline bool gup_can_follow_protnone(unsigned int flags) +{ + /* + * FOLL_FORCE has to be able to make progress even if the VMA is + * inaccessible. Further, FOLL_FORCE access usually does not represent + * application behaviour and we should avoid triggering NUMA hinting + * faults. + */ + return flags & FOLL_FORCE; +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); diff --git a/mm/gup.c b/mm/gup.c index 6e49fe5da5133..f06770e035495 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -561,7 +561,7 @@ retry: migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_protnone(pte)) + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; page = vm_normal_page(vma, address, pte); @@ -714,7 +714,7 @@ retry: if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); - if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); retry_locked: @@ -1160,14 +1160,6 @@ static long __get_user_pages(struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); - /* - * If FOLL_FORCE is set then do not force a full fault as the hinting - * fault information is unrelated to the reference behaviour of a task - * using the address space - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - do { struct page *page; unsigned int foll_flags = gup_flags; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 135acf87d24d8..84bf1d5f6b7e8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1447,7 +1447,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) -- GitLab From 0cf459866a91c741eb14a92f3633638d1dd9db59 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:58 +0200 Subject: [PATCH 0701/2223] mm/gup: use gup_can_follow_protnone() also in GUP-fast There seems to be no reason why FOLL_FORCE during GUP-fast would have to fallback to the slow path when stumbling over a PROT_NONE mapped page. We only have to trigger hinting faults in case FOLL_FORCE is not set, and any kind of fault handling naturally happens from the slow path -- where NUMA hinting accounting/handling would be performed. Note that the comment regarding THP migration is outdated: commit 2b4847e73004 ("mm: numa: serialise parallel get_user_page against THP migration") described that this was required for THP due to lack of PMD migration entries. Nowadays, we do have proper PMD migration entries in place -- see set_pmd_migration_entry(), which does a proper pmdp_invalidate() when placing the migration entry. So let's just reuse gup_can_follow_protnone() here to make it consistent and drop the somewhat outdated comments. Link: https://lkml.kernel.org/r/20220825164659.89824-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/gup.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f06770e035495..ce00a4c40da8a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2420,11 +2420,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, struct page *page; struct folio *folio; - /* - * Similar to the PMD case below, NUMA hinting must take slow - * path using the pte_protnone check. - */ - if (pte_protnone(pte)) + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) @@ -2808,12 +2804,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) + if (pmd_protnone(pmd) && + !gup_can_follow_protnone(flags)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, -- GitLab From 7014887a01587d8c50871d5985cd572ca08b29c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:59 +0200 Subject: [PATCH 0702/2223] mm: fixup documentation regarding pte_numa() and PROT_NUMA pte_numa() no longer exists -- replaced by pte_protnone() -- and PROT_NUMA probably never existed: MM_CP_PROT_NUMA also ends up using PROT_NONE. Let's fixup the doc. Link: https://lkml.kernel.org/r/20220825164659.89824-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e32211cb5a9d..26573ba485f37 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -614,22 +614,22 @@ struct mm_struct { #endif #ifdef CONFIG_NUMA_BALANCING /* - * numa_next_scan is the next time that the PTEs will be marked - * pte_numa. NUMA hinting faults will gather statistics and - * migrate pages to new nodes if necessary. + * numa_next_scan is the next time that PTEs will be remapped + * PROT_NONE to trigger NUMA hinting faults; such faults gather + * statistics and migrate pages to new nodes if necessary. */ unsigned long numa_next_scan; - /* Restart point for scanning and setting pte_numa */ + /* Restart point for scanning and remapping PTEs. */ unsigned long numa_scan_offset; - /* numa_scan_seq prevents two threads setting pte_numa */ + /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when - * moving a PROT_NONE or PROT_NUMA mapped page. + * moving a PROT_NONE mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -- GitLab From 974f4367dd315acc15ad4a6453f8304aea60dfbd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 23 Aug 2022 11:22:30 +0200 Subject: [PATCH 0703/2223] mm: reduce noise in show_mem for lowmem allocations While discussing early DMA pool pre-allocation failure with Christoph [1] I have realized that the allocation failure warning is rather noisy for constrained allocations like GFP_DMA{32}. Those zones are usually not populated on all nodes very often as their memory ranges are constrained. This is an attempt to reduce the ballast that doesn't provide any relevant information for those allocation failures investigation. Please note that I have only compile tested it (in my default config setup) and I am throwing it mostly to see what people think about it. [1] http://lkml.kernel.org/r/20220817060647.1032426-1-hch@lst.de [mhocko@suse.com: update] Link: https://lkml.kernel.org/r/Yw29bmJTIkKogTiW@dhcp22.suse.cz [mhocko@suse.com: fix build] [akpm@linux-foundation.org: fix it for mapletree] [akpm@linux-foundation.org: update it for Michal's update] [mhocko@suse.com: fix arch/powerpc/xmon/xmon.c] Link: https://lkml.kernel.org/r/Ywh3C4dKB9B93jIy@dhcp22.suse.cz [akpm@linux-foundation.org: fix arch/sparc/kernel/setup_32.c] Link: https://lkml.kernel.org/r/YwScVmVofIZkopkF@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Mel Gorman Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- lib/show_mem.c | 4 ++-- mm/oom_kill.c | 2 +- mm/page_alloc.c | 21 +++++++++++++++++++-- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb25cae06c551..e56dd8f7eae19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1838,7 +1838,11 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) +{ + __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); +} #ifdef CONFIG_MMU extern bool can_do_mlock(void); @@ -2578,7 +2582,12 @@ extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags, nodemask_t *nodemask); + +extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static inline void show_mem(unsigned int flags, nodemask_t *nodemask) +{ + __show_mem(flags, nodemask, MAX_NR_ZONES - 1); +} extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/lib/show_mem.c b/lib/show_mem.c index 1c26c14ffbb9b..0d7585cde2a69 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -8,13 +8,13 @@ #include #include -void show_mem(unsigned int filter, nodemask_t *nodemask) +void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(filter, nodemask); + __show_free_areas(filter, nodemask, max_zone_idx); for_each_online_pgdat(pgdat) { int zoneid; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 35ec75cdfee21..1276e49b31b0a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -461,7 +461,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (is_memcg_oom(oc)) mem_cgroup_print_oom_meminfo(oc->memcg); else { - show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); + __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 262896bd1a903..44f3c93643161 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4322,7 +4322,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - show_mem(filter, nodemask); + __show_mem(filter, nodemask, gfp_zone(gfp_mask)); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -6050,6 +6050,15 @@ static void show_migration_types(unsigned char type) printk(KERN_CONT "(%s) ", tmp); } +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -6059,7 +6068,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter, nodemask_t *nodemask) +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; @@ -6067,6 +6076,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) pg_data_t *pgdat; for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6104,6 +6115,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; printk("Node %d" " active_anon:%lukB" @@ -6160,6 +6173,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_populated_zone(zone) { int i; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6221,6 +6236,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); -- GitLab From e6ad640bc404eb298dd1880113131768ddf5c6a8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 26 Aug 2022 23:06:42 +0000 Subject: [PATCH 0704/2223] mm: deduplicate cacheline padding code There are three users (mmzone.h, memcontrol.h, page_counter.h) using similar code for forcing cacheline padding between fields of different structures. Dedup that code. Link: https://lkml.kernel.org/r/20220826230642.566725-1-shakeelb@google.com Signed-off-by: Shakeel Butt Suggested-by: Feng Tang Reviewed-by: Feng Tang Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/cache.h | 13 +++++++++++++ include/linux/memcontrol.h | 13 ++----------- include/linux/mmzone.h | 24 +++++------------------- include/linux/page_counter.h | 13 ++----------- 4 files changed, 22 insertions(+), 41 deletions(-) diff --git a/include/linux/cache.h b/include/linux/cache.h index d742c57eaee59..5da1bbd96154b 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -85,4 +85,17 @@ #define cache_line_size() L1_CACHE_BYTES #endif +/* + * Helper to add padding within a struct to ensure data fall into separate + * cachelines. + */ +#if defined(CONFIG_SMP) +struct cacheline_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define CACHELINE_PADDING(name) struct cacheline_padding name +#else +#define CACHELINE_PADDING(name) +#endif + #endif /* __LINUX_CACHE_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 344022f102c2c..60545e4a1c034 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -185,15 +185,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -#if defined(CONFIG_SMP) -struct memcg_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define MEMCG_PADDING(name) struct memcg_padding name -#else -#define MEMCG_PADDING(name) -#endif - /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss @@ -304,7 +295,7 @@ struct mem_cgroup { spinlock_t move_lock; unsigned long move_lock_flags; - MEMCG_PADDING(_pad1_); + CACHELINE_PADDING(_pad1_); /* memory.stat */ struct memcg_vmstats vmstats; @@ -326,7 +317,7 @@ struct mem_cgroup { struct list_head objcg_list; #endif - MEMCG_PADDING(_pad2_); + CACHELINE_PADDING(_pad2_); /* * set > 0 if pages under this cgroup are moving to other cgroup. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e335a492c2ebf..c69c081568227 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -121,20 +121,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) struct pglist_data; -/* - * Add a wild amount of padding here to ensure data fall into separate - * cachelines. There are very few zone structures in the machine, so space - * consumption is not a concern here. - */ -#if defined(CONFIG_SMP) -struct zone_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define ZONE_PADDING(name) struct zone_padding name; -#else -#define ZONE_PADDING(name) -#endif - #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ @@ -837,7 +823,7 @@ struct zone { int initialized; /* Write-intensive fields used from the page allocator */ - ZONE_PADDING(_pad1_) + CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; @@ -849,7 +835,7 @@ struct zone { spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ - ZONE_PADDING(_pad2_) + CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken @@ -886,7 +872,7 @@ struct zone { bool contiguous; - ZONE_PADDING(_pad3_) + CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; @@ -1196,7 +1182,7 @@ typedef struct pglist_data { #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ - ZONE_PADDING(_pad1_) + CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -1241,7 +1227,7 @@ typedef struct pglist_data { struct lru_gen_mm_walk mm_walk; #endif - ZONE_PADDING(_pad2_) + CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 78a1c934e4163..c141ea9a95ef8 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,22 +7,13 @@ #include #include -#if defined(CONFIG_SMP) -struct pc_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define PC_PADDING(name) struct pc_padding name -#else -#define PC_PADDING(name) -#endif - struct page_counter { /* * Make sure 'usage' does not share cacheline with any other field. The * memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; - PC_PADDING(_pad1_); + CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -38,7 +29,7 @@ struct page_counter { unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ - PC_PADDING(_pad2_); + CACHELINE_PADDING(_pad2_); unsigned long min; unsigned long low; -- GitLab From cb4df4cae4f2bd8cf7a32eff81178fce31600f7c Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 30 Aug 2022 14:38:38 +0000 Subject: [PATCH 0705/2223] ksm: count allocated ksm rmap_items for each process Patch series "ksm: count allocated rmap_items and update documentation", v5. KSM can save memory by merging identical pages, but also can consume additional memory, because it needs to generate rmap_items to save each scanned page's brief rmap information. To determine how beneficial the ksm-policy (like madvise), they are using brings, so we add a new interface /proc//ksm_stat for each process The value "ksm_rmap_items" in it indicates the total allocated ksm rmap_items of this process. The detailed description can be seen in the following patches' commit message. This patch (of 2): KSM can save memory by merging identical pages, but also can consume additional memory, because it needs to generate rmap_items to save each scanned page's brief rmap information. Some of these pages may be merged, but some may not be abled to be merged after being checked several times, which are unprofitable memory consumed. The information about whether KSM save memory or consume memory in system-wide range can be determined by the comprehensive calculation of pages_sharing, pages_shared, pages_unshared and pages_volatile. A simple approximate calculation: profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * sizeof(rmap_item); where all_rmap_items equals to the sum of pages_sharing, pages_shared, pages_unshared and pages_volatile. But we cannot calculate this kind of ksm profit inner single-process wide because the information of ksm rmap_item's number of a process is lacked. For user applications, if this kind of information could be obtained, it helps upper users know how beneficial the ksm-policy (like madvise) they are using brings, and then optimize their app code. For example, one application madvise 1000 pages as MERGEABLE, while only a few pages are really merged, then it's not cost-efficient. So we add a new interface /proc//ksm_stat for each process in which the value of ksm_rmap_itmes is only shown now and so more values can be added in future. So similarly, we can calculate the ksm profit approximately for a single process by: profit =~ ksm_merging_pages * sizeof(page) - ksm_rmap_items * sizeof(rmap_item); where ksm_merging_pages is shown at /proc//ksm_merging_pages, and ksm_rmap_items is shown in /proc//ksm_stat. Link: https://lkml.kernel.org/r/20220830143731.299702-1-xu.xin16@zte.com.cn Link: https://lkml.kernel.org/r/20220830143838.299758-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: CGEL ZTE Cc: Alexey Dobriyan Cc: Bagas Sanjaya Cc: Hugh Dickins Cc: Izik Eidus Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- fs/proc/base.c | 15 +++++++++++++++ include/linux/mm_types.h | 5 +++++ mm/ksm.c | 2 ++ 3 files changed, 22 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 12885a75913f5..ca3e836377e83 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3199,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace * return 0; } +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + mmput(mm); + } + + return 0; +} #endif /* CONFIG_KSM */ #ifdef CONFIG_STACKLEAK_METRICS @@ -3334,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; @@ -3671,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 26573ba485f37..8f30f262431c9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -654,6 +654,11 @@ struct mm_struct { * merging. */ unsigned long ksm_merging_pages; + /* + * Represent how many pages are checked for ksm merging + * including merged and not merged. + */ + unsigned long ksm_rmap_items; #endif #ifdef CONFIG_LRU_GEN struct { diff --git a/mm/ksm.c b/mm/ksm.c index 1fafd531f669a..0cd2f4b623345 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -387,6 +387,7 @@ static inline struct rmap_item *alloc_rmap_item(void) static inline void free_rmap_item(struct rmap_item *rmap_item) { ksm_rmap_items--; + rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } @@ -2235,6 +2236,7 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->mm; + rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; -- GitLab From 21b7bdb504ae6b0a795c8d63818611ce02b532c1 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 30 Aug 2022 14:40:03 +0000 Subject: [PATCH 0706/2223] ksm: add profit monitoring documentation Add the description of KSM profit and how to determine it separately in system-wide range and inner a single process. Link: https://lkml.kernel.org/r/20220830144003.299870-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Reviewed-by: Bagas Sanjaya Cc: Alexey Dobriyan Cc: Hugh Dickins Cc: Izik Eidus Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index b244f0202a036..fb6ba2002a4b2 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -184,6 +184,42 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the ``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must be increased accordingly. +Monitoring KSM profit +===================== + +KSM can save memory by merging identical pages, but also can consume +additional memory, because it needs to generate a number of rmap_items to +save each scanned page's brief rmap information. Some of these pages may +be merged, but some may not be abled to be merged after being checked +several times, which are unprofitable memory consumed. + +1) How to determine whether KSM save memory or consume memory in system-wide + range? Here is a simple approximate calculation for reference:: + + general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * + sizeof(rmap_item); + + where all_rmap_items can be easily obtained by summing ``pages_sharing``, + ``pages_shared``, ``pages_unshared`` and ``pages_volatile``. + +2) The KSM profit inner a single process can be similarly obtained by the + following approximate calculation:: + + process_profit =~ ksm_merging_pages * sizeof(page) - + ksm_rmap_items * sizeof(rmap_item). + + where ksm_merging_pages is shown under the directory ``/proc//``, + and ksm_rmap_items is shown in ``/proc//ksm_stat``. + +From the perspective of application, a high ratio of ``ksm_rmap_items`` to +``ksm_merging_pages`` means a bad madvise-applied policy, so developers or +administrators have to rethink how to change madvise policy. Giving an example +for reference, a page's size is usually 4K, and the rmap_item's size is +separately 32B on 32-bit CPU architecture and 64B on 64-bit CPU architecture. +so if the ``ksm_rmap_items/ksm_merging_pages`` ratio exceeds 64 on 64-bit CPU +or exceeds 128 on 32-bit CPU, then the app's madvise policy should be dropped, +because the ksm profit is approximately zero or negative. + Monitoring KSM events ===================== -- GitLab From 24613f7c7f2dbd0b47ecdf9928600379e606dfda Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 26 Sep 2022 16:48:21 -0500 Subject: [PATCH 0707/2223] Input: applespi - replace zero-length array with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated and we are moving towards adopting C99 flexible-array members, instead. So, replace zero-length arrays declarations in anonymous union with the new DECLARE_FLEX_ARRAY() helper macro. This helper allows for flexible-array members in unions. Link: https://github.com/KSPP/linux/issues/193 Link: https://github.com/KSPP/linux/issues/219 Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/YzIeJeqU73G+UI8g@work Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/applespi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/applespi.c b/drivers/input/keyboard/applespi.c index fab5473ae5dac..91a9810f69807 100644 --- a/drivers/input/keyboard/applespi.c +++ b/drivers/input/keyboard/applespi.c @@ -311,7 +311,7 @@ struct message { struct command_protocol_mt_init init_mt_command; struct command_protocol_capsl capsl_command; struct command_protocol_bl bl_command; - u8 data[0]; + DECLARE_FLEX_ARRAY(u8, data); }; }; -- GitLab From 8a9b7ef74369f08a8bde2a45168056f1cad9fb2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 24 Sep 2022 11:24:02 +0200 Subject: [PATCH 0708/2223] PCI: Add standard PCI Config Address macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lot of PCI and PCIe controllers are using standard Config Address for PCI Configuration Mechanism #1 (as defined in PCI Local Bus Specification) or its extended version. So introduce new macros PCI_CONF1_ADDRESS() and PCI_CONF1_EXT_ADDRESS() in include file drivers/pci/pci.h which can be suitable for PCI and PCIe controllers which uses this type of access to PCI config space. Link: https://lore.kernel.org/r/20220924092404.31776-2-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/pci.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 785f31086313a..88bd771071033 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -774,4 +774,49 @@ static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) } #endif +/* + * Config Address for PCI Configuration Mechanism #1 + * + * See PCI Local Bus Specification, Revision 3.0, + * Section 3.2.2.3.2, Figure 3-2, p. 50. + */ + +#define PCI_CONF1_BUS_SHIFT 16 /* Bus number */ +#define PCI_CONF1_DEV_SHIFT 11 /* Device number */ +#define PCI_CONF1_FUNC_SHIFT 8 /* Function number */ + +#define PCI_CONF1_BUS_MASK 0xff +#define PCI_CONF1_DEV_MASK 0x1f +#define PCI_CONF1_FUNC_MASK 0x7 +#define PCI_CONF1_REG_MASK 0xfc /* Limit aligned offset to a maximum of 256B */ + +#define PCI_CONF1_ENABLE BIT(31) +#define PCI_CONF1_BUS(x) (((x) & PCI_CONF1_BUS_MASK) << PCI_CONF1_BUS_SHIFT) +#define PCI_CONF1_DEV(x) (((x) & PCI_CONF1_DEV_MASK) << PCI_CONF1_DEV_SHIFT) +#define PCI_CONF1_FUNC(x) (((x) & PCI_CONF1_FUNC_MASK) << PCI_CONF1_FUNC_SHIFT) +#define PCI_CONF1_REG(x) ((x) & PCI_CONF1_REG_MASK) + +#define PCI_CONF1_ADDRESS(bus, dev, func, reg) \ + (PCI_CONF1_ENABLE | \ + PCI_CONF1_BUS(bus) | \ + PCI_CONF1_DEV(dev) | \ + PCI_CONF1_FUNC(func) | \ + PCI_CONF1_REG(reg)) + +/* + * Extension of PCI Config Address for accessing extended PCIe registers + * + * No standardized specification, but used on lot of non-ECAM-compliant ARM SoCs + * or on AMD Barcelona and new CPUs. Reserved bits [27:24] of PCI Config Address + * are used for specifying additional 4 high bits of PCI Express register. + */ + +#define PCI_CONF1_EXT_REG_SHIFT 16 +#define PCI_CONF1_EXT_REG_MASK 0xf00 +#define PCI_CONF1_EXT_REG(x) (((x) & PCI_CONF1_EXT_REG_MASK) << PCI_CONF1_EXT_REG_SHIFT) + +#define PCI_CONF1_EXT_ADDRESS(bus, dev, func, reg) \ + (PCI_CONF1_ADDRESS(bus, dev, func, reg) | \ + PCI_CONF1_EXT_REG(reg)) + #endif /* DRIVERS_PCI_H */ -- GitLab From f75a27dc6c07cbf371572cf0539c3b60e7d50c1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 24 Sep 2022 11:24:03 +0200 Subject: [PATCH 0709/2223] PCI: ftpci100: Use PCI_CONF1_ADDRESS() macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify pci-ftpci100.c driver code and use new PCI_CONF1_ADDRESS() macro for accessing PCI config space. Link: https://lore.kernel.org/r/20220924092404.31776-3-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-ftpci100.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/pci-ftpci100.c b/drivers/pci/controller/pci-ftpci100.c index 88980a44461df..0cfd9d5a497c9 100644 --- a/drivers/pci/controller/pci-ftpci100.c +++ b/drivers/pci/controller/pci-ftpci100.c @@ -103,13 +103,6 @@ #define FARADAY_PCI_DMA_MEM2_BASE 0x00000000 #define FARADAY_PCI_DMA_MEM3_BASE 0x00000000 -/* Defines for PCI configuration command register */ -#define PCI_CONF_ENABLE BIT(31) -#define PCI_CONF_WHERE(r) ((r) & 0xFC) -#define PCI_CONF_BUS(b) (((b) & 0xFF) << 16) -#define PCI_CONF_DEVICE(d) (((d) & 0x1F) << 11) -#define PCI_CONF_FUNCTION(f) (((f) & 0x07) << 8) - /** * struct faraday_pci_variant - encodes IP block differences * @cascaded_irq: this host has cascaded IRQs from an interrupt controller @@ -190,11 +183,8 @@ static int faraday_raw_pci_read_config(struct faraday_pci *p, int bus_number, unsigned int fn, int config, int size, u32 *value) { - writel(PCI_CONF_BUS(bus_number) | - PCI_CONF_DEVICE(PCI_SLOT(fn)) | - PCI_CONF_FUNCTION(PCI_FUNC(fn)) | - PCI_CONF_WHERE(config) | - PCI_CONF_ENABLE, + writel(PCI_CONF1_ADDRESS(bus_number, PCI_SLOT(fn), + PCI_FUNC(fn), config), p->base + FTPCI_CONFIG); *value = readl(p->base + FTPCI_DATA); @@ -225,11 +215,8 @@ static int faraday_raw_pci_write_config(struct faraday_pci *p, int bus_number, { int ret = PCIBIOS_SUCCESSFUL; - writel(PCI_CONF_BUS(bus_number) | - PCI_CONF_DEVICE(PCI_SLOT(fn)) | - PCI_CONF_FUNCTION(PCI_FUNC(fn)) | - PCI_CONF_WHERE(config) | - PCI_CONF_ENABLE, + writel(PCI_CONF1_ADDRESS(bus_number, PCI_SLOT(fn), + PCI_FUNC(fn), config), p->base + FTPCI_CONFIG); switch (size) { -- GitLab From 2301a3e1a5664cf8380d2b8ef051005dc90bc881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 24 Sep 2022 11:24:04 +0200 Subject: [PATCH 0710/2223] PCI: mt7621: Use PCI_CONF1_EXT_ADDRESS() macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify pcie-mt7621.c driver code and use new PCI_CONF1_EXT_ADDRESS() macro for accessing PCIe config space. Link: https://lore.kernel.org/r/20220924092404.31776-4-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Sergio Paracuellos --- drivers/pci/controller/pcie-mt7621.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/pcie-mt7621.c b/drivers/pci/controller/pcie-mt7621.c index 33eb37a2225c1..4bd1abf26008f 100644 --- a/drivers/pci/controller/pcie-mt7621.c +++ b/drivers/pci/controller/pcie-mt7621.c @@ -30,6 +30,8 @@ #include #include +#include "../pci.h" + /* MediaTek-specific configuration registers */ #define PCIE_FTS_NUM 0x70c #define PCIE_FTS_NUM_MASK GENMASK(15, 8) @@ -120,19 +122,12 @@ static inline void pcie_port_write(struct mt7621_pcie_port *port, writel_relaxed(val, port->base + reg); } -static inline u32 mt7621_pcie_get_cfgaddr(unsigned int bus, unsigned int slot, - unsigned int func, unsigned int where) -{ - return (((where & 0xf00) >> 8) << 24) | (bus << 16) | (slot << 11) | - (func << 8) | (where & 0xfc) | 0x80000000; -} - static void __iomem *mt7621_pcie_map_bus(struct pci_bus *bus, unsigned int devfn, int where) { struct mt7621_pcie *pcie = bus->sysdata; - u32 address = mt7621_pcie_get_cfgaddr(bus->number, PCI_SLOT(devfn), - PCI_FUNC(devfn), where); + u32 address = PCI_CONF1_EXT_ADDRESS(bus->number, PCI_SLOT(devfn), + PCI_FUNC(devfn), where); writel_relaxed(address, pcie->base + RALINK_PCI_CONFIG_ADDR); @@ -147,7 +142,7 @@ static struct pci_ops mt7621_pcie_ops = { static u32 read_config(struct mt7621_pcie *pcie, unsigned int dev, u32 reg) { - u32 address = mt7621_pcie_get_cfgaddr(0, dev, 0, reg); + u32 address = PCI_CONF1_EXT_ADDRESS(0, dev, 0, reg); pcie_write(pcie, address, RALINK_PCI_CONFIG_ADDR); return pcie_read(pcie, RALINK_PCI_CONFIG_DATA); @@ -156,7 +151,7 @@ static u32 read_config(struct mt7621_pcie *pcie, unsigned int dev, u32 reg) static void write_config(struct mt7621_pcie *pcie, unsigned int dev, u32 reg, u32 val) { - u32 address = mt7621_pcie_get_cfgaddr(0, dev, 0, reg); + u32 address = PCI_CONF1_EXT_ADDRESS(0, dev, 0, reg); pcie_write(pcie, address, RALINK_PCI_CONFIG_ADDR); pcie_write(pcie, val, RALINK_PCI_CONFIG_DATA); -- GitLab From 1abbe04a1b55200d0e3e93b2c15058c15126a225 Mon Sep 17 00:00:00 2001 From: Krishna chaitanya chundru Date: Thu, 8 Sep 2022 14:16:16 +0530 Subject: [PATCH 0711/2223] dt-bindings: pci: QCOM Add missing sc7280 aggre0, aggre1 clocks Add missing aggre0 and aggre1 clocks. Link: https://lore.kernel.org/r/1662626776-19636-3-git-send-email-quic_krichai@quicinc.com Signed-off-by: Krishna chaitanya chundru Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Kozlowski --- Documentation/devicetree/bindings/pci/qcom,pcie.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml index 7d29e2a45183e..dd84f1487bedb 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml @@ -54,11 +54,11 @@ properties: # Platform constraints are described later. clocks: minItems: 3 - maxItems: 12 + maxItems: 13 clock-names: minItems: 3 - maxItems: 12 + maxItems: 13 resets: minItems: 1 @@ -424,8 +424,8 @@ allOf: then: properties: clocks: - minItems: 11 - maxItems: 11 + minItems: 13 + maxItems: 13 clock-names: items: - const: pipe # PIPE clock @@ -439,6 +439,8 @@ allOf: - const: slave_q2a # Slave Q2A clock - const: tbu # PCIe TBU clock - const: ddrss_sf_tbu # PCIe SF TBU clock + - const: aggre0 # Aggre NoC PCIe CENTER SF AXI clock + - const: aggre1 # Aggre NoC PCIe1 AXI clock resets: maxItems: 1 reset-names: -- GitLab From ca5f21b2574903a7430fcb3590e534d92b2fa816 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 21:06:10 -0300 Subject: [PATCH 0712/2223] vfio: Follow a strict lifetime for struct iommu_group The iommu_group comes from the struct device that a driver has been bound to and then created a struct vfio_device against. To keep the iommu layer sane we want to have a simple rule that only an attached driver should be using the iommu API. Particularly only an attached driver should hold ownership. In VFIO's case since it uses the group APIs and it shares between different drivers it is a bit more complicated, but the principle still holds. Solve this by waiting for all users of the vfio_group to stop before allowing vfio_unregister_group_dev() to complete. This is done with a new completion to know when the users go away and an additional refcount to keep track of how many device drivers are sharing the vfio group. The last driver to be unregistered will clean up the group. This solves crashes in the S390 iommu driver that come because VFIO ends up racing releasing ownership (which attaches the default iommu_domain to the device) with the removal of that same device from the iommu driver. This is a side case that iommu drivers should not have to cope with. iommu driver failed to attach the default/blocking domain WARNING: CPU: 0 PID: 5082 at drivers/iommu/iommu.c:1961 iommu_detach_group+0x6c/0x80 Modules linked in: macvtap macvlan tap vfio_pci vfio_pci_core irqbypass vfio_virqfd kvm nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink mlx5_ib sunrpc ib_uverbs ism smc uvdevice ib_core s390_trng eadm_sch tape_3590 tape tape_class vfio_ccw mdev vfio_iommu_type1 vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 mlx5_core des_s390 libdes sha3_512_s390 nvme sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common nvme_core zfcp scsi_transport_fc pkey zcrypt rng_core autofs4 CPU: 0 PID: 5082 Comm: qemu-system-s39 Tainted: G W 6.0.0-rc3 #5 Hardware name: IBM 3931 A01 782 (LPAR) Krnl PSW : 0704c00180000000 000000095bb10d28 (iommu_detach_group+0x70/0x80) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000001 0000000900000027 0000000000000039 000000095c97ffe0 00000000fffeffff 00000009fc290000 00000000af1fda50 00000000af590b58 00000000af1fdaf0 0000000135c7a320 0000000135e52258 0000000135e52200 00000000a29e8000 00000000af590b40 000000095bb10d24 0000038004b13c98 Krnl Code: 000000095bb10d18: c020003d56fc larl %r2,000000095c2bbb10 000000095bb10d1e: c0e50019d901 brasl %r14,000000095be4bf20 #000000095bb10d24: af000000 mc 0,0 >000000095bb10d28: b904002a lgr %r2,%r10 000000095bb10d2c: ebaff0a00004 lmg %r10,%r15,160(%r15) 000000095bb10d32: c0f4001aa867 brcl 15,000000095be65e00 000000095bb10d38: c004002168e0 brcl 0,000000095bf3def8 000000095bb10d3e: eb6ff0480024 stmg %r6,%r15,72(%r15) Call Trace: [<000000095bb10d28>] iommu_detach_group+0x70/0x80 ([<000000095bb10d24>] iommu_detach_group+0x6c/0x80) [<000003ff80243b0e>] vfio_iommu_type1_detach_group+0x136/0x6c8 [vfio_iommu_type1] [<000003ff80137780>] __vfio_group_unset_container+0x58/0x158 [vfio] [<000003ff80138a16>] vfio_group_fops_unl_ioctl+0x1b6/0x210 [vfio] pci 0004:00:00.0: Removing from iommu group 4 [<000000095b5b62e8>] __s390x_sys_ioctl+0xc0/0x100 [<000000095be5d3b4>] __do_syscall+0x1d4/0x200 [<000000095be6c072>] system_call+0x82/0xb0 Last Breaking-Event-Address: [<000000095be4bf80>] __warn_printk+0x60/0x68 It indicates that domain->ops->attach_dev() failed because the driver has already passed the point of destructing the device. Fixes: 9ac8545199a1 ("iommu: Fix use-after-free in iommu_release_device") Reported-by: Matthew Rosato Tested-by: Matthew Rosato Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v2-a3c5f4429e2a+55-iommu_group_lifetime_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.h | 8 +++++ drivers/vfio/vfio_main.c | 68 ++++++++++++++++++++++++++-------------- 2 files changed, 53 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 56fab31f8e0ff..039e3208d286f 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -41,7 +41,15 @@ enum vfio_group_type { struct vfio_group { struct device dev; struct cdev cdev; + /* + * When drivers is non-zero a driver is attached to the struct device + * that provided the iommu_group and thus the iommu_group is a valid + * pointer. When drivers is 0 the driver is being detached. Once users + * reaches 0 then the iommu_group is invalid. + */ + refcount_t drivers; refcount_t users; + struct completion users_comp; unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index af5945c71c417..f19171cad9a25 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -125,8 +125,6 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -static void vfio_group_get(struct vfio_group *group); - /* * Group objects - create, release, get, put, search */ @@ -137,7 +135,7 @@ __vfio_group_get_from_iommu(struct iommu_group *iommu_group) list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) { - vfio_group_get(group); + refcount_inc(&group->drivers); return group; } } @@ -189,6 +187,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->users, 1); + refcount_set(&group->drivers, 1); + init_completion(&group->users_comp); init_rwsem(&group->group_rwsem); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); @@ -247,8 +247,41 @@ err_put: static void vfio_group_put(struct vfio_group *group) { - if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) + if (refcount_dec_and_test(&group->users)) + complete(&group->users_comp); +} + +static void vfio_device_remove_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + + /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ + if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) return; + list_del(&group->vfio_next); + + /* + * We could concurrently probe another driver in the group that might + * race vfio_device_remove_group() with vfio_get_group(), so we have to + * ensure that the sysfs is all cleaned up under lock otherwise the + * cdev_device_add() will fail due to the name aready existing. + */ + cdev_device_del(&group->cdev, &group->dev); + mutex_unlock(&vfio.group_lock); + + /* Matches the get from vfio_group_alloc() */ + vfio_group_put(group); + + /* + * Before we allow the last driver in the group to be unplugged the + * group must be sanitized so nothing else is or can reference it. This + * is because the group->iommu_group pointer should only be used so long + * as a device driver is attached to a device in the group. + */ + wait_for_completion(&group->users_comp); /* * These data structures all have paired operations that can only be @@ -259,19 +292,11 @@ static void vfio_group_put(struct vfio_group *group) WARN_ON(!list_empty(&group->device_list)); WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); - - list_del(&group->vfio_next); - cdev_device_del(&group->cdev, &group->dev); - mutex_unlock(&vfio.group_lock); + group->iommu_group = NULL; put_device(&group->dev); } -static void vfio_group_get(struct vfio_group *group) -{ - refcount_inc(&group->users); -} - /* * Device objects - create, release, get, put, search */ @@ -494,6 +519,10 @@ static int __vfio_register_dev(struct vfio_device *device, struct vfio_device *existing_device; int ret; + /* + * In all cases group is the output of one of the group allocation + * functions and we have group->drivers incremented for us. + */ if (IS_ERR(group)) return PTR_ERR(group); @@ -533,10 +562,7 @@ static int __vfio_register_dev(struct vfio_device *device, return 0; err_out: - if (group->type == VFIO_NO_IOMMU || - group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - vfio_group_put(group); + vfio_device_remove_group(device); return ret; } @@ -627,11 +653,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) /* Balances device_add in register path */ device_del(&device->device); - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - - /* Matches the get in vfio_register_group_dev() */ - vfio_group_put(group); + vfio_device_remove_group(device); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); @@ -884,7 +906,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) down_write(&group->group_rwsem); - /* users can be zero if this races with vfio_group_put() */ + /* users can be zero if this races with vfio_device_remove_group() */ if (!refcount_inc_not_zero(&group->users)) { ret = -ENODEV; goto err_unlock; -- GitLab From 948f5ada58b552d975d1937a3f5939414f28cacb Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:28 +0800 Subject: [PATCH 0713/2223] hisi_acc_vfio_pci: Fixes error return code issue During the process of compatibility and matching of live migration device information, if the isolation status of the two devices is inconsistent, the live migration needs to be exited. The current driver does not return the error code correctly and needs to be fixed. Reviewed-by: Shameer Kolothum Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 47174e2b61bd3..4ef9761ef4675 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -397,7 +397,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (vf_data->que_iso_cfg != que_iso_state) { dev_err(dev, "failed to match isolation state\n"); - return ret; + return -EINVAL; } ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); -- GitLab From 008e5e996f425f64c21755ebe77201895bbee3b8 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:29 +0800 Subject: [PATCH 0714/2223] hisi_acc_vfio_pci: Fix device data address combination problem The queue address of the accelerator device should be combined into a dma address in a way of combining the low and high bits. The previous combination is wrong and needs to be modified. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-3-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 4ef9761ef4675..fbe72ce173de0 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -520,12 +520,12 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; /* Every reg is 32 bit, the dma address is 64 bit. */ - vf_data->eqe_dma = vf_data->qm_eqc_dw[2]; + vf_data->eqe_dma = vf_data->qm_eqc_dw[1]; vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->eqe_dma |= vf_data->qm_eqc_dw[1]; - vf_data->aeqe_dma = vf_data->qm_aeqc_dw[2]; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[0]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1]; vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[1]; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); -- GitLab From af72f53c1b4e9614448b5d4e7b39d30d3339e3f7 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:30 +0800 Subject: [PATCH 0715/2223] hisi_acc_vfio_pci: Remove useless function parameter Remove unused function parameters for vf_qm_fun_reset() and ensure the device is enabled before the reset operation is performed. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-4-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fbe72ce173de0..c07ed7b0ccf15 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -337,8 +337,7 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) return 0; } -static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev, - struct hisi_qm *qm) +static void vf_qm_fun_reset(struct hisi_qm *qm) { int i; @@ -662,7 +661,10 @@ static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vd if (hisi_acc_vdev->vf_qm_state != QM_READY) return; - vf_qm_fun_reset(hisi_acc_vdev, vf_qm); + /* Make sure the device is enabled */ + qm_dev_cmd_init(vf_qm); + + vf_qm_fun_reset(vf_qm); } static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) -- GitLab From 3b7cfba0d873e8a26ac4ec5848dcb7c93098cfab Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:31 +0800 Subject: [PATCH 0716/2223] hisi_acc_vfio_pci: Remove useless macro definitions The QM_QUE_ISO_CFG macro definition is no longer used and needs to be deleted from the current driver. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-5-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 5494f4983bbe4..8e4bf21deae1a 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -16,7 +16,6 @@ #define SEC_CORE_INT_STATUS 0x301008 #define HPRE_HAC_INT_STATUS 0x301800 #define HZIP_CORE_INT_STATUS 0x3010AC -#define QM_QUE_ISO_CFG 0x301154 #define QM_VFT_CFG_RDY 0x10006c #define QM_VFT_CFG_OP_WR 0x100058 -- GitLab From 42e1d1eed20a17c6cbb1d600c77a6ca69a632d4c Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:32 +0800 Subject: [PATCH 0717/2223] hisi_acc_vfio_pci: Update some log and comment formats 1. Modify some annotation information formats to keep the entire driver annotation format consistent. 2. Modify some log description formats to be consistent with the format of the entire driver log. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-6-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 18 +++++++++--------- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index c07ed7b0ccf15..39eeca18a0f7c 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -16,7 +16,7 @@ #include "hisi_acc_vfio_pci.h" -/* return 0 on VM acc device ready, -ETIMEDOUT hardware timeout */ +/* Return 0 on VM acc device ready, -ETIMEDOUT hardware timeout */ static int qm_wait_dev_not_ready(struct hisi_qm *qm) { u32 val; @@ -189,7 +189,7 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) struct device *dev = &qm->pdev->dev; int ret; - /* check VF state */ + /* Check VF state */ if (unlikely(hisi_qm_wait_mb_ready(qm))) { dev_err(&qm->pdev->dev, "QM device is not ready to write\n"); return -EBUSY; @@ -373,7 +373,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } - /* vf qp num check */ + /* VF qp num check */ ret = qm_get_vft(vf_qm, &vf_qm->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums\n"); @@ -387,7 +387,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, vf_qm->qp_num = ret; - /* vf isolation state check */ + /* VF isolation state check */ ret = qm_read_regs(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); if (ret) { dev_err(dev, "failed to read QM_QUE_ISO_CFG_V\n"); @@ -418,10 +418,10 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, int ret; vf_data->acc_magic = ACC_DEV_MAGIC; - /* save device id */ + /* Save device id */ vf_data->dev_id = hisi_acc_vdev->vf_dev->device; - /* vf qp num save from PF */ + /* VF qp num save from PF */ ret = pf_qm_get_qp_num(pf_qm, vf_id, &vf_data->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums!\n"); @@ -465,19 +465,19 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = qm_set_regs(qm, vf_data); if (ret) { - dev_err(dev, "Set VF regs failed\n"); + dev_err(dev, "set VF regs failed\n"); return ret; } ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); if (ret) { - dev_err(dev, "Set sqc failed\n"); + dev_err(dev, "set sqc failed\n"); return ret; } ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); if (ret) { - dev_err(dev, "Set cqc failed\n"); + dev_err(dev, "set cqc failed\n"); return ret; } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 8e4bf21deae1a..67343325b3201 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -79,7 +79,7 @@ struct acc_vf_data { /* QM reserved 5 regs */ u32 qm_rsv_regs[5]; u32 padding; - /* qm memory init information */ + /* QM memory init information */ u64 eqe_dma; u64 aeqe_dma; u64 sqc_dma; @@ -98,7 +98,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; u8 deferred_reset:1; - /* for migration state */ + /* For migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; struct pci_dev *pf_dev; @@ -107,7 +107,7 @@ struct hisi_acc_vf_core_device { struct hisi_qm vf_qm; u32 vf_qm_state; int vf_id; - /* for reset handler */ + /* For reset handler */ spinlock_t reset_lock; struct hisi_acc_vf_migration_file *resuming_migf; struct hisi_acc_vf_migration_file *saving_migf; -- GitLab From 2b7672b0fa0b833312aef5a366a741921af3634f Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 27 Sep 2022 08:42:36 -0700 Subject: [PATCH 0718/2223] Input: twl4030-pwrbutton - add missing of.h include The driver is using of_match_ptr() and therefore needs to include of.h header. Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220927052217.2784593-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/twl4030-pwrbutton.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/misc/twl4030-pwrbutton.c b/drivers/input/misc/twl4030-pwrbutton.c index b307cca170222..e3ee0638ffbaf 100644 --- a/drivers/input/misc/twl4030-pwrbutton.c +++ b/drivers/input/misc/twl4030-pwrbutton.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include -- GitLab From 9dedc915937c33302df7fcab01c45e7936d6195a Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Tue, 27 Sep 2022 08:56:06 -0700 Subject: [PATCH 0719/2223] Input: synaptics-rmi4 - convert to use sysfs_emit() APIs Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Reported-by: Zeal Robot Signed-off-by: zhang songyi Link: https://lore.kernel.org/r/20220927070936.258300-1-zhang.songyi@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34.c b/drivers/input/rmi4/rmi_f34.c index 30169b584573c..0d9a5756e3f59 100644 --- a/drivers/input/rmi4/rmi_f34.c +++ b/drivers/input/rmi4/rmi_f34.c @@ -321,13 +321,13 @@ static ssize_t rmi_driver_bootloader_id_show(struct device *dev, f34 = dev_get_drvdata(&fn->dev); if (f34->bl_version == 5) - return scnprintf(buf, PAGE_SIZE, "%c%c\n", - f34->bootloader_id[0], - f34->bootloader_id[1]); + return sysfs_emit(buf, "%c%c\n", + f34->bootloader_id[0], + f34->bootloader_id[1]); else - return scnprintf(buf, PAGE_SIZE, "V%d.%d\n", - f34->bootloader_id[1], - f34->bootloader_id[0]); + return sysfs_emit(buf, "V%d.%d\n", + f34->bootloader_id[1], + f34->bootloader_id[0]); } return 0; @@ -346,7 +346,7 @@ static ssize_t rmi_driver_configuration_id_show(struct device *dev, if (fn) { f34 = dev_get_drvdata(&fn->dev); - return scnprintf(buf, PAGE_SIZE, "%s\n", f34->configuration_id); + return sysfs_emit(buf, "%s\n", f34->configuration_id); } return 0; @@ -499,7 +499,7 @@ static ssize_t rmi_driver_update_fw_status_show(struct device *dev, if (data->f34_container) update_status = rmi_f34_status(data->f34_container); - return scnprintf(buf, PAGE_SIZE, "%d\n", update_status); + return sysfs_emit(buf, "%d\n", update_status); } static DEVICE_ATTR(update_fw_status, 0444, -- GitLab From a0d49a8f77f26609036a05e7832393b6279554db Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:13 +0100 Subject: [PATCH 0720/2223] dt-bindings: riscv: microchip: document icicle reference design The icicle kit reference design's v2022.09 release made some changes to the memory map - including adding the ability to read the fabric clock controllers via the system controller bus & making the PCI controller work with upstream Linux. While the PCI was not working in the v2022.03 design, so nothing is broken there in terms of backwards compatibility, the fabric clocks used in the v2022.03 design were chosen by the individual run of the synthesis tool. In the v2022.09 reference design, the clocks are fixed to use the "north west" fabric Clock Conditioning Circuitry. In the v2022.10 release, the memory map on the DDR side is also changing, so to avoid making a breaking change here twice, jump over the v2022.09 release and straight to the v2022.10 one. Make use of a new compatible to denote that v2022.{09,10} reference design releases are not backwards compatible. Acked-by: Krzysztof Kozlowski Signed-off-by: Conor Dooley --- .../devicetree/bindings/riscv/microchip.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml index 1aa7336a9672f..5c1ad21080497 100644 --- a/Documentation/devicetree/bindings/riscv/microchip.yaml +++ b/Documentation/devicetree/bindings/riscv/microchip.yaml @@ -17,12 +17,18 @@ properties: $nodename: const: '/' compatible: - items: - - enum: - - microchip,mpfs-icicle-kit - - microchip,mpfs-icicle-reference-rtlv2203 - - sundance,polarberry - - const: microchip,mpfs + oneOf: + - items: + - enum: + - microchip,mpfs-icicle-reference-rtlv2203 + - microchip,mpfs-icicle-reference-rtlv2210 + - const: microchip,mpfs-icicle-kit + - const: microchip,mpfs + + - items: + - enum: + - sundance,polarberry + - const: microchip,mpfs additionalProperties: true -- GitLab From 0ebdc51787dbb8ef8d259daa98b8fd35babf8970 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:14 +0100 Subject: [PATCH 0721/2223] dt-bindings: riscv: microchip: document the aries m100pfsevp Add a compatible for the Aries Embedded M100PFSEVP SOM + EVK platform. Link: https://www.aries-embedded.com/polarfire-soc-fpga-microsemi-m100pfs-som-mpfs025t-pcie-serdes Signed-off-by: Conor Dooley Acked-by: Rob Herring --- Documentation/devicetree/bindings/riscv/microchip.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml index 5c1ad21080497..681cedc5578fb 100644 --- a/Documentation/devicetree/bindings/riscv/microchip.yaml +++ b/Documentation/devicetree/bindings/riscv/microchip.yaml @@ -27,6 +27,7 @@ properties: - items: - enum: + - aries,m100pfsevp - sundance,polarberry - const: microchip,mpfs -- GitLab From db3d481698efe8a7a943ecbba9491648c5a49ef3 Mon Sep 17 00:00:00 2001 From: Shravan Chippa Date: Tue, 27 Sep 2022 12:19:15 +0100 Subject: [PATCH 0722/2223] dt-bindings: riscv: microchip: document the sev kit Update devicetree bindings document with PolarFire SoC Video Kit, known by its "sev-kit" product code. Link: https://onlinedocs.microchip.com/pr/GUID-404D3738-DC76-46BA-8683-6A77E837C2DD-en-US-1/index.html?GUID-065AEBEE-7B2C-4895-8579-B1D73D797F06 Signed-off-by: Shravan Chippa Reviewed-by: Krzysztof Kozlowski Signed-off-by: Conor Dooley --- Documentation/devicetree/bindings/riscv/microchip.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml index 681cedc5578fb..2b8c6a695e991 100644 --- a/Documentation/devicetree/bindings/riscv/microchip.yaml +++ b/Documentation/devicetree/bindings/riscv/microchip.yaml @@ -28,6 +28,7 @@ properties: - items: - enum: - aries,m100pfsevp + - microchip,mpfs-sev-kit - sundance,polarberry - const: microchip,mpfs -- GitLab From f890e67f292db46c9bd5b5c004ba0f98761d1a33 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:16 +0100 Subject: [PATCH 0723/2223] riscv: dts: microchip: add pci dma ranges for the icicle kit The recently removed, accidentally included, "matr0" property was used in place of a dma-ranges property. The PCI controller is non-functional with mainline Linux in the v2022.02 or later reference designs and has not worked without configuration of address-translation since v2021.08. Add the address translation that will be used by the v2022.09 reference design & update the compatible used by the dts. Since this change is not backwards compatible, update the compatible to denote this, jumping over v2022.09 directly to v2022.10. Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi | 7 ++++++- arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index 0d28858b83f28..eec5aba434363 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -2,7 +2,8 @@ /* Copyright (c) 2020-2021 Microchip Technology Inc */ / { - compatible = "microchip,mpfs-icicle-reference-rtlv2203", "microchip,mpfs"; + compatible = "microchip,mpfs-icicle-reference-rtlv2210", "microchip,mpfs-icicle-kit", + "microchip,mpfs"; core_pwm0: pwm@41000000 { compatible = "microchip,corepwm-rtl-v4"; @@ -37,3 +38,7 @@ clock-frequency = <125000000>; }; }; + +&pcie { + dma-ranges = <0x02000000 0x0 0x00000000 0x0 0x00000000 0x1 0x00000000>; +}; diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts index 044982a11df50..42d350fe6c6b9 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts @@ -11,7 +11,8 @@ / { model = "Microchip PolarFire-SoC Icicle Kit"; - compatible = "microchip,mpfs-icicle-kit", "microchip,mpfs"; + compatible = "microchip,mpfs-icicle-reference-rtlv2210", "microchip,mpfs-icicle-kit", + "microchip,mpfs"; aliases { ethernet0 = &mac1; -- GitLab From 99d451a7db1624308bc9eb94b7befb3722f67b10 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:17 +0100 Subject: [PATCH 0724/2223] riscv: dts: microchip: move the mpfs' pci node to -fabric.dtsi In today's edition of moving things around: The PCIe root port on PolarFire SoC is more part of the FPGA than of the Core Complex. It is located on the other side of the chip and, apart from its interrupts, most of its configuration is determined by the FPGA bitstream rather. This includes: - address translation in both directions - the addresses at which the config and data regions appear to the core complex - the clocks used by the AXI bus - the plic interrupt used Moving the PCIe node to the -fabric.dtsi makes it clearer than a singular configuration for root port is not correct & allows the base SoC dtsi to be more easily included. Signed-off-by: Conor Dooley --- .../dts/microchip/mpfs-icicle-kit-fabric.dtsi | 32 +++++++++++++++++-- .../dts/microchip/mpfs-polarberry-fabric.dtsi | 29 +++++++++++++++++ arch/riscv/boot/dts/microchip/mpfs.dtsi | 30 ----------------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index eec5aba434363..688ef0fc5a646 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -37,8 +37,34 @@ #clock-cells = <0>; clock-frequency = <125000000>; }; -}; -&pcie { - dma-ranges = <0x02000000 0x0 0x00000000 0x0 0x00000000 0x1 0x00000000>; + pcie: pcie@2000000000 { + compatible = "microchip,pcie-host-1.0"; + #address-cells = <0x3>; + #interrupt-cells = <0x1>; + #size-cells = <0x2>; + device_type = "pci"; + reg = <0x20 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; + reg-names = "cfg", "apb"; + bus-range = <0x0 0x7f>; + interrupt-parent = <&plic>; + interrupts = <119>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + interrupt-map-mask = <0 0 0 7>; + clocks = <&fabric_clk1>, <&fabric_clk1>, <&fabric_clk3>; + clock-names = "fic0", "fic1", "fic3"; + ranges = <0x3000000 0x0 0x8000000 0x20 0x8000000 0x0 0x80000000>; + dma-ranges = <0x02000000 0x0 0x00000000 0x0 0x00000000 0x1 0x00000000>; + msi-parent = <&pcie>; + msi-controller; + status = "disabled"; + pcie_intc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + }; + }; }; diff --git a/arch/riscv/boot/dts/microchip/mpfs-polarberry-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-polarberry-fabric.dtsi index 49380c428ec91..67303bc0e451b 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-polarberry-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-polarberry-fabric.dtsi @@ -13,4 +13,33 @@ #clock-cells = <0>; clock-frequency = <125000000>; }; + + pcie: pcie@2000000000 { + compatible = "microchip,pcie-host-1.0"; + #address-cells = <0x3>; + #interrupt-cells = <0x1>; + #size-cells = <0x2>; + device_type = "pci"; + reg = <0x20 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; + reg-names = "cfg", "apb"; + bus-range = <0x0 0x7f>; + interrupt-parent = <&plic>; + interrupts = <119>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + interrupt-map-mask = <0 0 0 7>; + clocks = <&fabric_clk1>, <&fabric_clk1>, <&fabric_clk3>; + clock-names = "fic0", "fic1", "fic3"; + ranges = <0x3000000 0x0 0x8000000 0x20 0x8000000 0x0 0x80000000>; + msi-parent = <&pcie>; + msi-controller; + status = "disabled"; + pcie_intc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + }; + }; }; diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi index 45e3cc6598825..79fd8dfce96f1 100644 --- a/arch/riscv/boot/dts/microchip/mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi @@ -464,36 +464,6 @@ status = "disabled"; }; - pcie: pcie@2000000000 { - compatible = "microchip,pcie-host-1.0"; - #address-cells = <0x3>; - #interrupt-cells = <0x1>; - #size-cells = <0x2>; - device_type = "pci"; - reg = <0x20 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; - reg-names = "cfg", "apb"; - bus-range = <0x0 0x7f>; - interrupt-parent = <&plic>; - interrupts = <119>; - interrupt-map = <0 0 0 1 &pcie_intc 0>, - <0 0 0 2 &pcie_intc 1>, - <0 0 0 3 &pcie_intc 2>, - <0 0 0 4 &pcie_intc 3>; - interrupt-map-mask = <0 0 0 7>; - clocks = <&fabric_clk1>, <&fabric_clk1>, <&fabric_clk3>; - clock-names = "fic0", "fic1", "fic3"; - ranges = <0x3000000 0x0 0x8000000 0x20 0x8000000 0x0 0x80000000>; - msi-parent = <&pcie>; - msi-controller; - microchip,axi-m-atr0 = <0x10 0x0>; - status = "disabled"; - pcie_intc: legacy-interrupt-controller { - #address-cells = <0>; - #interrupt-cells = <1>; - interrupt-controller; - }; - }; - mbox: mailbox@37020000 { compatible = "microchip,mpfs-mailbox"; reg = <0x0 0x37020000 0x0 0x1000>, <0x0 0x2000318C 0x0 0x40>; -- GitLab From 6fc655ed4986f88b91e3f7b339222fc1c4ffba08 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:18 +0100 Subject: [PATCH 0725/2223] riscv: dts: microchip: icicle: update pci address properties For the v2022.09 reference design the PCI root port's data region has been moved to FIC1 from FIC0. This is a shorter path, allowing for higher clock rates and improved through-put. As a result, the address at which the PCIe's data region appears to the core complex has changed. The config region's address is unchanged. As FIC0 is no longer used, its clock can be removed too. Signed-off-by: Conor Dooley --- .../boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index 688ef0fc5a646..9ca2ac4ad8e27 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -38,13 +38,13 @@ clock-frequency = <125000000>; }; - pcie: pcie@2000000000 { + pcie: pcie@3000000000 { compatible = "microchip,pcie-host-1.0"; #address-cells = <0x3>; #interrupt-cells = <0x1>; #size-cells = <0x2>; device_type = "pci"; - reg = <0x20 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; + reg = <0x30 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; reg-names = "cfg", "apb"; bus-range = <0x0 0x7f>; interrupt-parent = <&plic>; @@ -54,9 +54,9 @@ <0 0 0 3 &pcie_intc 2>, <0 0 0 4 &pcie_intc 3>; interrupt-map-mask = <0 0 0 7>; - clocks = <&fabric_clk1>, <&fabric_clk1>, <&fabric_clk3>; - clock-names = "fic0", "fic1", "fic3"; - ranges = <0x3000000 0x0 0x8000000 0x20 0x8000000 0x0 0x80000000>; + clocks = <&fabric_clk1>, <&fabric_clk3>; + clock-names = "fic1", "fic3"; + ranges = <0x3000000 0x0 0x8000000 0x30 0x8000000 0x0 0x80000000>; dma-ranges = <0x02000000 0x0 0x00000000 0x0 0x00000000 0x1 0x00000000>; msi-parent = <&pcie>; msi-controller; -- GitLab From ab291621a8b85269496ae9a964b6d49cd1e030c8 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:19 +0100 Subject: [PATCH 0726/2223] riscv: dts: microchip: icicle: re-jig fabric peripheral addresses When users try to add onto the reference design, they find that the current addresses that peripherals connected to Fabric InterConnect (FIC) 3 use are restrictive. For the v2022.09 reference design, the peripherals have been shifted down, leaving more contiguous address space for their custom IP/peripherals. Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index 9ca2ac4ad8e27..35030ea330ee1 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -5,18 +5,18 @@ compatible = "microchip,mpfs-icicle-reference-rtlv2210", "microchip,mpfs-icicle-kit", "microchip,mpfs"; - core_pwm0: pwm@41000000 { + core_pwm0: pwm@40000000 { compatible = "microchip,corepwm-rtl-v4"; - reg = <0x0 0x41000000 0x0 0xF0>; + reg = <0x0 0x40000000 0x0 0xF0>; microchip,sync-update-mask = /bits/ 32 <0>; #pwm-cells = <2>; clocks = <&fabric_clk3>; status = "disabled"; }; - i2c2: i2c@44000000 { + i2c2: i2c@40000200 { compatible = "microchip,corei2c-rtl-v7"; - reg = <0x0 0x44000000 0x0 0x1000>; + reg = <0x0 0x40000200 0x0 0x1000>; #address-cells = <1>; #size-cells = <0>; clocks = <&fabric_clk3>; -- GitLab From fa52935abef422d119dda3c10c02787a86e6289d Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:20 +0100 Subject: [PATCH 0727/2223] riscv: dts: microchip: reduce the fic3 clock rate For the v2022.09 release of the reference design, the fic3 clock rate been reduced from 62.5 MHz to 50 MHz as it allows timing to be closed significantly more quickly by customers who chose to build the reference design themselves. Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index 35030ea330ee1..b6bfe177ccb28 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -29,7 +29,7 @@ fabric_clk3: fabric-clk3 { compatible = "fixed-clock"; #clock-cells = <0>; - clock-frequency = <62500000>; + clock-frequency = <50000000>; }; fabric_clk1: fabric-clk1 { -- GitLab From 978a17d1a688db025275d282665ab3f39407191d Mon Sep 17 00:00:00 2001 From: Vattipalli Praveen Date: Tue, 27 Sep 2022 12:19:21 +0100 Subject: [PATCH 0728/2223] riscv: dts: microchip: add sevkit device tree Add a basic dts for the Microchip Smart Embedded Vision dev kit. The SEV kit is an upcoming first party board, featuring an MPFS250T and: - Dual Sony Camera Sensors (IMX334) - IEEE 802.11 b/g/n 20MHz (1x1) Wi-Fi - Bluetooth 5 Low Energy - 4 GB DDR4 x64 - 2 GB LPDDR4 x32 - 1 GB SPI Flash - 8 GB eMMC flash & SD card slot (multiplexed) - HDMI2.0 Video Input/Output - MIPI DSI Output - MIPI CSI-2 Input Link: https://onlinedocs.microchip.com/pr/GUID-404D3738-DC76-46BA-8683-6A77E837C2DD-en-US-1/index.html?GUID-065AEBEE-7B2C-4895-8579-B1D73D797F06 Signed-off-by: Vattipalli Praveen Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/Makefile | 1 + .../dts/microchip/mpfs-sev-kit-fabric.dtsi | 45 ++++++ .../riscv/boot/dts/microchip/mpfs-sev-kit.dts | 145 ++++++++++++++++++ 3 files changed, 191 insertions(+) create mode 100644 arch/riscv/boot/dts/microchip/mpfs-sev-kit-fabric.dtsi create mode 100644 arch/riscv/boot/dts/microchip/mpfs-sev-kit.dts diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile index 39aae7b04f1cb..f18477b2e86d7 100644 --- a/arch/riscv/boot/dts/microchip/Makefile +++ b/arch/riscv/boot/dts/microchip/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-icicle-kit.dtb dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-polarberry.dtb +dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-sev-kit.dtb obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) diff --git a/arch/riscv/boot/dts/microchip/mpfs-sev-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-sev-kit-fabric.dtsi new file mode 100644 index 0000000000000..8545baf4d1290 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/mpfs-sev-kit-fabric.dtsi @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2022 Microchip Technology Inc */ + +/ { + fabric_clk3: fabric-clk3 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <0>; + }; + + fabric_clk1: fabric-clk1 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <125000000>; + }; + + pcie: pcie@2000000000 { + compatible = "microchip,pcie-host-1.0"; + #address-cells = <0x3>; + #interrupt-cells = <0x1>; + #size-cells = <0x2>; + device_type = "pci"; + reg = <0x20 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; + reg-names = "cfg", "apb"; + bus-range = <0x0 0x7f>; + interrupt-parent = <&plic>; + interrupts = <119>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + interrupt-map-mask = <0 0 0 7>; + clocks = <&fabric_clk1>, <&fabric_clk1>, <&fabric_clk3>; + clock-names = "fic0", "fic1", "fic3"; + ranges = <0x3000000 0x0 0x8000000 0x20 0x8000000 0x0 0x80000000>; + msi-parent = <&pcie>; + msi-controller; + status = "disabled"; + pcie_intc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + }; + }; +}; diff --git a/arch/riscv/boot/dts/microchip/mpfs-sev-kit.dts b/arch/riscv/boot/dts/microchip/mpfs-sev-kit.dts new file mode 100644 index 0000000000000..013cb666c72da --- /dev/null +++ b/arch/riscv/boot/dts/microchip/mpfs-sev-kit.dts @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2022 Microchip Technology Inc */ + +/dts-v1/; + +#include "mpfs.dtsi" +#include "mpfs-sev-kit-fabric.dtsi" + +/* Clock frequency (in Hz) of the rtcclk */ +#define MTIMER_FREQ 1000000 + +/ { + #address-cells = <2>; + #size-cells = <2>; + model = "Microchip PolarFire-SoC SEV Kit"; + compatible = "microchip,mpfs-sev-kit", "microchip,mpfs"; + + aliases { + ethernet0 = &mac1; + serial0 = &mmuart0; + serial1 = &mmuart1; + serial2 = &mmuart2; + serial3 = &mmuart3; + serial4 = &mmuart4; + }; + + chosen { + stdout-path = "serial1:115200n8"; + }; + + cpus { + timebase-frequency = ; + }; + + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ranges; + + fabricbuf0ddrc: buffer@80000000 { + compatible = "shared-dma-pool"; + reg = <0x0 0x80000000 0x0 0x2000000>; + }; + + fabricbuf1ddrnc: buffer@c4000000 { + compatible = "shared-dma-pool"; + reg = <0x0 0xc4000000 0x0 0x4000000>; + }; + + fabricbuf2ddrncwcb: buffer@d4000000 { + compatible = "shared-dma-pool"; + reg = <0x0 0xd4000000 0x0 0x4000000>; + }; + }; + + ddrc_cache: memory@1000000000 { + device_type = "memory"; + reg = <0x10 0x0 0x0 0x76000000>; + }; +}; + +&i2c0 { + status = "okay"; +}; + +&gpio2 { + interrupts = <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>, + <53>, <53>, <53>, <53>; + status = "okay"; +}; + +&mac0 { + status = "okay"; + phy-mode = "sgmii"; + phy-handle = <&phy0>; + phy1: ethernet-phy@9 { + reg = <9>; + }; + phy0: ethernet-phy@8 { + reg = <8>; + }; +}; + +&mac1 { + status = "okay"; + phy-mode = "sgmii"; + phy-handle = <&phy1>; +}; + +&mbox { + status = "okay"; +}; + +&mmc { + status = "okay"; + bus-width = <4>; + disable-wp; + cap-sd-highspeed; + cap-mmc-highspeed; + mmc-ddr-1_8v; + mmc-hs200-1_8v; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + sd-uhs-sdr104; +}; + +&mmuart1 { + status = "okay"; +}; + +&mmuart2 { + status = "okay"; +}; + +&mmuart3 { + status = "okay"; +}; + +&mmuart4 { + status = "okay"; +}; + +&refclk { + clock-frequency = <125000000>; +}; + +&rtc { + status = "okay"; +}; + +&syscontroller { + status = "okay"; +}; + +&usb { + status = "okay"; + dr_mode = "otg"; +}; -- GitLab From d49166646e44064b694a2e631fcdba4f814746d9 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:22 +0100 Subject: [PATCH 0729/2223] riscv: dts: microchip: add a devicetree for aries' m100pfsevp Add device trees for both configs used by the Aries Embedded M100PFSEVP. The M100OFSEVP consists of a MPFS250T on a SOM, featuring: - 2GB DDR4 SDRAM dedicated to the HMS - 512MB DDR4 SDRAM dedicated to the FPGA - 32 MB SPI NOR Flash - 4 GByte eMMC and a carrier board with: - 2x Gigabit Ethernet - USB - 2x UART - 2x CAN - TFT connector - HSMC extension connector - 3x PMOD extension connectors - microSD-card slot Link: https://www.aries-embedded.com/polarfire-soc-fpga-microsemi-m100pfs-som-mpfs025t-pcie-serdes Link: https://www.aries-embedded.com/evaluation-kit/fpga/polarfire-microchip-soc-fpga-m100pfsevp-riscv-hsmc-pmod Link: https://downloads.aries-embedded.de/products/M100PFS/Hardware/M100PFSEVP-Schematics.pdf Co-developed-by: Wolfgang Grandegger Signed-off-by: Wolfgang Grandegger Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/Makefile | 1 + .../dts/microchip/mpfs-m100pfs-fabric.dtsi | 45 +++++ .../boot/dts/microchip/mpfs-m100pfsevp.dts | 179 ++++++++++++++++++ 3 files changed, 225 insertions(+) create mode 100644 arch/riscv/boot/dts/microchip/mpfs-m100pfs-fabric.dtsi create mode 100644 arch/riscv/boot/dts/microchip/mpfs-m100pfsevp.dts diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile index f18477b2e86d7..7427a20934f37 100644 --- a/arch/riscv/boot/dts/microchip/Makefile +++ b/arch/riscv/boot/dts/microchip/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-icicle-kit.dtb +dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-m100pfsevp.dtb dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-polarberry.dtb dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += mpfs-sev-kit.dtb obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) diff --git a/arch/riscv/boot/dts/microchip/mpfs-m100pfs-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-m100pfs-fabric.dtsi new file mode 100644 index 0000000000000..7b9ee13b6a3af --- /dev/null +++ b/arch/riscv/boot/dts/microchip/mpfs-m100pfs-fabric.dtsi @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2022 Microchip Technology Inc */ + +/ { + fabric_clk3: fabric-clk3 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <62500000>; + }; + + fabric_clk1: fabric-clk1 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <125000000>; + }; + + pcie: pcie@2000000000 { + compatible = "microchip,pcie-host-1.0"; + #address-cells = <0x3>; + #interrupt-cells = <0x1>; + #size-cells = <0x2>; + device_type = "pci"; + reg = <0x20 0x0 0x0 0x8000000>, <0x0 0x43000000 0x0 0x10000>; + reg-names = "cfg", "apb"; + bus-range = <0x0 0x7f>; + interrupt-parent = <&plic>; + interrupts = <119>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + interrupt-map-mask = <0 0 0 7>; + clocks = <&fabric_clk1>, <&fabric_clk1>, <&fabric_clk3>; + clock-names = "fic0", "fic1", "fic3"; + ranges = <0x3000000 0x0 0x8000000 0x20 0x8000000 0x0 0x80000000>; + msi-parent = <&pcie>; + msi-controller; + status = "disabled"; + pcie_intc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + }; + }; +}; diff --git a/arch/riscv/boot/dts/microchip/mpfs-m100pfsevp.dts b/arch/riscv/boot/dts/microchip/mpfs-m100pfsevp.dts new file mode 100644 index 0000000000000..184cb36a175e4 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/mpfs-m100pfsevp.dts @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Original all-in-one devicetree: + * Copyright (C) 2021-2022 - Wolfgang Grandegger + * Rewritten to use includes: + * Copyright (C) 2022 - Conor Dooley + */ +/dts-v1/; + +#include "mpfs.dtsi" +#include "mpfs-m100pfs-fabric.dtsi" + +/* Clock frequency (in Hz) of the rtcclk */ +#define MTIMER_FREQ 1000000 + +/ { + model = "Aries Embedded M100PFEVPS"; + compatible = "aries,m100pfsevp", "microchip,mpfs"; + + aliases { + ethernet0 = &mac0; + ethernet1 = &mac1; + serial0 = &mmuart0; + serial1 = &mmuart1; + serial2 = &mmuart2; + serial3 = &mmuart3; + serial4 = &mmuart4; + gpio0 = &gpio0; + gpio1 = &gpio2; + }; + + chosen { + stdout-path = "serial1:115200n8"; + }; + + cpus { + timebase-frequency = ; + }; + + ddrc_cache_lo: memory@80000000 { + device_type = "memory"; + reg = <0x0 0x80000000 0x0 0x40000000>; + }; + ddrc_cache_hi: memory@1040000000 { + device_type = "memory"; + reg = <0x10 0x40000000 0x0 0x40000000>; + }; +}; + +&can0 { + status = "okay"; +}; + +&i2c0 { + status = "okay"; +}; + +&i2c1 { + status = "okay"; +}; + +&gpio0 { + interrupts = <13>, <14>, <15>, <16>, + <17>, <18>, <19>, <20>, + <21>, <22>, <23>, <24>, + <25>, <26>; + ngpios = <14>; + status = "okay"; + + pmic-irq-hog { + gpio-hog; + gpios = <13 0>; + input; + }; + + /* Set to low for eMMC, high for SD-card */ + mmc-sel-hog { + gpio-hog; + gpios = <12 0>; + output-high; + }; +}; + +&gpio2 { + interrupts = <13>, <14>, <15>, <16>, + <17>, <18>, <19>, <20>, + <21>, <22>, <23>, <24>, + <25>, <26>, <27>, <28>, + <29>, <30>, <31>, <32>, + <33>, <34>, <35>, <36>, + <37>, <38>, <39>, <40>, + <41>, <42>, <43>, <44>; + status = "okay"; +}; + +&mac0 { + status = "okay"; + phy-mode = "gmii"; + phy-handle = <&phy0>; + phy0: ethernet-phy@0 { + reg = <0>; + }; +}; + +&mac1 { + status = "okay"; + phy-mode = "gmii"; + phy-handle = <&phy1>; + phy1: ethernet-phy@0 { + reg = <0>; + }; +}; + +&mbox { + status = "okay"; +}; + +&mmc { + max-frequency = <50000000>; + bus-width = <4>; + cap-mmc-highspeed; + cap-sd-highspeed; + no-1-8-v; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + sd-uhs-sdr104; + disable-wp; + status = "okay"; +}; + +&mmuart1 { + status = "okay"; +}; + +&mmuart2 { + status = "okay"; +}; + +&mmuart3 { + status = "okay"; +}; + +&mmuart4 { + status = "okay"; +}; + +&pcie { + status = "okay"; +}; + +&qspi { + status = "okay"; +}; + +&refclk { + clock-frequency = <125000000>; +}; + +&rtc { + status = "okay"; +}; + +&spi0 { + status = "okay"; +}; + +&spi1 { + status = "okay"; +}; + +&syscontroller { + status = "okay"; +}; + +&usb { + status = "okay"; + dr_mode = "host"; +}; -- GitLab From 6c1193301791d3fcc0ad9ff3b861a8216e00773b Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 27 Sep 2022 12:19:23 +0100 Subject: [PATCH 0730/2223] riscv: dts: microchip: update memory configuration for v2022.10 In the v2022.10 reference design, the seg registers are going to be changed, resulting in a required change to the memory map in Linux. A small 4M reservation is made at the end of 32-bit DDR to provide some memory for the HSS to use, so that it can cache its payload.bin between reboots of a specific context. Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts index 42d350fe6c6b9..31f88cb4d5e5c 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit.dts @@ -33,15 +33,26 @@ ddrc_cache_lo: memory@80000000 { device_type = "memory"; - reg = <0x0 0x80000000 0x0 0x2e000000>; + reg = <0x0 0x80000000 0x0 0x40000000>; status = "okay"; }; ddrc_cache_hi: memory@1000000000 { device_type = "memory"; - reg = <0x10 0x0 0x0 0x40000000>; + reg = <0x10 0x40000000 0x0 0x40000000>; status = "okay"; }; + + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ranges; + + hss_payload: region@BFC00000 { + reg = <0x0 0xBFC00000 0x0 0x400000>; + no-map; + }; + }; }; &core_pwm0 { -- GitLab From 5459c0b7046752e519a646e1c2404852bb628459 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 16 Aug 2022 13:20:42 +0300 Subject: [PATCH 0731/2223] PCI/DPC: Quirk PIO log size for certain Intel Root Ports Some Root Ports on Intel Tiger Lake and Alder Lake systems support the RP Extensions for DPC and the RP PIO Log registers but incorrectly advertise an RP PIO Log Size of zero. This means the kernel complains that: DPC: RP PIO log size 0 is invalid and if DPC is triggered, the DPC driver will not dump the RP PIO Log registers when it should. This is caused by a BIOS bug and should be fixed the BIOS for future CPUs. Add a quirk to set the correct RP PIO Log size for the affected Root Ports. Link: https://bugzilla.kernel.org/show_bug.cgi?id=209943 Link: https://lore.kernel.org/r/20220816102042.69125-1-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/pcie/dpc.c | 15 ++++++++++----- drivers/pci/quirks.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 3e9afee02e8d1..f5ffea17c7f87 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -335,11 +335,16 @@ void pci_dpc_init(struct pci_dev *pdev) return; pdev->dpc_rp_extensions = true; - pdev->dpc_rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8; - if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) { - pci_err(pdev, "RP PIO log size %u is invalid\n", - pdev->dpc_rp_log_size); - pdev->dpc_rp_log_size = 0; + + /* Quirks may set dpc_rp_log_size if device or firmware is buggy */ + if (!pdev->dpc_rp_log_size) { + pdev->dpc_rp_log_size = + (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8; + if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) { + pci_err(pdev, "RP PIO log size %u is invalid\n", + pdev->dpc_rp_log_size); + pdev->dpc_rp_log_size = 0; + } } } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4944798e75b5a..285acc4aaccc1 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5956,3 +5956,39 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x56b1, aspm_l1_acceptable_latency DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x56c0, aspm_l1_acceptable_latency); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x56c1, aspm_l1_acceptable_latency); #endif + +#ifdef CONFIG_PCIE_DPC +/* + * Intel Tiger Lake and Alder Lake BIOS has a bug that clears the DPC + * RP PIO Log Size of the integrated Thunderbolt PCIe Root Ports. + */ +static void dpc_log_size(struct pci_dev *dev) +{ + u16 dpc, val; + + dpc = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC); + if (!dpc) + return; + + pci_read_config_word(dev, dpc + PCI_EXP_DPC_CAP, &val); + if (!(val & PCI_EXP_DPC_CAP_RP_EXT)) + return; + + if (!((val & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8)) { + pci_info(dev, "Overriding RP PIO Log Size to 4\n"); + dev->dpc_rp_log_size = 4; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x461f, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x462f, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x463f, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x466e, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a23, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a25, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a27, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a29, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a2b, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a2d, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a2f, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a31, dpc_log_size); +#endif -- GitLab From 90c9978959dacdecfc30d2e6ad5cefc4823399b8 Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Tue, 27 Sep 2022 18:01:16 -0700 Subject: [PATCH 0732/2223] Input: xpad - refactor using BIT() macro reduces the amount of magic numbers and makes the code more readable Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20220913213133.584979-2-rojtberg@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 113 +++++++++++++++++----------------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index fceb0d342945b..2d9a925514999 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -61,6 +61,7 @@ * Later changes can be tracked in SCM. */ +#include #include #include #include @@ -709,10 +710,10 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d /* digital pad */ if (xpad->mapping & MAP_DPAD_TO_BUTTONS) { /* dpad as buttons (left, right, up, down) */ - input_report_key(dev, BTN_TRIGGER_HAPPY1, data[2] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY2, data[2] & 0x08); - input_report_key(dev, BTN_TRIGGER_HAPPY3, data[2] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY4, data[2] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY1, data[2] & BIT(2)); + input_report_key(dev, BTN_TRIGGER_HAPPY2, data[2] & BIT(3)); + input_report_key(dev, BTN_TRIGGER_HAPPY3, data[2] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY4, data[2] & BIT(1)); } else { input_report_abs(dev, ABS_HAT0X, !!(data[2] & 0x08) - !!(data[2] & 0x04)); @@ -721,10 +722,10 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d } /* start/back buttons and stick press left/right */ - input_report_key(dev, BTN_START, data[2] & 0x10); - input_report_key(dev, BTN_SELECT, data[2] & 0x20); - input_report_key(dev, BTN_THUMBL, data[2] & 0x40); - input_report_key(dev, BTN_THUMBR, data[2] & 0x80); + input_report_key(dev, BTN_START, data[2] & BIT(4)); + input_report_key(dev, BTN_SELECT, data[2] & BIT(5)); + input_report_key(dev, BTN_THUMBL, data[2] & BIT(6)); + input_report_key(dev, BTN_THUMBR, data[2] & BIT(7)); /* "analog" buttons A, B, X, Y */ input_report_key(dev, BTN_A, data[4]); @@ -759,10 +760,10 @@ static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev, /* digital pad */ if (xpad->mapping & MAP_DPAD_TO_BUTTONS) { /* dpad as buttons (left, right, up, down) */ - input_report_key(dev, BTN_TRIGGER_HAPPY1, data[2] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY2, data[2] & 0x08); - input_report_key(dev, BTN_TRIGGER_HAPPY3, data[2] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY4, data[2] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY1, data[2] & BIT(2)); + input_report_key(dev, BTN_TRIGGER_HAPPY2, data[2] & BIT(3)); + input_report_key(dev, BTN_TRIGGER_HAPPY3, data[2] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY4, data[2] & BIT(1)); } /* @@ -780,21 +781,21 @@ static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev, } /* start/back buttons */ - input_report_key(dev, BTN_START, data[2] & 0x10); - input_report_key(dev, BTN_SELECT, data[2] & 0x20); + input_report_key(dev, BTN_START, data[2] & BIT(4)); + input_report_key(dev, BTN_SELECT, data[2] & BIT(5)); /* stick press left/right */ - input_report_key(dev, BTN_THUMBL, data[2] & 0x40); - input_report_key(dev, BTN_THUMBR, data[2] & 0x80); + input_report_key(dev, BTN_THUMBL, data[2] & BIT(6)); + input_report_key(dev, BTN_THUMBR, data[2] & BIT(7)); /* buttons A,B,X,Y,TL,TR and MODE */ - input_report_key(dev, BTN_A, data[3] & 0x10); - input_report_key(dev, BTN_B, data[3] & 0x20); - input_report_key(dev, BTN_X, data[3] & 0x40); - input_report_key(dev, BTN_Y, data[3] & 0x80); - input_report_key(dev, BTN_TL, data[3] & 0x01); - input_report_key(dev, BTN_TR, data[3] & 0x02); - input_report_key(dev, BTN_MODE, data[3] & 0x04); + input_report_key(dev, BTN_A, data[3] & BIT(4)); + input_report_key(dev, BTN_B, data[3] & BIT(5)); + input_report_key(dev, BTN_X, data[3] & BIT(6)); + input_report_key(dev, BTN_Y, data[3] & BIT(7)); + input_report_key(dev, BTN_TL, data[3] & BIT(0)); + input_report_key(dev, BTN_TR, data[3] & BIT(1)); + input_report_key(dev, BTN_MODE, data[3] & BIT(2)); if (!(xpad->mapping & MAP_STICKS_TO_NULL)) { /* left stick */ @@ -832,7 +833,7 @@ static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev, } /* mode button down/up */ - if (data[3] & 0x04) + if (data[3] & BIT(2)) xpad->mode_btn_down_ts = ktime_get_seconds(); else xpad->mode_btn_down_ts = 0; @@ -928,7 +929,7 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char if (data[1] == 0x30) xpadone_ack_mode_report(xpad, data[2]); - input_report_key(dev, BTN_MODE, data[4] & 0x01); + input_report_key(dev, BTN_MODE, data[4] & BIT(0)); do_sync = true; } else if (data[0] == 0X0C) { @@ -942,33 +943,33 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char data[18] = 0; /* Elite Series 2 split packet paddle bits */ - input_report_key(dev, BTN_TRIGGER_HAPPY5, data[18] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY6, data[18] & 0x02); - input_report_key(dev, BTN_TRIGGER_HAPPY7, data[18] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY8, data[18] & 0x08); + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[18] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[18] & BIT(1)); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[18] & BIT(2)); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[18] & BIT(3)); do_sync = true; } } else if (data[0] == 0X20) { /* The main valid packet type for inputs */ /* menu/view buttons */ - input_report_key(dev, BTN_START, data[4] & 0x04); - input_report_key(dev, BTN_SELECT, data[4] & 0x08); + input_report_key(dev, BTN_START, data[4] & BIT(2)); + input_report_key(dev, BTN_SELECT, data[4] & BIT(3)); if (xpad->mapping & MAP_SELECT_BUTTON) - input_report_key(dev, KEY_RECORD, data[22] & 0x01); + input_report_key(dev, KEY_RECORD, data[22] & BIT(0)); /* buttons A,B,X,Y */ - input_report_key(dev, BTN_A, data[4] & 0x10); - input_report_key(dev, BTN_B, data[4] & 0x20); - input_report_key(dev, BTN_X, data[4] & 0x40); - input_report_key(dev, BTN_Y, data[4] & 0x80); + input_report_key(dev, BTN_A, data[4] & BIT(4)); + input_report_key(dev, BTN_B, data[4] & BIT(5)); + input_report_key(dev, BTN_X, data[4] & BIT(6)); + input_report_key(dev, BTN_Y, data[4] & BIT(7)); /* digital pad */ if (xpad->mapping & MAP_DPAD_TO_BUTTONS) { /* dpad as buttons (left, right, up, down) */ - input_report_key(dev, BTN_TRIGGER_HAPPY1, data[5] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY2, data[5] & 0x08); - input_report_key(dev, BTN_TRIGGER_HAPPY3, data[5] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY4, data[5] & 0x02); + input_report_key(dev, BTN_TRIGGER_HAPPY1, data[5] & BIT(2)); + input_report_key(dev, BTN_TRIGGER_HAPPY2, data[5] & BIT(3)); + input_report_key(dev, BTN_TRIGGER_HAPPY3, data[5] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY4, data[5] & BIT(1)); } else { input_report_abs(dev, ABS_HAT0X, !!(data[5] & 0x08) - !!(data[5] & 0x04)); @@ -977,12 +978,12 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char } /* TL/TR */ - input_report_key(dev, BTN_TL, data[5] & 0x10); - input_report_key(dev, BTN_TR, data[5] & 0x20); + input_report_key(dev, BTN_TL, data[5] & BIT(4)); + input_report_key(dev, BTN_TR, data[5] & BIT(5)); /* stick press left/right */ - input_report_key(dev, BTN_THUMBL, data[5] & 0x40); - input_report_key(dev, BTN_THUMBR, data[5] & 0x80); + input_report_key(dev, BTN_THUMBL, data[5] & BIT(6)); + input_report_key(dev, BTN_THUMBR, data[5] & BIT(7)); if (!(xpad->mapping & MAP_STICKS_TO_NULL)) { /* left stick */ @@ -1023,10 +1024,10 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char data[32] = 0; /* OG Elite Series Controller paddle bits */ - input_report_key(dev, BTN_TRIGGER_HAPPY5, data[32] & 0x02); - input_report_key(dev, BTN_TRIGGER_HAPPY6, data[32] & 0x08); - input_report_key(dev, BTN_TRIGGER_HAPPY7, data[32] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY8, data[32] & 0x04); + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[32] & BIT(1)); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[32] & BIT(3)); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[32] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[32] & BIT(2)); } else if (xpad->packet_type == PKT_XBE2_FW_OLD) { /* Mute paddles if controller has a custom mapping applied. * Checked by comparing the current mapping @@ -1036,10 +1037,10 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char data[18] = 0; /* Elite Series 2 4.x firmware paddle bits */ - input_report_key(dev, BTN_TRIGGER_HAPPY5, data[18] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY6, data[18] & 0x02); - input_report_key(dev, BTN_TRIGGER_HAPPY7, data[18] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY8, data[18] & 0x08); + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[18] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[18] & BIT(1)); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[18] & BIT(2)); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[18] & BIT(3)); } else if (xpad->packet_type == PKT_XBE2_FW_5_EARLY) { /* Mute paddles if controller has a custom mapping applied. * Checked by comparing the current mapping @@ -1051,10 +1052,10 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char /* Elite Series 2 5.x firmware paddle bits * (before the packet was split) */ - input_report_key(dev, BTN_TRIGGER_HAPPY5, data[22] & 0x01); - input_report_key(dev, BTN_TRIGGER_HAPPY6, data[22] & 0x02); - input_report_key(dev, BTN_TRIGGER_HAPPY7, data[22] & 0x04); - input_report_key(dev, BTN_TRIGGER_HAPPY8, data[22] & 0x08); + input_report_key(dev, BTN_TRIGGER_HAPPY5, data[22] & BIT(0)); + input_report_key(dev, BTN_TRIGGER_HAPPY6, data[22] & BIT(1)); + input_report_key(dev, BTN_TRIGGER_HAPPY7, data[22] & BIT(2)); + input_report_key(dev, BTN_TRIGGER_HAPPY8, data[22] & BIT(3)); } } -- GitLab From 677065244aa17265d7933782b1720cdf8727fcae Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Tue, 27 Sep 2022 18:02:05 -0700 Subject: [PATCH 0733/2223] Input: xpad - decipher xpadone packages with GIP defines only renames, no functional changes. Some of the packets we send seem superfluous now. Unfortunately I dont have the hardware to verify whether they are. Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20220913213133.584979-3-rojtberg@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 99 ++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 2d9a925514999..60859008372c9 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -514,13 +514,52 @@ struct xboxone_init_packet { .len = ARRAY_SIZE(_data), \ } +/* + * starting with xbox one, the game input protocol is used + * magic numbers are taken from + * - https://github.com/xpadneo/gip-dissector/blob/main/src/gip-dissector.lua + * - https://github.com/medusalix/xone/blob/master/bus/protocol.c + */ +#define GIP_CMD_ACK 0x01 +#define GIP_CMD_IDENTIFY 0x04 +#define GIP_CMD_POWER 0x05 +#define GIP_CMD_AUTHENTICATE 0x06 +#define GIP_CMD_VIRTUAL_KEY 0x07 +#define GIP_CMD_RUMBLE 0x09 +#define GIP_CMD_LED 0x0a +#define GIP_CMD_FIRMWARE 0x0c +#define GIP_CMD_INPUT 0x20 + +#define GIP_SEQ0 0x00 + +#define GIP_OPT_ACK 0x10 +#define GIP_OPT_INTERNAL 0x20 + +/* + * length of the command payload encoded with + * https://en.wikipedia.org/wiki/LEB128 + * which is a no-op for N < 128 + */ +#define GIP_PL_LEN(N) (N) + +/* + * payload specific defines + */ +#define GIP_PWR_ON 0x00 +#define GIP_LED_ON 0x01 + +#define GIP_MOTOR_R BIT(0) +#define GIP_MOTOR_L BIT(1) +#define GIP_MOTOR_RT BIT(2) +#define GIP_MOTOR_LT BIT(3) +#define GIP_MOTOR_ALL (GIP_MOTOR_R | GIP_MOTOR_L | GIP_MOTOR_RT | GIP_MOTOR_LT) /* * This packet is required for all Xbox One pads with 2015 * or later firmware installed (or present from the factory). */ -static const u8 xboxone_fw2015_init[] = { - 0x05, 0x20, 0x00, 0x01, 0x00 +static const u8 xboxone_power_on[] = { + GIP_CMD_POWER, GIP_OPT_INTERNAL, GIP_SEQ0, GIP_PL_LEN(1), GIP_PWR_ON }; /* @@ -530,7 +569,7 @@ static const u8 xboxone_fw2015_init[] = { * Bluetooth mode. */ static const u8 xboxone_s_init[] = { - 0x05, 0x20, 0x00, 0x0f, 0x06 + GIP_CMD_POWER, GIP_OPT_INTERNAL, GIP_SEQ0, 0x0f, 0x06 }; /* @@ -547,9 +586,9 @@ static const u8 extra_input_packet_init[] = { * (0x0e6f:0x0165) to finish initialization and for Hori pads * (0x0f0d:0x0067) to make the analog sticks work. */ -static const u8 xboxone_hori_init[] = { - 0x01, 0x20, 0x00, 0x09, 0x00, 0x04, 0x20, 0x3a, - 0x00, 0x00, 0x00, 0x80, 0x00 +static const u8 xboxone_hori_ack_id[] = { + GIP_CMD_ACK, GIP_OPT_INTERNAL, GIP_SEQ0, GIP_PL_LEN(9), + 0x00, GIP_CMD_IDENTIFY, GIP_OPT_INTERNAL, 0x3a, 0x00, 0x00, 0x00, 0x80, 0x00 }; /* @@ -557,8 +596,8 @@ static const u8 xboxone_hori_init[] = { * sending input reports. These pads include: (0x0e6f:0x02ab), * (0x0e6f:0x02a4), (0x0e6f:0x02a6). */ -static const u8 xboxone_pdp_init1[] = { - 0x0a, 0x20, 0x00, 0x03, 0x00, 0x01, 0x14 +static const u8 xboxone_pdp_led_on[] = { + GIP_CMD_LED, GIP_OPT_INTERNAL, GIP_SEQ0, GIP_PL_LEN(3), 0x00, GIP_LED_ON, 0x14 }; /* @@ -566,8 +605,8 @@ static const u8 xboxone_pdp_init1[] = { * sending input reports. These pads include: (0x0e6f:0x02ab), * (0x0e6f:0x02a4), (0x0e6f:0x02a6). */ -static const u8 xboxone_pdp_init2[] = { - 0x06, 0x20, 0x00, 0x02, 0x01, 0x00 +static const u8 xboxone_pdp_auth[] = { + GIP_CMD_AUTHENTICATE, GIP_OPT_INTERNAL, GIP_SEQ0, GIP_PL_LEN(2), 0x01, 0x00 }; /* @@ -575,8 +614,8 @@ static const u8 xboxone_pdp_init2[] = { * sending input reports. One of those pads is (0x24c6:0x543a). */ static const u8 xboxone_rumblebegin_init[] = { - 0x09, 0x00, 0x00, 0x09, 0x00, 0x0F, 0x00, 0x00, - 0x1D, 0x1D, 0xFF, 0x00, 0x00 + GIP_CMD_RUMBLE, 0x00, GIP_SEQ0, GIP_PL_LEN(9), + 0x00, GIP_MOTOR_ALL, 0x00, 0x00, 0x1D, 0x1D, 0xFF, 0x00, 0x00 }; /* @@ -586,8 +625,8 @@ static const u8 xboxone_rumblebegin_init[] = { * spin up to enough speed to actually vibrate the gamepad. */ static const u8 xboxone_rumbleend_init[] = { - 0x09, 0x00, 0x00, 0x09, 0x00, 0x0F, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00 + GIP_CMD_RUMBLE, 0x00, GIP_SEQ0, GIP_PL_LEN(9), + 0x00, GIP_MOTOR_ALL, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; /* @@ -597,14 +636,14 @@ static const u8 xboxone_rumbleend_init[] = { * packet is going to be sent. */ static const struct xboxone_init_packet xboxone_init_packets[] = { - XBOXONE_INIT_PKT(0x0e6f, 0x0165, xboxone_hori_init), - XBOXONE_INIT_PKT(0x0f0d, 0x0067, xboxone_hori_init), - XBOXONE_INIT_PKT(0x0000, 0x0000, xboxone_fw2015_init), + XBOXONE_INIT_PKT(0x0e6f, 0x0165, xboxone_hori_ack_id), + XBOXONE_INIT_PKT(0x0f0d, 0x0067, xboxone_hori_ack_id), + XBOXONE_INIT_PKT(0x0000, 0x0000, xboxone_power_on), XBOXONE_INIT_PKT(0x045e, 0x02ea, xboxone_s_init), XBOXONE_INIT_PKT(0x045e, 0x0b00, xboxone_s_init), XBOXONE_INIT_PKT(0x045e, 0x0b00, extra_input_packet_init), - XBOXONE_INIT_PKT(0x0e6f, 0x0000, xboxone_pdp_init1), - XBOXONE_INIT_PKT(0x0e6f, 0x0000, xboxone_pdp_init2), + XBOXONE_INIT_PKT(0x0e6f, 0x0000, xboxone_pdp_led_on), + XBOXONE_INIT_PKT(0x0e6f, 0x0000, xboxone_pdp_auth), XBOXONE_INIT_PKT(0x24c6, 0x541a, xboxone_rumblebegin_init), XBOXONE_INIT_PKT(0x24c6, 0x542a, xboxone_rumblebegin_init), XBOXONE_INIT_PKT(0x24c6, 0x543a, xboxone_rumblebegin_init), @@ -920,19 +959,19 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char bool do_sync = false; /* the xbox button has its own special report */ - if (data[0] == 0X07) { + if (data[0] == GIP_CMD_VIRTUAL_KEY) { /* * The Xbox One S controller requires these reports to be * acked otherwise it continues sending them forever and * won't report further mode button events. */ - if (data[1] == 0x30) + if (data[1] == (GIP_OPT_ACK | GIP_OPT_INTERNAL)) xpadone_ack_mode_report(xpad, data[2]); input_report_key(dev, BTN_MODE, data[4] & BIT(0)); do_sync = true; - } else if (data[0] == 0X0C) { + } else if (data[0] == GIP_CMD_FIRMWARE) { /* Some packet formats force us to use this separate to poll paddle inputs */ if (xpad->packet_type == PKT_XBE2_FW_5_11) { /* Mute paddles if controller is in a custom profile slot @@ -950,7 +989,7 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char do_sync = true; } - } else if (data[0] == 0X20) { /* The main valid packet type for inputs */ + } else if (data[0] == GIP_CMD_INPUT) { /* The main valid packet type for inputs */ /* menu/view buttons */ input_report_key(dev, BTN_START, data[4] & BIT(2)); input_report_key(dev, BTN_SELECT, data[4] & BIT(3)); @@ -1363,8 +1402,8 @@ static void xpadone_ack_mode_report(struct usb_xpad *xpad, u8 seq_num) struct xpad_output_packet *packet = &xpad->out_packets[XPAD_OUT_CMD_IDX]; static const u8 mode_report_ack[] = { - 0x01, 0x20, 0x00, 0x09, 0x00, 0x07, 0x20, 0x02, - 0x00, 0x00, 0x00, 0x00, 0x00 + GIP_CMD_ACK, GIP_OPT_INTERNAL, GIP_SEQ0, GIP_PL_LEN(9), + 0x00, GIP_CMD_VIRTUAL_KEY, GIP_OPT_INTERNAL, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00 }; spin_lock_irqsave(&xpad->odata_lock, flags); @@ -1442,14 +1481,14 @@ static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect break; case XTYPE_XBOXONE: - packet->data[0] = 0x09; /* activate rumble */ + packet->data[0] = GIP_CMD_RUMBLE; /* activate rumble */ packet->data[1] = 0x00; packet->data[2] = xpad->odata_serial++; - packet->data[3] = 0x09; + packet->data[3] = GIP_PL_LEN(9); packet->data[4] = 0x00; - packet->data[5] = 0x0F; - packet->data[6] = 0x00; - packet->data[7] = 0x00; + packet->data[5] = GIP_MOTOR_ALL; + packet->data[6] = 0x00; /* left trigger */ + packet->data[7] = 0x00; /* right trigger */ packet->data[8] = strong / 512; /* left actuator */ packet->data[9] = weak / 512; /* right actuator */ packet->data[10] = 0xFF; /* on period */ -- GitLab From 05763c996f72ef934432639fe412f5193816fd9d Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 27 Sep 2022 13:38:14 +0000 Subject: [PATCH 0734/2223] ipmi: Remove unused struct watcher_entry After commit e86ee2d44b44("ipmi: Rework locking and shutdown for hot remove"), no one use struct watcher_entry, so remove it. Signed-off-by: Yuan Can Message-Id: <20220927133814.98929-1-yuancan@huawei.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index c8a3b208f923e..49a1707693c9f 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -736,12 +736,6 @@ static void intf_free(struct kref *ref) kfree(intf); } -struct watcher_entry { - int intf_num; - struct ipmi_smi *intf; - struct list_head link; -}; - int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher) { struct ipmi_smi *intf; -- GitLab From 612d5494aef9bd2ab68d585a8c0ac2b16d12d520 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 27 Sep 2022 20:45:57 +0800 Subject: [PATCH 0735/2223] irqchip: Make irqchip_init() usable on pure ACPI systems Pure ACPI systems (e.g., LoongArch) do not need OF_IRQ, but still require irqchip_init() to perform the ACPI irqchip probing, even when OF_IRQ isn't selected. Relax the dependency to enable the generic irqchip support when ACPI_GENERIC_GSI is configured. Signed-off-by: Huacai Chen Tested-by: Tiezhu Yang [maz: revamped commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220927124557.3246737-1-chenhuacai@loongson.cn --- drivers/irqchip/Kconfig | 2 +- include/linux/of_irq.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 66b9fa408bf24..93ad04d58f176 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -3,7 +3,7 @@ menu "IRQ chip support" config IRQCHIP def_bool y - depends on OF_IRQ + depends on (OF_IRQ || ACPI_GENERIC_GSI) config ARM_GIC bool diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 83fccd0c9bba2..d6d3eae2f1452 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -37,9 +37,8 @@ extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data); extern int of_irq_to_resource(struct device_node *dev, int index, struct resource *r); -extern void of_irq_init(const struct of_device_id *matches); - #ifdef CONFIG_OF_IRQ +extern void of_irq_init(const struct of_device_id *matches); extern int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq); extern int of_irq_count(struct device_node *dev); @@ -57,6 +56,9 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, extern void of_msi_configure(struct device *dev, struct device_node *np); u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else +static inline void of_irq_init(const struct of_device_id *matches) +{ +} static inline int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq) { -- GitLab From a1cc8a62c2b21d6d71d5a3d5d7c7658e3ab42d47 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Mon, 19 Sep 2022 22:24:41 +0200 Subject: [PATCH 0736/2223] irqchip/realtek-rtl: use irq_domain_add_linear() When using an offset of 0, irq_domain_add_simple() is identical to irq_domain_add_linear() on DT-based systems, so use the latter instead. Signed-off-by: Sander Vanheule Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/0c4cd9f7661a30a4cb7ab9881c4a94bc8a379162.1663617425.git.sander@svanheule.net --- drivers/irqchip/irq-realtek-rtl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c index 56bf502d9c673..160feae0ded7f 100644 --- a/drivers/irqchip/irq-realtek-rtl.c +++ b/drivers/irqchip/irq-realtek-rtl.c @@ -171,8 +171,7 @@ static int __init realtek_rtl_of_init(struct device_node *node, struct device_no /* Disable all cascaded interrupts */ writel(0, REG(RTL_ICTL_GIMR)); - domain = irq_domain_add_simple(node, 32, 0, - &irq_domain_ops, NULL); + domain = irq_domain_add_linear(node, 32, &irq_domain_ops, NULL); ret = map_interrupts(node, domain); if (ret) { -- GitLab From a3e77b70f19240f8a52bbe1c703aa8db6a8f7450 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Mon, 19 Sep 2022 22:24:42 +0200 Subject: [PATCH 0737/2223] dt-bindings: interrupt-controller: realtek,rtl-intc: require parents The interrupt router has 32 inputs, and up to 15 outputs connected to the MIPS CPU's interrupts. The way these are mapped to each other is runtime configurable. This controller can also mask individual interrupt sources, and has a status register to indicate pending interrupts. This means the controller is not transparent, and the use of "interrupt-map" inappropriate. Instead, a list of parent interrupts should be specified. Two-part compatibles are introduced to be able to require "interrupts" for new devicetrees. For backward compatibility "interrupt-map" is still allowed on these new compatibles, but deprecated. The old compatible, with required "interrupt-map" and "#address-cells", is also deprecated. The relevant descriptions are added or extended to more clearly describe the functionality of this controller. To prevent spurious changes to the binding when more SoCs are added, "allOf" is used with one "if", and the compatible enum only has one item. The example is updated to provide a correct example for RTL8380 SoCs. Signed-off-by: Sander Vanheule Reviewed-by: Rob Herring Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/ba3ae8e521ef82dd94f18a602ef53078f4a0d8d5.1663617425.git.sander@svanheule.net --- .../realtek,rtl-intc.yaml | 60 ++++++++++++++----- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml index 9e76fff20323c..13a893b18fb64 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml @@ -6,6 +6,14 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Realtek RTL SoC interrupt controller devicetree bindings +description: + Interrupt controller and router for Realtek MIPS SoCs, allowing each SoC + interrupt to be routed to one parent CPU (hardware) interrupt, or left + disconnected. + All connected input lines from SoC peripherals can be masked individually, + and an interrupt status register is present to indicate which interrupts are + pending. + maintainers: - Birger Koblitz - Bert Vermeulen @@ -13,23 +21,33 @@ maintainers: properties: compatible: - const: realtek,rtl-intc + oneOf: + - items: + - enum: + - realtek,rtl8380-intc + - const: realtek,rtl-intc + - const: realtek,rtl-intc + deprecated: true "#interrupt-cells": + description: + SoC interrupt line index. const: 1 reg: maxItems: 1 interrupts: - maxItems: 1 + minItems: 1 + maxItems: 15 + description: + List of parent interrupts, in the order that they are connected to this + interrupt router's outputs, starting at the first output. interrupt-controller: true - "#address-cells": - const: 0 - interrupt-map: + deprecated: true description: Describes mapping from SoC interrupts to CPU interrupts required: @@ -37,21 +55,33 @@ required: - reg - "#interrupt-cells" - interrupt-controller - - "#address-cells" - - interrupt-map + +allOf: + - if: + properties: + compatible: + const: realtek,rtl-intc + then: + properties: + "#address-cells": + const: 0 + required: + - "#address-cells" + - interrupt-map + else: + required: + - interrupts additionalProperties: false examples: - | - intc: interrupt-controller@3000 { - compatible = "realtek,rtl-intc"; + interrupt-controller@3000 { + compatible = "realtek,rtl8380-intc", "realtek,rtl-intc"; #interrupt-cells = <1>; interrupt-controller; - reg = <0x3000 0x20>; - #address-cells = <0>; - interrupt-map = - <31 &cpuintc 2>, - <30 &cpuintc 1>, - <29 &cpuintc 5>; + reg = <0x3000 0x18>; + + interrupt-parent = <&cpuintc>; + interrupts = <2>, <3>, <4>, <5>, <6>; }; -- GitLab From 9070f1ce31c5027821d5f37e9ca8dfb23158e457 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Mon, 19 Sep 2022 22:24:43 +0200 Subject: [PATCH 0738/2223] irqchip/realtek-rtl: use parent interrupts The interrupt-map property for "realtek,rtl-intc" has been deprecated in favor of a list of parent interrupts. Drop the open-coded parser for interrupt-map, and use the first parent interrupt instead. If no parent was provided, the driver will assume that this is the first hardware interrupt of the SoC's MIPS CPU for compatibility with the legacy binding. All SoC interrupts were treated equally, independent of which output they were actually routed to. This means the driver might as well route all interrupts to the first output, and achieve the same behaviour. Without the interrupt-map property, interrupt usage information is no longer available at initialisation. Routing setup will now happen later, when a hardware interrupt is mapped by the subsystem. Signed-off-by: Sander Vanheule Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/5f901a82eaa9d97cadf6e9b73a894a92f3f83b7c.1663617425.git.sander@svanheule.net --- drivers/irqchip/irq-realtek-rtl.c | 133 ++++++++++++++---------------- 1 file changed, 61 insertions(+), 72 deletions(-) diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c index 160feae0ded7f..2a349082af81d 100644 --- a/drivers/irqchip/irq-realtek-rtl.c +++ b/drivers/irqchip/irq-realtek-rtl.c @@ -21,11 +21,33 @@ #define RTL_ICTL_IRR2 0x10 #define RTL_ICTL_IRR3 0x14 +#define RTL_ICTL_NUM_INPUTS 32 + #define REG(x) (realtek_ictl_base + x) static DEFINE_RAW_SPINLOCK(irq_lock); static void __iomem *realtek_ictl_base; +/* + * IRR0-IRR3 store 4 bits per interrupt, but Realtek uses inverted numbering, + * placing IRQ 31 in the first four bits. A routing value of '0' means the + * interrupt is left disconnected. Routing values {1..15} connect to output + * lines {0..14}. + */ +#define IRR_OFFSET(idx) (4 * (3 - (idx * 4) / 32)) +#define IRR_SHIFT(idx) ((idx * 4) % 32) + +static void write_irr(void __iomem *irr0, int idx, u32 value) +{ + unsigned int offset = IRR_OFFSET(idx); + unsigned int shift = IRR_SHIFT(idx); + u32 irr; + + irr = readl(irr0 + offset) & ~(0xf << shift); + irr |= (value & 0xf) << shift; + writel(irr, irr0 + offset); +} + static void realtek_ictl_unmask_irq(struct irq_data *i) { unsigned long flags; @@ -62,8 +84,14 @@ static struct irq_chip realtek_ictl_irq = { static int intc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) { + unsigned long flags; + irq_set_chip_and_handler(irq, &realtek_ictl_irq, handle_level_irq); + raw_spin_lock_irqsave(&irq_lock, flags); + write_irr(REG(RTL_ICTL_IRR0), hw, 1); + raw_spin_unlock_irqrestore(&irq_lock, flags); + return 0; } @@ -95,89 +123,50 @@ out: chained_irq_exit(chip, desc); } -/* - * SoC interrupts are cascaded to MIPS CPU interrupts according to the - * interrupt-map in the device tree. Each SoC interrupt gets 4 bits for - * the CPU interrupt in an Interrupt Routing Register. Max 32 SoC interrupts - * thus go into 4 IRRs. A routing value of '0' means the interrupt is left - * disconnected. Routing values {1..15} connect to output lines {0..14}. - */ -static int __init map_interrupts(struct device_node *node, struct irq_domain *domain) -{ - struct device_node *cpu_ictl; - const __be32 *imap; - u32 imaplen, soc_int, cpu_int, tmp, regs[4]; - int ret, i, irr_regs[] = { - RTL_ICTL_IRR3, - RTL_ICTL_IRR2, - RTL_ICTL_IRR1, - RTL_ICTL_IRR0, - }; - u8 mips_irqs_set; - - ret = of_property_read_u32(node, "#address-cells", &tmp); - if (ret || tmp) - return -EINVAL; - - imap = of_get_property(node, "interrupt-map", &imaplen); - if (!imap || imaplen % 3) - return -EINVAL; - - mips_irqs_set = 0; - memset(regs, 0, sizeof(regs)); - for (i = 0; i < imaplen; i += 3 * sizeof(u32)) { - soc_int = be32_to_cpup(imap); - if (soc_int > 31) - return -EINVAL; - - cpu_ictl = of_find_node_by_phandle(be32_to_cpup(imap + 1)); - if (!cpu_ictl) - return -EINVAL; - ret = of_property_read_u32(cpu_ictl, "#interrupt-cells", &tmp); - of_node_put(cpu_ictl); - if (ret || tmp != 1) - return -EINVAL; - - cpu_int = be32_to_cpup(imap + 2); - if (cpu_int > 7 || cpu_int < 2) - return -EINVAL; - - if (!(mips_irqs_set & BIT(cpu_int))) { - irq_set_chained_handler_and_data(cpu_int, realtek_irq_dispatch, - domain); - mips_irqs_set |= BIT(cpu_int); - } - - /* Use routing values (1..6) for CPU interrupts (2..7) */ - regs[(soc_int * 4) / 32] |= (cpu_int - 1) << (soc_int * 4) % 32; - imap += 3; - } - - for (i = 0; i < 4; i++) - writel(regs[i], REG(irr_regs[i])); - - return 0; -} - static int __init realtek_rtl_of_init(struct device_node *node, struct device_node *parent) { + struct of_phandle_args oirq; struct irq_domain *domain; - int ret; + unsigned int soc_irq; + int parent_irq; realtek_ictl_base = of_iomap(node, 0); if (!realtek_ictl_base) return -ENXIO; - /* Disable all cascaded interrupts */ + /* Disable all cascaded interrupts and clear routing */ writel(0, REG(RTL_ICTL_GIMR)); + for (soc_irq = 0; soc_irq < RTL_ICTL_NUM_INPUTS; soc_irq++) + write_irr(REG(RTL_ICTL_IRR0), soc_irq, 0); + + if (WARN_ON(!of_irq_count(node))) { + /* + * If DT contains no parent interrupts, assume MIPS CPU IRQ 2 + * (HW0) is connected to the first output. This is the case for + * all known hardware anyway. "interrupt-map" is deprecated, so + * don't bother trying to parse that. + */ + oirq.np = of_find_compatible_node(NULL, NULL, "mti,cpu-interrupt-controller"); + oirq.args_count = 1; + oirq.args[0] = 2; + + parent_irq = irq_create_of_mapping(&oirq); + + of_node_put(oirq.np); + } else { + parent_irq = of_irq_get(node, 0); + } - domain = irq_domain_add_linear(node, 32, &irq_domain_ops, NULL); + if (parent_irq < 0) + return parent_irq; + else if (!parent_irq) + return -ENODEV; - ret = map_interrupts(node, domain); - if (ret) { - pr_err("invalid interrupt map\n"); - return ret; - } + domain = irq_domain_add_linear(node, RTL_ICTL_NUM_INPUTS, &irq_domain_ops, NULL); + if (!domain) + return -ENOMEM; + + irq_set_chained_handler_and_data(parent_irq, realtek_irq_dispatch, domain); return 0; } -- GitLab From aecd1de3b1438cc4ead086a025fb49a3a896d615 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 22 Sep 2022 11:12:41 -0500 Subject: [PATCH 0739/2223] platform-msi: Export symbol platform_msi_create_irq_domain() Allow irqchip drivers using platform MSI to be built as modules. Signed-off-by: Frank Li [maz: rewrote commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220922161246.20586-2-Frank.Li@nxp.com --- drivers/base/platform-msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index 296ea673d6615..12b044151298b 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -138,6 +138,7 @@ struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode, return domain; } +EXPORT_SYMBOL_GPL(platform_msi_create_irq_domain); static int platform_msi_alloc_priv_data(struct device *dev, unsigned int nvec, irq_write_msi_msg_t write_msi_msg) -- GitLab From 334f7d42db3eb0274aa6b4aba7ce14d87df3fef0 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 22 Sep 2022 11:12:42 -0500 Subject: [PATCH 0740/2223] irqchip: Allow extra fields to be passed to IRQCHIP_PLATFORM_DRIVER_END IRQCHIP_PLATFORM_DRIVER_* doesn't allow some fields (such as .pm) to be set in the platform_driver structure. Make IRQCHIP_PLATFORM_DRIVER_END variadic so that .pm or another field can be set if needed. Signed-off-by: Frank Li [maz: revamped commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220922161246.20586-3-Frank.Li@nxp.com --- include/linux/irqchip.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 3a091d0710ae1..d5e6024cb2a8c 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -44,7 +44,8 @@ static const struct of_device_id drv_name##_irqchip_match_table[] = { #define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ .data = typecheck_irq_init_cb(fn), }, -#define IRQCHIP_PLATFORM_DRIVER_END(drv_name) \ + +#define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \ {}, \ }; \ MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table); \ @@ -56,6 +57,7 @@ static struct platform_driver drv_name##_driver = { \ .owner = THIS_MODULE, \ .of_match_table = drv_name##_irqchip_match_table, \ .suppress_bind_attrs = true, \ + __VA_ARGS__ \ }, \ }; \ builtin_platform_driver(drv_name##_driver) -- GitLab From 448e711693e48d03f7933ab3673334701b0c3f41 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 19 Aug 2022 16:21:00 +0000 Subject: [PATCH 0741/2223] KVM: selftests: Update top-of-file comment in psci_test Fix the comment to accurately describe the test and recently added SYSTEM_SUSPEND test case. What was once psci_cpu_on_test was renamed and extended to squeeze in a test case for PSCI SYSTEM_SUSPEND. Nonetheless, the author of those changes (whoever they may be...) failed to update the file comment to reflect what had changed. Reported-by: Reiji Watanabe Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220819162100.213854-1-oliver.upton@linux.dev --- tools/testing/selftests/kvm/aarch64/psci_test.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index f7621f6e938e4..e0b9e81a3e091 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the - * CPU_ON PSCI call matches what the caller requested. + * psci_test - Tests relating to KVM's PSCI implementation. * * Copyright (c) 2021 Google LLC. * - * This is a regression test for a race between KVM servicing the PSCI call and - * userspace reading the vCPUs registers. + * This test includes: + * - A regression test for a race between KVM servicing the PSCI CPU_ON call + * and userspace reading the targeted vCPU's registers. + * - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated + * KVM_SYSTEM_EVENT_SUSPEND UAPI. */ #define _GNU_SOURCE -- GitLab From 600655cdc076fb7688887b3819628c9d0878601c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 28 Sep 2022 14:05:48 +0300 Subject: [PATCH 0742/2223] Input: icn8505 - utilize acpi_get_subsystem_id() Replace open coded variant of recently introduced acpi_get_subsystem_id(). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220928110548.43955-1-andriy.shevchenko@linux.intel.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/chipone_icn8505.c | 30 +++++++-------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/drivers/input/touchscreen/chipone_icn8505.c b/drivers/input/touchscreen/chipone_icn8505.c index f9ca5502ac8c5..c421f4be27001 100644 --- a/drivers/input/touchscreen/chipone_icn8505.c +++ b/drivers/input/touchscreen/chipone_icn8505.c @@ -364,32 +364,20 @@ static irqreturn_t icn8505_irq(int irq, void *dev_id) static int icn8505_probe_acpi(struct icn8505_data *icn8505, struct device *dev) { - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - const char *subsys = "unknown"; - struct acpi_device *adev; - union acpi_object *obj; - acpi_status status; - - adev = ACPI_COMPANION(dev); - if (!adev) - return -ENODEV; + const char *subsys; + int error; - status = acpi_evaluate_object(adev->handle, "_SUB", NULL, &buffer); - if (ACPI_SUCCESS(status)) { - obj = buffer.pointer; - if (obj->type == ACPI_TYPE_STRING) - subsys = obj->string.pointer; - else - dev_warn(dev, "Warning ACPI _SUB did not return a string\n"); - } else { - dev_warn(dev, "Warning ACPI _SUB failed: %#x\n", status); - buffer.pointer = NULL; - } + subsys = acpi_get_subsystem_id(ACPI_HANDLE(dev)); + error = PTR_ERR_OR_ZERO(subsys); + if (error == -ENODATA) + subsys = "unknown"; + else if (error) + return error; snprintf(icn8505->firmware_name, sizeof(icn8505->firmware_name), "chipone/icn8505-%s.fw", subsys); - kfree(buffer.pointer); + kfree_const(subsys); return 0; } -- GitLab From 25d0bef5d1d0dc2f919baa033be157d5c313994c Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Wed, 28 Sep 2022 09:02:34 -0700 Subject: [PATCH 0743/2223] Input: ibm-panel - add missing MODULE_DEVICE_TABLE This patch adds missing MODULE_DEVICE_TABLE definition which generates correct modalias for automatic loading of this driver when it is built as an external module. Signed-off-by: Zeng Heng Link: https://lore.kernel.org/r/20220928143133.1809491-1-zengheng4@huawei.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/ibm-panel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/misc/ibm-panel.c b/drivers/input/misc/ibm-panel.c index 094bcdb568f13..a8fba00547190 100644 --- a/drivers/input/misc/ibm-panel.c +++ b/drivers/input/misc/ibm-panel.c @@ -183,6 +183,7 @@ static const struct of_device_id ibm_panel_match[] = { { .compatible = "ibm,op-panel" }, { } }; +MODULE_DEVICE_TABLE(of, ibm_panel_match); static struct i2c_driver ibm_panel_driver = { .driver = { -- GitLab From bf3f11581893494a5fb01eb87b99627edc2a85ff Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Wed, 21 Sep 2022 23:24:51 -0700 Subject: [PATCH 0744/2223] KVM: selftests: Check result in hyperv_features for successful hypercalls Commit cc5851c6be86 ("KVM: selftests: Use exception fixup for #UD/#GP Hyper-V MSR/hcall tests") introduced a wrong guest assert in guest_hcall(). It is not checking the successful hypercall results and only checks the result when a fault happens. GUEST_ASSERT_2(!hcall->ud_expected || res == hcall->expect, hcall->expect, res); Correct the assertion by only checking results of the successful hypercalls. This issue was observed when this test started failing after building it in Clang. Above guest assert statement fails because "res" is not equal to "hcall->expect" when "hcall->ud_expected" is true. "res" gets some garbage value in Clang from the RAX register. In GCC, RAX is 0 because it using RAX for @output_address in the asm statement and resetting it to 0 before using it as output operand in the same asm statement. Clang is not using RAX for @output_address. Fixes: cc5851c6be86 ("KVM: selftests: Use exception fixup for #UD/#GP Hyper-V MSR/hcall tests") Signed-off-by: Vipin Sharma Suggested-by: Sean Christopherson Reviewed-by: Jim Mattson Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20220922062451.2927010-1-vipinsh@google.com [sean: wrap changelog at ~75 chars, move -EFAULT change to separate patch] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/hyperv_features.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 79ab0152d2810..ad01868548f97 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -81,13 +81,13 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) } vector = hypercall(hcall->control, input, output, &res); - if (hcall->ud_expected) + if (hcall->ud_expected) { GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector); - else + } else { GUEST_ASSERT_2(!vector, hcall->control, vector); + GUEST_ASSERT_2(res == hcall->expect, hcall->expect, res); + } - GUEST_ASSERT_2(!hcall->ud_expected || res == hcall->expect, - hcall->expect, res); GUEST_DONE(); } -- GitLab From dfb45db43e9f6283a79230c8ea9cb589f14791b0 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Wed, 21 Sep 2022 23:24:51 -0700 Subject: [PATCH 0745/2223] KVM: selftests: Load RAX with -EFAULT before Hyper-V hypercall Load RAX with -EFAULT prior to making a Hyper-V hypercall so that tests can't get false negatives due to the compiler coincidentally loading the "right" value into RAX, i.e. to ensure that _KVM_ and not the compiler is correctly clearing RAX on a successful hypercall. Note, initializing *hv_status (in C code) to -EFAULT is not sufficient to avoid false negatives, as the compiler can still "clobber" RAX and thus load garbage into *hv_status if the hypercall faults (or if KVM doesn't set RAX). Suggested-by: Sean Christopherson Signed-off-by: Vipin Sharma Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20220922062451.2927010-1-vipinsh@google.com [sean: move to separate patch, massage changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/hyperv_features.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index ad01868548f97..4d55e038c2d79 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -26,7 +26,8 @@ static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, : "=a" (*hv_status), "+c" (control), "+d" (input_address), KVM_ASM_SAFE_OUTPUTS(vector) - : [output_address] "r"(output_address) + : [output_address] "r"(output_address), + "a" (-EFAULT) : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); return vector; } -- GitLab From 31d3b871f5ee9b195d90b4d14b74a7864209c6e8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 22 Sep 2022 10:39:41 +0200 Subject: [PATCH 0746/2223] KVM: selftests: Don't set reserved bits for invalid Hyper-V hypercall number Bits 27 through 31 in Hyper-V hypercall 'control' are reserved (see HV_HYPERCALL_RSVD0_MASK) but '0xdeadbeef' includes them. This causes KVM to return HV_STATUS_INVALID_HYPERCALL_INPUT instead of the expected HV_STATUS_INVALID_HYPERCALL_CODE. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/all/87fsgjol20.fsf@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/hyperv_features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 4d55e038c2d79..05b32e550a802 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -508,7 +508,7 @@ static void guest_test_hcalls_access(void) switch (stage) { case 0: feat->eax |= HV_MSR_HYPERCALL_AVAILABLE; - hcall->control = 0xdeadbeef; + hcall->control = 0xbeef; hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; break; -- GitLab From c23981df6642eec1da94a8125ec0ec402f7b1b7b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:22 +0800 Subject: [PATCH 0747/2223] KVM: x86/pmu: Avoid setting BIT_ULL(-1) to pmu->host_cross_mapped_mask In the extreme case of host counters multiplexing and contention, the perf_event requested by the guest's pebs counter is not allocated to any actual physical counter, in which case hw.idx is bookkept as -1, resulting in an out-of-bounds access to host_cross_mapped_mask. Fixes: 854250329c02 ("KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-2-likexu@tencent.com [sean: expand comment to explain how a negative idx can be encountered] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index c399637a3a79b..78dec4dc6e6fa 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -776,20 +776,23 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) { struct kvm_pmc *pmc = NULL; - int bit; + int bit, hw_idx; for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (!pmc || !pmc_speculative_in_use(pmc) || - !intel_pmc_is_enabled(pmc)) + !intel_pmc_is_enabled(pmc) || !pmc->perf_event) continue; - if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { - pmu->host_cross_mapped_mask |= - BIT_ULL(pmc->perf_event->hw.idx); - } + /* + * A negative index indicates the event isn't mapped to a + * physical counter in the host, e.g. due to contention. + */ + hw_idx = pmc->perf_event->hw.idx; + if (hw_idx != pmc->idx && hw_idx > -1) + pmu->host_cross_mapped_mask |= BIT_ULL(hw_idx); } } -- GitLab From f331601c65ad217a5c000ce20c26266d3f0aceb3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:23 +0800 Subject: [PATCH 0748/2223] KVM: x86/pmu: Don't generate PEBS records for emulated instructions KVM will accumulate an enabled counter for at least INSTRUCTIONS or BRANCH_INSTRUCTION hw event from any KVM emulated instructions, generating emulated overflow interrupt on counter overflow, which in theory should also happen when the PEBS counter overflows but it currently lacks this part of the underlying support (e.g. through software injection of records in the irq context or a lazy approach). In this case, KVM skips the injection of this BUFFER_OVF PMI (effectively dropping one PEBS record) and let the overflow counter move on. The loss of a single sample does not introduce a loss of accuracy, but is easily noticeable for certain specific instructions. This issue is expected to be addressed along with the issue of PEBS cross-mapped counters with a slow-path proposal. Fixes: 79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-3-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 02f9e4f245bd0..390d697efde14 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -106,9 +106,19 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) return; if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { - /* Indicate PEBS overflow PMI to guest. */ - skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, - (unsigned long *)&pmu->global_status); + if (!in_pmi) { + /* + * TODO: KVM is currently _choosing_ to not generate records + * for emulated instructions, avoiding BUFFER_OVF PMI when + * there are no records. Strictly speaking, it should be done + * as well in the right context to improve sampling accuracy. + */ + skip_pmi = true; + } else { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } -- GitLab From c0245b774203f7341ddb1cce29a6ee607857f325 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 22 Sep 2022 13:40:38 -0700 Subject: [PATCH 0749/2223] KVM: x86/pmu: Refactor PERF_GLOBAL_CTRL update helper for reuse by PEBS Extract the "global ctrl" specific bits out of global_ctrl_changed() so that the helper only deals with reprogramming general purpose counters, and rename the helper accordingly. PEBS needs the same logic, i.e needs to reprogram counters associated when PEBS_ENABLE bits are toggled, and will use the helper in a future fix. No functional change intended. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-4-likexu@tencent.com [sean: split to separate patch, write changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 78dec4dc6e6fa..5592b1259e1bb 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -68,15 +68,11 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -/* function is called when global control register has been updated. */ -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) { int bit; - u64 diff = pmu->global_ctrl ^ data; struct kvm_pmc *pmc; - pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) @@ -397,7 +393,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct kvm_pmc *pmc; u32 msr = msr_info->index; u64 data = msr_info->data; - u64 reserved_bits; + u64 reserved_bits, diff; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -418,7 +414,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->global_ctrl == data) return 0; if (kvm_valid_perf_global_ctrl(pmu, data)) { - global_ctrl_changed(pmu, data); + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); return 0; } break; -- GitLab From cf52de619c67bd1f6b1cf2751c3827815f74a5a5 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:24 +0800 Subject: [PATCH 0750/2223] KVM: x86/pmu: Avoid using PEBS perf_events for normal counters The check logic in the pmc_resume_counter() to determine whether a perf_event is reusable is partial and flawed, especially when it comes to a pseudocode sequence (contrived, but valid) like: - enabling a counter and its PEBS bit - enable global_ctrl - run workload - disable only the PEBS bit, leaving the global_ctrl bit enabled In this corner case, a perf_event created for PEBS can be reused by a normal counter before it has been released and recreated, and when this normal counter overflows, it triggers a PEBS interrupt (precise_ip != 0). To address this issue, reprogram all affected counters when PEBS_ENABLE change and reuse a counter if and only if PEBS exactly matches precise. Fixes: 79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-4-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 4 ++-- arch/x86/kvm/vmx/pmu_intel.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 390d697efde14..d9b9a0f0db17c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -237,8 +237,8 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; - if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && - pmc->perf_event->attr.precise_ip) + if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) != + (!!pmc->perf_event->attr.precise_ip)) return false; /* reuse perf_event to serve as pmc_reprogram_counter() does*/ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5592b1259e1bb..25b70a85bef54 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -431,7 +431,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->pebs_enable == data) return 0; if (!(data & pmu->pebs_enable_mask)) { + diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; + reprogram_counters(pmu, diff); return 0; } break; -- GitLab From 5c6a67f4f265f84e1b8582f82562dda2a53f52d1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:27 +0800 Subject: [PATCH 0751/2223] KVM: x86/svm/pmu: Direct access pmu->gp_counter[] to implement amd_*_to_pmc() Access PMU counters on AMD by directly indexing the array of general purpose counters instead of translating the PMC index to an MSR index. AMD only supports gp counters, there's no need to translate a PMC index to an MSR index and back to a PMC index. Opportunistically apply array_index_nospec() to reduce the attack surface for speculative execution and remove the dead code. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-7-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index f24613a108c53..d1c3b766841e3 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -33,23 +33,6 @@ enum index { INDEX_ERROR, }; -static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) -{ - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { - if (type == PMU_TYPE_COUNTER) - return MSR_F15H_PERF_CTR; - else - return MSR_F15H_PERF_CTL; - } else { - if (type == PMU_TYPE_COUNTER) - return MSR_K7_PERFCTR0; - else - return MSR_K7_EVNTSEL0; - } -} - static enum index msr_to_index(u32 msr) { switch (msr) { @@ -141,18 +124,12 @@ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + unsigned int num_counters = pmu->nr_arch_gp_counters; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { - /* - * The idx is contiguous. The MSRs are not. The counter MSRs - * are interleaved with the event select MSRs. - */ - pmc_idx *= 2; - } + if (pmc_idx >= num_counters) + return NULL; - return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); + return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)]; } static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) @@ -168,15 +145,7 @@ static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *counters; - - idx &= ~(3u << 30); - if (idx >= pmu->nr_arch_gp_counters) - return NULL; - counters = pmu->gp_counters; - - return &counters[idx]; + return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); } static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -- GitLab From ea5cbc9ff839091a86558d4e2c082225b13e0055 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:28 +0800 Subject: [PATCH 0752/2223] KVM: x86/svm/pmu: Rewrite get_gp_pmc_amd() for more counters scalability If the number of AMD gp counters continues to grow, the code will be very clumsy and the switch-case design of inline get_gp_pmc_amd() will also bloat the kernel text size. The target code is taught to manage two groups of MSRs, each representing a different version of the AMD PMU counter MSRs. The MSR addresses of each group are contiguous, with no holes, and there is no intersection between two sets of addresses, but they are discrete in functionality by design like this: [Group A : All counter MSRs are tightly bound to all event select MSRs ] MSR_K7_EVNTSEL0 0xc0010000 MSR_K7_EVNTSELi 0xc0010000 + i ... MSR_K7_EVNTSEL3 0xc0010003 MSR_K7_PERFCTR0 0xc0010004 MSR_K7_PERFCTRi 0xc0010004 + i ... MSR_K7_PERFCTR3 0xc0010007 [Group B : The counter MSRs are interleaved with the event select MSRs ] MSR_F15H_PERF_CTL0 0xc0010200 MSR_F15H_PERF_CTR0 (0xc0010200 + 1) ... MSR_F15H_PERF_CTLi (0xc0010200 + 2 * i) MSR_F15H_PERF_CTRi (0xc0010200 + 2 * i + 1) ... MSR_F15H_PERF_CTL5 (0xc0010200 + 2 * 5) MSR_F15H_PERF_CTR5 (0xc0010200 + 2 * 5 + 1) Rewrite get_gp_pmc_amd() in this way: first determine which group of registers is accessed, then determine if it matches its requested type, applying different scaling ratios respectively, and finally get pmc_idx to pass into amd_pmc_idx_to_pmc(). Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-8-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 88 ++++++++++-------------------------------- 1 file changed, 20 insertions(+), 68 deletions(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index d1c3b766841e3..b68956299fa8e 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -23,90 +23,52 @@ enum pmu_type { PMU_TYPE_EVNTSEL, }; -enum index { - INDEX_ZERO = 0, - INDEX_ONE, - INDEX_TWO, - INDEX_THREE, - INDEX_FOUR, - INDEX_FIVE, - INDEX_ERROR, -}; - -static enum index msr_to_index(u32 msr) +static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - switch (msr) { - case MSR_F15H_PERF_CTL0: - case MSR_F15H_PERF_CTR0: - case MSR_K7_EVNTSEL0: - case MSR_K7_PERFCTR0: - return INDEX_ZERO; - case MSR_F15H_PERF_CTL1: - case MSR_F15H_PERF_CTR1: - case MSR_K7_EVNTSEL1: - case MSR_K7_PERFCTR1: - return INDEX_ONE; - case MSR_F15H_PERF_CTL2: - case MSR_F15H_PERF_CTR2: - case MSR_K7_EVNTSEL2: - case MSR_K7_PERFCTR2: - return INDEX_TWO; - case MSR_F15H_PERF_CTL3: - case MSR_F15H_PERF_CTR3: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR3: - return INDEX_THREE; - case MSR_F15H_PERF_CTL4: - case MSR_F15H_PERF_CTR4: - return INDEX_FOUR; - case MSR_F15H_PERF_CTL5: - case MSR_F15H_PERF_CTR5: - return INDEX_FIVE; - default: - return INDEX_ERROR; - } + unsigned int num_counters = pmu->nr_arch_gp_counters; + + if (pmc_idx >= num_counters) + return NULL; + + return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)]; } static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + unsigned int idx; if (!vcpu->kvm->arch.enable_pmu) return NULL; switch (msr) { - case MSR_F15H_PERF_CTL0: - case MSR_F15H_PERF_CTL1: - case MSR_F15H_PERF_CTL2: - case MSR_F15H_PERF_CTL3: - case MSR_F15H_PERF_CTL4: - case MSR_F15H_PERF_CTL5: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) return NULL; - fallthrough; + /* + * Each PMU counter has a pair of CTL and CTR MSRs. CTLn + * MSRs (accessed via EVNTSEL) are even, CTRn MSRs are odd. + */ + idx = (unsigned int)((msr - MSR_F15H_PERF_CTL0) / 2); + if (!(msr & 0x1) != (type == PMU_TYPE_EVNTSEL)) + return NULL; + break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; + idx = msr - MSR_K7_EVNTSEL0; break; - case MSR_F15H_PERF_CTR0: - case MSR_F15H_PERF_CTR1: - case MSR_F15H_PERF_CTR2: - case MSR_F15H_PERF_CTR3: - case MSR_F15H_PERF_CTR4: - case MSR_F15H_PERF_CTR5: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) - return NULL; - fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; + idx = msr - MSR_K7_PERFCTR0; break; default: return NULL; } - return &pmu->gp_counters[msr_to_index(msr)]; + return amd_pmc_idx_to_pmc(pmu, idx); } static bool amd_hw_event_available(struct kvm_pmc *pmc) @@ -122,16 +84,6 @@ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) return true; } -static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - unsigned int num_counters = pmu->nr_arch_gp_counters; - - if (pmc_idx >= num_counters) - return NULL; - - return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)]; -} - static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); -- GitLab From c85c36798bc2ed12af04b6cc274aed5b02984647 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Sep 2022 14:53:33 -0700 Subject: [PATCH 0753/2223] Input: ims-pcu - fix spelling mistake "BOOLTLOADER" -> "BOOTLOADER" There is a spelling mistake in a dev_err message. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220928211003.61872-1-colin.i.king@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/ims-pcu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c index 6f38aa23a1ff6..b2f1292e27ef7 100644 --- a/drivers/input/misc/ims-pcu.c +++ b/drivers/input/misc/ims-pcu.c @@ -744,7 +744,7 @@ static int ims_pcu_switch_to_bootloader(struct ims_pcu *pcu) error = ims_pcu_execute_command(pcu, JUMP_TO_BTLDR, NULL, 0); if (error) { dev_err(pcu->dev, - "Failure when sending JUMP TO BOOLTLOADER command, error: %d\n", + "Failure when sending JUMP TO BOOTLOADER command, error: %d\n", error); return error; } -- GitLab From d218fe04335183518009f29f3270ec4dde1b66a2 Mon Sep 17 00:00:00 2001 From: Nate Yocom Date: Wed, 28 Sep 2022 18:10:35 -0700 Subject: [PATCH 0754/2223] Input: xpad - add X-Box Adaptive support Adds correct VID/PID for this XTYPE_XBOXONE compatible controller to xpad_device[] table. Signed-off-by: Nate Yocom Tested-by: Bastien Nocera Reviewed-by: Mattijs Korpershoek Link: https://lore.kernel.org/r/20220908173930.28940-2-nate@yocom.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 60859008372c9..1058ed28be491 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -148,6 +148,7 @@ static const struct xpad_device { { 0x045e, 0x0b00, "Microsoft X-Box One Elite 2 pad", MAP_PADDLES, XTYPE_XBOXONE }, { 0x045e, 0x02ea, "Microsoft X-Box One S pad", 0, XTYPE_XBOXONE }, { 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, + { 0x045e, 0x0b0a, "Microsoft X-Box Adaptive Controller", 0, XTYPE_XBOXONE }, { 0x045e, 0x0b12, "Microsoft Xbox Series S|X Controller", MAP_SELECT_BUTTON, XTYPE_XBOXONE }, { 0x046d, 0xc21d, "Logitech Gamepad F310", 0, XTYPE_XBOX360 }, { 0x046d, 0xc21e, "Logitech Gamepad F510", 0, XTYPE_XBOX360 }, -- GitLab From f45aaae6204d1c7b0200ce043102ec84d805ac34 Mon Sep 17 00:00:00 2001 From: Nate Yocom Date: Wed, 28 Sep 2022 18:18:54 -0700 Subject: [PATCH 0755/2223] Input: xpad - add X-Box Adaptive XBox button Adaptive controller sets 0x02 bit for this button, all others set 0x01 so presence of either is used for BTN_MODE. Signed-off-by: Nate Yocom Tested-by: Bastien Nocera Reviewed-by: Mattijs Korpershoek --- drivers/input/joystick/xpad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 1058ed28be491..c66ae73523d2c 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -969,7 +969,8 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char if (data[1] == (GIP_OPT_ACK | GIP_OPT_INTERNAL)) xpadone_ack_mode_report(xpad, data[2]); - input_report_key(dev, BTN_MODE, data[4] & BIT(0)); + input_report_key(dev, BTN_MODE, data[4] & GENMASK(1, 0)); + input_sync(dev); do_sync = true; } else if (data[0] == GIP_CMD_FIRMWARE) { -- GitLab From 1260cd04a601e0e02e09fa332111b8639611970d Mon Sep 17 00:00:00 2001 From: Nate Yocom Date: Wed, 28 Sep 2022 18:23:22 -0700 Subject: [PATCH 0756/2223] Input: add ABS_PROFILE to uapi and documentation Define new ABS_PROFILE axis for input devices which need it, e.g. X-Box Adaptive Controller and X-Box Elite 2. Signed-off-by: Nate Yocom Link: https://lore.kernel.org/r/20220908173930.28940-4-nate@yocom.org Signed-off-by: Dmitry Torokhov --- Documentation/input/event-codes.rst | 6 ++++++ Documentation/input/gamepad.rst | 6 ++++++ drivers/hid/hid-debug.c | 3 ++- include/uapi/linux/input-event-codes.h | 1 + 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index 8741d390b1843..b4557462edd7b 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -235,6 +235,12 @@ A few EV_ABS codes have special meanings: BTN_TOOL_ signals the type of tool that is currently detected by the hardware and is otherwise independent of ABS_DISTANCE and/or BTN_TOUCH. +* ABS_PROFILE: + + - Used to describe the state of a multi-value profile switch. An event is + emitted only when the selected profile changes, indicating the newly + selected profile value. + * ABS_MT_: - Used to describe multitouch input events. Please see diff --git a/Documentation/input/gamepad.rst b/Documentation/input/gamepad.rst index 4d5e7fb80a845..71019de460367 100644 --- a/Documentation/input/gamepad.rst +++ b/Documentation/input/gamepad.rst @@ -189,3 +189,9 @@ Gamepads report the following events: - Rumble: Rumble is advertised as FF_RUMBLE. + +- Profile: + + Some pads provide a multi-value profile selection switch. An example is the + XBox Adaptive and the XBox Elite 2 controllers. When the active profile is + switched, its newly selected value is emitted as an ABS_PROFILE event. diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 81e7e404a5fce..2ca6ab600bc9f 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -1014,7 +1014,8 @@ static const char *absolutes[ABS_CNT] = { [ABS_HAT3Y] = "Hat 3Y", [ABS_PRESSURE] = "Pressure", [ABS_DISTANCE] = "Distance", [ABS_TILT_X] = "XTilt", [ABS_TILT_Y] = "YTilt", [ABS_TOOL_WIDTH] = "ToolWidth", - [ABS_VOLUME] = "Volume", [ABS_MISC] = "Misc", + [ABS_VOLUME] = "Volume", [ABS_PROFILE] = "Profile", + [ABS_MISC] = "Misc", [ABS_MT_TOUCH_MAJOR] = "MTMajor", [ABS_MT_TOUCH_MINOR] = "MTMinor", [ABS_MT_WIDTH_MAJOR] = "MTMajorW", diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index dff8e7f170748..7ad931a329706 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -862,6 +862,7 @@ #define ABS_TOOL_WIDTH 0x1c #define ABS_VOLUME 0x20 +#define ABS_PROFILE 0x21 #define ABS_MISC 0x28 -- GitLab From fff1011a26d6cbf26b18c8ee4c61d99943174f8c Mon Sep 17 00:00:00 2001 From: Nate Yocom Date: Wed, 28 Sep 2022 18:23:49 -0700 Subject: [PATCH 0757/2223] Input: xpad - add X-Box Adaptive Profile button Adds a new quirk for controllers that have a Profile button which has 4 states, reflected as an ABS_PROFILE axis with 4 values. Signed-off-by: Nate Yocom Tested-by: Bastien Nocera Link: https://lore.kernel.org/r/20220908173930.28940-6-nate@yocom.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index c66ae73523d2c..2959d80f7fdb6 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -81,7 +81,9 @@ #define MAP_TRIGGERS_TO_BUTTONS (1 << 1) #define MAP_STICKS_TO_NULL (1 << 2) #define MAP_SELECT_BUTTON (1 << 3) -#define MAP_PADDLES (1 << 4) +#define MAP_PADDLES (1 << 4) +#define MAP_PROFILE_BUTTON (1 << 5) + #define DANCEPAD_MAP_CONFIG (MAP_DPAD_TO_BUTTONS | \ MAP_TRIGGERS_TO_BUTTONS | MAP_STICKS_TO_NULL) @@ -148,7 +150,7 @@ static const struct xpad_device { { 0x045e, 0x0b00, "Microsoft X-Box One Elite 2 pad", MAP_PADDLES, XTYPE_XBOXONE }, { 0x045e, 0x02ea, "Microsoft X-Box One S pad", 0, XTYPE_XBOXONE }, { 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, - { 0x045e, 0x0b0a, "Microsoft X-Box Adaptive Controller", 0, XTYPE_XBOXONE }, + { 0x045e, 0x0b0a, "Microsoft X-Box Adaptive Controller", MAP_PROFILE_BUTTON, XTYPE_XBOXONE }, { 0x045e, 0x0b12, "Microsoft Xbox Series S|X Controller", MAP_SELECT_BUTTON, XTYPE_XBOXONE }, { 0x046d, 0xc21d, "Logitech Gamepad F310", 0, XTYPE_XBOX360 }, { 0x046d, 0xc21e, "Logitech Gamepad F510", 0, XTYPE_XBOX360 }, @@ -777,6 +779,10 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d input_report_key(dev, BTN_C, data[8]); input_report_key(dev, BTN_Z, data[9]); + /* Profile button has a value of 0-3, so it is reported as an axis */ + if (xpad->mapping & MAP_PROFILE_BUTTON) + input_report_abs(dev, ABS_PROFILE, data[34]); + input_sync(dev); } @@ -1800,6 +1806,9 @@ static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs) case ABS_HAT0Y: /* the d-pad (only if dpad is mapped to axes */ input_set_abs_params(input_dev, abs, -1, 1, 0, 0); break; + case ABS_PROFILE: /* 4 value profile button (such as on XAC) */ + input_set_abs_params(input_dev, abs, 0, 4, 0, 0); + break; default: input_set_abs_params(input_dev, abs, 0, 0, 0, 0); break; @@ -1898,6 +1907,10 @@ static int xpad_init_input(struct usb_xpad *xpad) xpad_set_up_abs(input_dev, xpad_abs_triggers[i]); } + /* setup profile button as an axis with 4 possible values */ + if (xpad->mapping & MAP_PROFILE_BUTTON) + xpad_set_up_abs(input_dev, ABS_PROFILE); + error = xpad_init_ff(xpad); if (error) goto err_free_input; -- GitLab From 43b233b1582de501e441deb7c4ed1f944e60b1f9 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Thu, 29 Sep 2022 12:28:39 +0800 Subject: [PATCH 0758/2223] KVM: arm64: Fix comment typo in nvhe/switch.c Fix the comment of __hyp_vgic_restore_state() from saying VEH to VHE, also change the underscore to a dash to match the comment above __hyp_vgic_save_state(). Signed-off-by: Wei-Lin Chang Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220929042839.24277-1-r09922117@csie.ntu.edu.tw --- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 9f63857020618..8e9d49a964be6 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -143,7 +143,7 @@ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) } } -/* Restore VGICv3 state on non_VEH systems */ +/* Restore VGICv3 state on non-VHE systems */ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { -- GitLab From b623023225abed7a7d76cf1cc9f7187c1a3e7cff Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 28 Sep 2022 17:54:20 +0200 Subject: [PATCH 0759/2223] PCI: qcom: Drop unused post_deinit callback Drop the unused and confusingly named post_deinit callback that was added for the now removed pipe clock handling. If ever needed we can add back a callback named pre_deinit (or perhaps rather pre_phy_power_off) instead. Link: https://lore.kernel.org/r/20220928155421.21660-2-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-qcom.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 39ca06ffe6149..8d6df0db4ebba 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -208,7 +208,6 @@ struct qcom_pcie_ops { int (*init)(struct qcom_pcie *pcie); int (*post_init)(struct qcom_pcie *pcie); void (*deinit)(struct qcom_pcie *pcie); - void (*post_deinit)(struct qcom_pcie *pcie); void (*ltssm_enable)(struct qcom_pcie *pcie); int (*config_sid)(struct qcom_pcie *pcie); }; @@ -1520,8 +1519,6 @@ static int qcom_pcie_host_init(struct dw_pcie_rp *pp) err: qcom_ep_reset_assert(pcie); - if (pcie->cfg->ops->post_deinit) - pcie->cfg->ops->post_deinit(pcie); err_disable_phy: phy_power_off(pcie->phy); err_deinit: -- GitLab From 0e4d9a5cc7670d59e73cc372263a7417330aa56f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 28 Sep 2022 17:54:21 +0200 Subject: [PATCH 0760/2223] PCI: qcom: Rename host-init error label Use a more descriptive name for the reset host-init error label for consistency. Link: https://lore.kernel.org/r/20220928155421.21660-3-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-qcom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 8d6df0db4ebba..f711acacaeaf8 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1512,12 +1512,12 @@ static int qcom_pcie_host_init(struct dw_pcie_rp *pp) if (pcie->cfg->ops->config_sid) { ret = pcie->cfg->ops->config_sid(pcie); if (ret) - goto err; + goto err_assert_reset; } return 0; -err: +err_assert_reset: qcom_ep_reset_assert(pcie); err_disable_phy: phy_power_off(pcie->phy); -- GitLab From 8bb7ff12a91429eb76e093b517ae810b146448fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 28 Sep 2022 14:19:11 +0200 Subject: [PATCH 0761/2223] PCI: tegra: Use PCI_CONF1_EXT_ADDRESS() macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify pci-tegra.c driver code and use new PCI_CONF1_EXT_ADDRESS() macro for accessing PCI config space. Link: https://lore.kernel.org/r/20220928121911.14994-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding --- drivers/pci/controller/pci-tegra.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 8e323e93be915..24478ae5a345d 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -415,13 +415,6 @@ static inline u32 pads_readl(struct tegra_pcie *pcie, unsigned long offset) * address (access to which generates correct config transaction) falls in * this 4 KiB region. */ -static unsigned int tegra_pcie_conf_offset(u8 bus, unsigned int devfn, - unsigned int where) -{ - return ((where & 0xf00) << 16) | (bus << 16) | (PCI_SLOT(devfn) << 11) | - (PCI_FUNC(devfn) << 8) | (where & 0xff); -} - static void __iomem *tegra_pcie_map_bus(struct pci_bus *bus, unsigned int devfn, int where) @@ -443,7 +436,9 @@ static void __iomem *tegra_pcie_map_bus(struct pci_bus *bus, unsigned int offset; u32 base; - offset = tegra_pcie_conf_offset(bus->number, devfn, where); + offset = PCI_CONF1_EXT_ADDRESS(bus->number, PCI_SLOT(devfn), + PCI_FUNC(devfn), where) & + ~PCI_CONF1_ENABLE; /* move 4 KiB window to offset within the FPCI region */ base = 0xfe100000 + ((offset & ~(SZ_4K - 1)) >> 8); -- GitLab From 8929bc9659640f35dd2ef8373263cbd885b4a072 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:15 +0100 Subject: [PATCH 0762/2223] KVM: Use acquire/release semantics when accessing dirty ring GFN state The current implementation of the dirty ring has an implicit requirement that stores to the dirty ring from userspace must be: - be ordered with one another - visible from another CPU executing a ring reset While these implicit requirements work well for x86 (and any other TSO-like architecture), they do not work for more relaxed architectures such as arm64 where stores to different addresses can be freely reordered, and loads from these addresses not observing writes from another CPU unless the required barriers (or acquire/release semantics) are used. In order to start fixing this, upgrade the ring reset accesses: - the kvm_dirty_gfn_harvested() helper now uses acquire semantics so it is ordered after all previous writes, including that from userspace - the kvm_dirty_gfn_set_invalid() helper now uses release semantics so that the next_slot and next_offset reads don't drift past the entry invalidation This is only a partial fix as the userspace side also need upgrading. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-2-maz@kernel.org --- virt/kvm/dirty_ring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index f4c2a6eb1666b..d6fabf238032a 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -74,7 +74,7 @@ int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn) { - gfn->flags = 0; + smp_store_release(&gfn->flags, 0); } static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) @@ -84,7 +84,7 @@ static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) { - return gfn->flags & KVM_DIRTY_GFN_F_RESET; + return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; } int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) -- GitLab From 17601bfed909fa080fcfd227b57da2bd4dc2d2a6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:16 +0100 Subject: [PATCH 0763/2223] KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option In order to differenciate between architectures that require no extra synchronisation when accessing the dirty ring and those who do, add a new capability (KVM_CAP_DIRTY_LOG_RING_ACQ_REL) that identify the latter sort. TSO architectures can obviously advertise both, while relaxed architectures must only advertise the ACQ_REL version. This requires some configuration symbol rejigging, with HAVE_KVM_DIRTY_RING being only indirectly selected by two top-level config symbols: - HAVE_KVM_DIRTY_RING_TSO for strongly ordered architectures (x86) - HAVE_KVM_DIRTY_RING_ACQ_REL for weakly ordered architectures (arm64) Suggested-by: Paolo Bonzini Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-3-maz@kernel.org --- arch/x86/kvm/Kconfig | 2 +- include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 14 ++++++++++++++ virt/kvm/kvm_main.c | 9 ++++++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index e3cbd77061364..876748b236ffe 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,7 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD - select HAVE_KVM_DIRTY_RING + select HAVE_KVM_DIRTY_RING_TSO select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index eed0315a77a6d..0d5d4419139ae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 +#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index a8c5c9f06b3cf..800f9470e36b1 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -19,6 +19,20 @@ config HAVE_KVM_IRQ_ROUTING config HAVE_KVM_DIRTY_RING bool +# Only strongly ordered architectures can select this, as it doesn't +# put any explicit constraint on userspace ordering. They can also +# select the _ACQ_REL version. +config HAVE_KVM_DIRTY_RING_TSO + bool + select HAVE_KVM_DIRTY_RING + depends on X86 + +# Weakly ordered architectures can only select this, advertising +# to userspace the additional ordering requirements. +config HAVE_KVM_DIRTY_RING_ACQ_REL + bool + select HAVE_KVM_DIRTY_RING + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 584a5bab3af39..5b064dbadaf42 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4475,7 +4475,13 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; case KVM_CAP_DIRTY_LOG_RING: -#ifdef CONFIG_HAVE_KVM_DIRTY_RING +#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO + return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); +#else + return 0; +#endif + case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: +#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; @@ -4580,6 +4586,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return 0; } case KVM_CAP_DIRTY_LOG_RING: + case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); default: return kvm_vm_ioctl_enable_cap(kvm, cap); -- GitLab From fc0693d4e5afe3c110503c3afa9f60600f9e964b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:17 +0100 Subject: [PATCH 0764/2223] KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL Since x86 is TSO (give or take), allow it to advertise the new ACQ_REL version of the dirty ring capability. No other change is required for it. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-4-maz@kernel.org --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 876748b236ffe..67be7f217e37b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD select HAVE_KVM_DIRTY_RING_TSO + select HAVE_KVM_DIRTY_RING_ACQ_REL select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING -- GitLab From 671c8c7f9f2349d8b2176ad810f1406794011f63 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:18 +0100 Subject: [PATCH 0765/2223] KVM: Document weakly ordered architecture requirements for dirty ring Now that the kernel can expose to userspace that its dirty ring management relies on explicit ordering, document these new requirements for VMMs to do the right thing. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-5-maz@kernel.org --- Documentation/virt/kvm/api.rst | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index abd7c32126ce0..32427ea160dfa 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8019,8 +8019,8 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf (0x40000001). Otherwise, a guest may use the paravirtual features regardless of what has actually been exposed through the CPUID leaf. -8.29 KVM_CAP_DIRTY_LOG_RING ---------------------------- +8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL +---------------------------------------------------------- :Architectures: x86 :Parameters: args[0] - size of the dirty log ring @@ -8078,6 +8078,11 @@ on to the next GFN. The userspace should continue to do this until the flags of a GFN have the DIRTY bit cleared, meaning that it has harvested all the dirty GFNs that were available. +Note that on weakly ordered architectures, userspace accesses to the +ring buffer (and more specifically the 'flags' field) must be ordered, +using load-acquire/store-release accessors when available, or any +other memory barrier that will ensure this ordering. + It's not necessary for userspace to harvest the all dirty GFNs at once. However it must collect the dirty GFNs in sequence, i.e., the userspace program cannot skip one dirty GFN to collect the one next to it. @@ -8106,6 +8111,14 @@ KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual machine will switch to ring-buffer dirty page tracking and further KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail. +NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that +should be exposed by weakly ordered architecture, in order to indicate +the additional memory ordering requirements imposed on userspace when +reading the state of an entry and mutating it from DIRTY to HARVESTED. +Architecture with TSO-like ordering (such as x86) are allowed to +expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL +to userspace. + 8.30 KVM_CAP_XEN_HVM -------------------- -- GitLab From 4eb6486cb43c93382c27a2659ba978c660e98498 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:19 +0100 Subject: [PATCH 0766/2223] KVM: selftests: dirty-log: Upgrade flag accesses to acquire/release semantics In order to preserve ordering, make sure that the flag accesses in the dirty log are done using acquire/release accessors. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-6-maz@kernel.org --- tools/testing/selftests/kvm/dirty_log_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 9c883c94d478f..53627add8a7cb 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "kvm_util.h" #include "test_util.h" @@ -279,12 +280,12 @@ static void dirty_ring_create_vm_done(struct kvm_vm *vm) static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) { - return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; + return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY; } static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) { - gfn->flags = KVM_DIRTY_GFN_F_RESET; + smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET); } static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, -- GitLab From 4b3402f1f4d9860301d6d5cd7aff3b67f678d577 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Sep 2022 15:51:20 +0100 Subject: [PATCH 0767/2223] KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ACQ_REL if available Pick KVM_CAP_DIRTY_LOG_RING_ACQ_REL if exposed by the kernel. Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20220926145120.27974-7-maz@kernel.org --- tools/testing/selftests/kvm/dirty_log_test.c | 3 ++- tools/testing/selftests/kvm/lib/kvm_util.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 53627add8a7cb..b5234d6efbe15 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -265,7 +265,8 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) static bool dirty_ring_supported(void) { - return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING); + return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) || + kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL)); } static void dirty_ring_create_vm_done(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9889fe0d8919c..411a4c0bc81c8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -82,7 +82,10 @@ unsigned int kvm_check_cap(long cap) void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { - vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); + if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); + else + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); vm->dirty_ring_size = ring_size; } -- GitLab From d9fc272bfd76acadf0537901549d07a1b81dbeed Mon Sep 17 00:00:00 2001 From: Apurva Nandan Date: Sat, 20 Aug 2022 00:37:27 +0530 Subject: [PATCH 0768/2223] dt-bindings: irqchip: ti,sci-inta: Fix warning for missing #interrupt-cells ti,sci-inta nodes, or else we will have following warning when building device tree files with W=2 warning level. arch/arm64/boot/dts/ti/k3-j721e-main.dtsi:147.51-156.5: Warning (interrupt_provider): /bus@100000/main-navss/interrupt-controller@33d00000: Missing #interrupt-cells in interrupt provider And further, #interrupt-cells is required to be in yaml bindings as well to prevent following schema warnings: k3-j721e-common-proc-board.dtb: interrupt-controller@33d00000: Unevaluated properties are not allowed ('#interrupt-cells' was unexpected) >From schema: linux/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml Add #interrupt-cells property in ti,sci-inta.yaml Signed-off-by: Apurva Nandan Acked-by: Rob Herring Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220819190729.32358-2-a-nandan@ti.com --- .../devicetree/bindings/interrupt-controller/ti,sci-inta.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml index 88c46e61732e1..1151518859bd0 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml @@ -59,6 +59,9 @@ properties: interrupt-controller: true + '#interrupt-cells': + const: 0 + msi-controller: true ti,interrupt-ranges: -- GitLab From daa0b6d0187599a574cb5cb392b259bda3dcf979 Mon Sep 17 00:00:00 2001 From: Apurva Nandan Date: Sat, 20 Aug 2022 00:37:29 +0530 Subject: [PATCH 0769/2223] dt-bindings: interrupt-controller: ti,sci-intr: Fix missing reg property in the binding Fix the following warning in dtbs_check interrupt-controller@a00000: Unevaluated properties are not allowed ('reg' was unexpected) Add the reg property in the schema. Signed-off-by: Apurva Nandan Acked-by: Rob Herring Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220819190729.32358-4-a-nandan@ti.com --- .../devicetree/bindings/interrupt-controller/ti,sci-intr.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.yaml b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.yaml index e12aee42b1268..c99cc7323c711 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.yaml @@ -58,6 +58,9 @@ properties: 1 = If intr supports edge triggered interrupts. 4 = If intr supports level triggered interrupts. + reg: + maxItems: 1 + interrupt-controller: true '#interrupt-cells': -- GitLab From 4d96829774b7bd70ed81b5e2830afb9d97b9fea2 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Wed, 28 Sep 2022 10:39:27 +0800 Subject: [PATCH 0770/2223] irqchip/gic-v3: Fix typo in comment Fix typo in comment (cleanip/cleanup). Signed-off-by: Zhiyuan Dai [maz: commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1664332767-6909-1-git-send-email-daizhiyuan@phytium.com.cn --- drivers/irqchip/irq-gic-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 262658fd5f9e5..34d58567b78d1 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -978,7 +978,7 @@ static int __gic_update_rdist_properties(struct redist_region *region, u64 typer = gic_read_typer(ptr + GICR_TYPER); u32 ctlr = readl_relaxed(ptr + GICR_CTLR); - /* Boot-time cleanip */ + /* Boot-time cleanup */ if ((typer & GICR_TYPER_VLPIS) && (typer & GICR_TYPER_RVPEID)) { u64 val; -- GitLab From 872f3a4e90ef2a0245f9143558d9f45bfc352194 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 28 Sep 2022 14:33:36 +0200 Subject: [PATCH 0771/2223] dt-bindings: irqchip: renesas,irqc: Add r8a779g0 support Document support for the Interrupt Controller for External Devices (INT-EX) in the Renesas R-Car V4H (R8A779G0) SoC. Signed-off-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/4fdb6ff47f62814aab3b06efd1d4c2d7de83b109.1664368373.git.geert+renesas@glider.be --- .../devicetree/bindings/interrupt-controller/renesas,irqc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml index 620f01775e429..62fd47c88275d 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml @@ -37,6 +37,7 @@ properties: - renesas,intc-ex-r8a77990 # R-Car E3 - renesas,intc-ex-r8a77995 # R-Car D3 - renesas,intc-ex-r8a779a0 # R-Car V3U + - renesas,intc-ex-r8a779g0 # R-Car V4H - const: renesas,irqc '#interrupt-cells': -- GitLab From 70afdab904d2d1e68bffe75fe08e7e48e0b0ff8e Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 22 Sep 2022 11:12:43 -0500 Subject: [PATCH 0772/2223] irqchip: Add IMX MU MSI controller driver The MU block found in a number of Freescale/NXP SoCs supports generating IRQs by writing data to a register. This enables the MU block to be used as a MSI controller, by leveraging the platform-MSI API. Signed-off-by: Frank Li [maz: dropped pointless dma-iommu.h and of_pci.h includes] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220922161246.20586-4-Frank.Li@nxp.com --- drivers/irqchip/Kconfig | 14 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-imx-mu-msi.c | 453 +++++++++++++++++++++++++++++++ 3 files changed, 468 insertions(+) create mode 100644 drivers/irqchip/irq-imx-mu-msi.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 66b9fa408bf24..a213465f51187 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -481,6 +481,20 @@ config IMX_INTMUX help Support for the i.MX INTMUX interrupt multiplexer. +config IMX_MU_MSI + tristate "i.MX MU used as MSI controller" + depends on OF && HAS_IOMEM + default m if ARCH_MXC + select IRQ_DOMAIN + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ_DOMAIN + help + Provide a driver for the MU block used as a CPU-to-CPU MSI + controller. This requires a specially crafted DT to make use + of this driver. + + If unsure, say N + config LS1X_IRQ bool "Loongson-1 Interrupt Controller" depends on MACH_LOONGSON32 diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index b6acbca2248bc..87b49a10962c7 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_RISCV_INTC) += irq-riscv-intc.o obj-$(CONFIG_SIFIVE_PLIC) += irq-sifive-plic.o obj-$(CONFIG_IMX_IRQSTEER) += irq-imx-irqsteer.o obj-$(CONFIG_IMX_INTMUX) += irq-imx-intmux.o +obj-$(CONFIG_IMX_MU_MSI) += irq-imx-mu-msi.o obj-$(CONFIG_MADERA_IRQ) += irq-madera.o obj-$(CONFIG_LS1X_IRQ) += irq-ls1x.o obj-$(CONFIG_TI_SCI_INTR_IRQCHIP) += irq-ti-sci-intr.o diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c new file mode 100644 index 0000000000000..b62139dc36e82 --- /dev/null +++ b/drivers/irqchip/irq-imx-mu-msi.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Freescale MU used as MSI controller + * + * Copyright (c) 2018 Pengutronix, Oleksij Rempel + * Copyright 2022 NXP + * Frank Li + * Peng Fan + * + * Based on drivers/mailbox/imx-mailbox.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IMX_MU_CHANS 4 + +enum imx_mu_xcr { + IMX_MU_GIER, + IMX_MU_GCR, + IMX_MU_TCR, + IMX_MU_RCR, + IMX_MU_xCR_MAX, +}; + +enum imx_mu_xsr { + IMX_MU_SR, + IMX_MU_GSR, + IMX_MU_TSR, + IMX_MU_RSR, + IMX_MU_xSR_MAX +}; + +enum imx_mu_type { + IMX_MU_V2 = BIT(1), +}; + +/* Receive Interrupt Enable */ +#define IMX_MU_xCR_RIEn(data, x) ((data->cfg->type) & IMX_MU_V2 ? BIT(x) : BIT(24 + (3 - (x)))) +#define IMX_MU_xSR_RFn(data, x) ((data->cfg->type) & IMX_MU_V2 ? BIT(x) : BIT(24 + (3 - (x)))) + +struct imx_mu_dcfg { + enum imx_mu_type type; + u32 xTR; /* Transmit Register0 */ + u32 xRR; /* Receive Register0 */ + u32 xSR[IMX_MU_xSR_MAX]; /* Status Registers */ + u32 xCR[IMX_MU_xCR_MAX]; /* Control Registers */ +}; + +struct imx_mu_msi { + raw_spinlock_t lock; + struct irq_domain *msi_domain; + void __iomem *regs; + phys_addr_t msiir_addr; + const struct imx_mu_dcfg *cfg; + unsigned long used; + struct clk *clk; +}; + +static void imx_mu_write(struct imx_mu_msi *msi_data, u32 val, u32 offs) +{ + iowrite32(val, msi_data->regs + offs); +} + +static u32 imx_mu_read(struct imx_mu_msi *msi_data, u32 offs) +{ + return ioread32(msi_data->regs + offs); +} + +static u32 imx_mu_xcr_rmw(struct imx_mu_msi *msi_data, enum imx_mu_xcr type, u32 set, u32 clr) +{ + unsigned long flags; + u32 val; + + raw_spin_lock_irqsave(&msi_data->lock, flags); + val = imx_mu_read(msi_data, msi_data->cfg->xCR[type]); + val &= ~clr; + val |= set; + imx_mu_write(msi_data, val, msi_data->cfg->xCR[type]); + raw_spin_unlock_irqrestore(&msi_data->lock, flags); + + return val; +} + +static void imx_mu_msi_parent_mask_irq(struct irq_data *data) +{ + struct imx_mu_msi *msi_data = irq_data_get_irq_chip_data(data); + + imx_mu_xcr_rmw(msi_data, IMX_MU_RCR, 0, IMX_MU_xCR_RIEn(msi_data, data->hwirq)); +} + +static void imx_mu_msi_parent_unmask_irq(struct irq_data *data) +{ + struct imx_mu_msi *msi_data = irq_data_get_irq_chip_data(data); + + imx_mu_xcr_rmw(msi_data, IMX_MU_RCR, IMX_MU_xCR_RIEn(msi_data, data->hwirq), 0); +} + +static void imx_mu_msi_parent_ack_irq(struct irq_data *data) +{ + struct imx_mu_msi *msi_data = irq_data_get_irq_chip_data(data); + + imx_mu_read(msi_data, msi_data->cfg->xRR + data->hwirq * 4); +} + +static struct irq_chip imx_mu_msi_irq_chip = { + .name = "MU-MSI", + .irq_ack = irq_chip_ack_parent, +}; + +static struct msi_domain_ops imx_mu_msi_irq_ops = { +}; + +static struct msi_domain_info imx_mu_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .ops = &imx_mu_msi_irq_ops, + .chip = &imx_mu_msi_irq_chip, +}; + +static void imx_mu_msi_parent_compose_msg(struct irq_data *data, + struct msi_msg *msg) +{ + struct imx_mu_msi *msi_data = irq_data_get_irq_chip_data(data); + u64 addr = msi_data->msiir_addr + 4 * data->hwirq; + + msg->address_hi = upper_32_bits(addr); + msg->address_lo = lower_32_bits(addr); + msg->data = data->hwirq; +} + +static int imx_mu_msi_parent_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + return -EINVAL; +} + +static struct irq_chip imx_mu_msi_parent_chip = { + .name = "MU", + .irq_mask = imx_mu_msi_parent_mask_irq, + .irq_unmask = imx_mu_msi_parent_unmask_irq, + .irq_ack = imx_mu_msi_parent_ack_irq, + .irq_compose_msi_msg = imx_mu_msi_parent_compose_msg, + .irq_set_affinity = imx_mu_msi_parent_set_affinity, +}; + +static int imx_mu_msi_domain_irq_alloc(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs, + void *args) +{ + struct imx_mu_msi *msi_data = domain->host_data; + unsigned long flags; + int pos, err = 0; + + WARN_ON(nr_irqs != 1); + + raw_spin_lock_irqsave(&msi_data->lock, flags); + pos = find_first_zero_bit(&msi_data->used, IMX_MU_CHANS); + if (pos < IMX_MU_CHANS) + __set_bit(pos, &msi_data->used); + else + err = -ENOSPC; + raw_spin_unlock_irqrestore(&msi_data->lock, flags); + + if (err) + return err; + + irq_domain_set_info(domain, virq, pos, + &imx_mu_msi_parent_chip, msi_data, + handle_edge_irq, NULL, NULL); + return 0; +} + +static void imx_mu_msi_domain_irq_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct imx_mu_msi *msi_data = irq_data_get_irq_chip_data(d); + unsigned long flags; + + raw_spin_lock_irqsave(&msi_data->lock, flags); + __clear_bit(d->hwirq, &msi_data->used); + raw_spin_unlock_irqrestore(&msi_data->lock, flags); +} + +static const struct irq_domain_ops imx_mu_msi_domain_ops = { + .alloc = imx_mu_msi_domain_irq_alloc, + .free = imx_mu_msi_domain_irq_free, +}; + +static void imx_mu_msi_irq_handler(struct irq_desc *desc) +{ + struct imx_mu_msi *msi_data = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + u32 status; + int i; + + status = imx_mu_read(msi_data, msi_data->cfg->xSR[IMX_MU_RSR]); + + chained_irq_enter(chip, desc); + for (i = 0; i < IMX_MU_CHANS; i++) { + if (status & IMX_MU_xSR_RFn(msi_data, i)) + generic_handle_domain_irq(msi_data->msi_domain, i); + } + chained_irq_exit(chip, desc); +} + +static int imx_mu_msi_domains_init(struct imx_mu_msi *msi_data, struct device *dev) +{ + struct fwnode_handle *fwnodes = dev_fwnode(dev); + struct irq_domain *parent; + + /* Initialize MSI domain parent */ + parent = irq_domain_create_linear(fwnodes, + IMX_MU_CHANS, + &imx_mu_msi_domain_ops, + msi_data); + if (!parent) { + dev_err(dev, "failed to create IRQ domain\n"); + return -ENOMEM; + } + + irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS); + + msi_data->msi_domain = platform_msi_create_irq_domain(fwnodes, + &imx_mu_msi_domain_info, + parent); + + if (!msi_data->msi_domain) { + dev_err(dev, "failed to create MSI domain\n"); + irq_domain_remove(parent); + return -ENOMEM; + } + + irq_domain_set_pm_device(msi_data->msi_domain, dev); + + return 0; +} + +/* Register offset of different version MU IP */ +static const struct imx_mu_dcfg imx_mu_cfg_imx6sx = { + .type = 0, + .xTR = 0x0, + .xRR = 0x10, + .xSR = { + [IMX_MU_SR] = 0x20, + [IMX_MU_GSR] = 0x20, + [IMX_MU_TSR] = 0x20, + [IMX_MU_RSR] = 0x20, + }, + .xCR = { + [IMX_MU_GIER] = 0x24, + [IMX_MU_GCR] = 0x24, + [IMX_MU_TCR] = 0x24, + [IMX_MU_RCR] = 0x24, + }, +}; + +static const struct imx_mu_dcfg imx_mu_cfg_imx7ulp = { + .type = 0, + .xTR = 0x20, + .xRR = 0x40, + .xSR = { + [IMX_MU_SR] = 0x60, + [IMX_MU_GSR] = 0x60, + [IMX_MU_TSR] = 0x60, + [IMX_MU_RSR] = 0x60, + }, + .xCR = { + [IMX_MU_GIER] = 0x64, + [IMX_MU_GCR] = 0x64, + [IMX_MU_TCR] = 0x64, + [IMX_MU_RCR] = 0x64, + }, +}; + +static const struct imx_mu_dcfg imx_mu_cfg_imx8ulp = { + .type = IMX_MU_V2, + .xTR = 0x200, + .xRR = 0x280, + .xSR = { + [IMX_MU_SR] = 0xC, + [IMX_MU_GSR] = 0x118, + [IMX_MU_GSR] = 0x124, + [IMX_MU_RSR] = 0x12C, + }, + .xCR = { + [IMX_MU_GIER] = 0x110, + [IMX_MU_GCR] = 0x114, + [IMX_MU_TCR] = 0x120, + [IMX_MU_RCR] = 0x128 + }, +}; + +static int __init imx_mu_of_init(struct device_node *dn, + struct device_node *parent, + const struct imx_mu_dcfg *cfg) +{ + struct platform_device *pdev = of_find_device_by_node(dn); + struct device_link *pd_link_a; + struct device_link *pd_link_b; + struct imx_mu_msi *msi_data; + struct resource *res; + struct device *pd_a; + struct device *pd_b; + struct device *dev; + int ret; + int irq; + + dev = &pdev->dev; + + msi_data = devm_kzalloc(&pdev->dev, sizeof(*msi_data), GFP_KERNEL); + if (!msi_data) + return -ENOMEM; + + msi_data->cfg = cfg; + + msi_data->regs = devm_platform_ioremap_resource_byname(pdev, "processor-a-side"); + if (IS_ERR(msi_data->regs)) { + dev_err(&pdev->dev, "failed to initialize 'regs'\n"); + return PTR_ERR(msi_data->regs); + } + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "processor-b-side"); + if (!res) + return -EIO; + + msi_data->msiir_addr = res->start + msi_data->cfg->xTR; + + irq = platform_get_irq(pdev, 0); + if (irq <= 0) + return -ENODEV; + + platform_set_drvdata(pdev, msi_data); + + msi_data->clk = devm_clk_get(dev, NULL); + if (IS_ERR(msi_data->clk)) + return PTR_ERR(msi_data->clk); + + pd_a = dev_pm_domain_attach_by_name(dev, "processor-a-side"); + if (IS_ERR(pd_a)) + return PTR_ERR(pd_a); + + pd_b = dev_pm_domain_attach_by_name(dev, "processor-b-side"); + if (IS_ERR(pd_b)) + return PTR_ERR(pd_b); + + pd_link_a = device_link_add(dev, pd_a, + DL_FLAG_STATELESS | + DL_FLAG_PM_RUNTIME | + DL_FLAG_RPM_ACTIVE); + + if (!pd_link_a) { + dev_err(dev, "Failed to add device_link to mu a.\n"); + goto err_pd_a; + } + + pd_link_b = device_link_add(dev, pd_b, + DL_FLAG_STATELESS | + DL_FLAG_PM_RUNTIME | + DL_FLAG_RPM_ACTIVE); + + + if (!pd_link_b) { + dev_err(dev, "Failed to add device_link to mu a.\n"); + goto err_pd_b; + } + + ret = imx_mu_msi_domains_init(msi_data, dev); + if (ret) + goto err_dm_init; + + pm_runtime_enable(dev); + + irq_set_chained_handler_and_data(irq, + imx_mu_msi_irq_handler, + msi_data); + + return 0; + +err_dm_init: + device_link_remove(dev, pd_b); +err_pd_b: + device_link_remove(dev, pd_a); +err_pd_a: + return -EINVAL; +} + +static int __maybe_unused imx_mu_runtime_suspend(struct device *dev) +{ + struct imx_mu_msi *priv = dev_get_drvdata(dev); + + clk_disable_unprepare(priv->clk); + + return 0; +} + +static int __maybe_unused imx_mu_runtime_resume(struct device *dev) +{ + struct imx_mu_msi *priv = dev_get_drvdata(dev); + int ret; + + ret = clk_prepare_enable(priv->clk); + if (ret) + dev_err(dev, "failed to enable clock\n"); + + return ret; +} + +static const struct dev_pm_ops imx_mu_pm_ops = { + SET_RUNTIME_PM_OPS(imx_mu_runtime_suspend, + imx_mu_runtime_resume, NULL) +}; + +static int __init imx_mu_imx7ulp_of_init(struct device_node *dn, + struct device_node *parent) +{ + return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx7ulp); +} + +static int __init imx_mu_imx6sx_of_init(struct device_node *dn, + struct device_node *parent) +{ + return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx6sx); +} + +static int __init imx_mu_imx8ulp_of_init(struct device_node *dn, + struct device_node *parent) +{ + return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx8ulp); +} + +IRQCHIP_PLATFORM_DRIVER_BEGIN(imx_mu_msi) +IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_of_init) +IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_of_init) +IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_of_init) +IRQCHIP_PLATFORM_DRIVER_END(imx_mu_msi, .pm = &imx_mu_pm_ops) + + +MODULE_AUTHOR("Frank Li "); +MODULE_DESCRIPTION("Freescale MU MSI controller driver"); +MODULE_LICENSE("GPL"); -- GitLab From 7c025238b47a55c81c61dfe85a200ab82e6a6ece Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 22 Sep 2022 11:12:44 -0500 Subject: [PATCH 0773/2223] dt-bindings: irqchip: Describe the IMX MU block as a MSI controller I.MX MU supports generating IRQs by writing to a register. Describe its use as a MSI controller so that other blocks (such as a PCI EP) can use it directly. Reviewed-by: Rob Herring Signed-off-by: Frank Li [maz: commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220922161246.20586-5-Frank.Li@nxp.com --- .../interrupt-controller/fsl,mu-msi.yaml | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/fsl,mu-msi.yaml diff --git a/Documentation/devicetree/bindings/interrupt-controller/fsl,mu-msi.yaml b/Documentation/devicetree/bindings/interrupt-controller/fsl,mu-msi.yaml new file mode 100644 index 0000000000000..799ae5c3e32ae --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/fsl,mu-msi.yaml @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interrupt-controller/fsl,mu-msi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Freescale/NXP i.MX Messaging Unit (MU) work as msi controller + +maintainers: + - Frank Li + +description: | + The Messaging Unit module enables two processors within the SoC to + communicate and coordinate by passing messages (e.g. data, status + and control) through the MU interface. The MU also provides the ability + for one processor (A side) to signal the other processor (B side) using + interrupts. + + Because the MU manages the messaging between processors, the MU uses + different clocks (from each side of the different peripheral buses). + Therefore, the MU must synchronize the accesses from one side to the + other. The MU accomplishes synchronization using two sets of matching + registers (Processor A-side, Processor B-side). + + MU can work as msi interrupt controller to do doorbell + +allOf: + - $ref: /schemas/interrupt-controller/msi-controller.yaml# + +properties: + compatible: + enum: + - fsl,imx6sx-mu-msi + - fsl,imx7ulp-mu-msi + - fsl,imx8ulp-mu-msi + - fsl,imx8ulp-mu-msi-s4 + + reg: + items: + - description: a side register base address + - description: b side register base address + + reg-names: + items: + - const: processor-a-side + - const: processor-b-side + + interrupts: + description: a side interrupt number. + maxItems: 1 + + clocks: + maxItems: 1 + + power-domains: + items: + - description: a side power domain + - description: b side power domain + + power-domain-names: + items: + - const: processor-a-side + - const: processor-b-side + + interrupt-controller: true + + msi-controller: true + + "#msi-cells": + const: 0 + +required: + - compatible + - reg + - interrupts + - interrupt-controller + - msi-controller + - "#msi-cells" + +additionalProperties: false + +examples: + - | + #include + #include + + msi-controller@5d270000 { + compatible = "fsl,imx6sx-mu-msi"; + msi-controller; + #msi-cells = <0>; + interrupt-controller; + reg = <0x5d270000 0x10000>, /* A side */ + <0x5d300000 0x10000>; /* B side */ + reg-names = "processor-a-side", "processor-b-side"; + interrupts = ; + power-domains = <&pd IMX_SC_R_MU_12A>, + <&pd IMX_SC_R_MU_12B>; + power-domain-names = "processor-a-side", "processor-b-side"; + }; -- GitLab From 3e347969a5776947a115649dae740a9ed47473f5 Mon Sep 17 00:00:00 2001 From: Sajid Dalvi Date: Wed, 21 Sep 2022 21:27:35 +0000 Subject: [PATCH 0774/2223] PCI/PM: Reduce D3hot delay with usleep_range() PCIe r6.0, sec 5.9, requires a 10ms delay between programming a device to change to or from D3hot and the time the device is next accessed (unless Readiness Notifications are used). The 10ms value (PCI_PM_D3HOT_WAIT) doesn't appear directly here because some chipsets require 120ms for devices *below* them (pci_pm_d3hot_delay) and some devices require more or less than 10ms (dev->d3hot_delay). But msleep(10) typically waits about *20*ms, which is more than we need. Switch to usleep_range() to improve the delay accuracy. Based on a commit from Sajid in the Pixel 6 kernel tree [1]. On a Pixel 6, the 10ms delay for the Exynos PCIe device delayed for an average of 19ms. Switching to usleep_range() decreased the resume time by about 9ms. [1] https://android.googlesource.com/kernel/gs/+/18a8cad68d8e6d50f339a716a18295e6d987cee3 [bhelgaas commit log, add timers-howto.rst link] Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/timers/timers-howto.rst?id=v5.19#n73 Link: https://lore.kernel.org/r/20220921212735.2131588-1-willmcvicker@google.com Signed-off-by: Sajid Dalvi Signed-off-by: Will McVicker Signed-off-by: Bjorn Helgaas Reviewed-by: Matthias Kaehlcke --- drivers/pci/pci.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 107afa0a5b03a..92c6f7e5ca2e5 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -66,13 +66,15 @@ struct pci_pme_device { static void pci_dev_d3_sleep(struct pci_dev *dev) { - unsigned int delay = dev->d3hot_delay; - - if (delay < pci_pm_d3hot_delay) - delay = pci_pm_d3hot_delay; - - if (delay) - msleep(delay); + unsigned int delay_ms = max(dev->d3hot_delay, pci_pm_d3hot_delay); + unsigned int upper; + + if (delay_ms) { + /* Use a 20% upper bound, 1ms minimum */ + upper = max(DIV_ROUND_CLOSEST(delay_ms, 5), 1U); + usleep_range(delay_ms * USEC_PER_MSEC, + (delay_ms + upper) * USEC_PER_MSEC); + } } bool pci_reset_supported(struct pci_dev *dev) -- GitLab From 2d09ac951b7750780ecb3de3ccb642dffd7ef62b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 29 Sep 2022 21:11:36 +0200 Subject: [PATCH 0775/2223] input: drop empty comment blocks Commit 1a59d1b8e05e ("treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 156") has left some empty comment blocks. Remove them to save a few lines of code. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/26a2b905b259bfffaf2de5b26f2007b8606970ed.1664478665.git.christophe.jaillet@wanadoo.fr Signed-off-by: Dmitry Torokhov --- drivers/input/ff-core.c | 3 --- drivers/input/ff-memless.c | 3 --- drivers/input/gameport/emu10k1-gp.c | 3 --- drivers/input/gameport/lightning.c | 3 --- drivers/input/gameport/ns558.c | 3 --- drivers/input/joystick/a3d.c | 3 --- drivers/input/joystick/adi.c | 3 --- drivers/input/joystick/amijoy.c | 3 --- drivers/input/joystick/analog.c | 3 --- drivers/input/joystick/cobra.c | 3 --- drivers/input/joystick/db9.c | 3 --- drivers/input/joystick/gamecon.c | 3 --- drivers/input/joystick/gf2k.c | 3 --- drivers/input/joystick/grip.c | 3 --- drivers/input/joystick/guillemot.c | 3 --- drivers/input/joystick/interact.c | 3 --- drivers/input/joystick/joydump.c | 3 --- drivers/input/joystick/magellan.c | 3 --- drivers/input/joystick/sidewinder.c | 3 --- drivers/input/joystick/spaceball.c | 3 --- drivers/input/joystick/spaceorb.c | 3 --- drivers/input/joystick/stinger.c | 3 --- drivers/input/joystick/tmdc.c | 3 --- drivers/input/joystick/turbografx.c | 3 --- drivers/input/joystick/twidjoy.c | 3 --- drivers/input/joystick/warrior.c | 3 --- drivers/input/joystick/zhenhua.c | 3 --- drivers/input/keyboard/amikbd.c | 3 --- drivers/input/keyboard/atakbd.c | 3 --- drivers/input/keyboard/lkkbd.c | 3 --- drivers/input/keyboard/newtonkbd.c | 3 --- drivers/input/keyboard/stowaway.c | 3 --- drivers/input/keyboard/sunkbd.c | 3 --- drivers/input/keyboard/xtkbd.c | 3 --- drivers/input/mouse/inport.c | 3 --- drivers/input/mouse/logibm.c | 3 --- drivers/input/mouse/pc110pad.c | 3 --- drivers/input/mouse/sermouse.c | 3 --- drivers/input/mouse/vsxxxaa.c | 3 --- drivers/input/serio/ct82c710.c | 3 --- drivers/input/serio/q40kbd.c | 3 --- drivers/input/serio/rpckbd.c | 3 --- drivers/input/serio/serio.c | 3 --- drivers/input/tablet/acecad.c | 3 --- drivers/input/tablet/hanwang.c | 3 --- drivers/input/touchscreen/gunze.c | 3 --- 46 files changed, 138 deletions(-) diff --git a/drivers/input/ff-core.c b/drivers/input/ff-core.c index fa8d1a4660142..16231fe080b00 100644 --- a/drivers/input/ff-core.c +++ b/drivers/input/ff-core.c @@ -6,9 +6,6 @@ * Copyright (c) 2006 Dmitry Torokhov */ -/* - */ - /* #define DEBUG */ #include diff --git a/drivers/input/ff-memless.c b/drivers/input/ff-memless.c index 8229a90069176..c321cdabd2141 100644 --- a/drivers/input/ff-memless.c +++ b/drivers/input/ff-memless.c @@ -6,9 +6,6 @@ * Copyright (c) 2006 Dmitry Torokhov */ -/* - */ - /* #define DEBUG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/drivers/input/gameport/emu10k1-gp.c b/drivers/input/gameport/emu10k1-gp.c index 11bbd1edfdb4a..76ce41e58df0c 100644 --- a/drivers/input/gameport/emu10k1-gp.c +++ b/drivers/input/gameport/emu10k1-gp.c @@ -7,9 +7,6 @@ * EMU10k1 - SB Live / Audigy - gameport driver for Linux */ -/* - */ - #include #include diff --git a/drivers/input/gameport/lightning.c b/drivers/input/gameport/lightning.c index 87eeb4b5b5b57..2ce717b25a84f 100644 --- a/drivers/input/gameport/lightning.c +++ b/drivers/input/gameport/lightning.c @@ -7,9 +7,6 @@ * PDPI Lightning 4 gamecard driver for Linux. */ -/* - */ - #include #include #include diff --git a/drivers/input/gameport/ns558.c b/drivers/input/gameport/ns558.c index 2f80b7f1b7362..91a8cd346e9b7 100644 --- a/drivers/input/gameport/ns558.c +++ b/drivers/input/gameport/ns558.c @@ -8,9 +8,6 @@ * NS558 based standard IBM game port driver for Linux */ -/* - */ - #include #include diff --git a/drivers/input/joystick/a3d.c b/drivers/input/joystick/a3d.c index 68475fad177c4..fd1827baf27cd 100644 --- a/drivers/input/joystick/a3d.c +++ b/drivers/input/joystick/a3d.c @@ -7,9 +7,6 @@ * FP-Gaming Assassin 3D joystick driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/adi.c b/drivers/input/joystick/adi.c index e10d57bf1180c..f1a720be458b7 100644 --- a/drivers/input/joystick/adi.c +++ b/drivers/input/joystick/adi.c @@ -7,9 +7,6 @@ * Logitech ADI joystick family driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/amijoy.c b/drivers/input/joystick/amijoy.c index 12456a196dc73..3752dc2a20868 100644 --- a/drivers/input/joystick/amijoy.c +++ b/drivers/input/joystick/amijoy.c @@ -7,9 +7,6 @@ * Driver for Amiga joysticks for Linux/m68k */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 3088c5b829f07..0c9e172a98181 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -7,9 +7,6 @@ * Analog joystick and gamepad driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/cobra.c b/drivers/input/joystick/cobra.c index 41e1936a847bd..7ff78c9388bd3 100644 --- a/drivers/input/joystick/cobra.c +++ b/drivers/input/joystick/cobra.c @@ -7,9 +7,6 @@ * Creative Labs Blaster GamePad Cobra driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/db9.c b/drivers/input/joystick/db9.c index 434d265fa2e83..4fba28b1a1e75 100644 --- a/drivers/input/joystick/db9.c +++ b/drivers/input/joystick/db9.c @@ -10,9 +10,6 @@ * Atari, Amstrad, Commodore, Amiga, Sega, etc. joystick driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/gamecon.c b/drivers/input/joystick/gamecon.c index d37645e496ff1..41d5dac054481 100644 --- a/drivers/input/joystick/gamecon.c +++ b/drivers/input/joystick/gamecon.c @@ -11,9 +11,6 @@ * Raphael Assenat */ -/* - */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/input/joystick/gf2k.c b/drivers/input/joystick/gf2k.c index 920feba967f6a..abefbd1484dfe 100644 --- a/drivers/input/joystick/gf2k.c +++ b/drivers/input/joystick/gf2k.c @@ -7,9 +7,6 @@ * Genius Flight 2000 joystick driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/grip.c b/drivers/input/joystick/grip.c index fe798bc879501..0e86b269a90ea 100644 --- a/drivers/input/joystick/grip.c +++ b/drivers/input/joystick/grip.c @@ -7,9 +7,6 @@ * Gravis/Kensington GrIP protocol joystick and gamepad driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/guillemot.c b/drivers/input/joystick/guillemot.c index 8eeacdb007c1d..205eb6f8b84d2 100644 --- a/drivers/input/joystick/guillemot.c +++ b/drivers/input/joystick/guillemot.c @@ -7,9 +7,6 @@ * Guillemot Digital Interface Protocol driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/interact.c b/drivers/input/joystick/interact.c index ca22d84e5c842..03a9f0829f7ed 100644 --- a/drivers/input/joystick/interact.c +++ b/drivers/input/joystick/interact.c @@ -10,9 +10,6 @@ * InterAct digital gamepad/joystick driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/joydump.c b/drivers/input/joystick/joydump.c index 70f63f9550e72..865652a7821da 100644 --- a/drivers/input/joystick/joydump.c +++ b/drivers/input/joystick/joydump.c @@ -8,9 +8,6 @@ * out of the joystick port into the syslog ... */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/magellan.c b/drivers/input/joystick/magellan.c index edb8e1982e260..017ef8c6170b7 100644 --- a/drivers/input/joystick/magellan.c +++ b/drivers/input/joystick/magellan.c @@ -7,9 +7,6 @@ * Magellan and Space Mouse 6dof controller driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/sidewinder.c b/drivers/input/joystick/sidewinder.c index 8e9672deb1ebe..7282301c3ae73 100644 --- a/drivers/input/joystick/sidewinder.c +++ b/drivers/input/joystick/sidewinder.c @@ -7,9 +7,6 @@ * Microsoft SideWinder joystick family driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/spaceball.c b/drivers/input/joystick/spaceball.c index a85a4f33aea8c..fa8ec533cd696 100644 --- a/drivers/input/joystick/spaceball.c +++ b/drivers/input/joystick/spaceball.c @@ -11,9 +11,6 @@ * SpaceTec SpaceBall 2003/3003/4000 FLX driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/spaceorb.c b/drivers/input/joystick/spaceorb.c index 557171483256d..dbbc69f17c89a 100644 --- a/drivers/input/joystick/spaceorb.c +++ b/drivers/input/joystick/spaceorb.c @@ -10,9 +10,6 @@ * SpaceTec SpaceOrb 360 and Avenger 6dof controller driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/stinger.c b/drivers/input/joystick/stinger.c index c20425f52bd8a..530de468cb617 100644 --- a/drivers/input/joystick/stinger.c +++ b/drivers/input/joystick/stinger.c @@ -8,9 +8,6 @@ * Gravis Stinger gamepad driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/tmdc.c b/drivers/input/joystick/tmdc.c index 7416de84b955c..93562ecc0ca1c 100644 --- a/drivers/input/joystick/tmdc.c +++ b/drivers/input/joystick/tmdc.c @@ -10,9 +10,6 @@ * ThrustMaster DirectConnect (BSP) joystick family driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/turbografx.c b/drivers/input/joystick/turbografx.c index dfe7a2cacce28..dfb9c684651f3 100644 --- a/drivers/input/joystick/turbografx.c +++ b/drivers/input/joystick/turbografx.c @@ -10,9 +10,6 @@ * TurboGraFX parallel port interface driver for Linux. */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/twidjoy.c b/drivers/input/joystick/twidjoy.c index 174c69a188fb1..9b6792ac27f10 100644 --- a/drivers/input/joystick/twidjoy.c +++ b/drivers/input/joystick/twidjoy.c @@ -32,9 +32,6 @@ * Arndt Schoenewald */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/warrior.c b/drivers/input/joystick/warrior.c index 42bdbc28d95d2..f66bddf145c22 100644 --- a/drivers/input/joystick/warrior.c +++ b/drivers/input/joystick/warrior.c @@ -7,9 +7,6 @@ * Logitech WingMan Warrior joystick driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/joystick/zhenhua.c b/drivers/input/joystick/zhenhua.c index d5531179b01f2..3f2460e2b0955 100644 --- a/drivers/input/joystick/zhenhua.c +++ b/drivers/input/joystick/zhenhua.c @@ -28,9 +28,6 @@ * coder :-( */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/amikbd.c b/drivers/input/keyboard/amikbd.c index 09551f64d53ff..a20a4e186639c 100644 --- a/drivers/input/keyboard/amikbd.c +++ b/drivers/input/keyboard/amikbd.c @@ -10,9 +10,6 @@ * Amiga keyboard driver for Linux/m68k */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/atakbd.c b/drivers/input/keyboard/atakbd.c index 77ed54630601e..07e17e563f9b6 100644 --- a/drivers/input/keyboard/atakbd.c +++ b/drivers/input/keyboard/atakbd.c @@ -21,9 +21,6 @@ * This driver only deals with handing key events off to the input layer. */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/lkkbd.c b/drivers/input/keyboard/lkkbd.c index ea9a1d8834c1c..047b654b3752c 100644 --- a/drivers/input/keyboard/lkkbd.c +++ b/drivers/input/keyboard/lkkbd.c @@ -46,9 +46,6 @@ * http://www.vt100.net/manx/details?pn=EK-104AA-TM-001;id=21;cp=1 */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/newtonkbd.c b/drivers/input/keyboard/newtonkbd.c index 9742261b2d1a7..df00a119aa9a2 100644 --- a/drivers/input/keyboard/newtonkbd.c +++ b/drivers/input/keyboard/newtonkbd.c @@ -7,9 +7,6 @@ * Newton keyboard driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/stowaway.c b/drivers/input/keyboard/stowaway.c index a4977193dd4a5..56e7849360596 100644 --- a/drivers/input/keyboard/stowaway.c +++ b/drivers/input/keyboard/stowaway.c @@ -10,9 +10,6 @@ * by Justin Cormack */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c index d450f11b98a70..b123a208ef369 100644 --- a/drivers/input/keyboard/sunkbd.c +++ b/drivers/input/keyboard/sunkbd.c @@ -7,9 +7,6 @@ * Sun keyboard driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/keyboard/xtkbd.c b/drivers/input/keyboard/xtkbd.c index 280796df679a3..c9d7c24817260 100644 --- a/drivers/input/keyboard/xtkbd.c +++ b/drivers/input/keyboard/xtkbd.c @@ -7,9 +7,6 @@ * XT keyboard driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/mouse/inport.c b/drivers/input/mouse/inport.c index df5d1160478c4..401d8bff8e842 100644 --- a/drivers/input/mouse/inport.c +++ b/drivers/input/mouse/inport.c @@ -13,9 +13,6 @@ * Inport (ATI XL and Microsoft) busmouse driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/mouse/logibm.c b/drivers/input/mouse/logibm.c index bd647f9f505a8..0aab63dbc30a3 100644 --- a/drivers/input/mouse/logibm.c +++ b/drivers/input/mouse/logibm.c @@ -14,9 +14,6 @@ * Logitech Bus Mouse Driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/mouse/pc110pad.c b/drivers/input/mouse/pc110pad.c index f75574766b85b..efa58049f746e 100644 --- a/drivers/input/mouse/pc110pad.c +++ b/drivers/input/mouse/pc110pad.c @@ -10,9 +10,6 @@ * IBM PC110 touchpad driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/mouse/sermouse.c b/drivers/input/mouse/sermouse.c index caa79c177c559..993f903333808 100644 --- a/drivers/input/mouse/sermouse.c +++ b/drivers/input/mouse/sermouse.c @@ -7,9 +7,6 @@ * Serial mouse driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/mouse/vsxxxaa.c b/drivers/input/mouse/vsxxxaa.c index 3bd6e723a4220..8af8e4a15f95d 100644 --- a/drivers/input/mouse/vsxxxaa.c +++ b/drivers/input/mouse/vsxxxaa.c @@ -12,9 +12,6 @@ * Later on, I had access to the device's documentation (referenced below). */ -/* - */ - /* * Building an adaptor to DE9 / DB25 RS232 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/drivers/input/serio/ct82c710.c b/drivers/input/serio/ct82c710.c index 752ce60e22116..3da751f4a6bf6 100644 --- a/drivers/input/serio/ct82c710.c +++ b/drivers/input/serio/ct82c710.c @@ -7,9 +7,6 @@ * 82C710 C&T mouse port chip driver for Linux */ -/* - */ - #include #include #include diff --git a/drivers/input/serio/q40kbd.c b/drivers/input/serio/q40kbd.c index a1c61f5de0477..ba04058fc3cbd 100644 --- a/drivers/input/serio/q40kbd.c +++ b/drivers/input/serio/q40kbd.c @@ -10,9 +10,6 @@ * Q40 PS/2 keyboard controller driver for Linux/m68k */ -/* - */ - #include #include #include diff --git a/drivers/input/serio/rpckbd.c b/drivers/input/serio/rpckbd.c index 7008bc101415b..ce420eb1f51be 100644 --- a/drivers/input/serio/rpckbd.c +++ b/drivers/input/serio/rpckbd.c @@ -8,9 +8,6 @@ * Acorn RiscPC PS/2 keyboard controller driver for Linux/ARM */ -/* - */ - #include #include #include diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c index ec117be3d8d83..15ce3202322f0 100644 --- a/drivers/input/serio/serio.c +++ b/drivers/input/serio/serio.c @@ -7,9 +7,6 @@ * Copyright (c) 2003 Daniele Bellucci */ -/* - */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/input/tablet/acecad.c b/drivers/input/tablet/acecad.c index 80e06727464da..b20e5a1afbcca 100644 --- a/drivers/input/tablet/acecad.c +++ b/drivers/input/tablet/acecad.c @@ -9,9 +9,6 @@ * v3.2 - Added sysfs support */ -/* - */ - #include #include #include diff --git a/drivers/input/tablet/hanwang.c b/drivers/input/tablet/hanwang.c index e492a0331b246..9bc631518b92d 100644 --- a/drivers/input/tablet/hanwang.c +++ b/drivers/input/tablet/hanwang.c @@ -5,9 +5,6 @@ * Copyright (c) 2010 Xing Wei */ -/* - */ - #include #include #include diff --git a/drivers/input/touchscreen/gunze.c b/drivers/input/touchscreen/gunze.c index e07e8e0fe8ea9..5a5f9da73fa18 100644 --- a/drivers/input/touchscreen/gunze.c +++ b/drivers/input/touchscreen/gunze.c @@ -7,9 +7,6 @@ * Gunze AHL-51S touchscreen driver for Linux */ -/* - */ - #include #include #include -- GitLab From 2e5021cc42ba26c98fe83b973d774a999fa4f219 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 30 Aug 2022 00:45:05 -0500 Subject: [PATCH 0776/2223] libnvdimm/region: Allow setting align attribute on regions without mappings The alignment constraint for namespace creation in a region was increased, from 2M to 16M, for non-PowerPC architectures in v5.7 with commit 2522afb86a8c ("libnvdimm/region: Introduce an 'align' attribute"). The thought behind the change was that region alignment should be uniform across all architectures and, since PowerPC had the largest alignment constraint of 16M, all architectures should conform to that alignment. The change regressed namespace creation in pre-defined regions that relied on 2M alignment but a workaround was provided in the form of a sysfs attribute, named 'align', that could be adjusted to a non-default alignment value. However, the sysfs attribute's store function returned an error (-ENXIO) when userspace attempted to change the alignment of a region that had no mappings. This affected 2M aligned regions of volatile memory that were defined in a device tree using "pmem-region" and created by the of_pmem_region_driver, since those regions do not contain mappings (ndr_mappings is 0). Allow userspace to set the align attribute on pre-existing regions that do not have mappings so that namespaces can still be within those regions, despite not being aligned to 16M. Link: https://lore.kernel.org/lkml/CA+CK2bDJ3hrWoE91L2wpAk+Yu0_=GtYw=4gLDDD7mxs321b_aA@mail.gmail.com Fixes: 2522afb86a8c ("libnvdimm/region: Introduce an 'align' attribute") Signed-off-by: Tyler Hicks Link: https://lore.kernel.org/r/20220830054505.1159488-1-tyhicks@linux.microsoft.com Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 70f1a23cbe31d..e0875d3697624 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -509,16 +509,13 @@ static ssize_t align_store(struct device *dev, { struct nd_region *nd_region = to_nd_region(dev); unsigned long val, dpa; - u32 remainder; + u32 mappings, remainder; int rc; rc = kstrtoul(buf, 0, &val); if (rc) return rc; - if (!nd_region->ndr_mappings) - return -ENXIO; - /* * Ensure space-align is evenly divisible by the region * interleave-width because the kernel typically has no facility @@ -526,7 +523,8 @@ static ssize_t align_store(struct device *dev, * contribute to the tail capacity in system-physical-address * space for the namespace. */ - dpa = div_u64_rem(val, nd_region->ndr_mappings, &remainder); + mappings = max_t(u32, 1, nd_region->ndr_mappings); + dpa = div_u64_rem(val, mappings, &remainder); if (!is_power_of_2(dpa) || dpa < PAGE_SIZE || val > region_size(nd_region) || remainder) return -EINVAL; -- GitLab From 0f702033a64bd3adcd57c9d5cf91ea64c08fad42 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Sun, 25 Sep 2022 21:26:35 -0400 Subject: [PATCH 0777/2223] dax: Remove usage of the deprecated ida_simple_xxx API ida_alloc_max() makes it clear that the second argument is inclusive, and the alloc/free terminology is more idiomatic and symmetric then get/remove. Signed-off-by: Bo Liu Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220926012635.3205-1-liubo03@inspur.com [djbw: reword changelog] Signed-off-by: Dan Williams --- drivers/dax/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 9b5e2a5eb0ae6..da4438f3188c8 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -363,7 +363,7 @@ static void dax_free_inode(struct inode *inode) { struct dax_device *dax_dev = to_dax_dev(inode); if (inode->i_rdev) - ida_simple_remove(&dax_minor_ida, iminor(inode)); + ida_free(&dax_minor_ida, iminor(inode)); kmem_cache_free(dax_cache, dax_dev); } @@ -445,7 +445,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) if (WARN_ON_ONCE(ops && !ops->zero_page_range)) return ERR_PTR(-EINVAL); - minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); + minor = ida_alloc_max(&dax_minor_ida, MINORMASK, GFP_KERNEL); if (minor < 0) return ERR_PTR(-ENOMEM); @@ -459,7 +459,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) return dax_dev; err_dev: - ida_simple_remove(&dax_minor_ida, minor); + ida_free(&dax_minor_ida, minor); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(alloc_dax); -- GitLab From 6a02124c87f0b61dcaaeb65e7fd406d8afb40fd4 Mon Sep 17 00:00:00 2001 From: Lin Yujun Date: Wed, 14 Sep 2022 11:37:55 +0800 Subject: [PATCH 0778/2223] ACPI: HMAT: Release platform device in case of platform_device_add_data() fails The platform device is not released when platform_device_add_data() fails. And platform_device_put() perfom one more pointer check than put_device() to check for errors in the 'pdev' pointer. Use platform_device_put() to release platform device in platform_device_add()/platform_device_add_data()/ platform_device_add_resources() error case. Fixes: c01044cc8191 ("ACPI: HMAT: refactor hmat_register_target_device to hmem_register_device") Signed-off-by: Lin Yujun Link: https://lore.kernel.org/r/20220914033755.99924-1-linyujun809@huawei.com Signed-off-by: Dan Williams --- drivers/dax/hmem/device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index cb6401c9e9a4f..f87ae005431a5 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -47,7 +47,7 @@ void hmem_register_device(int target_nid, struct resource *r) rc = platform_device_add_data(pdev, &info, sizeof(info)); if (rc < 0) { pr_err("hmem memregion_info allocation failure for %pr\n", &res); - goto out_pdev; + goto out_resource; } rc = platform_device_add_resources(pdev, &res, 1); @@ -65,7 +65,7 @@ void hmem_register_device(int target_nid, struct resource *r) return; out_resource: - put_device(&pdev->dev); + platform_device_put(pdev); out_pdev: memregion_free(id); } -- GitLab From 04f2f60befc9af274c1790e626cc79334b1f4489 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:36:48 +0000 Subject: [PATCH 0779/2223] KVM: selftests: Remove unnecessary register shuffling in fix_hypercall_test Use input constraints to load RAX and RBX when testing that KVM correctly does/doesn't patch the "wrong" hypercall. There's no need to manually load RAX and RBX, and no reason to clobber them either (KVM is not supposed to modify anything other than RAX). Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Message-Id: <20220928233652.783504-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/fix_hypercall_test.c | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index e0004bd265360..6864eb0d5d14e 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -30,14 +30,11 @@ static uint64_t svm_do_sched_yield(uint8_t apic_id) { uint64_t ret; - asm volatile("mov %1, %%rax\n\t" - "mov %2, %%rbx\n\t" - "svm_hypercall_insn:\n\t" + asm volatile("svm_hypercall_insn:\n\t" "vmmcall\n\t" - "mov %%rax, %0\n\t" - : "=r"(ret) - : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id) - : "rax", "rbx", "memory"); + : "=a"(ret) + : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id) + : "memory"); return ret; } @@ -47,14 +44,11 @@ static uint64_t vmx_do_sched_yield(uint8_t apic_id) { uint64_t ret; - asm volatile("mov %1, %%rax\n\t" - "mov %2, %%rbx\n\t" - "vmx_hypercall_insn:\n\t" + asm volatile("vmx_hypercall_insn:\n\t" "vmcall\n\t" - "mov %%rax, %0\n\t" - : "=r"(ret) - : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id) - : "rax", "rbx", "memory"); + : "=a"(ret) + : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id) + : "memory"); return ret; } -- GitLab From fca6d06cd164c6c3029be6323ed06020fca0d933 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:36:49 +0000 Subject: [PATCH 0780/2223] KVM: selftests: Hardcode VMCALL/VMMCALL opcodes in "fix hypercall" test Hardcode the VMCALL/VMMCALL opcodes in dedicated arrays instead of extracting the opcodes from inline asm, and patch in the "other" opcode so as to preserve the original opcode, i.e. the opcode that the test executes in the guest. Preserving the original opcode (by not patching the source), will make it easier to implement a check that KVM doesn't modify the opcode (the test currently only verifies that a #UD occurred). Use INT3 (0xcc) as the placeholder so that the guest will likely die a horrible death if the test's patching goes awry. As a bonus, patching from within the test dedups a decent chunk of code. Signed-off-by: Sean Christopherson Message-Id: <20220928233652.783504-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/fix_hypercall_test.c | 43 +++++++------------ 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index 6864eb0d5d14e..cebc84b263521 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -25,27 +25,16 @@ static void guest_ud_handler(struct ex_regs *regs) GUEST_DONE(); } -extern uint8_t svm_hypercall_insn[HYPERCALL_INSN_SIZE]; -static uint64_t svm_do_sched_yield(uint8_t apic_id) -{ - uint64_t ret; - - asm volatile("svm_hypercall_insn:\n\t" - "vmmcall\n\t" - : "=a"(ret) - : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id) - : "memory"); +static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xc1 }; +static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 }; - return ret; -} - -extern uint8_t vmx_hypercall_insn[HYPERCALL_INSN_SIZE]; -static uint64_t vmx_do_sched_yield(uint8_t apic_id) +extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE]; +static uint64_t do_sched_yield(uint8_t apic_id) { uint64_t ret; - asm volatile("vmx_hypercall_insn:\n\t" - "vmcall\n\t" + asm volatile("hypercall_insn:\n\t" + ".byte 0xcc,0xcc,0xcc\n\t" : "=a"(ret) : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id) : "memory"); @@ -55,25 +44,25 @@ static uint64_t vmx_do_sched_yield(uint8_t apic_id) static void guest_main(void) { - uint8_t *native_hypercall_insn, *hypercall_insn; - uint8_t apic_id; - - apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)); + const uint8_t *native_hypercall_insn; + const uint8_t *other_hypercall_insn; if (is_intel_cpu()) { - native_hypercall_insn = vmx_hypercall_insn; - hypercall_insn = svm_hypercall_insn; - svm_do_sched_yield(apic_id); + native_hypercall_insn = vmx_vmcall; + other_hypercall_insn = svm_vmmcall; } else if (is_amd_cpu()) { - native_hypercall_insn = svm_hypercall_insn; - hypercall_insn = vmx_hypercall_insn; - vmx_do_sched_yield(apic_id); + native_hypercall_insn = svm_vmmcall; + other_hypercall_insn = vmx_vmcall; } else { GUEST_ASSERT(0); /* unreachable */ return; } + memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE); + + do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID))); + /* * The hypercall didn't #UD (guest_ud_handler() signals "done" if a #UD * occurs). Verify that a #UD is NOT expected and that KVM patched in -- GitLab From b7ab6d7d2cf7d5400592d1ff0448e8e0a09e5188 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:36:50 +0000 Subject: [PATCH 0781/2223] KVM: selftests: Explicitly verify KVM doesn't patch hypercall if quirk==off Explicitly verify that KVM doesn't patch in the native hypercall if the FIX_HYPERCALL_INSN quirk is disabled. The test currently verifies that a #UD occurred, but doesn't actually verify that no patching occurred. Signed-off-by: Sean Christopherson Message-Id: <20220928233652.783504-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/fix_hypercall_test.c | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index cebc84b263521..10b9482fc4d71 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -21,8 +21,8 @@ static bool ud_expected; static void guest_ud_handler(struct ex_regs *regs) { - GUEST_ASSERT(ud_expected); - GUEST_DONE(); + regs->rax = -EFAULT; + regs->rip += HYPERCALL_INSN_SIZE; } static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xc1 }; @@ -46,6 +46,7 @@ static void guest_main(void) { const uint8_t *native_hypercall_insn; const uint8_t *other_hypercall_insn; + uint64_t ret; if (is_intel_cpu()) { native_hypercall_insn = vmx_vmcall; @@ -61,15 +62,24 @@ static void guest_main(void) memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE); - do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID))); + ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID))); /* - * The hypercall didn't #UD (guest_ud_handler() signals "done" if a #UD - * occurs). Verify that a #UD is NOT expected and that KVM patched in - * the native hypercall. + * If the quirk is disabled, verify that guest_ud_handler() "returned" + * -EFAULT and that KVM did NOT patch the hypercall. If the quirk is + * enabled, verify that the hypercall succeeded and that KVM patched in + * the "right" hypercall. */ - GUEST_ASSERT(!ud_expected); - GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, HYPERCALL_INSN_SIZE)); + if (ud_expected) { + GUEST_ASSERT(ret == (uint64_t)-EFAULT); + GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn, + HYPERCALL_INSN_SIZE)); + } else { + GUEST_ASSERT(!ret); + GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, + HYPERCALL_INSN_SIZE)); + } + GUEST_DONE(); } -- GitLab From 53c9bdb922f40a7abf3b1642f8a39d3b94d10d62 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:36:51 +0000 Subject: [PATCH 0782/2223] KVM: selftests: Dedup subtests of fix_hypercall_test Combine fix_hypercall_test's two subtests into a common routine, the only difference between the two is whether or not the quirk is disabled. Passing a boolean is a little gross, but using an enum to make it super obvious that the callers are enabling/disabling the quirk seems like overkill. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Message-Id: <20220928233652.783504-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/fix_hypercall_test.c | 45 ++++++------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index 10b9482fc4d71..32f7e09ef67cb 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -17,7 +17,7 @@ /* VMCALL and VMMCALL are both 3-byte opcodes. */ #define HYPERCALL_INSN_SIZE 3 -static bool ud_expected; +static bool quirk_disabled; static void guest_ud_handler(struct ex_regs *regs) { @@ -70,7 +70,7 @@ static void guest_main(void) * enabled, verify that the hypercall succeeded and that KVM patched in * the "right" hypercall. */ - if (ud_expected) { + if (quirk_disabled) { GUEST_ASSERT(ret == (uint64_t)-EFAULT); GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn, HYPERCALL_INSN_SIZE)); @@ -83,13 +83,6 @@ static void guest_main(void) GUEST_DONE(); } -static void setup_ud_vector(struct kvm_vcpu *vcpu) -{ - vm_init_descriptor_tables(vcpu->vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); -} - static void enter_guest(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; @@ -110,35 +103,23 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } -static void test_fix_hypercall(void) +static void test_fix_hypercall(bool disable_quirk) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; vm = vm_create_with_one_vcpu(&vcpu, guest_main); - setup_ud_vector(vcpu); - - ud_expected = false; - sync_global_to_guest(vm, ud_expected); - - virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - - enter_guest(vcpu); -} - -static void test_fix_hypercall_disabled(void) -{ - struct kvm_vcpu *vcpu; - struct kvm_vm *vm; - vm = vm_create_with_one_vcpu(&vcpu, guest_main); - setup_ud_vector(vcpu); + vm_init_descriptor_tables(vcpu->vm); + vcpu_init_descriptor_tables(vcpu); + vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); - vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, - KVM_X86_QUIRK_FIX_HYPERCALL_INSN); + if (disable_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, + KVM_X86_QUIRK_FIX_HYPERCALL_INSN); - ud_expected = true; - sync_global_to_guest(vm, ud_expected); + quirk_disabled = disable_quirk; + sync_global_to_guest(vm, quirk_disabled); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); @@ -149,6 +130,6 @@ int main(void) { TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN); - test_fix_hypercall(); - test_fix_hypercall_disabled(); + test_fix_hypercall(false); + test_fix_hypercall(true); } -- GitLab From c96409d1e58905bfc8c73b630481228382ab8846 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:36:52 +0000 Subject: [PATCH 0783/2223] Revert "KVM: selftests: Fix nested SVM tests when built with clang" Revert back to using memset() in generic_svm_setup() now that KVM selftests override memset() and friends specifically to prevent the compiler from generating fancy code and/or linking to the libc implementation. This reverts commit ed290e1c20da19fa100a3e0f421aa31b65984960. Suggested-by: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20220928233652.783504-8-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/svm.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 6d445886e16c5..5495a92dfd5a4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -60,18 +60,6 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, seg->base = base; } -/* - * Avoid using memset to clear the vmcb, since libc may not be - * available in L1 (and, even if it is, features that libc memset may - * want to use, like AVX, may not be enabled). - */ -static void clear_vmcb(struct vmcb *vmcb) -{ - int n = sizeof(*vmcb) / sizeof(u32); - - asm volatile ("rep stosl" : "+c"(n), "+D"(vmcb) : "a"(0) : "memory"); -} - void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) { struct vmcb *vmcb = svm->vmcb; @@ -88,7 +76,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r wrmsr(MSR_EFER, efer | EFER_SVME); wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); - clear_vmcb(vmcb); + memset(vmcb, 0, sizeof(*vmcb)); asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory"); vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); -- GitLab From 62ece2c5a95cc989648c39155173d3bae27e89a3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 29 Sep 2022 11:12:05 -0700 Subject: [PATCH 0784/2223] KVM: selftests: Tell the compiler that code after TEST_FAIL() is unreachable Add __builtin_unreachable() to TEST_FAIL() so that the compiler knows that any code after a TEST_FAIL() is unreachable. Signed-off-by: David Matlack Message-Id: <20220929181207.2281449-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 5c5a88180b6c3..befc754ce9b3b 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -63,8 +63,10 @@ void test_assert(bool exp, const char *exp_str, #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \ } while (0) -#define TEST_FAIL(fmt, ...) \ - TEST_ASSERT(false, fmt, ##__VA_ARGS__) +#define TEST_FAIL(fmt, ...) do { \ + TEST_ASSERT(false, fmt, ##__VA_ARGS__); \ + __builtin_unreachable(); \ +} while (0) size_t parse_size(const char *size); -- GitLab From 4d2bd14319e4e0a49fc868c50ca0b2a747b58208 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 29 Sep 2022 11:12:06 -0700 Subject: [PATCH 0785/2223] KVM: selftests: Add helpers to read kvm_{intel,amd} boolean module parameters Add helper functions for reading the value of kvm_intel and kvm_amd boolean module parameters. Use the kvm_intel variant in vm_is_unrestricted_guest() to simplify the check for kvm_intel.unrestricted_guest. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220929181207.2281449-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/kvm_util_base.h | 4 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 39 +++++++++++++++++++ .../selftests/kvm/lib/x86_64/processor.c | 13 +------ 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 24fde97f61211..e42a09cd24a04 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -175,6 +175,10 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[]; int open_path_or_exit(const char *path, int flags); int open_kvm_dev_path_or_exit(void); + +bool get_kvm_intel_param_bool(const char *param); +bool get_kvm_amd_param_bool(const char *param); + unsigned int kvm_check_cap(long cap); static inline bool kvm_has_cap(long cap) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9889fe0d8919c..504c1e1355c3a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -50,6 +50,45 @@ int open_kvm_dev_path_or_exit(void) return _open_kvm_dev_path_or_exit(O_RDONLY); } +static bool get_module_param_bool(const char *module_name, const char *param) +{ + const int path_size = 128; + char path[path_size]; + char value; + ssize_t r; + int fd; + + r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", + module_name, param); + TEST_ASSERT(r < path_size, + "Failed to construct sysfs path in %d bytes.", path_size); + + fd = open_path_or_exit(path, O_RDONLY); + + r = read(fd, &value, 1); + TEST_ASSERT(r == 1, "read(%s) failed", path); + + r = close(fd); + TEST_ASSERT(!r, "close(%s) failed", path); + + if (value == 'Y') + return true; + else if (value == 'N') + return false; + + TEST_FAIL("Unrecognized value '%c' for boolean module param", value); +} + +bool get_kvm_intel_param_bool(const char *param) +{ + return get_module_param_bool("kvm_intel", param); +} + +bool get_kvm_amd_param_bool(const char *param) +{ + return get_module_param_bool("kvm_amd", param); +} + /* * Capability * diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 2e6e61bbe81b3..fab0f526fb818 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1294,20 +1294,9 @@ done: /* Returns true if kvm_intel was loaded with unrestricted_guest=1. */ bool vm_is_unrestricted_guest(struct kvm_vm *vm) { - char val = 'N'; - size_t count; - FILE *f; - /* Ensure that a KVM vendor-specific module is loaded. */ if (vm == NULL) close(open_kvm_dev_path_or_exit()); - f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); - if (f) { - count = fread(&val, sizeof(char), 1, f); - TEST_ASSERT(count == 1, "Unable to read from param file."); - fclose(f); - } - - return val == 'Y'; + return get_kvm_intel_param_bool("unrestricted_guest"); } -- GitLab From 458e98746fa852d744d34b5a8d0b1673959efc2f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 29 Sep 2022 11:12:07 -0700 Subject: [PATCH 0786/2223] KVM: selftests: Fix nx_huge_pages_test on TDP-disabled hosts Map the test's huge page region with 2MiB virtual mappings when TDP is disabled so that KVM can shadow the region with huge pages. This fixes nx_huge_pages_test on hosts where TDP hardware support is disabled. Purposely do not skip this test on TDP-disabled hosts. While we don't care about NX Huge Pages on TDP-disabled hosts from a security perspective, KVM does support it, and so we should test it. For TDP-enabled hosts, continue mapping the region with 4KiB pages to ensure that KVM can map it with huge pages irrespective of the guest mappings. Fixes: 8448ec5993be ("KVM: selftests: Add NX huge pages test") Signed-off-by: David Matlack Message-Id: <20220929181207.2281449-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 4 +++ .../selftests/kvm/lib/x86_64/processor.c | 27 +++++++++++++++++++ .../selftests/kvm/x86_64/nx_huge_pages_test.c | 19 +++++++++++-- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0cbc71b7af50a..e8ca0d8a6a7e0 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -825,6 +825,8 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr)); } +bool kvm_is_tdp_enabled(void); + uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr); void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, @@ -855,6 +857,8 @@ enum pg_level { #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level); /* * Basic CPU control in CR0 diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index fab0f526fb818..39c4409ef56a6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -111,6 +111,14 @@ static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) } } +bool kvm_is_tdp_enabled(void) +{ + if (is_intel_cpu()) + return get_kvm_intel_param_bool("ept"); + else + return get_kvm_amd_param_bool("npt"); +} + void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " @@ -214,6 +222,25 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); } +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level) +{ + uint64_t pg_size = PG_LEVEL_SIZE(level); + uint64_t nr_pages = nr_bytes / pg_size; + int i; + + TEST_ASSERT(nr_bytes % pg_size == 0, + "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx", + nr_bytes, pg_size); + + for (i = 0; i < nr_pages; i++) { + __virt_pg_map(vm, vaddr, paddr, level); + + vaddr += pg_size; + paddr += pg_size; + } +} + static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr) diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index cc64217164005..8c1181a5ba56c 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -112,6 +112,7 @@ void run_test(int reclaim_period_ms, bool disable_nx_huge_pages, { struct kvm_vcpu *vcpu; struct kvm_vm *vm; + uint64_t nr_bytes; void *hva; int r; @@ -141,10 +142,24 @@ void run_test(int reclaim_period_ms, bool disable_nx_huge_pages, HPAGE_GPA, HPAGE_SLOT, HPAGE_SLOT_NPAGES, 0); - virt_map(vm, HPAGE_GVA, HPAGE_GPA, HPAGE_SLOT_NPAGES); + nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size; + + /* + * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the + * region into the guest with 2MiB pages whenever TDP is disabled (i.e. + * whenever KVM is shadowing the guest page tables). + * + * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge + * pages irrespective of the guest page size, so map with 4KiB pages + * to test that that is the case. + */ + if (kvm_is_tdp_enabled()) + virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K); + else + virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M); hva = addr_gpa2hva(vm, HPAGE_GPA); - memset(hva, RETURN_OPCODE, HPAGE_SLOT_NPAGES * PAGE_SIZE); + memset(hva, RETURN_OPCODE, nr_bytes); check_2m_page_count(vm, 0); check_split_count(vm, 0); -- GitLab From f96c48e9ddf40f6abf0a67aa94642701294daf79 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 23 Sep 2022 23:05:36 +0800 Subject: [PATCH 0787/2223] kvm: mmu: fix typos in struct kvm_arch No 'kvmp_mmu_pages', it should be 'kvm_mmu_page'. And struct kvm_mmu_pages and struct kvm_mmu_page are different structures, here should be kvm_mmu_page. kvm_mmu_pages is defined in arch/x86/kvm/mmu/mmu.c. Suggested-by: David Matlack Signed-off-by: Peng Hao Reviewed-by: David Matlack Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 61b9dd34d333e..7551b6f9c31c5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1280,8 +1280,8 @@ struct kvm_arch { bool tdp_mmu_enabled; /* - * List of struct kvm_mmu_pages being used as roots. - * All struct kvm_mmu_pages in the list should have + * List of kvm_mmu_page structs being used as roots. + * All kvm_mmu_page structs in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: @@ -1300,8 +1300,8 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* - * List of struct kvmp_mmu_pages not being used as roots. - * All struct kvm_mmu_pages in the list should have + * List of kvm_mmu_page structs not being used as roots. + * All kvm_mmu_page structs in the list should have * tdp_mmu_page set and a tdp_mmu_root_count of 0. */ struct list_head tdp_mmu_pages; @@ -1311,9 +1311,9 @@ struct kvm_arch { * is held in read mode: * - tdp_mmu_roots (above) * - tdp_mmu_pages (above) - * - the link field of struct kvm_mmu_pages used by the TDP MMU + * - the link field of kvm_mmu_page structs used by the TDP MMU * - lpage_disallowed_mmu_pages - * - the lpage_disallowed_link field of struct kvm_mmu_pages used + * - the lpage_disallowed_link field of kvm_mmu_page structs used * by the TDP MMU * It is acceptable, but not necessary, to acquire this lock when * the thread holds the MMU lock in write mode. -- GitLab From e779ce9d17c44a338b4fa3be8715e3b7eb9706f0 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 23 Sep 2022 23:03:03 +0800 Subject: [PATCH 0788/2223] kvm: vmx: keep constant definition format consistent Keep all constants using lowercase "x". Signed-off-by: Peng Hao Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c371ef695fcc0..498dc600bd5c8 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -309,7 +309,7 @@ enum vmcs_field { GUEST_LDTR_AR_BYTES = 0x00004820, GUEST_TR_AR_BYTES = 0x00004822, GUEST_INTERRUPTIBILITY_INFO = 0x00004824, - GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_ACTIVITY_STATE = 0x00004826, GUEST_SYSENTER_CS = 0x0000482A, VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, HOST_IA32_SYSENTER_CS = 0x00004c00, -- GitLab From f423fa1bc9fe1978e6b9f54927411b62cb43eb04 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 29 Sep 2022 14:48:35 -0300 Subject: [PATCH 0789/2223] drm/i915/gvt: Add missing vfio_unregister_group_dev() call When converting to directly create the vfio_device the mdev driver has to put a vfio_register_emulated_iommu_dev() in the probe() and a pairing vfio_unregister_group_dev() in the remove. This was missed for gvt, add it. Cc: stable@vger.kernel.org Fixes: 978cf586ac35 ("drm/i915/gvt: convert to use vfio_register_emulated_iommu_dev") Reported-by: Alex Williamson Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/0-v1-013609965fe8+9d-vfio_gvt_unregister_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 41bba40feef8f..9003145adb5a9 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1615,6 +1615,7 @@ static void intel_vgpu_remove(struct mdev_device *mdev) if (WARN_ON_ONCE(vgpu->attached)) return; + vfio_unregister_group_dev(&vgpu->vfio_device); vfio_put_device(&vgpu->vfio_device); } -- GitLab From 1d666ab2dad5b311cd7d742607afcc59a2558925 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 30 Sep 2022 20:50:18 -0700 Subject: [PATCH 0790/2223] dt-bindings: input: Convert hid-over-i2c to DT schema Convert the hid-over-i2c binding to DT schema format. The supplies should probably be specific to a specific device, but it seems they are already in use otherwise. 'wakeup-source' is added as it was not explicitly documented. There's a few warnings for undocumented properties 'vcc-supply' and 'reset-gpios'. Those remain as they probably should have a specific compatible as well. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220927150916.1091217-1-robh@kernel.org Signed-off-by: Dmitry Torokhov --- .../bindings/input/hid-over-i2c.txt | 46 ---------- .../bindings/input/hid-over-i2c.yaml | 83 +++++++++++++++++++ 2 files changed, 83 insertions(+), 46 deletions(-) delete mode 100644 Documentation/devicetree/bindings/input/hid-over-i2c.txt create mode 100644 Documentation/devicetree/bindings/input/hid-over-i2c.yaml diff --git a/Documentation/devicetree/bindings/input/hid-over-i2c.txt b/Documentation/devicetree/bindings/input/hid-over-i2c.txt deleted file mode 100644 index 34c43d3bddfd1..0000000000000 --- a/Documentation/devicetree/bindings/input/hid-over-i2c.txt +++ /dev/null @@ -1,46 +0,0 @@ -* HID over I2C Device-Tree bindings - -HID over I2C provides support for various Human Interface Devices over the -I2C bus. These devices can be for example touchpads, keyboards, touch screens -or sensors. - -The specification has been written by Microsoft and is currently available here: -http://msdn.microsoft.com/en-us/library/windows/hardware/hh852380.aspx - -If this binding is used, the kernel module i2c-hid will handle the communication -with the device and the generic hid core layer will handle the protocol. - -Required properties: -- compatible: must be "hid-over-i2c" -- reg: i2c slave address -- hid-descr-addr: HID descriptor address -- interrupts: interrupt line - -Additional optional properties: - -Some devices may support additional optional properties to help with, e.g., -power sequencing. The following properties can be supported by one or more -device-specific compatible properties, which should be used in addition to the -"hid-over-i2c" string. - -- compatible: - * "wacom,w9013" (Wacom W9013 digitizer). Supports: - - vdd-supply (3.3V) - - vddl-supply (1.8V) - - post-power-on-delay-ms - -- vdd-supply: phandle of the regulator that provides the supply voltage. -- post-power-on-delay-ms: time required by the device after enabling its regulators - or powering it on, before it is ready for communication. -- touchscreen-inverted-x: See touchscreen.txt -- touchscreen-inverted-y: See touchscreen.txt - -Example: - - i2c-hid-dev@2c { - compatible = "hid-over-i2c"; - reg = <0x2c>; - hid-descr-addr = <0x0020>; - interrupt-parent = <&gpx3>; - interrupts = <3 2>; - }; diff --git a/Documentation/devicetree/bindings/input/hid-over-i2c.yaml b/Documentation/devicetree/bindings/input/hid-over-i2c.yaml new file mode 100644 index 0000000000000..7156b08f76453 --- /dev/null +++ b/Documentation/devicetree/bindings/input/hid-over-i2c.yaml @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: GPL-2.0-only +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/hid-over-i2c.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: HID over I2C Devices + +maintainers: + - Benjamin Tissoires + - Jiri Kosina + +description: |+ + HID over I2C provides support for various Human Interface Devices over the + I2C bus. These devices can be for example touchpads, keyboards, touch screens + or sensors. + + The specification has been written by Microsoft and is currently available here: + https://msdn.microsoft.com/en-us/library/windows/hardware/hh852380.aspx + + If this binding is used, the kernel module i2c-hid will handle the communication + with the device and the generic hid core layer will handle the protocol. + +allOf: + - $ref: /schemas/input/touchscreen/touchscreen.yaml# + +properties: + compatible: + oneOf: + - items: + - enum: + - wacom,w9013 + - const: hid-over-i2c + - description: Just "hid-over-i2c" alone is allowed, but not recommended. + const: hid-over-i2c + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + hid-descr-addr: + description: HID descriptor address + $ref: /schemas/types.yaml#/definitions/uint32 + + post-power-on-delay-ms: + description: Time required by the device after enabling its regulators + or powering it on, before it is ready for communication. + + touchscreen-inverted-x: true + + touchscreen-inverted-y: true + + vdd-supply: + description: 3.3V supply + + vddl-supply: + description: 1.8V supply + + wakeup-source: true + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + hid@2c { + compatible = "hid-over-i2c"; + reg = <0x2c>; + hid-descr-addr = <0x0020>; + interrupts = <3 2>; + }; + }; +... -- GitLab From 75024261403af74051e6aeb1b0a2dc2bca2458dc Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 30 Sep 2022 22:52:34 -0700 Subject: [PATCH 0791/2223] dt-bindings: input: Add the PinePhone keyboard binding Add devicetree support for the PinePhone keyboard case, which provides a matrix keyboard interface and a proxied I2C bus. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20220618165747.55709-2-samuel@sholland.org Signed-off-by: Dmitry Torokhov --- .../input/pine64,pinephone-keyboard.yaml | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml diff --git a/Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml b/Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml new file mode 100644 index 0000000000000..e4a0ac0fff9a7 --- /dev/null +++ b/Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/pine64,pinephone-keyboard.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Pine64 PinePhone keyboard device tree bindings + +maintainers: + - Samuel Holland + +description: + A keyboard accessory is available for the Pine64 PinePhone and PinePhone Pro. + It connects via I2C, providing a raw scan matrix, a flashing interface, and a + subordinate I2C bus for communication with a battery charger IC. + +properties: + compatible: + const: pine64,pinephone-keyboard + + reg: + const: 0x15 + + interrupts: + maxItems: 1 + + vbat-supply: + description: Supply for the keyboard MCU + + wakeup-source: true + + i2c: + $ref: /schemas/i2c/i2c-controller.yaml# + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + keyboard@15 { + compatible = "pine64,pinephone-keyboard"; + reg = <0x15>; + interrupt-parent = <&r_pio>; + interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + charger@75 { + reg = <0x75>; + }; + }; + }; + }; -- GitLab From ac107abef197660c9db529fe550080ad07b46a67 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 1 Oct 2022 10:12:45 +0100 Subject: [PATCH 0792/2223] KVM: arm64: Advertise new kvmarm mailing list As announced on the kvmarm list, we're moving the mailing list over to kvmarm@lists.linux.dev: As you probably all know, the kvmarm mailing has been hosted on Columbia's machines for as long as the project existed (over 13 years). After all this time, the university has decided to retire the list infrastructure and asked us to find a new hosting. A new mailing list has been created on lists.linux.dev[1], and I'm kindly asking everyone interested in following the KVM/arm64 developments to start subscribing to it (and start posting your patches there). I hope that people will move over to it quickly enough that we can soon give Columbia the green light to turn their systems off. Note that the new list will only get archived automatically once we fully switch over, but I'll make sure we fill any gap and not lose any message. In the meantime, please Cc both lists. [...] [1] https://subspace.kernel.org/lists.linux.dev.html Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221001091245.3900668-1-maz@kernel.org --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 589517372408c..f29f27717de4d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11124,7 +11124,8 @@ R: Alexandru Elisei R: Suzuki K Poulose R: Oliver Upton L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -L: kvmarm@lists.cs.columbia.edu (moderated for non-subscribers) +L: kvmarm@lists.linux.dev +L: kvmarm@lists.cs.columbia.edu (deprecated, moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git F: arch/arm64/include/asm/kvm* -- GitLab From 7fc4426959e17178654404e6bde4b920b5fee7c7 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:17:58 +0530 Subject: [PATCH 0793/2223] riscv: Add X register names to gpr-nums When encoding instructions it's sometimes necessary to set a register field to a precise number. This is easiest to do using the x naming. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/gpr-num.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/riscv/include/asm/gpr-num.h b/arch/riscv/include/asm/gpr-num.h index dfee2829fc7cb..efeb5edf8a3af 100644 --- a/arch/riscv/include/asm/gpr-num.h +++ b/arch/riscv/include/asm/gpr-num.h @@ -3,6 +3,11 @@ #define __ASM_GPR_NUM_H #ifdef __ASSEMBLY__ + + .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + .equ .L__gpr_num_x\num, \num + .endr + .equ .L__gpr_num_zero, 0 .equ .L__gpr_num_ra, 1 .equ .L__gpr_num_sp, 2 @@ -39,6 +44,9 @@ #else /* __ASSEMBLY__ */ #define __DEFINE_ASM_GPR_NUMS \ +" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \ +" .equ .L__gpr_num_x\\num, \\num\n" \ +" .endr\n" \ " .equ .L__gpr_num_zero, 0\n" \ " .equ .L__gpr_num_ra, 1\n" \ " .equ .L__gpr_num_sp, 2\n" \ -- GitLab From 5ac43ab2e3fe4e5d48ef313a99d0591021c3bbdd Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:18:07 +0530 Subject: [PATCH 0794/2223] riscv: Introduce support for defining instructions When compiling with toolchains that haven't yet been taught about new instructions we need to encode them ourselves. Create a new file where support for instruction definitions will evolve. We initiate the file with a macro called INSN_R(), which implements the R-type instruction encoding. INSN_R() will use the assembler's .insn directive when available, which should give the assembler a chance to do some validation. When .insn is not available we fall back to manual encoding. Not only should using instruction encoding macros improve readability and maintainability of code over the alternative of inserting instructions directly (e.g. '.word 0xc0de'), but we should also gain potential for more optimized code after compilation because the compiler will have control over the input and output registers used. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/Kconfig | 3 ++ arch/riscv/include/asm/insn-def.h | 90 +++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 arch/riscv/include/asm/insn-def.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 59d18881f35be..d6b0ffd9bf007 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -227,6 +227,9 @@ config RISCV_DMA_NONCOHERENT select ARCH_HAS_SETUP_DMA_OPS select DMA_DIRECT_REMAP +config AS_HAS_INSN + def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero) + source "arch/riscv/Kconfig.socs" source "arch/riscv/Kconfig.erratas" diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h new file mode 100644 index 0000000000000..635612e59b25c --- /dev/null +++ b/arch/riscv/include/asm/insn-def.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_INSN_DEF_H +#define __ASM_INSN_DEF_H + +#include + +#define INSN_R_FUNC7_SHIFT 25 +#define INSN_R_RS2_SHIFT 20 +#define INSN_R_RS1_SHIFT 15 +#define INSN_R_FUNC3_SHIFT 12 +#define INSN_R_RD_SHIFT 7 +#define INSN_R_OPCODE_SHIFT 0 + +#ifdef __ASSEMBLY__ + +#ifdef CONFIG_AS_HAS_INSN + + .macro insn_r, opcode, func3, func7, rd, rs1, rs2 + .insn r \opcode, \func3, \func7, \rd, \rs1, \rs2 + .endm + +#else + +#include + + .macro insn_r, opcode, func3, func7, rd, rs1, rs2 + .4byte ((\opcode << INSN_R_OPCODE_SHIFT) | \ + (\func3 << INSN_R_FUNC3_SHIFT) | \ + (\func7 << INSN_R_FUNC7_SHIFT) | \ + (.L__gpr_num_\rd << INSN_R_RD_SHIFT) | \ + (.L__gpr_num_\rs1 << INSN_R_RS1_SHIFT) | \ + (.L__gpr_num_\rs2 << INSN_R_RS2_SHIFT)) + .endm + +#endif + +#define __INSN_R(...) insn_r __VA_ARGS__ + +#else /* ! __ASSEMBLY__ */ + +#ifdef CONFIG_AS_HAS_INSN + +#define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \ + ".insn r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" + +#else + +#include +#include + +#define DEFINE_INSN_R \ + __DEFINE_ASM_GPR_NUMS \ +" .macro insn_r, opcode, func3, func7, rd, rs1, rs2\n" \ +" .4byte ((\\opcode << " __stringify(INSN_R_OPCODE_SHIFT) ") |" \ +" (\\func3 << " __stringify(INSN_R_FUNC3_SHIFT) ") |" \ +" (\\func7 << " __stringify(INSN_R_FUNC7_SHIFT) ") |" \ +" (.L__gpr_num_\\rd << " __stringify(INSN_R_RD_SHIFT) ") |" \ +" (.L__gpr_num_\\rs1 << " __stringify(INSN_R_RS1_SHIFT) ") |" \ +" (.L__gpr_num_\\rs2 << " __stringify(INSN_R_RS2_SHIFT) "))\n" \ +" .endm\n" + +#define UNDEFINE_INSN_R \ +" .purgem insn_r\n" + +#define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \ + DEFINE_INSN_R \ + "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \ + UNDEFINE_INSN_R + +#endif + +#endif /* ! __ASSEMBLY__ */ + +#define INSN_R(opcode, func3, func7, rd, rs1, rs2) \ + __INSN_R(RV_##opcode, RV_##func3, RV_##func7, \ + RV_##rd, RV_##rs1, RV_##rs2) + +#define RV_OPCODE(v) __ASM_STR(v) +#define RV_FUNC3(v) __ASM_STR(v) +#define RV_FUNC7(v) __ASM_STR(v) +#define RV_RD(v) __ASM_STR(v) +#define RV_RS1(v) __ASM_STR(v) +#define RV_RS2(v) __ASM_STR(v) +#define __RV_REG(v) __ASM_STR(x ## v) +#define RV___RD(v) __RV_REG(v) +#define RV___RS1(v) __RV_REG(v) +#define RV___RS2(v) __RV_REG(v) + +#endif /* __ASM_INSN_DEF_H */ -- GitLab From bb233a11dc6b3774fd46087242d7627ecf5293ed Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:18:14 +0530 Subject: [PATCH 0795/2223] riscv: KVM: Apply insn-def to hfence encodings Introduce hfence instruction encodings and apply them to KVM's use. With the self-documenting nature of the instruction encoding macros, and a spec always within arm's reach, it's safe to remove the comments, so we do that too. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/insn-def.h | 10 +++ arch/riscv/kvm/tlb.c | 129 ++++-------------------------- 2 files changed, 27 insertions(+), 112 deletions(-) diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index 635612e59b25c..c8aca3c27433f 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -87,4 +87,14 @@ #define RV___RS1(v) __RV_REG(v) #define RV___RS2(v) __RV_REG(v) +#define RV_OPCODE_SYSTEM RV_OPCODE(115) + +#define HFENCE_VVMA(vaddr, asid) \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17), \ + __RD(0), RS1(vaddr), RS2(asid)) + +#define HFENCE_GVMA(gaddr, vmid) \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(49), \ + __RD(0), RS1(gaddr), RS2(vmid)) + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 1a76d0b1907d5..1ce3394b3acfc 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -12,22 +12,7 @@ #include #include #include - -/* - * Instruction encoding of hfence.gvma is: - * HFENCE.GVMA rs1, rs2 - * HFENCE.GVMA zero, rs2 - * HFENCE.GVMA rs1 - * HFENCE.GVMA - * - * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2 - * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2 - * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1 - * rs1==zero and rs2==zero ==> HFENCE.GVMA - * - * Instruction encoding of HFENCE.GVMA is: - * 0110001 rs2(5) rs1(5) 000 00000 1110011 - */ +#include void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, gpa_t gpa, gpa_t gpsz, @@ -40,32 +25,14 @@ void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, return; } - for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) { - /* - * rs1 = a0 (GPA >> 2) - * rs2 = a1 (VMID) - * HFENCE.GVMA a0, a1 - * 0110001 01011 01010 000 00000 1110011 - */ - asm volatile ("srli a0, %0, 2\n" - "add a1, %1, zero\n" - ".word 0x62b50073\n" - :: "r" (pos), "r" (vmid) - : "a0", "a1", "memory"); - } + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) + asm volatile (HFENCE_GVMA(%0, %1) + : : "r" (pos >> 2), "r" (vmid) : "memory"); } void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid) { - /* - * rs1 = zero - * rs2 = a0 (VMID) - * HFENCE.GVMA zero, a0 - * 0110001 01010 00000 000 00000 1110011 - */ - asm volatile ("add a0, %0, zero\n" - ".word 0x62a00073\n" - :: "r" (vmid) : "a0", "memory"); + asm volatile(HFENCE_GVMA(zero, %0) : : "r" (vmid) : "memory"); } void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, @@ -78,46 +45,16 @@ void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, return; } - for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) { - /* - * rs1 = a0 (GPA >> 2) - * rs2 = zero - * HFENCE.GVMA a0 - * 0110001 00000 01010 000 00000 1110011 - */ - asm volatile ("srli a0, %0, 2\n" - ".word 0x62050073\n" - :: "r" (pos) : "a0", "memory"); - } + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) + asm volatile(HFENCE_GVMA(%0, zero) + : : "r" (pos >> 2) : "memory"); } void kvm_riscv_local_hfence_gvma_all(void) { - /* - * rs1 = zero - * rs2 = zero - * HFENCE.GVMA - * 0110001 00000 00000 000 00000 1110011 - */ - asm volatile (".word 0x62000073" ::: "memory"); + asm volatile(HFENCE_GVMA(zero, zero) : : : "memory"); } -/* - * Instruction encoding of hfence.gvma is: - * HFENCE.VVMA rs1, rs2 - * HFENCE.VVMA zero, rs2 - * HFENCE.VVMA rs1 - * HFENCE.VVMA - * - * rs1!=zero and rs2!=zero ==> HFENCE.VVMA rs1, rs2 - * rs1==zero and rs2!=zero ==> HFENCE.VVMA zero, rs2 - * rs1!=zero and rs2==zero ==> HFENCE.VVMA rs1 - * rs1==zero and rs2==zero ==> HFENCE.VVMA - * - * Instruction encoding of HFENCE.VVMA is: - * 0010001 rs2(5) rs1(5) 000 00000 1110011 - */ - void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, unsigned long asid, unsigned long gva, @@ -133,19 +70,9 @@ void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); - for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) { - /* - * rs1 = a0 (GVA) - * rs2 = a1 (ASID) - * HFENCE.VVMA a0, a1 - * 0010001 01011 01010 000 00000 1110011 - */ - asm volatile ("add a0, %0, zero\n" - "add a1, %1, zero\n" - ".word 0x22b50073\n" - :: "r" (pos), "r" (asid) - : "a0", "a1", "memory"); - } + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) + asm volatile(HFENCE_VVMA(%0, %1) + : : "r" (pos), "r" (asid) : "memory"); csr_write(CSR_HGATP, hgatp); } @@ -157,15 +84,7 @@ void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); - /* - * rs1 = zero - * rs2 = a0 (ASID) - * HFENCE.VVMA zero, a0 - * 0010001 01010 00000 000 00000 1110011 - */ - asm volatile ("add a0, %0, zero\n" - ".word 0x22a00073\n" - :: "r" (asid) : "a0", "memory"); + asm volatile(HFENCE_VVMA(zero, %0) : : "r" (asid) : "memory"); csr_write(CSR_HGATP, hgatp); } @@ -183,17 +102,9 @@ void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); - for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) { - /* - * rs1 = a0 (GVA) - * rs2 = zero - * HFENCE.VVMA a0 - * 0010001 00000 01010 000 00000 1110011 - */ - asm volatile ("add a0, %0, zero\n" - ".word 0x22050073\n" - :: "r" (pos) : "a0", "memory"); - } + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) + asm volatile(HFENCE_VVMA(%0, zero) + : : "r" (pos) : "memory"); csr_write(CSR_HGATP, hgatp); } @@ -204,13 +115,7 @@ void kvm_riscv_local_hfence_vvma_all(unsigned long vmid) hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); - /* - * rs1 = zero - * rs2 = zero - * HFENCE.VVMA - * 0010001 00000 00000 000 00000 1110011 - */ - asm volatile (".word 0x22000073" ::: "memory"); + asm volatile(HFENCE_VVMA(zero, zero) : : : "memory"); csr_write(CSR_HGATP, hgatp); } -- GitLab From 26b73f14933e9c0beb88bb2fcee69d93572558ef Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:18:20 +0530 Subject: [PATCH 0796/2223] riscv: KVM: Apply insn-def to hlv encodings Introduce hlv instruction encodings and apply them to KVM's use. We're careful not to introduce hlv.d to 32-bit builds. Indeed, we ensure the build fails if someone tries to use it. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/insn-def.h | 17 ++++++++++++++ arch/riscv/kvm/vcpu_exit.c | 39 +++++++------------------------ 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index c8aca3c27433f..af7b0b55815cf 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -97,4 +97,21 @@ INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(49), \ __RD(0), RS1(gaddr), RS2(vmid)) +#define HLVX_HU(dest, addr) \ + INSN_R(OPCODE_SYSTEM, FUNC3(4), FUNC7(50), \ + RD(dest), RS1(addr), __RS2(3)) + +#define HLV_W(dest, addr) \ + INSN_R(OPCODE_SYSTEM, FUNC3(4), FUNC7(52), \ + RD(dest), RS1(addr), __RS2(0)) + +#ifdef CONFIG_64BIT +#define HLV_D(dest, addr) \ + INSN_R(OPCODE_SYSTEM, FUNC3(4), FUNC7(54), \ + RD(dest), RS1(addr), __RS2(0)) +#else +#define HLV_D(dest, addr) \ + __ASM_STR(.error "hlv.d requires 64-bit support") +#endif + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index d5c36386878a3..c9f741ab26f5b 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -8,6 +8,7 @@ #include #include +#include static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) @@ -62,11 +63,7 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, { register unsigned long taddr asm("a0") = (unsigned long)trap; register unsigned long ttmp asm("a1"); - register unsigned long val asm("t0"); - register unsigned long tmp asm("t1"); - register unsigned long addr asm("t2") = guest_addr; - unsigned long flags; - unsigned long old_stvec, old_hstatus; + unsigned long flags, val, tmp, old_stvec, old_hstatus; local_irq_save(flags); @@ -82,29 +79,19 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, ".option push\n" ".option norvc\n" "add %[ttmp], %[taddr], 0\n" - /* - * HLVX.HU %[val], (%[addr]) - * HLVX.HU t0, (t2) - * 0110010 00011 00111 100 00101 1110011 - */ - ".word 0x6433c2f3\n" + HLVX_HU(%[val], %[addr]) "andi %[tmp], %[val], 3\n" "addi %[tmp], %[tmp], -3\n" "bne %[tmp], zero, 2f\n" "addi %[addr], %[addr], 2\n" - /* - * HLVX.HU %[tmp], (%[addr]) - * HLVX.HU t1, (t2) - * 0110010 00011 00111 100 00110 1110011 - */ - ".word 0x6433c373\n" + HLVX_HU(%[tmp], %[addr]) "sll %[tmp], %[tmp], 16\n" "add %[val], %[val], %[tmp]\n" "2:\n" ".option pop" : [val] "=&r" (val), [tmp] "=&r" (tmp), [taddr] "+&r" (taddr), [ttmp] "+&r" (ttmp), - [addr] "+&r" (addr) : : "memory"); + [addr] "+&r" (guest_addr) : : "memory"); if (trap->scause == EXC_LOAD_PAGE_FAULT) trap->scause = EXC_INST_PAGE_FAULT; @@ -121,24 +108,14 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, ".option norvc\n" "add %[ttmp], %[taddr], 0\n" #ifdef CONFIG_64BIT - /* - * HLV.D %[val], (%[addr]) - * HLV.D t0, (t2) - * 0110110 00000 00111 100 00101 1110011 - */ - ".word 0x6c03c2f3\n" + HLV_D(%[val], %[addr]) #else - /* - * HLV.W %[val], (%[addr]) - * HLV.W t0, (t2) - * 0110100 00000 00111 100 00101 1110011 - */ - ".word 0x6803c2f3\n" + HLV_W(%[val], %[addr]) #endif ".option pop" : [val] "=&r" (val), [taddr] "+&r" (taddr), [ttmp] "+&r" (ttmp) - : [addr] "r" (addr) : "memory"); + : [addr] "r" (guest_addr) : "memory"); } csr_write(CSR_STVEC, old_stvec); -- GitLab From d837f19195e77f7c89b6645c8311f5ea6ff67905 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 2 Oct 2022 10:18:25 +0530 Subject: [PATCH 0797/2223] RISC-V: KVM: Change the SBI specification version to v1.0 The SBI v1.0 specificaiton is functionally same as SBI v0.3 specification except that SBI v1.0 specification went through the full RISC-V International ratification process. Let us change the SBI specification version to v1.0. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 26a446a34057b..d4e3e600beefb 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -11,8 +11,8 @@ #define KVM_SBI_IMPID 3 -#define KVM_SBI_VERSION_MAJOR 0 -#define KVM_SBI_VERSION_MINOR 3 +#define KVM_SBI_VERSION_MAJOR 1 +#define KVM_SBI_VERSION_MINOR 0 struct kvm_vcpu_sbi_extension { unsigned long extid_start; -- GitLab From 122979aa26cd4a314aae889a0496eb829d50bc9e Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Sun, 2 Oct 2022 10:18:31 +0530 Subject: [PATCH 0798/2223] RISC-V: Probe Svinval extension form ISA string Just like other ISA extensions, we allow callers/users to detect the presence of Svinval extension from ISA string. Signed-off-by: Mayuresh Chitale Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/hwcap.h | 4 ++++ arch/riscv/kernel/cpu.c | 1 + arch/riscv/kernel/cpufeature.c | 1 + 3 files changed, 6 insertions(+) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 6f59ec64175ef..b225252900730 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -58,6 +58,7 @@ enum riscv_isa_ext_id { RISCV_ISA_EXT_ZICBOM, RISCV_ISA_EXT_ZIHINTPAUSE, RISCV_ISA_EXT_SSTC, + RISCV_ISA_EXT_SVINVAL, RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX, }; @@ -69,6 +70,7 @@ enum riscv_isa_ext_id { enum riscv_isa_ext_key { RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */ RISCV_ISA_EXT_KEY_ZIHINTPAUSE, + RISCV_ISA_EXT_KEY_SVINVAL, RISCV_ISA_EXT_KEY_MAX, }; @@ -90,6 +92,8 @@ static __always_inline int riscv_isa_ext2key(int num) return RISCV_ISA_EXT_KEY_FPU; case RISCV_ISA_EXT_ZIHINTPAUSE: return RISCV_ISA_EXT_KEY_ZIHINTPAUSE; + case RISCV_ISA_EXT_SVINVAL: + return RISCV_ISA_EXT_KEY_SVINVAL; default: return -EINVAL; } diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 0be8a2403212d..7d1cd653ca027 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -96,6 +96,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = { __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), + __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), __RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX), }; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 3b5583db9d80e..9774f1271f93e 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -204,6 +204,7 @@ void __init riscv_fill_hwcap(void) SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE); SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); + SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); } #undef SET_ISA_EXT_MAP } -- GitLab From 5ff112484f2e63c5cac9f865181ca7ce467d0f89 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 2 Oct 2022 10:18:37 +0530 Subject: [PATCH 0799/2223] RISC-V: KVM: Use Svinval for local TLB maintenance when available We should prefer HINVAL.GVMA and HINVAL.VVMA instruction for local TLB maintenance when underlying host supports Svinval extension. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/asm/insn-def.h | 20 +++++++++++ arch/riscv/kvm/tlb.c | 60 ++++++++++++++++++++++++------- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index af7b0b55815cf..16044affa57cc 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -114,4 +114,24 @@ __ASM_STR(.error "hlv.d requires 64-bit support") #endif +#define SINVAL_VMA(vaddr, asid) \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(11), \ + __RD(0), RS1(vaddr), RS2(asid)) + +#define SFENCE_W_INVAL() \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(12), \ + __RD(0), __RS1(0), __RS2(0)) + +#define SFENCE_INVAL_IR() \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(12), \ + __RD(0), __RS1(0), __RS2(1)) + +#define HINVAL_VVMA(vaddr, asid) \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(19), \ + __RD(0), RS1(vaddr), RS2(asid)) + +#define HINVAL_GVMA(gaddr, vmid) \ + INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(51), \ + __RD(0), RS1(gaddr), RS2(vmid)) + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 1ce3394b3acfc..309d79b3e5cd5 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -12,8 +12,12 @@ #include #include #include +#include #include +#define has_svinval() \ + static_branch_unlikely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_SVINVAL]) + void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, gpa_t gpa, gpa_t gpsz, unsigned long order) @@ -25,9 +29,17 @@ void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, return; } - for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) - asm volatile (HFENCE_GVMA(%0, %1) - : : "r" (pos >> 2), "r" (vmid) : "memory"); + if (has_svinval()) { + asm volatile (SFENCE_W_INVAL() ::: "memory"); + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) + asm volatile (HINVAL_GVMA(%0, %1) + : : "r" (pos >> 2), "r" (vmid) : "memory"); + asm volatile (SFENCE_INVAL_IR() ::: "memory"); + } else { + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) + asm volatile (HFENCE_GVMA(%0, %1) + : : "r" (pos >> 2), "r" (vmid) : "memory"); + } } void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid) @@ -45,9 +57,17 @@ void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, return; } - for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) - asm volatile(HFENCE_GVMA(%0, zero) - : : "r" (pos >> 2) : "memory"); + if (has_svinval()) { + asm volatile (SFENCE_W_INVAL() ::: "memory"); + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) + asm volatile(HINVAL_GVMA(%0, zero) + : : "r" (pos >> 2) : "memory"); + asm volatile (SFENCE_INVAL_IR() ::: "memory"); + } else { + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) + asm volatile(HFENCE_GVMA(%0, zero) + : : "r" (pos >> 2) : "memory"); + } } void kvm_riscv_local_hfence_gvma_all(void) @@ -70,9 +90,17 @@ void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); - for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) - asm volatile(HFENCE_VVMA(%0, %1) - : : "r" (pos), "r" (asid) : "memory"); + if (has_svinval()) { + asm volatile (SFENCE_W_INVAL() ::: "memory"); + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) + asm volatile(HINVAL_VVMA(%0, %1) + : : "r" (pos), "r" (asid) : "memory"); + asm volatile (SFENCE_INVAL_IR() ::: "memory"); + } else { + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) + asm volatile(HFENCE_VVMA(%0, %1) + : : "r" (pos), "r" (asid) : "memory"); + } csr_write(CSR_HGATP, hgatp); } @@ -102,9 +130,17 @@ void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); - for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) - asm volatile(HFENCE_VVMA(%0, zero) - : : "r" (pos) : "memory"); + if (has_svinval()) { + asm volatile (SFENCE_W_INVAL() ::: "memory"); + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) + asm volatile(HINVAL_VVMA(%0, zero) + : : "r" (pos) : "memory"); + asm volatile (SFENCE_INVAL_IR() ::: "memory"); + } else { + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) + asm volatile(HFENCE_VVMA(%0, zero) + : : "r" (pos) : "memory"); + } csr_write(CSR_HGATP, hgatp); } -- GitLab From bad6ea07c876a67c4d8f46b0c565ab500150720f Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 2 Oct 2022 10:18:42 +0530 Subject: [PATCH 0800/2223] RISC-V: KVM: Allow Guest use Svinval extension We should advertise Svinval ISA extension to KVM user-space whenever host supports it. This will allow KVM user-space (i.e. QEMU or KVMTOOL) to pass on this information to Guest via ISA string. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 7351417afd62e..b6770ee088721 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -98,6 +98,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_M, KVM_RISCV_ISA_EXT_SVPBMT, KVM_RISCV_ISA_EXT_SSTC, + KVM_RISCV_ISA_EXT_SVINVAL, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index d0f08d5b42829..901bb5c0cb503 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -53,6 +53,7 @@ static const unsigned long kvm_isa_ext_arr[] = { RISCV_ISA_EXT_m, RISCV_ISA_EXT_SVPBMT, RISCV_ISA_EXT_SSTC, + RISCV_ISA_EXT_SVINVAL, }; static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) @@ -87,6 +88,7 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_I: case KVM_RISCV_ISA_EXT_M: case KVM_RISCV_ISA_EXT_SSTC: + case KVM_RISCV_ISA_EXT_SVINVAL: return false; default: break; -- GitLab From 0bba48978f6b63aee0fa4ee3a8097ec94e75f7f2 Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Sun, 2 Oct 2022 10:18:48 +0530 Subject: [PATCH 0801/2223] RISC-V: KVM: Allow Guest use Zihintpause extension We should advertise Zihintpause ISA extension to KVM user-space whenever host supports it. This will allow KVM user-space (i.e. QEMU or KVMTOOL) to pass on this information to Guest via ISA string. Signed-off-by: Mayuresh Chitale Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index b6770ee088721..9085b90cf3247 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -99,6 +99,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_SVPBMT, KVM_RISCV_ISA_EXT_SSTC, KVM_RISCV_ISA_EXT_SVINVAL, + KVM_RISCV_ISA_EXT_ZIHINTPAUSE, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 901bb5c0cb503..0de0dd22e734c 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -54,6 +54,7 @@ static const unsigned long kvm_isa_ext_arr[] = { RISCV_ISA_EXT_SVPBMT, RISCV_ISA_EXT_SSTC, RISCV_ISA_EXT_SVINVAL, + RISCV_ISA_EXT_ZIHINTPAUSE, }; static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) @@ -89,6 +90,7 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_M: case KVM_RISCV_ISA_EXT_SSTC: case KVM_RISCV_ISA_EXT_SVINVAL: + case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: return false; default: break; -- GitLab From 1b5cbb8733f924c99bc48a8e4c2a95449f0f514d Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:18:54 +0530 Subject: [PATCH 0802/2223] RISC-V: KVM: Make ISA ext mappings explicit While adding new extensions at the bottom of the array isn't hard to do, it's a pain to review in order to ensure we're not missing any. Also, resolving merge conflicts for multiple new ISA extensions can be error-prone. To make adding new mappings foolproof, explicitly assign the array elements. And, now that the order doesn't matter, we can alphabetize the extensions, so we do that too. Signed-off-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 0de0dd22e734c..61fe1604e8ea5 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -42,19 +42,22 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { #define KVM_RISCV_BASE_ISA_MASK GENMASK(25, 0) +#define KVM_ISA_EXT_ARR(ext) [KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext + /* Mapping between KVM ISA Extension ID & Host ISA extension ID */ static const unsigned long kvm_isa_ext_arr[] = { - RISCV_ISA_EXT_a, - RISCV_ISA_EXT_c, - RISCV_ISA_EXT_d, - RISCV_ISA_EXT_f, - RISCV_ISA_EXT_h, - RISCV_ISA_EXT_i, - RISCV_ISA_EXT_m, - RISCV_ISA_EXT_SVPBMT, - RISCV_ISA_EXT_SSTC, - RISCV_ISA_EXT_SVINVAL, - RISCV_ISA_EXT_ZIHINTPAUSE, + [KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a, + [KVM_RISCV_ISA_EXT_C] = RISCV_ISA_EXT_c, + [KVM_RISCV_ISA_EXT_D] = RISCV_ISA_EXT_d, + [KVM_RISCV_ISA_EXT_F] = RISCV_ISA_EXT_f, + [KVM_RISCV_ISA_EXT_H] = RISCV_ISA_EXT_h, + [KVM_RISCV_ISA_EXT_I] = RISCV_ISA_EXT_i, + [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, + + KVM_ISA_EXT_ARR(SSTC), + KVM_ISA_EXT_ARR(SVINVAL), + KVM_ISA_EXT_ARR(SVPBMT), + KVM_ISA_EXT_ARR(ZIHINTPAUSE), }; static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) -- GitLab From afd5dde9a186b8fc5742fff707f184760c4af1a9 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:18:59 +0530 Subject: [PATCH 0803/2223] RISC-V: KVM: Provide UAPI for Zicbom block size We're about to allow guests to use the Zicbom extension. KVM userspace needs to know the cache block size in order to properly advertise it to the guest. Provide a virtual config register for userspace to get it with the GET_ONE_REG API, but setting it cannot be supported, so disallow SET_ONE_REG. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 8 ++++++++ arch/riscv/mm/dma-noncoherent.c | 2 ++ 3 files changed, 11 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 9085b90cf3247..3d77713005672 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -48,6 +48,7 @@ struct kvm_sregs { /* CONFIG registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ struct kvm_riscv_config { unsigned long isa; + unsigned long zicbom_block_size; }; /* CORE registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 61fe1604e8ea5..b0a0ce6d16ef5 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { @@ -261,6 +262,11 @@ static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, case KVM_REG_RISCV_CONFIG_REG(isa): reg_val = vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK; break; + case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): + if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOM)) + return -EINVAL; + reg_val = riscv_cbom_block_size; + break; default: return -EINVAL; } @@ -318,6 +324,8 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; } break; + case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): + return -EOPNOTSUPP; default: return -EINVAL; } diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index e3f9bdf47c5ff..b0add983530ab 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -13,6 +13,8 @@ #include unsigned int riscv_cbom_block_size; +EXPORT_SYMBOL_GPL(riscv_cbom_block_size); + static bool noncoherent_supported; void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, -- GitLab From 56852c6211971798dfbe4098c8a8528b59234de2 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sun, 2 Oct 2022 10:19:05 +0530 Subject: [PATCH 0804/2223] RISC-V: KVM: Expose Zicbom to the guest Guests may use the cbo.inval,clean,flush instructions when the CPU has the Zicbom extension and the hypervisor sets henvcfg.CBIE (for cbo.inval) and henvcfg.CBCFE (for cbo.clean,flush). Add Zicbom support for KVM guests which may be enabled and disabled from KVM userspace using the ISA extension ONE_REG API. Also opportunistically switch the other isa extension checks in kvm_riscv_vcpu_update_config() to riscv_isa_extension_available(). Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 3d77713005672..8985ff234c01c 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -101,6 +101,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_SSTC, KVM_RISCV_ISA_EXT_SVINVAL, KVM_RISCV_ISA_EXT_ZIHINTPAUSE, + KVM_RISCV_ISA_EXT_ZICBOM, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index b0a0ce6d16ef5..f55d15a8a410f 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -59,6 +59,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVPBMT), KVM_ISA_EXT_ARR(ZIHINTPAUSE), + KVM_ISA_EXT_ARR(ZICBOM), }; static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) @@ -799,11 +800,15 @@ static void kvm_riscv_vcpu_update_config(const unsigned long *isa) { u64 henvcfg = 0; - if (__riscv_isa_extension_available(isa, RISCV_ISA_EXT_SVPBMT)) + if (riscv_isa_extension_available(isa, SVPBMT)) henvcfg |= ENVCFG_PBMTE; - if (__riscv_isa_extension_available(isa, RISCV_ISA_EXT_SSTC)) + if (riscv_isa_extension_available(isa, SSTC)) henvcfg |= ENVCFG_STCE; + + if (riscv_isa_extension_available(isa, ZICBOM)) + henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); + csr_write(CSR_HENVCFG, henvcfg); #ifdef CONFIG_32BIT csr_write(CSR_HENVCFGH, henvcfg >> 32); -- GitLab From f493cdc92d9b9e9a0db0a9049609457e43a56066 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sun, 2 Oct 2022 10:19:11 +0530 Subject: [PATCH 0805/2223] RISC-V: KVM: add __init annotation to riscv_kvm_init() The riscv_kvm_init() is a module_init entry so let us add __init annotation to it. Signed-off-by: Xiu Jianfeng Signed-off-by: Anup Patel --- arch/riscv/kvm/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 1549205fe5feb..df2d8716851f2 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -122,7 +122,7 @@ void kvm_arch_exit(void) { } -static int riscv_kvm_init(void) +static int __init riscv_kvm_init(void) { return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); } -- GitLab From 54ce3f7ff3395f12ad142d46b628606ab1e926ef Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 2 Oct 2022 10:19:16 +0530 Subject: [PATCH 0806/2223] RISC-V: KVM: Record number of signal exits as a vCPU stat Record a statistic indicating the number of times a vCPU has exited due to a pending signal. Signed-off-by: Jisheng Zhang Reviewed-by: Guo Ren Reviewed-by: Andrew Jones Signed-off-by: Anup Patel exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; } /* -- GitLab From 9c00fbdd93a22a6657378292f2eb29e9754cde7f Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 2 Oct 2022 10:19:25 +0530 Subject: [PATCH 0807/2223] RISC-V: KVM: Use generic guest entry infrastructure Use generic guest entry infrastructure to properly handle TIF_NOTIFY_RESUME. Signed-off-by: Jisheng Zhang Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/kvm/Kconfig | 1 + arch/riscv/kvm/vcpu.c | 18 ++++++------------ 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index f5a342fa1b1d2..f36a737d5f96d 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -24,6 +24,7 @@ config KVM select PREEMPT_NOTIFIERS select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_XFER_TO_GUEST_WORK select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_EVENTFD select SRCU diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 5414bf56bce55..a032c4f0d6006 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -979,7 +980,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; while (ret > 0) { /* Check conditions before entering the guest */ - cond_resched(); + ret = xfer_to_guest_mode_handle_work(vcpu); + if (!ret) + ret = 1; kvm_riscv_gstage_vmid_update(vcpu); @@ -987,16 +990,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) local_irq_disable(); - /* - * Exit if we have a signal pending so that we can deliver - * the signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - } - /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -1019,7 +1012,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (ret <= 0 || kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || - kvm_request_pending(vcpu)) { + kvm_request_pending(vcpu) || + xfer_to_guest_mode_work_pending()) { vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); kvm_vcpu_srcu_read_lock(vcpu); -- GitLab From b60ca69715fcc39a5f4bdd56ca2ea691b7358455 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 2 Oct 2022 10:19:31 +0530 Subject: [PATCH 0808/2223] riscv: select HAVE_POSIX_CPU_TIMERS_TASK_WORK Move POSIX CPU timer expiry and signal delivery into task context to allow PREEMPT_RT setups to coexist with KVM. Signed-off-by: Jisheng Zhang Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d6b0ffd9bf007..74082e2d7ce8d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -103,6 +103,7 @@ config RISCV select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_STACKPROTECTOR -- GitLab From 434e5f93ed16f01936bfc492798cf610be60fbe9 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Wed, 25 May 2022 09:46:05 +0900 Subject: [PATCH 0809/2223] dt-bindings: watchdog: toshiba,visconti-wdt: Update the common clock properties The clock for this driver switched to the common clock controller driver. Therefore, update common clock properties for watchdog in the binding document. And this matched this example with the actual dts. Signed-off-by: Nobuhiro Iwamatsu Acked-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220525004605.2128727-1-nobuhiro1.iwamatsu@toshiba.co.jp Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/toshiba,visconti-wdt.yaml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml b/Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml index 690e19ce4b878..eba083822d1fb 100644 --- a/Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml @@ -35,20 +35,16 @@ additionalProperties: false examples: - | + #include + soc { #address-cells = <2>; #size-cells = <2>; - wdt_clk: wdt-clk { - compatible = "fixed-clock"; - clock-frequency = <150000000>; - #clock-cells = <0>; - }; - - watchdog@28330000 { + wdt: watchdog@28330000 { compatible = "toshiba,visconti-wdt"; reg = <0 0x28330000 0 0x1000>; - clocks = <&wdt_clk>; timeout-sec = <20>; + clocks = <&pismu TMPV770X_CLK_WDTCLK>; }; }; -- GitLab From 4f719022a753bb15720c9ddeb0387a93caa372ce Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 4 Sep 2022 23:31:02 -0700 Subject: [PATCH 0810/2223] watchdog: bd9576_wdt: switch to using devm_fwnode_gpiod_get() I would like to stop exporting OF-specific devm_gpiod_get_from_of_node() so that gpiolib can be cleaned a bit, so let's switch to the generic fwnode property API. While at it, switch the rest of the calls to read properties in bd9576_wdt_probe() to the generic device property API as well. Signed-off-by: Dmitry Torokhov Reviewed-by: Guenter Roeck Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20220903-gpiod_get_from_of_node-remove-v1-10-b29adfb27a6c@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/bd9576_wdt.c | 51 +++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/drivers/watchdog/bd9576_wdt.c b/drivers/watchdog/bd9576_wdt.c index 0b6999f3b6e83..4a20e07fbb699 100644 --- a/drivers/watchdog/bd9576_wdt.c +++ b/drivers/watchdog/bd9576_wdt.c @@ -9,8 +9,8 @@ #include #include #include -#include #include +#include #include #include @@ -202,10 +202,10 @@ static int bd957x_set_wdt_mode(struct bd9576_wdt_priv *priv, int hw_margin, static int bd9576_wdt_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct device_node *np = dev->parent->of_node; struct bd9576_wdt_priv *priv; u32 hw_margin[2]; u32 hw_margin_max = BD957X_WDT_DEFAULT_MARGIN, hw_margin_min = 0; + int count; int ret; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -221,40 +221,51 @@ static int bd9576_wdt_probe(struct platform_device *pdev) return -ENODEV; } - priv->gpiod_en = devm_gpiod_get_from_of_node(dev, dev->parent->of_node, - "rohm,watchdog-enable-gpios", - 0, GPIOD_OUT_LOW, - "watchdog-enable"); + priv->gpiod_en = devm_fwnode_gpiod_get(dev, dev_fwnode(dev->parent), + "rohm,watchdog-enable", + GPIOD_OUT_LOW, + "watchdog-enable"); if (IS_ERR(priv->gpiod_en)) return dev_err_probe(dev, PTR_ERR(priv->gpiod_en), "getting watchdog-enable GPIO failed\n"); - priv->gpiod_ping = devm_gpiod_get_from_of_node(dev, dev->parent->of_node, - "rohm,watchdog-ping-gpios", - 0, GPIOD_OUT_LOW, - "watchdog-ping"); + priv->gpiod_ping = devm_fwnode_gpiod_get(dev, dev_fwnode(dev->parent), + "rohm,watchdog-ping", + GPIOD_OUT_LOW, + "watchdog-ping"); if (IS_ERR(priv->gpiod_ping)) return dev_err_probe(dev, PTR_ERR(priv->gpiod_ping), "getting watchdog-ping GPIO failed\n"); - ret = of_property_read_variable_u32_array(np, "rohm,hw-timeout-ms", - &hw_margin[0], 1, 2); - if (ret < 0 && ret != -EINVAL) - return ret; + count = device_property_count_u32(dev->parent, "rohm,hw-timeout-ms"); + if (count < 0 && count != -EINVAL) + return count; + + if (count > 0) { + if (count > ARRAY_SIZE(hw_margin)) + return -EINVAL; - if (ret == 1) - hw_margin_max = hw_margin[0]; + ret = device_property_read_u32_array(dev->parent, + "rohm,hw-timeout-ms", + hw_margin, count); + if (ret < 0) + return ret; - if (ret == 2) { - hw_margin_max = hw_margin[1]; - hw_margin_min = hw_margin[0]; + if (count == 1) + hw_margin_max = hw_margin[0]; + + if (count == 2) { + hw_margin_max = hw_margin[1]; + hw_margin_min = hw_margin[0]; + } } ret = bd957x_set_wdt_mode(priv, hw_margin_max, hw_margin_min); if (ret) return ret; - priv->always_running = of_property_read_bool(np, "always-running"); + priv->always_running = device_property_read_bool(dev->parent, + "always-running"); watchdog_set_drvdata(&priv->wdd, priv); -- GitLab From 926e099267950f3b4442eb48dffc5cc3a870ad34 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 23 Aug 2022 15:47:13 +0200 Subject: [PATCH 0811/2223] watchdog: wdat_wdt: Set the min and max timeout values properly The wdat_wdt driver is misusing the min_hw_heartbeat_ms field. This field should only be used when the hardware watchdog device should not be pinged more frequently than a specific period. The ACPI WDAT "Minimum Count" field, on the other hand, specifies the minimum timeout value that can be set. This corresponds to the min_timeout field in Linux's watchdog infrastructure. Setting min_hw_heartbeat_ms instead can cause pings to the hardware to be delayed when there is no reason for that, eventually leading to unexpected firing of the watchdog timer (and thus unexpected reboot). Since commit 6d72c7ac9fbe ("watchdog: wdat_wdt: Using the existing function to check parameter timeout"), min_timeout is being set too, but to the arbitrary value of 1 second, which doesn't make sense and allows setting timeout values lower that the ACPI WDAT "Minimum Count" field. I'm also changing max_hw_heartbeat_ms to max_timeout for symmetry, although the use of this one isn't fundamentally wrong, but there is also no reason to enable the software-driven ping mechanism for the wdat_wdt driver. Signed-off-by: Jean Delvare Fixes: 058dfc767008 ("ACPI / watchdog: Add support for WDAT hardware watchdog") Fixes: 6d72c7ac9fbe ("watchdog: wdat_wdt: Using the existing function to check parameter timeout") Reviewed-by: Mika Westerberg Reviewed-by: Guenter Roeck Cc: Wim Van Sebroeck Cc: Rafael J. Wysocki Cc: Liu Xinpeng Link: https://lore.kernel.org/r/20220823154713.023ee771@endymion.delvare Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/wdat_wdt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index aeadaa07c891d..ce7a4a9e4b03c 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -342,9 +342,8 @@ static int wdat_wdt_probe(struct platform_device *pdev) return -EINVAL; wdat->period = tbl->timer_period; - wdat->wdd.min_hw_heartbeat_ms = wdat->period * tbl->min_count; - wdat->wdd.max_hw_heartbeat_ms = wdat->period * tbl->max_count; - wdat->wdd.min_timeout = 1; + wdat->wdd.min_timeout = DIV_ROUND_UP(wdat->period * tbl->min_count, 1000); + wdat->wdd.max_timeout = wdat->period * tbl->max_count / 1000; wdat->stopped_in_sleep = tbl->flags & ACPI_WDAT_STOPPED; wdat->wdd.info = &wdat_wdt_info; wdat->wdd.ops = &wdat_wdt_ops; -- GitLab From ed835d8171fc884c7750cdd54128df16d4571e3a Mon Sep 17 00:00:00 2001 From: Jerry Hoemann Date: Sat, 20 Aug 2022 14:28:20 -0600 Subject: [PATCH 0812/2223] watchdog/hpwdt: Include nmi.h only if CONFIG_HPWDT_NMI_DECODING Fixes: d48b0e173715 ("x86, nmi, drivers: Fix nmi splitup build bug") Arm64 does not support NMI and has no . Include only if CONFIG_HPWDT_NMI_DECODING is defined to avoid build failure on non-existent header file on Arm64. Signed-off-by: Jerry Hoemann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220820202821.1263837-2-jerry.hoemann@hpe.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/hpwdt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index a5006a58e0dbb..f79f932bca148 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -20,7 +20,9 @@ #include #include #include +#ifdef CONFIG_HPWDT_NMI_DECODING #include +#endif #include #define HPWDT_VERSION "2.0.4" -- GitLab From 891862d5ba11da739ac796221ff64e4ccf5a275f Mon Sep 17 00:00:00 2001 From: Jerry Hoemann Date: Sat, 20 Aug 2022 14:28:21 -0600 Subject: [PATCH 0813/2223] watchdog/hpwdt: Enable HP_WATCHDOG for ARM64 systems. Enable HP_WATCHDOG for ARM64 systems. HPWDT_NMI_DECODING requires X86 as NMI handlers are X86 specific. Signed-off-by: Jerry Hoemann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220820202821.1263837-3-jerry.hoemann@hpe.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 9295492d24f74..cd643e50681e1 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1315,7 +1315,7 @@ config IT87_WDT config HP_WATCHDOG tristate "HP ProLiant iLO2+ Hardware Watchdog Timer" select WATCHDOG_CORE - depends on X86 && PCI + depends on (ARM64 || X86) && PCI help A software monitoring watchdog and NMI handling driver. This driver will detect lockups and provide a stack trace. This is a driver that @@ -1325,7 +1325,7 @@ config HP_WATCHDOG config HPWDT_NMI_DECODING bool "NMI support for the HP ProLiant iLO2+ Hardware Watchdog Timer" - depends on HP_WATCHDOG + depends on X86 && HP_WATCHDOG default y help Enables the NMI handler for the watchdog pretimeout NMI and the iLO -- GitLab From 19f04459f019743310d17e8d426ff5d1a4b81041 Mon Sep 17 00:00:00 2001 From: Chin-Ting Kuo Date: Fri, 19 Aug 2022 17:49:05 +0800 Subject: [PATCH 0814/2223] watchdog: aspeed_wdt: Reorder output signal register configuration If the output driving type is push-pull mode, the output polarity should be selected in advance. Otherwise, an unexpected value will be output at the moment of changing to push-pull mode. Thus, output polarity, WDT18[31], must be configured before changing driving type, WDT18[30]. Signed-off-by: Chin-Ting Kuo Reviewed-by: Guenter Roeck Tested-by: Bonnie Lo Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20220819094905.1962513-1-chin-ting_kuo@aspeedtech.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/aspeed_wdt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c index bd06622813eb4..0cff2adfbfc96 100644 --- a/drivers/watchdog/aspeed_wdt.c +++ b/drivers/watchdog/aspeed_wdt.c @@ -332,18 +332,18 @@ static int aspeed_wdt_probe(struct platform_device *pdev) u32 reg = readl(wdt->base + WDT_RESET_WIDTH); reg &= config->ext_pulse_width_mask; - if (of_property_read_bool(np, "aspeed,ext-push-pull")) - reg |= WDT_PUSH_PULL_MAGIC; + if (of_property_read_bool(np, "aspeed,ext-active-high")) + reg |= WDT_ACTIVE_HIGH_MAGIC; else - reg |= WDT_OPEN_DRAIN_MAGIC; + reg |= WDT_ACTIVE_LOW_MAGIC; writel(reg, wdt->base + WDT_RESET_WIDTH); reg &= config->ext_pulse_width_mask; - if (of_property_read_bool(np, "aspeed,ext-active-high")) - reg |= WDT_ACTIVE_HIGH_MAGIC; + if (of_property_read_bool(np, "aspeed,ext-push-pull")) + reg |= WDT_PUSH_PULL_MAGIC; else - reg |= WDT_ACTIVE_LOW_MAGIC; + reg |= WDT_OPEN_DRAIN_MAGIC; writel(reg, wdt->base + WDT_RESET_WIDTH); } -- GitLab From dc1f12b916005e1a1a908fbfcded356634a07038 Mon Sep 17 00:00:00 2001 From: Srinivas Neeli Date: Thu, 18 Aug 2022 20:36:37 +0530 Subject: [PATCH 0815/2223] dt-bindings: watchdog: Convert Xilinx watchdog bindings to json-schema Convert Xilinx watchdog bindings to DT schema format using json-schema Signed-off-by: Shubhrajyoti Datta Signed-off-by: Radhey Shyam Pandey Signed-off-by: Srinivas Neeli Reviewed-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220818150637.815-1-srinivas.neeli@xilinx.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/of-xilinx-wdt.txt | 26 ------- .../watchdog/xlnx,xps-timebase-wdt.yaml | 68 +++++++++++++++++++ 2 files changed, 68 insertions(+), 26 deletions(-) delete mode 100644 Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt create mode 100644 Documentation/devicetree/bindings/watchdog/xlnx,xps-timebase-wdt.yaml diff --git a/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt b/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt deleted file mode 100644 index c6ae9c9d5e3e2..0000000000000 --- a/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt +++ /dev/null @@ -1,26 +0,0 @@ -Xilinx AXI/PLB soft-core watchdog Device Tree Bindings ---------------------------------------------------------- - -Required properties: -- compatible : Should be "xlnx,xps-timebase-wdt-1.00.a" or - "xlnx,xps-timebase-wdt-1.01.a". -- reg : Physical base address and size - -Optional properties: -- clocks : Input clock specifier. Refer to common clock - bindings. -- clock-frequency : Frequency of clock in Hz -- xlnx,wdt-enable-once : 0 - Watchdog can be restarted - 1 - Watchdog can be enabled just once -- xlnx,wdt-interval : Watchdog timeout interval in 2^ clock cycles, - is integer from 8 to 31. - -Example: -axi-timebase-wdt@40100000 { - clock-frequency = <50000000>; - compatible = "xlnx,xps-timebase-wdt-1.00.a"; - clocks = <&clkc 15>; - reg = <0x40100000 0x10000>; - xlnx,wdt-enable-once = <0x0>; - xlnx,wdt-interval = <0x1b>; -} ; diff --git a/Documentation/devicetree/bindings/watchdog/xlnx,xps-timebase-wdt.yaml b/Documentation/devicetree/bindings/watchdog/xlnx,xps-timebase-wdt.yaml new file mode 100644 index 0000000000000..493a1c9547077 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/xlnx,xps-timebase-wdt.yaml @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: GPL-2.0-or-later OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/xlnx,xps-timebase-wdt.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx AXI/PLB softcore and window Watchdog Timer + +maintainers: + - Shubhrajyoti Datta + - Srinivas Neeli + +description: + The Timebase watchdog timer(WDT) is a free-running 32 bit counter. + WDT uses a dual-expiration architecture. After one expiration of + the timeout interval, an interrupt is generated and the WDT state + bit is set to one in the status register. If the state bit is not + cleared (by writing a one to the state bit) before the next + expiration of the timeout interval, a WDT reset is generated. + +allOf: + - $ref: watchdog.yaml# + +properties: + compatible: + enum: + - xlnx,xps-timebase-wdt-1.01.a + - xlnx,xps-timebase-wdt-1.00.a + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-frequency: + description: Frequency of clock in Hz + + xlnx,wdt-interval: + $ref: /schemas/types.yaml#/definitions/uint32 + description: Watchdog timeout interval + minimum: 8 + maximum: 32 + + xlnx,wdt-enable-once: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1] + description: If watchdog is configured as enable once, + then the watchdog cannot be disabled after + it has been enabled. + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + watchdog@40100000 { + compatible = "xlnx,xps-timebase-wdt-1.00.a"; + reg = <0x40100000 0x1000>; + clock-frequency = <50000000>; + clocks = <&clkc 15>; + xlnx,wdt-enable-once = <0x0>; + xlnx,wdt-interval = <0x1b>; + }; +... -- GitLab From 5a9fbf8b807c0e35fc99bb65a9559ec9b0abde66 Mon Sep 17 00:00:00 2001 From: Henning Schild Date: Wed, 24 Aug 2022 17:24:48 +0200 Subject: [PATCH 0816/2223] watchdog: w83627hf_wdt: add bootstatus support The status bit in the status and control register can tell us whether the last reboot was caused by the watchdog. Make sure to take that into the bootstatus before clearing it. Signed-off-by: Henning Schild Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220824152448.7736-1-henning.schild@siemens.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/w83627hf_wdt.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c index 56a4a4030ca96..bc33b63c5a5df 100644 --- a/drivers/watchdog/w83627hf_wdt.c +++ b/drivers/watchdog/w83627hf_wdt.c @@ -113,6 +113,10 @@ MODULE_PARM_DESC(early_disable, "Disable watchdog at boot time (default=0)"); #define W836X7HF_WDT_CSR 0xf7 #define NCT6102D_WDT_CSR 0xf2 +#define WDT_CSR_STATUS 0x10 +#define WDT_CSR_KBD 0x40 +#define WDT_CSR_MOUSE 0x80 + static void superio_outb(int reg, int val) { outb(reg, WDT_EFER); @@ -244,8 +248,12 @@ static int w83627hf_init(struct watchdog_device *wdog, enum chips chip) t = superio_inb(cr_wdt_control) & ~0x0C; superio_outb(cr_wdt_control, t); - /* reset trigger, disable keyboard & mouse turning off watchdog */ - t = superio_inb(cr_wdt_csr) & ~0xD0; + t = superio_inb(cr_wdt_csr); + if (t & WDT_CSR_STATUS) + wdog->bootstatus |= WDIOF_CARDRESET; + + /* reset status, disable keyboard & mouse turning off watchdog */ + t &= ~(WDT_CSR_STATUS | WDT_CSR_KBD | WDT_CSR_MOUSE); superio_outb(cr_wdt_csr, t); superio_exit(); -- GitLab From 64ee9375090e3c677b6e4e089d41362ac16e4357 Mon Sep 17 00:00:00 2001 From: Sergei Antonov Date: Mon, 29 Aug 2022 12:04:36 +0300 Subject: [PATCH 0817/2223] watchdog: ftwdt010_wdt: implement _restart() function Implement ftwdt010_wdt_restart(). It enables watchdog with timeout = 0 and disabled IRQ. Since it needs code similar to ftwdt010_wdt_start(), add a new function ftwdt010_enable() and move common code there. Suggested-by: Guenter Roeck Signed-off-by: Sergei Antonov Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220829090436.452742-1-saproj@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ftwdt010_wdt.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/ftwdt010_wdt.c b/drivers/watchdog/ftwdt010_wdt.c index 21dcc7765688a..0a5bbfd2823ff 100644 --- a/drivers/watchdog/ftwdt010_wdt.c +++ b/drivers/watchdog/ftwdt010_wdt.c @@ -47,21 +47,28 @@ struct ftwdt010_wdt *to_ftwdt010_wdt(struct watchdog_device *wdd) return container_of(wdd, struct ftwdt010_wdt, wdd); } -static int ftwdt010_wdt_start(struct watchdog_device *wdd) +static void ftwdt010_enable(struct ftwdt010_wdt *gwdt, + unsigned int timeout, + bool need_irq) { - struct ftwdt010_wdt *gwdt = to_ftwdt010_wdt(wdd); u32 enable; - writel(wdd->timeout * WDT_CLOCK, gwdt->base + FTWDT010_WDLOAD); + writel(timeout * WDT_CLOCK, gwdt->base + FTWDT010_WDLOAD); writel(WDRESTART_MAGIC, gwdt->base + FTWDT010_WDRESTART); /* set clock before enabling */ enable = WDCR_CLOCK_5MHZ | WDCR_SYS_RST; writel(enable, gwdt->base + FTWDT010_WDCR); - if (gwdt->has_irq) + if (need_irq) enable |= WDCR_WDINTR; enable |= WDCR_ENABLE; writel(enable, gwdt->base + FTWDT010_WDCR); +} +static int ftwdt010_wdt_start(struct watchdog_device *wdd) +{ + struct ftwdt010_wdt *gwdt = to_ftwdt010_wdt(wdd); + + ftwdt010_enable(gwdt, wdd->timeout, gwdt->has_irq); return 0; } @@ -93,6 +100,13 @@ static int ftwdt010_wdt_set_timeout(struct watchdog_device *wdd, return 0; } +static int ftwdt010_wdt_restart(struct watchdog_device *wdd, + unsigned long action, void *data) +{ + ftwdt010_enable(to_ftwdt010_wdt(wdd), 0, false); + return 0; +} + static irqreturn_t ftwdt010_wdt_interrupt(int irq, void *data) { struct ftwdt010_wdt *gwdt = data; @@ -107,6 +121,7 @@ static const struct watchdog_ops ftwdt010_wdt_ops = { .stop = ftwdt010_wdt_stop, .ping = ftwdt010_wdt_ping, .set_timeout = ftwdt010_wdt_set_timeout, + .restart = ftwdt010_wdt_restart, .owner = THIS_MODULE, }; -- GitLab From 81126222bd3ad30eed486aafa66b52b5fc88b236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller?= Date: Wed, 14 Sep 2022 11:46:05 +0200 Subject: [PATCH 0818/2223] watchdog: Exar/MaxLinear XR28V38x driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple driver for the watchdog present in some Exar/MaxLinear UART chips. Please see https://www.maxlinear.com/product/interface/uarts/lpc-uarts/xr28v384 for more info. Signed-off-by: David Müller Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220914094605.93377-1-d.mueller@elsoft.ch Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 11 + drivers/watchdog/Makefile | 1 + drivers/watchdog/exar_wdt.c | 427 ++++++++++++++++++++++++++++++++++++ 3 files changed, 439 insertions(+) create mode 100644 drivers/watchdog/exar_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index cd643e50681e1..bd3bb4abca51c 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1089,6 +1089,17 @@ config EBC_C384_WDT WinSystems EBC-C384 motherboard. The timeout may be configured via the timeout module parameter. +config EXAR_WDT + tristate "Exar Watchdog Timer" + depends on X86 + select WATCHDOG_CORE + help + Enables watchdog timer support for the watchdog timer present + in some Exar/MaxLinear UART chips like the XR28V38x. + + To compile this driver as a module, choose M here: the + module will be called exar_wdt. + config F71808E_WDT tristate "Fintek F718xx, F818xx Super I/O Watchdog" depends on X86 diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index cdeb119e6e61a..d41e5f830ae7f 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -105,6 +105,7 @@ obj-$(CONFIG_ADVANTECH_WDT) += advantechwdt.o obj-$(CONFIG_ALIM1535_WDT) += alim1535_wdt.o obj-$(CONFIG_ALIM7101_WDT) += alim7101_wdt.o obj-$(CONFIG_EBC_C384_WDT) += ebc-c384_wdt.o +obj-$(CONFIG_EXAR_WDT) += exar_wdt.o obj-$(CONFIG_F71808E_WDT) += f71808e_wdt.o obj-$(CONFIG_SP5100_TCO) += sp5100_tco.o obj-$(CONFIG_GEODE_WDT) += geodewdt.o diff --git a/drivers/watchdog/exar_wdt.c b/drivers/watchdog/exar_wdt.c new file mode 100644 index 0000000000000..35058d8b21bc7 --- /dev/null +++ b/drivers/watchdog/exar_wdt.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * exar_wdt.c - Driver for the watchdog present in some + * Exar/MaxLinear UART chips like the XR28V38x. + * + * (c) Copyright 2022 D. Müller . + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "exar_wdt" + +static const unsigned short sio_config_ports[] = { 0x2e, 0x4e }; +static const unsigned char sio_enter_keys[] = { 0x67, 0x77, 0x87, 0xA0 }; +#define EXAR_EXIT_KEY 0xAA + +#define EXAR_LDN 0x07 +#define EXAR_DID 0x20 +#define EXAR_VID 0x23 +#define EXAR_WDT 0x26 +#define EXAR_ACT 0x30 +#define EXAR_RTBASE 0x60 + +#define EXAR_WDT_LDEV 0x08 + +#define EXAR_VEN_ID 0x13A8 +#define EXAR_DEV_382 0x0382 +#define EXAR_DEV_384 0x0384 + +/* WDT runtime registers */ +#define WDT_CTRL 0x00 +#define WDT_VAL 0x01 + +#define WDT_UNITS_10MS 0x0 /* the 10 millisec unit of the HW is not used */ +#define WDT_UNITS_SEC 0x2 +#define WDT_UNITS_MIN 0x4 + +/* default WDT control for WDTOUT signal activ / rearm by read */ +#define EXAR_WDT_DEF_CONF 0 + +struct wdt_pdev_node { + struct list_head list; + struct platform_device *pdev; + const char name[16]; +}; + +struct wdt_priv { + /* the lock for WDT io operations */ + spinlock_t io_lock; + struct resource wdt_res; + struct watchdog_device wdt_dev; + unsigned short did; + unsigned short config_port; + unsigned char enter_key; + unsigned char unit; + unsigned char timeout; +}; + +#define WATCHDOG_TIMEOUT 60 + +static int timeout = WATCHDOG_TIMEOUT; +module_param(timeout, int, 0); +MODULE_PARM_DESC(timeout, + "Watchdog timeout in seconds. 1<=timeout<=15300, default=" + __MODULE_STRING(WATCHDOG_TIMEOUT) "."); + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, + "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +static int exar_sio_enter(const unsigned short config_port, + const unsigned char key) +{ + if (!request_muxed_region(config_port, 2, DRV_NAME)) + return -EBUSY; + + /* write the ENTER-KEY twice */ + outb(key, config_port); + outb(key, config_port); + + return 0; +} + +static void exar_sio_exit(const unsigned short config_port) +{ + outb(EXAR_EXIT_KEY, config_port); + release_region(config_port, 2); +} + +static unsigned char exar_sio_read(const unsigned short config_port, + const unsigned char reg) +{ + outb(reg, config_port); + return inb(config_port + 1); +} + +static void exar_sio_write(const unsigned short config_port, + const unsigned char reg, const unsigned char val) +{ + outb(reg, config_port); + outb(val, config_port + 1); +} + +static unsigned short exar_sio_read16(const unsigned short config_port, + const unsigned char reg) +{ + unsigned char msb, lsb; + + msb = exar_sio_read(config_port, reg); + lsb = exar_sio_read(config_port, reg + 1); + + return (msb << 8) | lsb; +} + +static void exar_sio_select_wdt(const unsigned short config_port) +{ + exar_sio_write(config_port, EXAR_LDN, EXAR_WDT_LDEV); +} + +static void exar_wdt_arm(const struct wdt_priv *priv) +{ + unsigned short rt_base = priv->wdt_res.start; + + /* write timeout value twice to arm watchdog */ + outb(priv->timeout, rt_base + WDT_VAL); + outb(priv->timeout, rt_base + WDT_VAL); +} + +static void exar_wdt_disarm(const struct wdt_priv *priv) +{ + unsigned short rt_base = priv->wdt_res.start; + + /* + * use two accesses with different values to make sure + * that a combination of a previous single access and + * the ones below with the same value are not falsely + * interpreted as "arm watchdog" + */ + outb(0xFF, rt_base + WDT_VAL); + outb(0, rt_base + WDT_VAL); +} + +static int exar_wdt_start(struct watchdog_device *wdog) +{ + struct wdt_priv *priv = watchdog_get_drvdata(wdog); + unsigned short rt_base = priv->wdt_res.start; + + spin_lock(&priv->io_lock); + + exar_wdt_disarm(priv); + outb(priv->unit, rt_base + WDT_CTRL); + exar_wdt_arm(priv); + + spin_unlock(&priv->io_lock); + return 0; +} + +static int exar_wdt_stop(struct watchdog_device *wdog) +{ + struct wdt_priv *priv = watchdog_get_drvdata(wdog); + + spin_lock(&priv->io_lock); + + exar_wdt_disarm(priv); + + spin_unlock(&priv->io_lock); + return 0; +} + +static int exar_wdt_keepalive(struct watchdog_device *wdog) +{ + struct wdt_priv *priv = watchdog_get_drvdata(wdog); + unsigned short rt_base = priv->wdt_res.start; + + spin_lock(&priv->io_lock); + + /* reading the WDT_VAL reg will feed the watchdog */ + inb(rt_base + WDT_VAL); + + spin_unlock(&priv->io_lock); + return 0; +} + +static int exar_wdt_set_timeout(struct watchdog_device *wdog, unsigned int t) +{ + struct wdt_priv *priv = watchdog_get_drvdata(wdog); + bool unit_min = false; + + /* + * if new timeout is bigger then 255 seconds, change the + * unit to minutes and round the timeout up to the next whole minute + */ + if (t > 255) { + unit_min = true; + t = DIV_ROUND_UP(t, 60); + } + + /* save for later use in exar_wdt_start() */ + priv->unit = unit_min ? WDT_UNITS_MIN : WDT_UNITS_SEC; + priv->timeout = t; + + wdog->timeout = unit_min ? t * 60 : t; + + if (watchdog_hw_running(wdog)) + exar_wdt_start(wdog); + + return 0; +} + +static const struct watchdog_info exar_wdt_info = { + .options = WDIOF_KEEPALIVEPING | + WDIOF_SETTIMEOUT | + WDIOF_MAGICCLOSE, + .identity = "Exar/MaxLinear XR28V38x Watchdog", +}; + +static const struct watchdog_ops exar_wdt_ops = { + .owner = THIS_MODULE, + .start = exar_wdt_start, + .stop = exar_wdt_stop, + .ping = exar_wdt_keepalive, + .set_timeout = exar_wdt_set_timeout, +}; + +static int exar_wdt_config(struct watchdog_device *wdog, + const unsigned char conf) +{ + struct wdt_priv *priv = watchdog_get_drvdata(wdog); + int ret; + + ret = exar_sio_enter(priv->config_port, priv->enter_key); + if (ret) + return ret; + + exar_sio_select_wdt(priv->config_port); + exar_sio_write(priv->config_port, EXAR_WDT, conf); + + exar_sio_exit(priv->config_port); + + return 0; +} + +static int __init exar_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct wdt_priv *priv = dev->platform_data; + struct watchdog_device *wdt_dev = &priv->wdt_dev; + struct resource *res; + int ret; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!res) + return -ENXIO; + + spin_lock_init(&priv->io_lock); + + wdt_dev->info = &exar_wdt_info; + wdt_dev->ops = &exar_wdt_ops; + wdt_dev->min_timeout = 1; + wdt_dev->max_timeout = 255 * 60; + + watchdog_init_timeout(wdt_dev, timeout, NULL); + watchdog_set_nowayout(wdt_dev, nowayout); + watchdog_stop_on_reboot(wdt_dev); + watchdog_stop_on_unregister(wdt_dev); + watchdog_set_drvdata(wdt_dev, priv); + + ret = exar_wdt_config(wdt_dev, EXAR_WDT_DEF_CONF); + if (ret) + return ret; + + exar_wdt_set_timeout(wdt_dev, timeout); + /* Make sure that the watchdog is not running */ + exar_wdt_stop(wdt_dev); + + ret = devm_watchdog_register_device(dev, wdt_dev); + if (ret) + return ret; + + dev_info(dev, "XR28V%X WDT initialized. timeout=%d sec (nowayout=%d)\n", + priv->did, timeout, nowayout); + + return 0; +} + +static unsigned short __init exar_detect(const unsigned short config_port, + const unsigned char key, + unsigned short *rt_base) +{ + int ret; + unsigned short base = 0; + unsigned short vid, did; + + ret = exar_sio_enter(config_port, key); + if (ret) + return 0; + + vid = exar_sio_read16(config_port, EXAR_VID); + did = exar_sio_read16(config_port, EXAR_DID); + + /* check for the vendor and device IDs we currently know about */ + if (vid == EXAR_VEN_ID && + (did == EXAR_DEV_382 || + did == EXAR_DEV_384)) { + exar_sio_select_wdt(config_port); + /* is device active? */ + if (exar_sio_read(config_port, EXAR_ACT) == 0x01) + base = exar_sio_read16(config_port, EXAR_RTBASE); + } + + exar_sio_exit(config_port); + + if (base) { + pr_debug("Found a XR28V%X WDT (conf: 0x%x / rt: 0x%04x)\n", + did, config_port, base); + *rt_base = base; + return did; + } + + return 0; +} + +static struct platform_driver exar_wdt_driver = { + .driver = { + .name = DRV_NAME, + }, +}; + +static LIST_HEAD(pdev_list); + +static int __init exar_wdt_register(struct wdt_priv *priv, const int idx) +{ + struct wdt_pdev_node *n; + + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (!n) + return -ENOMEM; + + INIT_LIST_HEAD(&n->list); + + scnprintf((char *)n->name, sizeof(n->name), DRV_NAME ".%d", idx); + priv->wdt_res.name = n->name; + + n->pdev = platform_device_register_resndata(NULL, DRV_NAME, idx, + &priv->wdt_res, 1, + priv, sizeof(*priv)); + if (IS_ERR(n->pdev)) { + kfree(n); + return PTR_ERR(n->pdev); + } + + list_add_tail(&n->list, &pdev_list); + + return 0; +} + +static void exar_wdt_unregister(void) +{ + struct wdt_pdev_node *n, *t; + + list_for_each_entry_safe(n, t, &pdev_list, list) { + platform_device_unregister(n->pdev); + list_del(&n->list); + kfree(n); + } +} + +static int __init exar_wdt_init(void) +{ + int ret, i, j, idx = 0; + + /* search for active Exar watchdogs on all possible locations */ + for (i = 0; i < ARRAY_SIZE(sio_config_ports); i++) { + for (j = 0; j < ARRAY_SIZE(sio_enter_keys); j++) { + unsigned short did, rt_base = 0; + + did = exar_detect(sio_config_ports[i], + sio_enter_keys[j], + &rt_base); + + if (did) { + struct wdt_priv priv = { + .wdt_res = DEFINE_RES_IO(rt_base, 2), + .did = did, + .config_port = sio_config_ports[i], + .enter_key = sio_enter_keys[j], + }; + + ret = exar_wdt_register(&priv, idx); + if (!ret) + idx++; + } + } + } + + if (!idx) + return -ENODEV; + + ret = platform_driver_probe(&exar_wdt_driver, exar_wdt_probe); + if (ret) + exar_wdt_unregister(); + + return ret; +} + +static void __exit exar_wdt_exit(void) +{ + exar_wdt_unregister(); + platform_driver_unregister(&exar_wdt_driver); +} + +module_init(exar_wdt_init); +module_exit(exar_wdt_exit); + +MODULE_AUTHOR("David Müller "); +MODULE_DESCRIPTION("Exar/MaxLinear Watchdog Driver"); +MODULE_LICENSE("GPL"); -- GitLab From 22b455eecca0a3b73673898099d55db516eddbe1 Mon Sep 17 00:00:00 2001 From: Thanh Quan Date: Fri, 9 Sep 2022 11:08:11 +0200 Subject: [PATCH 0819/2223] dt-bindings: watchdog: renesas-wdt: Add r8a779g0 support Document support for the Watchdog Timer (WDT) Controller in the Renesas R-Car V4H (R8A779G0) SoC. Signed-off-by: Thanh Quan Signed-off-by: Geert Uytterhoeven Acked-by: Krzysztof Kozlowski Reviewed-by: Wolfram Sang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/e3a246be066d5e9c2231285bc1488fc12866cf5d.1662714387.git.geert+renesas@glider.be Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index a8d7dde5271b8..b2647bbaa19ce 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -65,6 +65,7 @@ properties: - enum: - renesas,r8a779a0-wdt # R-Car V3U - renesas,r8a779f0-wdt # R-Car S4-8 + - renesas,r8a779g0-wdt # R-Car V4H - const: renesas,rcar-gen4-wdt # R-Car Gen4 reg: -- GitLab From 695bfff55327caf6e9b098ada32b39b1d81dafc4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Aug 2022 13:56:06 +0300 Subject: [PATCH 0820/2223] watchdog: ftwdt010_wdt: fix test for platform_get_irq() failure This code assumes that platform_get_irq() function returns zero on failure. In fact, platform_get_irq() never returns zero. It returns negative error codes or positive non-zero values on success. Fixes: eca10ae6000d ("watchdog: add driver for Cortina Gemini watchdog") Signed-off-by: Dan Carpenter Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/YvTgRk/ABp62/hNA@kili Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ftwdt010_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/ftwdt010_wdt.c b/drivers/watchdog/ftwdt010_wdt.c index 0a5bbfd2823ff..442c5bf63ff4d 100644 --- a/drivers/watchdog/ftwdt010_wdt.c +++ b/drivers/watchdog/ftwdt010_wdt.c @@ -171,7 +171,7 @@ static int ftwdt010_wdt_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); - if (irq) { + if (irq > 0) { ret = devm_request_irq(dev, irq, ftwdt010_wdt_interrupt, 0, "watchdog bark", gwdt); if (ret) -- GitLab From d59913b0a5b6b8c52c8fbceca910d4aedbbd4cf1 Mon Sep 17 00:00:00 2001 From: Phil Edworthy Date: Tue, 23 Aug 2022 10:32:32 +0100 Subject: [PATCH 0821/2223] dt-bindings: watchdog: renesas,wdt: Add r9a09g011 (RZ/V2M) support Add the documentation for the r9a09g011 SoC, but in doing so also reorganise the doc to make it easier to read. Additionally, make the binding require an interrupt to be specified. Whilst the driver does not need an interrupt, all of the SoCs that use this binding actually provide one. Signed-off-by: Phil Edworthy Reviewed-by: Biju Das Reviewed-by: Rob Herring Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220823093233.8577-2-phil.edworthy@renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/renesas,wdt.yaml | 73 +++++++++++++------ 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index b2647bbaa19ce..26b1815a6753a 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -31,6 +31,11 @@ properties: - renesas,r9a07g054-wdt # RZ/V2L - const: renesas,rzg2l-wdt + - items: + - enum: + - renesas,r9a09g011-wdt # RZ/V2M + - const: renesas,rzv2m-wdt # RZ/V2M + - items: - enum: - renesas,r8a7742-wdt # RZ/G1H @@ -71,13 +76,29 @@ properties: reg: maxItems: 1 - interrupts: true - - interrupt-names: true - - clocks: true - - clock-names: true + interrupts: + minItems: 1 + items: + - description: Timeout + - description: Parity error + + interrupt-names: + minItems: 1 + items: + - const: wdt + - const: perrout + + clocks: + minItems: 1 + items: + - description: Register access clock + - description: Main clock + + clock-names: + minItems: 1 + items: + - const: pclk + - const: oscclk power-domains: maxItems: 1 @@ -90,6 +111,7 @@ properties: required: - compatible - reg + - interrupts - clocks allOf: @@ -114,31 +136,38 @@ allOf: contains: enum: - renesas,rzg2l-wdt + - renesas,rzv2m-wdt then: properties: - interrupts: - maxItems: 2 - interrupt-names: - items: - - const: wdt - - const: perrout clocks: - items: - - description: Register access clock - - description: Main clock + minItems: 2 clock-names: - items: - - const: pclk - - const: oscclk + minItems: 2 required: - clock-names + else: + properties: + clocks: + maxItems: 1 + + - if: + properties: + compatible: + contains: + enum: + - renesas,rzg2l-wdt + then: + properties: + interrupts: + minItems: 2 + interrupt-names: + minItems: 2 + required: - interrupt-names else: properties: interrupts: maxItems: 1 - clocks: - maxItems: 1 additionalProperties: false @@ -146,9 +175,11 @@ examples: - | #include #include + #include wdt0: watchdog@e6020000 { compatible = "renesas,r8a7795-wdt", "renesas,rcar-gen3-wdt"; reg = <0xe6020000 0x0c>; + interrupts = ; clocks = <&cpg CPG_MOD 402>; power-domains = <&sysc R8A7795_PD_ALWAYS_ON>; resets = <&cpg 402>; -- GitLab From ec122fd94eeb87b2e906360efe7447362f83e9ae Mon Sep 17 00:00:00 2001 From: Phil Edworthy Date: Tue, 23 Aug 2022 10:32:33 +0100 Subject: [PATCH 0822/2223] watchdog: rzg2l_wdt: Add rzv2m support The WDT on RZ/V2M devices is basically the same as RZ/G2L, but without the parity error registers. This means the driver has to reset the hardware plus set the minimum timeout in order to do a restart and has a single interrupt. Signed-off-by: Phil Edworthy Reviewed-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220823093233.8577-3-phil.edworthy@renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 39 ++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 6eea0ee4af49e..974a4194a8fd6 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -40,6 +40,11 @@ module_param(nowayout, bool, 0); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); +enum rz_wdt_type { + WDT_RZG2L, + WDT_RZV2M, +}; + struct rzg2l_wdt_priv { void __iomem *base; struct watchdog_device wdev; @@ -48,6 +53,7 @@ struct rzg2l_wdt_priv { unsigned long delay; struct clk *pclk; struct clk *osc_clk; + enum rz_wdt_type devtype; }; static void rzg2l_wdt_wait_delay(struct rzg2l_wdt_priv *priv) @@ -142,11 +148,29 @@ static int rzg2l_wdt_restart(struct watchdog_device *wdev, clk_prepare_enable(priv->pclk); clk_prepare_enable(priv->osc_clk); - /* Generate Reset (WDTRSTB) Signal on parity error */ - rzg2l_wdt_write(priv, 0, PECR); + if (priv->devtype == WDT_RZG2L) { + /* Generate Reset (WDTRSTB) Signal on parity error */ + rzg2l_wdt_write(priv, 0, PECR); + + /* Force parity error */ + rzg2l_wdt_write(priv, PEEN_FORCE, PEEN); + } else { + /* RZ/V2M doesn't have parity error registers */ + + wdev->timeout = 0; + + /* Initialize time out */ + rzg2l_wdt_init_timeout(wdev); - /* Force parity error */ - rzg2l_wdt_write(priv, PEEN_FORCE, PEEN); + /* Initialize watchdog counter register */ + rzg2l_wdt_write(priv, 0, WDTTIM); + + /* Enable watchdog timer*/ + rzg2l_wdt_write(priv, WDTCNT_WDTEN, WDTCNT); + + /* Wait 2 consecutive overflow cycles for reset */ + mdelay(DIV_ROUND_UP(2 * 0xFFFFF * 1000, priv->osc_clk_rate)); + } return 0; } @@ -227,6 +251,8 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) if (ret) return dev_err_probe(dev, ret, "failed to deassert"); + priv->devtype = (uintptr_t)of_device_get_match_data(dev); + pm_runtime_enable(&pdev->dev); priv->wdev.info = &rzg2l_wdt_ident; @@ -255,7 +281,8 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) } static const struct of_device_id rzg2l_wdt_ids[] = { - { .compatible = "renesas,rzg2l-wdt", }, + { .compatible = "renesas,rzg2l-wdt", .data = (void *)WDT_RZG2L }, + { .compatible = "renesas,rzv2m-wdt", .data = (void *)WDT_RZV2M }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, rzg2l_wdt_ids); -- GitLab From 0e01297212244b5a769aa956854e45da1f0cd1f4 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Mon, 19 Sep 2022 22:03:12 -0400 Subject: [PATCH 0823/2223] watchdog: Check dev_set_name() return value It's possible that dev_set_name() returns -ENOMEM, catch and handle this. Signed-off-by: Bo Liu Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220920020312.2383-1-liubo03@inspur.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/watchdog_dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 54903f3c851eb..744b2ab75288d 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1015,7 +1015,11 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) wd_data->dev.groups = wdd->groups; wd_data->dev.release = watchdog_core_data_release; dev_set_drvdata(&wd_data->dev, wdd); - dev_set_name(&wd_data->dev, "watchdog%d", wdd->id); + err = dev_set_name(&wd_data->dev, "watchdog%d", wdd->id); + if (err) { + put_device(&wd_data->dev); + return err; + } kthread_init_work(&wd_data->work, watchdog_ping_work); hrtimer_init(&wd_data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -- GitLab From 08a884cf03048142629e6dd748c7633e11d98b9b Mon Sep 17 00:00:00 2001 From: shaomin Deng Date: Mon, 8 Aug 2022 11:39:56 -0400 Subject: [PATCH 0824/2223] watchdog: eurotechwdt: Remove redundant word in comments There is a rebundant word "we" in comments, so remove it. Signed-off-by: shaomin Deng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220808153956.8374-1-dengshaomin@cdjrlc.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/eurotechwdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c index ce682942662cd..e26609ad4c17c 100644 --- a/drivers/watchdog/eurotechwdt.c +++ b/drivers/watchdog/eurotechwdt.c @@ -192,7 +192,7 @@ static void eurwdt_ping(void) * @ppos: pointer to the position to write. No seeks allowed * * A write to a watchdog device is defined as a keepalive signal. Any - * write of data will do, as we we don't define content meaning. + * write of data will do, as we don't define content meaning. */ static ssize_t eurwdt_write(struct file *file, const char __user *buf, -- GitLab From b26b96085d521466bd8ddf624c0853842215d0f0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 3 Aug 2022 04:11:09 +0800 Subject: [PATCH 0825/2223] watchdog: w83977f_wdt: Fix comment typo The double `we' is duplicated in the comment, remove one. Signed-off-by: Jason Wang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220802201109.6843-1-wangborong@cdjrlc.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/w83977f_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/w83977f_wdt.c b/drivers/watchdog/w83977f_wdt.c index fd64ae77780a6..31bf21ceaf48d 100644 --- a/drivers/watchdog/w83977f_wdt.c +++ b/drivers/watchdog/w83977f_wdt.c @@ -321,7 +321,7 @@ static int wdt_release(struct inode *inode, struct file *file) * @ppos: pointer to the position to write. No seeks allowed * * A write to a watchdog device is defined as a keepalive signal. Any - * write of data will do, as we we don't define content meaning. + * write of data will do, as we don't define content meaning. */ static ssize_t wdt_write(struct file *file, const char __user *buf, -- GitLab From 74b31987e281e31d7bd4184c027d57543e9e0392 Mon Sep 17 00:00:00 2001 From: sunliming Date: Fri, 26 Aug 2022 16:52:43 +0800 Subject: [PATCH 0826/2223] watchdog: sa1100: make variable sa1100dog_driver static This symbol is not used outside of sa1100_wdt.c, so marks it static. Fixes the following warning: >> drivers/watchdog/sa1100_wdt.c:241:24: sparse: sparse: symbol 'sa1100dog_driver' was not declared. Should it be static? Reported-by: kernel test robot Signed-off-by: sunliming Acked-by: Arnd Bergmann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220802020819.1226454-1-sunliming@kylinos.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sa1100_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c index 2d0a06a158a85..82ac5d19f519e 100644 --- a/drivers/watchdog/sa1100_wdt.c +++ b/drivers/watchdog/sa1100_wdt.c @@ -238,7 +238,7 @@ static int sa1100dog_remove(struct platform_device *pdev) return 0; } -struct platform_driver sa1100dog_driver = { +static struct platform_driver sa1100dog_driver = { .driver.name = "sa1100_wdt", .probe = sa1100dog_probe, .remove = sa1100dog_remove, -- GitLab From 8007935305610d577746b888bd1864b34fb0ea13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 26 Jul 2022 10:56:12 +0200 Subject: [PATCH 0827/2223] watchdog: armada_37xx_wdt: Fix .set_timeout callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioctl(WDIOC_SETTIMEOUT) calls .set_timeout and .ping callbacks and it is expected that it changes current watchdog timeout. armada_37xx_wdt's .ping callback just reping counter 0 and does not touch counter 1 used for timeout. So it is needed to set counter 1 to the new value in .set_timeout callback to ensure ioctl(WDIOC_SETTIMEOUT) functionality. Fix it. Fixes: 54e3d9b518c8 ("watchdog: Add support for Armada 37xx CPU watchdog") Signed-off-by: Pali Rohár Reviewed-by: Guenter Roeck Reviewed-by: Marek Behún Link: https://lore.kernel.org/r/20220726085612.10672-1-pali@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/armada_37xx_wdt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/watchdog/armada_37xx_wdt.c b/drivers/watchdog/armada_37xx_wdt.c index 854b1cc723cb6..ac9fed1ef681b 100644 --- a/drivers/watchdog/armada_37xx_wdt.c +++ b/drivers/watchdog/armada_37xx_wdt.c @@ -179,6 +179,8 @@ static int armada_37xx_wdt_set_timeout(struct watchdog_device *wdt, dev->timeout = (u64)dev->clk_rate * timeout; do_div(dev->timeout, CNTR_CTRL_PRESCALE_MIN); + set_counter_value(dev, CNTR_ID_WDOG, dev->timeout); + return 0; } -- GitLab From b24620608dc2b54cb9df511e3d2c789f99497538 Mon Sep 17 00:00:00 2001 From: Sergiu Moga Date: Thu, 14 Jul 2022 15:51:24 +0300 Subject: [PATCH 0828/2223] watchdog: dt-bindings: atmel,at91sam9-wdt: convert to json-schema Convert at91sam9 WDT binding for Atmel/Microchip SoCs to json-schema format. Signed-off-by: Sergiu Moga Reviewed-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220714125122.144377-1-sergiu.moga@microchip.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/atmel,at91sam9-wdt.yaml | 127 ++++++++++++++++++ .../bindings/watchdog/atmel-wdt.txt | 51 ------- 2 files changed, 127 insertions(+), 51 deletions(-) create mode 100644 Documentation/devicetree/bindings/watchdog/atmel,at91sam9-wdt.yaml delete mode 100644 Documentation/devicetree/bindings/watchdog/atmel-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/atmel,at91sam9-wdt.yaml b/Documentation/devicetree/bindings/watchdog/atmel,at91sam9-wdt.yaml new file mode 100644 index 0000000000000..ad27bc518670a --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/atmel,at91sam9-wdt.yaml @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2022 Microchip Technology, Inc. and its subsidiaries +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/atmel,at91sam9-wdt.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Atmel Watchdog Timers + +maintainers: + - Eugen Hristev + +properties: + compatible: + const: atmel,at91sam9260-wdt + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + interrupts: + maxItems: 1 + + atmel,max-heartbeat-sec: + description: + Should contain the maximum heartbeat value in seconds. This value + should be less or equal to 16. It is used to compute the WDV field. + maximum: 16 + + atmel,min-heartbeat-sec: + description: + Should contain the minimum heartbeat value in seconds. This value + must be smaller than the max-heartbeat-sec value. It is used to + compute the WDD field. + maximum: 16 + + atmel,watchdog-type: + $ref: /schemas/types.yaml#/definitions/string + description: | + Should be hardware or software. + oneOf: + - description: + Hardware watchdog uses the at91 watchdog reset. + const: hardware + - description: | + Software watchdog uses the watchdog interrupt + to trigger a software reset. + const: software + default: hardware + + atmel,reset-type: + $ref: /schemas/types.yaml#/definitions/string + description: | + Should be proc or all. This is valid only when using hardware watchdog. + oneOf: + - description: + Assert peripherals and processor reset signals. + const: all + - description: + Assert the processor reset signal. + const: proc + default: all + + atmel,disable: + $ref: /schemas/types.yaml#/definitions/flag + description: + Should be present if you want to stop the watchdog. + + atmel,idle-halt: + $ref: /schemas/types.yaml#/definitions/flag + description: | + Should be present if you want to stop the watchdog when + entering idle state. + CAUTION: This property should be used with care, it actually makes the + watchdog not counting when the CPU is in idle state, therefore the + watchdog reset time depends on mean CPU usage and will not reset at all + if the CPU stops working while it is in idle state, which is probably + not what you want. + + atmel,dbg-halt: + $ref: /schemas/types.yaml#/definitions/flag + description: | + Should be present if you want to stop the watchdog when + entering debug state. + +required: + - compatible + - reg + - clocks + +allOf: + - $ref: watchdog.yaml# + - if: + properties: + atmel,reset-type: + enum: + - all + - proc + then: + properties: + atmel,watchdog-type: + const: hardware + +dependencies: + atmel,reset-type: ['atmel,watchdog-type'] + +unevaluatedProperties: false + +examples: + - | + #include + + watchdog@fffffd40 { + compatible = "atmel,at91sam9260-wdt"; + reg = <0xfffffd40 0x10>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>; + clocks = <&clk32k>; + timeout-sec = <15>; + atmel,watchdog-type = "hardware"; + atmel,reset-type = "all"; + atmel,dbg-halt; + atmel,idle-halt; + atmel,max-heartbeat-sec = <16>; + atmel,min-heartbeat-sec = <0>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/atmel-wdt.txt b/Documentation/devicetree/bindings/watchdog/atmel-wdt.txt deleted file mode 100644 index 711a880b3d3bf..0000000000000 --- a/Documentation/devicetree/bindings/watchdog/atmel-wdt.txt +++ /dev/null @@ -1,51 +0,0 @@ -* Atmel Watchdog Timers - -** at91sam9-wdt - -Required properties: -- compatible: must be "atmel,at91sam9260-wdt". -- reg: physical base address of the controller and length of memory mapped - region. -- clocks: phandle to input clock. - -Optional properties: -- timeout-sec: contains the watchdog timeout in seconds. -- interrupts : Should contain WDT interrupt. -- atmel,max-heartbeat-sec : Should contain the maximum heartbeat value in - seconds. This value should be less or equal to 16. It is used to - compute the WDV field. -- atmel,min-heartbeat-sec : Should contain the minimum heartbeat value in - seconds. This value must be smaller than the max-heartbeat-sec value. - It is used to compute the WDD field. -- atmel,watchdog-type : Should be "hardware" or "software". Hardware watchdog - use the at91 watchdog reset. Software watchdog use the watchdog - interrupt to trigger a software reset. -- atmel,reset-type : Should be "proc" or "all". - "all" : assert peripherals and processor reset signals - "proc" : assert the processor reset signal - This is valid only when using "hardware" watchdog. -- atmel,disable : Should be present if you want to disable the watchdog. -- atmel,idle-halt : Should be present if you want to stop the watchdog when - entering idle state. - CAUTION: This property should be used with care, it actually makes the - watchdog not counting when the CPU is in idle state, therefore the - watchdog reset time depends on mean CPU usage and will not reset at all - if the CPU stop working while it is in idle state, which is probably - not what you want. -- atmel,dbg-halt : Should be present if you want to stop the watchdog when - entering debug state. - -Example: - watchdog@fffffd40 { - compatible = "atmel,at91sam9260-wdt"; - reg = <0xfffffd40 0x10>; - interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>; - clocks = <&clk32k>; - timeout-sec = <15>; - atmel,watchdog-type = "hardware"; - atmel,reset-type = "all"; - atmel,dbg-halt; - atmel,idle-halt; - atmel,max-heartbeat-sec = <16>; - atmel,min-heartbeat-sec = <0>; - }; -- GitLab From 6adbfbab0f039bb89edb9d3ad0d9ac8a18efa6db Mon Sep 17 00:00:00 2001 From: Philippe Boos Date: Mon, 1 Aug 2022 11:21:50 +0200 Subject: [PATCH 0829/2223] watchdog: meson: keep running if already active If the watchdog is already running (e.g.: started by bootloader) then the kernel driver should keep the watchdog active but the amlogic driver turns it off. Let the driver fix the clock rate if already active because we do not know the previous timebase value. To avoid unintentional resetting we temporarily set it to its maximum value. Then keep the enable bit if is was previously active. Signed-off-by: Philippe Boos Reviewed-by: Jerome Brunet Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220801092150.4449-1-pboos@baylibre.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/meson_gxbb_wdt.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/watchdog/meson_gxbb_wdt.c b/drivers/watchdog/meson_gxbb_wdt.c index d3c9e2f6e63b9..981a2f7c3bec2 100644 --- a/drivers/watchdog/meson_gxbb_wdt.c +++ b/drivers/watchdog/meson_gxbb_wdt.c @@ -156,6 +156,7 @@ static int meson_gxbb_wdt_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct meson_gxbb_wdt *data; int ret; + u32 ctrl_reg; data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); if (!data) @@ -189,13 +190,26 @@ static int meson_gxbb_wdt_probe(struct platform_device *pdev) watchdog_set_nowayout(&data->wdt_dev, nowayout); watchdog_set_drvdata(&data->wdt_dev, data); + ctrl_reg = readl(data->reg_base + GXBB_WDT_CTRL_REG) & + GXBB_WDT_CTRL_EN; + + if (ctrl_reg) { + /* Watchdog is running - keep it running but extend timeout + * to the maximum while setting the timebase + */ + set_bit(WDOG_HW_RUNNING, &data->wdt_dev.status); + meson_gxbb_wdt_set_timeout(&data->wdt_dev, + GXBB_WDT_TCNT_SETUP_MASK / 1000); + } + /* Setup with 1ms timebase */ - writel(((clk_get_rate(data->clk) / 1000) & GXBB_WDT_CTRL_DIV_MASK) | - GXBB_WDT_CTRL_EE_RESET | - GXBB_WDT_CTRL_CLK_EN | - GXBB_WDT_CTRL_CLKDIV_EN, - data->reg_base + GXBB_WDT_CTRL_REG); + ctrl_reg |= ((clk_get_rate(data->clk) / 1000) & + GXBB_WDT_CTRL_DIV_MASK) | + GXBB_WDT_CTRL_EE_RESET | + GXBB_WDT_CTRL_CLK_EN | + GXBB_WDT_CTRL_CLKDIV_EN; + writel(ctrl_reg, data->reg_base + GXBB_WDT_CTRL_REG); meson_gxbb_wdt_set_timeout(&data->wdt_dev, data->wdt_dev.timeout); return devm_watchdog_register_device(dev, &data->wdt_dev); -- GitLab From af084fdccfafa79ad30a6a42c8eced79b71fb0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Fri, 10 Jun 2022 09:21:37 +0200 Subject: [PATCH 0830/2223] watchdog: npcm: Enable clock if provided MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the Nuvoton WPCM450 SoC, with its upcoming clock driver, peripheral clocks are individually gated and ungated. Therefore, the watchdog driver must be able to ungate the watchdog clock. Signed-off-by: Jonathan Neuschäfer Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220610072141.347795-3-j.neuschaefer@gmx.net Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/npcm_wdt.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/watchdog/npcm_wdt.c b/drivers/watchdog/npcm_wdt.c index 28a24caa2627c..a5dd1c2301374 100644 --- a/drivers/watchdog/npcm_wdt.c +++ b/drivers/watchdog/npcm_wdt.c @@ -3,6 +3,7 @@ // Copyright (c) 2018 IBM Corp. #include +#include #include #include #include @@ -43,6 +44,7 @@ struct npcm_wdt { struct watchdog_device wdd; void __iomem *reg; + struct clk *clk; }; static inline struct npcm_wdt *to_npcm_wdt(struct watchdog_device *wdd) @@ -66,6 +68,9 @@ static int npcm_wdt_start(struct watchdog_device *wdd) struct npcm_wdt *wdt = to_npcm_wdt(wdd); u32 val; + if (wdt->clk) + clk_prepare_enable(wdt->clk); + if (wdd->timeout < 2) val = 0x800; else if (wdd->timeout < 3) @@ -100,6 +105,9 @@ static int npcm_wdt_stop(struct watchdog_device *wdd) writel(0, wdt->reg); + if (wdt->clk) + clk_disable_unprepare(wdt->clk); + return 0; } @@ -147,6 +155,10 @@ static int npcm_wdt_restart(struct watchdog_device *wdd, { struct npcm_wdt *wdt = to_npcm_wdt(wdd); + /* For reset, we start the WDT clock and leave it running. */ + if (wdt->clk) + clk_prepare_enable(wdt->clk); + writel(NPCM_WTR | NPCM_WTRE | NPCM_WTE, wdt->reg); udelay(1000); @@ -191,6 +203,10 @@ static int npcm_wdt_probe(struct platform_device *pdev) if (IS_ERR(wdt->reg)) return PTR_ERR(wdt->reg); + wdt->clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(wdt->clk)) + return PTR_ERR(wdt->clk); + irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; -- GitLab From eadf8c4c737f6e88e1b1e86f2f52ea5acf28bb04 Mon Sep 17 00:00:00 2001 From: Chanho Park Date: Fri, 20 May 2022 21:17:47 +0900 Subject: [PATCH 0831/2223] dt-bindings: watchdog: add exynosautov9 compatible Adds "samsung,exynosautov9-wdt" to samsung-wdt compatible. This has two cpu watchdogs like exynos850. Signed-off-by: Chanho Park Reviewed-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220520121750.71473-2-chanho61.park@samsung.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/samsung-wdt.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/samsung-wdt.yaml b/Documentation/devicetree/bindings/watchdog/samsung-wdt.yaml index b08373336b161..8fb6656ba0c28 100644 --- a/Documentation/devicetree/bindings/watchdog/samsung-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/samsung-wdt.yaml @@ -23,6 +23,7 @@ properties: - samsung,exynos5420-wdt # for Exynos5420 - samsung,exynos7-wdt # for Exynos7 - samsung,exynos850-wdt # for Exynos850 + - samsung,exynosautov9-wdt # for Exynosautov9 reg: maxItems: 1 @@ -67,6 +68,7 @@ allOf: - samsung,exynos5420-wdt - samsung,exynos7-wdt - samsung,exynos850-wdt + - samsung,exynosautov9-wdt then: required: - samsung,syscon-phandle @@ -76,6 +78,7 @@ allOf: contains: enum: - samsung,exynos850-wdt + - samsung,exynosautov9-wdt then: properties: clocks: -- GitLab From 0c91aa185a63324183c67eff2d3bb2af605f05a7 Mon Sep 17 00:00:00 2001 From: Chanho Park Date: Fri, 20 May 2022 21:17:48 +0900 Subject: [PATCH 0832/2223] watchdog: s3c2410_wdt: support exynosautov9 watchdog Like exynos850, exynosautov9 SoC also has two cpu watchdogs. Unfortunately, some configurations are slightly different so we need to add samsung,exynosautov9-wdt and separate drv data for those watchdogs. Signed-off-by: Chanho Park Reviewed-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220520121750.71473-3-chanho61.park@samsung.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/s3c2410_wdt.c | 41 ++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c index 95919392927fc..d3fc8ed886fff 100644 --- a/drivers/watchdog/s3c2410_wdt.c +++ b/drivers/watchdog/s3c2410_wdt.c @@ -60,9 +60,13 @@ #define EXYNOS850_CLUSTER0_NONCPU_INT_EN 0x1244 #define EXYNOS850_CLUSTER1_NONCPU_OUT 0x1620 #define EXYNOS850_CLUSTER1_NONCPU_INT_EN 0x1644 +#define EXYNOSAUTOV9_CLUSTER1_NONCPU_OUT 0x1520 +#define EXYNOSAUTOV9_CLUSTER1_NONCPU_INT_EN 0x1544 #define EXYNOS850_CLUSTER0_WDTRESET_BIT 24 #define EXYNOS850_CLUSTER1_WDTRESET_BIT 23 +#define EXYNOSAUTOV9_CLUSTER0_WDTRESET_BIT 25 +#define EXYNOSAUTOV9_CLUSTER1_WDTRESET_BIT 24 /** * DOC: Quirk flags for different Samsung watchdog IP-cores @@ -236,6 +240,30 @@ static const struct s3c2410_wdt_variant drv_data_exynos850_cl1 = { QUIRK_HAS_PMU_RST_STAT | QUIRK_HAS_PMU_CNT_EN, }; +static const struct s3c2410_wdt_variant drv_data_exynosautov9_cl0 = { + .mask_reset_reg = EXYNOS850_CLUSTER0_NONCPU_INT_EN, + .mask_bit = 2, + .mask_reset_inv = true, + .rst_stat_reg = EXYNOS5_RST_STAT_REG_OFFSET, + .rst_stat_bit = EXYNOSAUTOV9_CLUSTER0_WDTRESET_BIT, + .cnt_en_reg = EXYNOS850_CLUSTER0_NONCPU_OUT, + .cnt_en_bit = 7, + .quirks = QUIRK_HAS_WTCLRINT_REG | QUIRK_HAS_PMU_MASK_RESET | + QUIRK_HAS_PMU_RST_STAT | QUIRK_HAS_PMU_CNT_EN, +}; + +static const struct s3c2410_wdt_variant drv_data_exynosautov9_cl1 = { + .mask_reset_reg = EXYNOSAUTOV9_CLUSTER1_NONCPU_INT_EN, + .mask_bit = 2, + .mask_reset_inv = true, + .rst_stat_reg = EXYNOS5_RST_STAT_REG_OFFSET, + .rst_stat_bit = EXYNOSAUTOV9_CLUSTER1_WDTRESET_BIT, + .cnt_en_reg = EXYNOSAUTOV9_CLUSTER1_NONCPU_OUT, + .cnt_en_bit = 7, + .quirks = QUIRK_HAS_WTCLRINT_REG | QUIRK_HAS_PMU_MASK_RESET | + QUIRK_HAS_PMU_RST_STAT | QUIRK_HAS_PMU_CNT_EN, +}; + static const struct of_device_id s3c2410_wdt_match[] = { { .compatible = "samsung,s3c2410-wdt", .data = &drv_data_s3c2410 }, @@ -249,6 +277,8 @@ static const struct of_device_id s3c2410_wdt_match[] = { .data = &drv_data_exynos7 }, { .compatible = "samsung,exynos850-wdt", .data = &drv_data_exynos850_cl0 }, + { .compatible = "samsung,exynosautov9-wdt", + .data = &drv_data_exynosautov9_cl0 }, {}, }; MODULE_DEVICE_TABLE(of, s3c2410_wdt_match); @@ -630,8 +660,9 @@ s3c2410_get_wdt_drv_data(struct platform_device *pdev) } #ifdef CONFIG_OF - /* Choose Exynos850 driver data w.r.t. cluster index */ - if (variant == &drv_data_exynos850_cl0) { + /* Choose Exynos850/ExynosAutov9 driver data w.r.t. cluster index */ + if (variant == &drv_data_exynos850_cl0 || + variant == &drv_data_exynosautov9_cl0) { u32 index; int err; @@ -644,9 +675,11 @@ s3c2410_get_wdt_drv_data(struct platform_device *pdev) switch (index) { case 0: - return &drv_data_exynos850_cl0; + return variant; case 1: - return &drv_data_exynos850_cl1; + return (variant == &drv_data_exynos850_cl0) ? + &drv_data_exynos850_cl1 : + &drv_data_exynosautov9_cl1; default: dev_err(dev, "wrong cluster index: %u\n", index); return NULL; -- GitLab From 5946401e25b36f755f401447e3ffb6d0e6a3769a Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Sat, 10 Sep 2022 00:01:56 +0200 Subject: [PATCH 0833/2223] dt-bindings: watchdog: rockchip: add rockchip,rk3128-wdt Add rockchip,rk3128-wdt compatible string. Signed-off-by: Johan Jonker Reviewed-by: Guenter Roeck Acked-by: Rob Herring Link: https://lore.kernel.org/r/a4da79fe-3449-6538-742f-790835ffe43a@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml b/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml index 6461eb4f4a278..92df6e453f64b 100644 --- a/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml @@ -20,6 +20,7 @@ properties: - enum: - rockchip,px30-wdt - rockchip,rk3066-wdt + - rockchip,rk3128-wdt - rockchip,rk3188-wdt - rockchip,rk3228-wdt - rockchip,rk3288-wdt -- GitLab From a1f136fd8725243a69681e4e20e29f7b2043ad93 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 2 Aug 2022 07:46:43 +0000 Subject: [PATCH 0834/2223] watchdog: rti-wdt:using the pm_runtime_resume_and_get to simplify the code Using pm_runtime_resume_and_get() to instade of pm_runtime_get_sync and pm_runtime_put_noidle. Reported-by: Zeal Robot Signed-off-by: ye xingchen Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220802074643.1648660-1-ye.xingchen@zte.com.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rti_wdt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c index 053ef3bde12d4..6e9253761fc10 100644 --- a/drivers/watchdog/rti_wdt.c +++ b/drivers/watchdog/rti_wdt.c @@ -225,9 +225,8 @@ static int rti_wdt_probe(struct platform_device *pdev) wdt->freq = wdt->freq * 9 / 10; pm_runtime_enable(dev); - ret = pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); if (ret < 0) { - pm_runtime_put_noidle(dev); pm_runtime_disable(&pdev->dev); return dev_err_probe(dev, ret, "runtime pm failed\n"); } -- GitLab From f182683333b5d8ac4af64517b6e3c444c4579e6e Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Thu, 25 Aug 2022 16:32:50 +0800 Subject: [PATCH 0835/2223] watchdog: imx7ulp: Move suspend/resume to noirq phase The i.MX7ULP's watchdog is enabled by default when out of reset, so the resume callback which is to disable watchdog should be called earlier to avoid unexpected timeout, move suspend/resume callback to noirq phase. Signed-off-by: Anson Huang Signed-off-by: Alice Guo Reviewed-by: Jacky Bai Tested-by: Peter Chen Tested-by: Li Jun Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-2-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index 922b603742952..014f497ea0dc7 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -255,7 +255,7 @@ static int imx7ulp_wdt_probe(struct platform_device *pdev) return devm_watchdog_register_device(dev, wdog); } -static int __maybe_unused imx7ulp_wdt_suspend(struct device *dev) +static int __maybe_unused imx7ulp_wdt_suspend_noirq(struct device *dev) { struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev); @@ -267,7 +267,7 @@ static int __maybe_unused imx7ulp_wdt_suspend(struct device *dev) return 0; } -static int __maybe_unused imx7ulp_wdt_resume(struct device *dev) +static int __maybe_unused imx7ulp_wdt_resume_noirq(struct device *dev) { struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev); u32 timeout = imx7ulp_wdt->wdd.timeout * WDOG_CLOCK_RATE; @@ -286,8 +286,10 @@ static int __maybe_unused imx7ulp_wdt_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(imx7ulp_wdt_pm_ops, imx7ulp_wdt_suspend, - imx7ulp_wdt_resume); +static const struct dev_pm_ops imx7ulp_wdt_pm_ops = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(imx7ulp_wdt_suspend_noirq, + imx7ulp_wdt_resume_noirq) +}; static const struct of_device_id imx7ulp_wdt_dt_ids[] = { { .compatible = "fsl,imx7ulp-wdt", }, -- GitLab From 6371593fbad75cfb9ee14e8b462a5ebb1aa38c02 Mon Sep 17 00:00:00 2001 From: Jacky Bai Date: Thu, 25 Aug 2022 16:32:51 +0800 Subject: [PATCH 0836/2223] watchdog: imx7ulp: Add explict memory barrier for unlock sequence When reconfiguring the WDOG Timer of i.MX7ULP, there is a certain probability causes it to reset. The reason is that the CMD32EN of the WDOG Timer of i.MX7ULP is disabled in bootloader. The unlock sequence are two 16-bit writes to the CNT register within 16 bus clocks. Adding mb() is to guarantee that two 16-bit writes are finished within 16 bus clocks. Memory barriers cannot be added between these two 16-bit writes so that writel_relaxed is used. Suggested-by: Ye Li Signed-off-by: Jacky Bai Signed-off-by: Alice Guo Reviewed-by: Ye Li Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-3-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index 014f497ea0dc7..b8ac0cb04d2f1 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -179,9 +179,13 @@ static int imx7ulp_wdt_init(void __iomem *base, unsigned int timeout) int ret; local_irq_disable(); + + mb(); /* unlock the wdog for reconfiguration */ writel_relaxed(UNLOCK_SEQ0, base + WDOG_CNT); writel_relaxed(UNLOCK_SEQ1, base + WDOG_CNT); + mb(); + ret = imx7ulp_wdt_wait(base, WDOG_CS_ULK); if (ret) goto init_out; -- GitLab From e809daec17572216d91b6c41a8e04f9bb24d00a5 Mon Sep 17 00:00:00 2001 From: Ye Li Date: Thu, 25 Aug 2022 16:32:52 +0800 Subject: [PATCH 0837/2223] watchdog: imx7ulp_wdt: Check CMD32EN in wdog init When bootloader has enabled the CMD32EN bit, switch to use 32bits unlock command to unlock the CS register. Using 32bits command will help on avoiding 16 bus cycle window violation for two 16 bits commands. Signed-off-by: Ye Li Signed-off-by: Alice Guo Reviewed-by: Jacky Bai Acked-by: Jason Liu Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-4-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index b8ac0cb04d2f1..a0f6b8cea78f8 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -180,11 +180,16 @@ static int imx7ulp_wdt_init(void __iomem *base, unsigned int timeout) local_irq_disable(); - mb(); - /* unlock the wdog for reconfiguration */ - writel_relaxed(UNLOCK_SEQ0, base + WDOG_CNT); - writel_relaxed(UNLOCK_SEQ1, base + WDOG_CNT); - mb(); + val = readl(base + WDOG_CS); + if (val & WDOG_CS_CMD32EN) { + writel(UNLOCK, base + WDOG_CNT); + } else { + mb(); + /* unlock the wdog for reconfiguration */ + writel_relaxed(UNLOCK_SEQ0, base + WDOG_CNT); + writel_relaxed(UNLOCK_SEQ1, base + WDOG_CNT); + mb(); + } ret = imx7ulp_wdt_wait(base, WDOG_CS_ULK); if (ret) -- GitLab From 52c4d05113264aa406d8d33751f09178e2476177 Mon Sep 17 00:00:00 2001 From: Ye Li Date: Thu, 25 Aug 2022 16:32:53 +0800 Subject: [PATCH 0838/2223] watchdog: imx7ulp_wdt: Fix RCS timeout issue According to measure on i.MX7ULP and i.MX8ULP, the RCS done needs about 3400us and 6700us respectively. So current 20us timeout is not enough. When reconfiguring is on-going, unlock and configure CS will lead to unknown result. Increase the wait timeout value to 10ms and check the return value of RCS wait to fix the issue Signed-off-by: Ye Li Signed-off-by: Alice Guo Reviewed-by: Jacky Bai Acked-by: Jason Liu Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-5-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index a0f6b8cea78f8..12715c2486880 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -39,7 +39,7 @@ #define DEFAULT_TIMEOUT 60 #define MAX_TIMEOUT 128 #define WDOG_CLOCK_RATE 1000 -#define WDOG_WAIT_TIMEOUT 20 +#define WDOG_WAIT_TIMEOUT 10000 static bool nowayout = WATCHDOG_NOWAYOUT; module_param(nowayout, bool, 0000); @@ -80,7 +80,7 @@ static int imx7ulp_wdt_enable(struct watchdog_device *wdog, bool enable) writel(val | WDOG_CS_EN, wdt->base + WDOG_CS); else writel(val & ~WDOG_CS_EN, wdt->base + WDOG_CS); - imx7ulp_wdt_wait(wdt->base, WDOG_CS_RCS); + ret = imx7ulp_wdt_wait(wdt->base, WDOG_CS_RCS); enable_out: local_irq_enable(); @@ -127,7 +127,9 @@ static int imx7ulp_wdt_set_timeout(struct watchdog_device *wdog, if (ret) goto timeout_out; writel(val, wdt->base + WDOG_TOVAL); - imx7ulp_wdt_wait(wdt->base, WDOG_CS_RCS); + ret = imx7ulp_wdt_wait(wdt->base, WDOG_CS_RCS); + if (ret) + goto timeout_out; wdog->timeout = timeout; -- GitLab From c32b53f965edcab53e16a2dea02d34e1c2c8173c Mon Sep 17 00:00:00 2001 From: Ye Li Date: Thu, 25 Aug 2022 16:32:54 +0800 Subject: [PATCH 0839/2223] watchdog: imx7ulp_wdt: Handle wdog reconfigure failure Current driver may meet reconfigure failure caused by below reasons: 1. The wdog on iMX7ULP has different behavior after RCS valid. It needs to wait more than 2.5 wdog clock for clock sync before next reconfiguration, while imx8ulp wdog does not need such delay. 2. After unlock, there is 128 bus clock window opened for reconfiguration, but on iMX8ULP, the HW can't guarantee the latency. So it is possible the window is closed before the writing arrives to wdog. 3. If the PRES is enabled, the RCS valid time becomes x256 to the time of PRES disabled. It is about 1715ms on iMX8ULP. So We have to increase the RCS timeout and can't wait it in IRQ disabled. The patch updates the driver to handle failures 1. Using different wait for unlock and RCS. Unlock valid time is very short and only related to bus clock. It must be in IRQ disabled to avoid being interrupted in 128 clock window. But for RCS time, it is longer and ok for IRQ enabled. 2. Add retry for any reconfigure failure with default 5 times. 3. Add "fsl,imx8ulp-wdt" compatile string for iMX8ULP and afterwards platform which don't need more 2.5 wdog clock after RCS valid. For imx7ulp, add post delay of 2.5 clock after RCS valid. Signed-off-by: Ye Li Signed-off-by: Alice Guo Reviewed-by: Jacky Bai Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-6-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 163 ++++++++++++++++++++++++++------- 1 file changed, 129 insertions(+), 34 deletions(-) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index 12715c2486880..0cafa86fff7f9 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -14,7 +14,9 @@ #include #define WDOG_CS 0x0 +#define WDOG_CS_FLG BIT(14) #define WDOG_CS_CMD32EN BIT(13) +#define WDOG_CS_PRES BIT(12) #define WDOG_CS_ULK BIT(11) #define WDOG_CS_RCS BIT(10) #define LPO_CLK 0x1 @@ -39,7 +41,11 @@ #define DEFAULT_TIMEOUT 60 #define MAX_TIMEOUT 128 #define WDOG_CLOCK_RATE 1000 -#define WDOG_WAIT_TIMEOUT 10000 +#define WDOG_ULK_WAIT_TIMEOUT 1000 +#define WDOG_RCS_WAIT_TIMEOUT 10000 +#define WDOG_RCS_POST_WAIT 3000 + +#define RETRY_MAX 5 static bool nowayout = WATCHDOG_NOWAYOUT; module_param(nowayout, bool, 0000); @@ -50,40 +56,82 @@ struct imx7ulp_wdt_device { struct watchdog_device wdd; void __iomem *base; struct clk *clk; + bool post_rcs_wait; }; -static int imx7ulp_wdt_wait(void __iomem *base, u32 mask) +static int imx7ulp_wdt_wait_ulk(void __iomem *base) { u32 val = readl(base + WDOG_CS); - if (!(val & mask) && readl_poll_timeout_atomic(base + WDOG_CS, val, - val & mask, 0, - WDOG_WAIT_TIMEOUT)) + if (!(val & WDOG_CS_ULK) && + readl_poll_timeout_atomic(base + WDOG_CS, val, + val & WDOG_CS_ULK, 0, + WDOG_ULK_WAIT_TIMEOUT)) return -ETIMEDOUT; return 0; } -static int imx7ulp_wdt_enable(struct watchdog_device *wdog, bool enable) +static int imx7ulp_wdt_wait_rcs(struct imx7ulp_wdt_device *wdt) { - struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + int ret = 0; + u32 val = readl(wdt->base + WDOG_CS); + u64 timeout = (val & WDOG_CS_PRES) ? + WDOG_RCS_WAIT_TIMEOUT * 256 : WDOG_RCS_WAIT_TIMEOUT; + unsigned long wait_min = (val & WDOG_CS_PRES) ? + WDOG_RCS_POST_WAIT * 256 : WDOG_RCS_POST_WAIT; + if (!(val & WDOG_CS_RCS) && + readl_poll_timeout(wdt->base + WDOG_CS, val, val & WDOG_CS_RCS, 100, + timeout)) + ret = -ETIMEDOUT; + + /* Wait 2.5 clocks after RCS done */ + if (wdt->post_rcs_wait) + usleep_range(wait_min, wait_min + 2000); + + return ret; +} + +static int _imx7ulp_wdt_enable(struct imx7ulp_wdt_device *wdt, bool enable) +{ u32 val = readl(wdt->base + WDOG_CS); int ret; local_irq_disable(); writel(UNLOCK, wdt->base + WDOG_CNT); - ret = imx7ulp_wdt_wait(wdt->base, WDOG_CS_ULK); + ret = imx7ulp_wdt_wait_ulk(wdt->base); if (ret) goto enable_out; if (enable) writel(val | WDOG_CS_EN, wdt->base + WDOG_CS); else writel(val & ~WDOG_CS_EN, wdt->base + WDOG_CS); - ret = imx7ulp_wdt_wait(wdt->base, WDOG_CS_RCS); + + local_irq_enable(); + ret = imx7ulp_wdt_wait_rcs(wdt); + + return ret; enable_out: local_irq_enable(); + return ret; +} + +static int imx7ulp_wdt_enable(struct watchdog_device *wdog, bool enable) +{ + struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + int ret; + u32 val; + u32 loop = RETRY_MAX; + + do { + ret = _imx7ulp_wdt_enable(wdt, enable); + val = readl(wdt->base + WDOG_CS); + } while (--loop > 0 && ((!!(val & WDOG_CS_EN)) != enable || ret)); + + if (loop == 0) + return -EBUSY; return ret; } @@ -114,28 +162,44 @@ static int imx7ulp_wdt_stop(struct watchdog_device *wdog) return imx7ulp_wdt_enable(wdog, false); } -static int imx7ulp_wdt_set_timeout(struct watchdog_device *wdog, - unsigned int timeout) +static int _imx7ulp_wdt_set_timeout(struct imx7ulp_wdt_device *wdt, + unsigned int toval) { - struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); - u32 val = WDOG_CLOCK_RATE * timeout; int ret; local_irq_disable(); writel(UNLOCK, wdt->base + WDOG_CNT); - ret = imx7ulp_wdt_wait(wdt->base, WDOG_CS_ULK); + ret = imx7ulp_wdt_wait_ulk(wdt->base); if (ret) goto timeout_out; - writel(val, wdt->base + WDOG_TOVAL); - ret = imx7ulp_wdt_wait(wdt->base, WDOG_CS_RCS); - if (ret) - goto timeout_out; - - wdog->timeout = timeout; + writel(toval, wdt->base + WDOG_TOVAL); + local_irq_enable(); + ret = imx7ulp_wdt_wait_rcs(wdt); + return ret; timeout_out: local_irq_enable(); + return ret; +} +static int imx7ulp_wdt_set_timeout(struct watchdog_device *wdog, + unsigned int timeout) +{ + struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + u32 toval = WDOG_CLOCK_RATE * timeout; + u32 val; + int ret; + u32 loop = RETRY_MAX; + + do { + ret = _imx7ulp_wdt_set_timeout(wdt, toval); + val = readl(wdt->base + WDOG_TOVAL); + } while (--loop > 0 && (val != toval || ret)); + + if (loop == 0) + return -EBUSY; + + wdog->timeout = timeout; return ret; } @@ -175,38 +239,59 @@ static const struct watchdog_info imx7ulp_wdt_info = { WDIOF_MAGICCLOSE, }; -static int imx7ulp_wdt_init(void __iomem *base, unsigned int timeout) +static int _imx7ulp_wdt_init(struct imx7ulp_wdt_device *wdt, unsigned int timeout, unsigned int cs) { u32 val; int ret; local_irq_disable(); - val = readl(base + WDOG_CS); + val = readl(wdt->base + WDOG_CS); if (val & WDOG_CS_CMD32EN) { - writel(UNLOCK, base + WDOG_CNT); + writel(UNLOCK, wdt->base + WDOG_CNT); } else { mb(); /* unlock the wdog for reconfiguration */ - writel_relaxed(UNLOCK_SEQ0, base + WDOG_CNT); - writel_relaxed(UNLOCK_SEQ1, base + WDOG_CNT); + writel_relaxed(UNLOCK_SEQ0, wdt->base + WDOG_CNT); + writel_relaxed(UNLOCK_SEQ1, wdt->base + WDOG_CNT); mb(); } - ret = imx7ulp_wdt_wait(base, WDOG_CS_ULK); + ret = imx7ulp_wdt_wait_ulk(wdt->base); if (ret) goto init_out; /* set an initial timeout value in TOVAL */ - writel(timeout, base + WDOG_TOVAL); - /* enable 32bit command sequence and reconfigure */ - val = WDOG_CS_CMD32EN | WDOG_CS_CLK | WDOG_CS_UPDATE | - WDOG_CS_WAIT | WDOG_CS_STOP; - writel(val, base + WDOG_CS); - imx7ulp_wdt_wait(base, WDOG_CS_RCS); + writel(timeout, wdt->base + WDOG_TOVAL); + writel(cs, wdt->base + WDOG_CS); + local_irq_enable(); + ret = imx7ulp_wdt_wait_rcs(wdt); + + return ret; init_out: local_irq_enable(); + return ret; +} + +static int imx7ulp_wdt_init(struct imx7ulp_wdt_device *wdt, unsigned int timeout) +{ + /* enable 32bit command sequence and reconfigure */ + u32 val = WDOG_CS_CMD32EN | WDOG_CS_CLK | WDOG_CS_UPDATE | + WDOG_CS_WAIT | WDOG_CS_STOP; + u32 cs, toval; + int ret; + u32 loop = RETRY_MAX; + + do { + ret = _imx7ulp_wdt_init(wdt, timeout, val); + toval = readl(wdt->base + WDOG_TOVAL); + cs = readl(wdt->base + WDOG_CS); + cs &= ~(WDOG_CS_FLG | WDOG_CS_ULK | WDOG_CS_RCS); + } while (--loop > 0 && (cs != val || toval != timeout || ret)); + + if (loop == 0) + return -EBUSY; return ret; } @@ -239,6 +324,15 @@ static int imx7ulp_wdt_probe(struct platform_device *pdev) return PTR_ERR(imx7ulp_wdt->clk); } + imx7ulp_wdt->post_rcs_wait = true; + if (of_device_is_compatible(dev->of_node, + "fsl,imx8ulp-wdt")) { + dev_info(dev, "imx8ulp wdt probe\n"); + imx7ulp_wdt->post_rcs_wait = false; + } else { + dev_info(dev, "imx7ulp wdt probe\n"); + } + ret = clk_prepare_enable(imx7ulp_wdt->clk); if (ret) return ret; @@ -259,7 +353,7 @@ static int imx7ulp_wdt_probe(struct platform_device *pdev) watchdog_stop_on_reboot(wdog); watchdog_stop_on_unregister(wdog); watchdog_set_drvdata(wdog, imx7ulp_wdt); - ret = imx7ulp_wdt_init(imx7ulp_wdt->base, wdog->timeout * WDOG_CLOCK_RATE); + ret = imx7ulp_wdt_init(imx7ulp_wdt, wdog->timeout * WDOG_CLOCK_RATE); if (ret) return ret; @@ -289,7 +383,7 @@ static int __maybe_unused imx7ulp_wdt_resume_noirq(struct device *dev) return ret; if (imx7ulp_wdt_is_enabled(imx7ulp_wdt->base)) - imx7ulp_wdt_init(imx7ulp_wdt->base, timeout); + imx7ulp_wdt_init(imx7ulp_wdt, timeout); if (watchdog_active(&imx7ulp_wdt->wdd)) imx7ulp_wdt_start(&imx7ulp_wdt->wdd); @@ -303,6 +397,7 @@ static const struct dev_pm_ops imx7ulp_wdt_pm_ops = { }; static const struct of_device_id imx7ulp_wdt_dt_ids[] = { + { .compatible = "fsl,imx8ulp-wdt", }, { .compatible = "fsl,imx7ulp-wdt", }, { /* sentinel */ } }; -- GitLab From cef6bc98d50da24252fb289759f1790e17afa448 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Thu, 25 Aug 2022 16:32:55 +0800 Subject: [PATCH 0840/2223] watchdog: imx7ulp_wdt: init wdog when it was active Paired with suspend, we can only init wdog again when it was active and ping it once to avoid the watchdog timeout after it resumed. Signed-off-by: Jason Liu Signed-off-by: Alice Guo Reviewed-by: Ye Li Reviewed-by: Jacky Bai Tested-by: Jacky Bai Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-7-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index 0cafa86fff7f9..dee02c2a52c95 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -136,13 +136,6 @@ static int imx7ulp_wdt_enable(struct watchdog_device *wdog, bool enable) return ret; } -static bool imx7ulp_wdt_is_enabled(void __iomem *base) -{ - u32 val = readl(base + WDOG_CS); - - return val & WDOG_CS_EN; -} - static int imx7ulp_wdt_ping(struct watchdog_device *wdog) { struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); @@ -382,11 +375,11 @@ static int __maybe_unused imx7ulp_wdt_resume_noirq(struct device *dev) if (ret) return ret; - if (imx7ulp_wdt_is_enabled(imx7ulp_wdt->base)) + if (watchdog_active(&imx7ulp_wdt->wdd)) { imx7ulp_wdt_init(imx7ulp_wdt, timeout); - - if (watchdog_active(&imx7ulp_wdt->wdd)) imx7ulp_wdt_start(&imx7ulp_wdt->wdd); + imx7ulp_wdt_ping(&imx7ulp_wdt->wdd); + } return 0; } -- GitLab From 8ed2dc48551354bbf33df869f3968b7805cbaa61 Mon Sep 17 00:00:00 2001 From: Alice Guo Date: Thu, 25 Aug 2022 16:32:56 +0800 Subject: [PATCH 0841/2223] watchdog: imx93: add watchdog timer on imx93 The WDOG clocks are sourced from lpo_clk, and lpo_clk is the fixed 32KHz. TOVAL contains the 16-bit value used to set the timeout period of the watchdog. When the timeout period exceeds 2 seconds, the value written to the TOVAL register is larger than 16-bit can represent. Enabling watchdog prescaler can solve this problem. Two points need to be aware of: 1. watchdog prescaler enables a fixed 256 pre-scaling of watchdog counter reference clock 2. reconfiguration takes about 55ms on imx93 Reviewed-by: Jacky Bai Signed-off-by: Alice Guo Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220825083256.14565-8-alice.guo@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx7ulp_wdt.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index dee02c2a52c95..2897902090b39 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -52,11 +53,17 @@ module_param(nowayout, bool, 0000); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); +struct imx_wdt_hw_feature { + bool prescaler_enable; + u32 wdog_clock_rate; +}; + struct imx7ulp_wdt_device { struct watchdog_device wdd; void __iomem *base; struct clk *clk; bool post_rcs_wait; + const struct imx_wdt_hw_feature *hw; }; static int imx7ulp_wdt_wait_ulk(void __iomem *base) @@ -179,7 +186,7 @@ static int imx7ulp_wdt_set_timeout(struct watchdog_device *wdog, unsigned int timeout) { struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); - u32 toval = WDOG_CLOCK_RATE * timeout; + u32 toval = wdt->hw->wdog_clock_rate * timeout; u32 val; int ret; u32 loop = RETRY_MAX; @@ -276,6 +283,9 @@ static int imx7ulp_wdt_init(struct imx7ulp_wdt_device *wdt, unsigned int timeout int ret; u32 loop = RETRY_MAX; + if (wdt->hw->prescaler_enable) + val |= WDOG_CS_PRES; + do { ret = _imx7ulp_wdt_init(wdt, timeout, val); toval = readl(wdt->base + WDOG_TOVAL); @@ -346,7 +356,9 @@ static int imx7ulp_wdt_probe(struct platform_device *pdev) watchdog_stop_on_reboot(wdog); watchdog_stop_on_unregister(wdog); watchdog_set_drvdata(wdog, imx7ulp_wdt); - ret = imx7ulp_wdt_init(imx7ulp_wdt, wdog->timeout * WDOG_CLOCK_RATE); + + imx7ulp_wdt->hw = of_device_get_match_data(dev); + ret = imx7ulp_wdt_init(imx7ulp_wdt, wdog->timeout * imx7ulp_wdt->hw->wdog_clock_rate); if (ret) return ret; @@ -368,7 +380,7 @@ static int __maybe_unused imx7ulp_wdt_suspend_noirq(struct device *dev) static int __maybe_unused imx7ulp_wdt_resume_noirq(struct device *dev) { struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev); - u32 timeout = imx7ulp_wdt->wdd.timeout * WDOG_CLOCK_RATE; + u32 timeout = imx7ulp_wdt->wdd.timeout * imx7ulp_wdt->hw->wdog_clock_rate; int ret; ret = clk_prepare_enable(imx7ulp_wdt->clk); @@ -389,9 +401,20 @@ static const struct dev_pm_ops imx7ulp_wdt_pm_ops = { imx7ulp_wdt_resume_noirq) }; +static const struct imx_wdt_hw_feature imx7ulp_wdt_hw = { + .prescaler_enable = false, + .wdog_clock_rate = 1000, +}; + +static const struct imx_wdt_hw_feature imx93_wdt_hw = { + .prescaler_enable = true, + .wdog_clock_rate = 125, +}; + static const struct of_device_id imx7ulp_wdt_dt_ids[] = { - { .compatible = "fsl,imx8ulp-wdt", }, - { .compatible = "fsl,imx7ulp-wdt", }, + { .compatible = "fsl,imx8ulp-wdt", .data = &imx7ulp_wdt_hw, }, + { .compatible = "fsl,imx7ulp-wdt", .data = &imx7ulp_wdt_hw, }, + { .compatible = "fsl,imx93-wdt", .data = &imx93_wdt_hw, }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, imx7ulp_wdt_dt_ids); -- GitLab From 081574f76d010532ff406d682f532ac410559a3b Mon Sep 17 00:00:00 2001 From: Vladimir Panteleev Date: Tue, 20 Sep 2022 09:27:21 +0000 Subject: [PATCH 0842/2223] watchdog: sp5100_tco: Add "action" module parameter Allow configuring the "action" bit, as documented in [1]. Previously, the only action supported by this module was to reset the system (0). It can now be configured to power off (1) instead. [1]: https://www.amd.com/system/files/TechDocs/44413.pdf Signed-off-by: Vladimir Panteleev Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220920092721.7686-1-git@vladimir.panteleev.md Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sp5100_tco.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/sp5100_tco.c b/drivers/watchdog/sp5100_tco.c index ae54dd33e2336..fb426b7d81dac 100644 --- a/drivers/watchdog/sp5100_tco.c +++ b/drivers/watchdog/sp5100_tco.c @@ -65,6 +65,12 @@ static struct pci_dev *sp5100_tco_pci; /* module parameters */ +#define WATCHDOG_ACTION 0 +static bool action = WATCHDOG_ACTION; +module_param(action, bool, 0); +MODULE_PARM_DESC(action, "Action taken when watchdog expires, 0 to reset, 1 to poweroff (default=" + __MODULE_STRING(WATCHDOG_ACTION) ")"); + #define WATCHDOG_HEARTBEAT 60 /* 60 sec default heartbeat. */ static int heartbeat = WATCHDOG_HEARTBEAT; /* in seconds */ module_param(heartbeat, int, 0); @@ -297,8 +303,11 @@ static int sp5100_tco_timer_init(struct sp5100_tco *tco) if (val & SP5100_WDT_FIRED) wdd->bootstatus = WDIOF_CARDRESET; - /* Set watchdog action to reset the system */ - val &= ~SP5100_WDT_ACTION_RESET; + /* Set watchdog action */ + if (action) + val |= SP5100_WDT_ACTION_RESET; + else + val &= ~SP5100_WDT_ACTION_RESET; writel(val, SP5100_WDT_CONTROL(tco->tcobase)); /* Set a reasonable heartbeat before we stop the timer */ -- GitLab From 9023e05b7a5809593a7ea09896eee0bbb6ae1685 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Mon, 26 Sep 2022 18:25:49 +0200 Subject: [PATCH 0843/2223] dt-bindings: watchdog: migrate mt7621 text bindings to YAML Soc Mt7621 Watchdog bindings used text format, so migrate them to YAML. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sergio Paracuellos Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220926162549.805108-1-sergio.paracuellos@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../watchdog/mediatek,mt7621-wdt.yaml | 33 +++++++++++++++++++ .../bindings/watchdog/mt7621-wdt.txt | 12 ------- 2 files changed, 33 insertions(+), 12 deletions(-) create mode 100644 Documentation/devicetree/bindings/watchdog/mediatek,mt7621-wdt.yaml delete mode 100644 Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/mediatek,mt7621-wdt.yaml b/Documentation/devicetree/bindings/watchdog/mediatek,mt7621-wdt.yaml new file mode 100644 index 0000000000000..b2b17fdf4e398 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/mediatek,mt7621-wdt.yaml @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/mediatek,mt7621-wdt.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Ralink Watchdog Timers + +maintainers: + - Sergio Paracuellos + +allOf: + - $ref: watchdog.yaml# + +properties: + compatible: + const: mediatek,mt7621-wdt + + reg: + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + watchdog@100 { + compatible = "mediatek,mt7621-wdt"; + reg = <0x100 0x100>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt b/Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt deleted file mode 100644 index c15ef0ef609f3..0000000000000 --- a/Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt +++ /dev/null @@ -1,12 +0,0 @@ -Ralink Watchdog Timers - -Required properties: -- compatible: must be "mediatek,mt7621-wdt" -- reg: physical base address of the controller and length of the register range - -Example: - - watchdog@100 { - compatible = "mediatek,mt7621-wdt"; - reg = <0x100 0x10>; - }; -- GitLab From 0dbc45241dc3f8d51957d4c770c16e49387cd6c2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Aug 2022 21:33:10 +0300 Subject: [PATCH 0844/2223] PCI: dwc: Replace of_gpio_named_count() by gpiod_count() As a preparation to unexport of_gpio_named_count(), convert the driver to use gpiod_count() instead. Link: https://lore.kernel.org/r/20220830183310.48541-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- drivers/pci/controller/dwc/pcie-kirin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 7f67aad71df4e..d09507f822a7d 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -366,12 +367,11 @@ static int kirin_pcie_get_gpio_enable(struct kirin_pcie *pcie, struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; char name[32]; int ret, i; /* This is an optional property */ - ret = of_gpio_named_count(np, "hisilicon,clken-gpios"); + ret = gpiod_count(dev, "hisilicon,clken"); if (ret < 0) return 0; -- GitLab From 3db1e531e444290f0f54dd794b5cc22cf189930a Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Fri, 2 Sep 2022 16:58:06 +0800 Subject: [PATCH 0845/2223] PCI: imx6: Add i.MX8MP PCIe support Add i.MX8MP PCIe support. To avoid codes duplication when find the syscon regmap, add the iomux gpr syscon compatible into drvdata. Link: https://lore.kernel.org/r/1662109086-15881-8-git-send-email-hongxing.zhu@nxp.com Tested-by: Marek Vasut Tested-by: Richard Leitner Tested-by: Alexander Stein Signed-off-by: Richard Zhu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 6e5debdbc55b9..facc8e7b01c22 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -51,6 +51,7 @@ enum imx6_pcie_variants { IMX7D, IMX8MQ, IMX8MM, + IMX8MP, }; #define IMX6_PCIE_FLAG_IMX6_PHY BIT(0) @@ -61,6 +62,7 @@ struct imx6_pcie_drvdata { enum imx6_pcie_variants variant; u32 flags; int dbi_length; + const char *gpr; }; struct imx6_pcie { @@ -150,7 +152,8 @@ struct imx6_pcie { static unsigned int imx6_pcie_grp_offset(const struct imx6_pcie *imx6_pcie) { WARN_ON(imx6_pcie->drvdata->variant != IMX8MQ && - imx6_pcie->drvdata->variant != IMX8MM); + imx6_pcie->drvdata->variant != IMX8MM && + imx6_pcie->drvdata->variant != IMX8MP); return imx6_pcie->controller_id == 1 ? IOMUXC_GPR16 : IOMUXC_GPR14; } @@ -301,6 +304,7 @@ static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie) { switch (imx6_pcie->drvdata->variant) { case IMX8MM: + case IMX8MP: /* * The PHY initialization had been done in the PHY * driver, break here directly. @@ -558,6 +562,7 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) break; case IMX8MM: case IMX8MQ: + case IMX8MP: ret = clk_prepare_enable(imx6_pcie->pcie_aux); if (ret) { dev_err(dev, "unable to enable pcie_aux clock\n"); @@ -602,6 +607,7 @@ static void imx6_pcie_disable_ref_clk(struct imx6_pcie *imx6_pcie) break; case IMX8MM: case IMX8MQ: + case IMX8MP: clk_disable_unprepare(imx6_pcie->pcie_aux); break; default: @@ -669,6 +675,7 @@ static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) reset_control_assert(imx6_pcie->pciephy_reset); fallthrough; case IMX8MM: + case IMX8MP: reset_control_assert(imx6_pcie->apps_reset); break; case IMX6SX: @@ -744,6 +751,7 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) break; case IMX6Q: /* Nothing to do */ case IMX8MM: + case IMX8MP: break; } @@ -793,6 +801,7 @@ static void imx6_pcie_ltssm_enable(struct device *dev) case IMX7D: case IMX8MQ: case IMX8MM: + case IMX8MP: reset_control_deassert(imx6_pcie->apps_reset); break; } @@ -812,6 +821,7 @@ static void imx6_pcie_ltssm_disable(struct device *dev) case IMX7D: case IMX8MQ: case IMX8MM: + case IMX8MP: reset_control_assert(imx6_pcie->apps_reset); break; } @@ -1179,6 +1189,7 @@ static int imx6_pcie_probe(struct platform_device *pdev) } break; case IMX8MM: + case IMX8MP: imx6_pcie->pcie_aux = devm_clk_get(dev, "pcie_aux"); if (IS_ERR(imx6_pcie->pcie_aux)) return dev_err_probe(dev, PTR_ERR(imx6_pcie->pcie_aux), @@ -1216,7 +1227,7 @@ static int imx6_pcie_probe(struct platform_device *pdev) /* Grab GPR config register range */ imx6_pcie->iomuxc_gpr = - syscon_regmap_lookup_by_compatible("fsl,imx6q-iomuxc-gpr"); + syscon_regmap_lookup_by_compatible(imx6_pcie->drvdata->gpr); if (IS_ERR(imx6_pcie->iomuxc_gpr)) { dev_err(dev, "unable to find iomuxc registers\n"); return PTR_ERR(imx6_pcie->iomuxc_gpr); @@ -1295,12 +1306,14 @@ static const struct imx6_pcie_drvdata drvdata[] = { .flags = IMX6_PCIE_FLAG_IMX6_PHY | IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE, .dbi_length = 0x200, + .gpr = "fsl,imx6q-iomuxc-gpr", }, [IMX6SX] = { .variant = IMX6SX, .flags = IMX6_PCIE_FLAG_IMX6_PHY | IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE | IMX6_PCIE_FLAG_SUPPORTS_SUSPEND, + .gpr = "fsl,imx6q-iomuxc-gpr", }, [IMX6QP] = { .variant = IMX6QP, @@ -1308,17 +1321,26 @@ static const struct imx6_pcie_drvdata drvdata[] = { IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE | IMX6_PCIE_FLAG_SUPPORTS_SUSPEND, .dbi_length = 0x200, + .gpr = "fsl,imx6q-iomuxc-gpr", }, [IMX7D] = { .variant = IMX7D, .flags = IMX6_PCIE_FLAG_SUPPORTS_SUSPEND, + .gpr = "fsl,imx7d-iomuxc-gpr", }, [IMX8MQ] = { .variant = IMX8MQ, + .gpr = "fsl,imx8mq-iomuxc-gpr", }, [IMX8MM] = { .variant = IMX8MM, .flags = IMX6_PCIE_FLAG_SUPPORTS_SUSPEND, + .gpr = "fsl,imx8mm-iomuxc-gpr", + }, + [IMX8MP] = { + .variant = IMX8MP, + .flags = IMX6_PCIE_FLAG_SUPPORTS_SUSPEND, + .gpr = "fsl,imx8mp-iomuxc-gpr", }, }; @@ -1329,6 +1351,7 @@ static const struct of_device_id imx6_pcie_of_match[] = { { .compatible = "fsl,imx7d-pcie", .data = &drvdata[IMX7D], }, { .compatible = "fsl,imx8mq-pcie", .data = &drvdata[IMX8MQ], }, { .compatible = "fsl,imx8mm-pcie", .data = &drvdata[IMX8MM], }, + { .compatible = "fsl,imx8mp-pcie", .data = &drvdata[IMX8MP], }, {}, }; -- GitLab From cbcf8722b523dcf0970ab67dc3d5ced1ea7b334e Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Mon, 5 Sep 2022 10:23:03 +0800 Subject: [PATCH 0846/2223] phy: freescale: imx8m-pcie: Fix the wrong order of phy_init() and phy_power_on() Refer to phy_core driver, phy_init() must be called before phy_power_on(). Fix the wrong order of phy_init() and phy_power_on() here. Link: https://lore.kernel.org/r/1662344583-18874-1-git-send-email-hongxing.zhu@nxp.com Fixes: 1aa97b002258 ("phy: freescale: pcie: Initialize the imx8 pcie standalone phy driver") Tested-by: Alexander Stein Signed-off-by: Richard Zhu Signed-off-by: Lorenzo Pieralisi Acked-by: Vinod Koul Acked-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pci-imx6.c | 6 +++--- drivers/phy/freescale/phy-fsl-imx8m-pcie.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index facc8e7b01c22..2616585ca5f8a 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -945,7 +945,7 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) } if (imx6_pcie->phy) { - ret = phy_power_on(imx6_pcie->phy); + ret = phy_init(imx6_pcie->phy); if (ret) { dev_err(dev, "pcie PHY power up failed\n"); goto err_clk_disable; @@ -959,7 +959,7 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) } if (imx6_pcie->phy) { - ret = phy_init(imx6_pcie->phy); + ret = phy_power_on(imx6_pcie->phy); if (ret) { dev_err(dev, "waiting for PHY ready timeout!\n"); goto err_phy_off; @@ -971,7 +971,7 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) err_phy_off: if (imx6_pcie->phy) - phy_power_off(imx6_pcie->phy); + phy_exit(imx6_pcie->phy); err_clk_disable: imx6_pcie_clk_disable(imx6_pcie); err_reg_disable: diff --git a/drivers/phy/freescale/phy-fsl-imx8m-pcie.c b/drivers/phy/freescale/phy-fsl-imx8m-pcie.c index ad7d2edfc4146..c93286483b425 100644 --- a/drivers/phy/freescale/phy-fsl-imx8m-pcie.c +++ b/drivers/phy/freescale/phy-fsl-imx8m-pcie.c @@ -59,7 +59,7 @@ struct imx8_pcie_phy { bool clkreq_unused; }; -static int imx8_pcie_phy_init(struct phy *phy) +static int imx8_pcie_phy_power_on(struct phy *phy) { int ret; u32 val, pad_mode; @@ -137,14 +137,14 @@ static int imx8_pcie_phy_init(struct phy *phy) return ret; } -static int imx8_pcie_phy_power_on(struct phy *phy) +static int imx8_pcie_phy_init(struct phy *phy) { struct imx8_pcie_phy *imx8_phy = phy_get_drvdata(phy); return clk_prepare_enable(imx8_phy->clk); } -static int imx8_pcie_phy_power_off(struct phy *phy) +static int imx8_pcie_phy_exit(struct phy *phy) { struct imx8_pcie_phy *imx8_phy = phy_get_drvdata(phy); @@ -155,8 +155,8 @@ static int imx8_pcie_phy_power_off(struct phy *phy) static const struct phy_ops imx8_pcie_phy_ops = { .init = imx8_pcie_phy_init, + .exit = imx8_pcie_phy_exit, .power_on = imx8_pcie_phy_power_on, - .power_off = imx8_pcie_phy_power_off, .owner = THIS_MODULE, }; -- GitLab From f1bfbd000f3bc42a34aec9208c6aaa9076682601 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 14 Sep 2022 13:23:39 +0530 Subject: [PATCH 0847/2223] PCI: qcom-ep: Add kernel-doc for qcom_pcie_ep structure Add kernel-doc for qcom_pcie_ep structure. Link: https://lore.kernel.org/r/20220914075350.7992-2-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 4c87167861fd6..98c64a85d01f8 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -140,6 +140,23 @@ static struct clk_bulk_data qcom_pcie_ep_clks[] = { { .id = "slave_q2a" }, }; +/** + * struct qcom_pcie_ep - Qualcomm PCIe Endpoint Controller + * @pci: Designware PCIe controller struct + * @parf: Qualcomm PCIe specific PARF register base + * @elbi: Designware PCIe specific ELBI register base + * @perst_map: PERST regmap + * @mmio_res: MMIO region resource + * @core_reset: PCIe Endpoint core reset + * @reset: PERST# GPIO + * @wake: WAKE# GPIO + * @phy: PHY controller block + * @perst_en: Flag for PERST enable + * @perst_sep_en: Flag for PERST separation enable + * @link_status: PCIe Link status + * @global_irq: Qualcomm PCIe specific Global IRQ + * @perst_irq: PERST# IRQ + */ struct qcom_pcie_ep { struct dw_pcie pci; -- GitLab From e2efd31465b1d97a0bca6f93cb75ccdc8001c8d3 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 14 Sep 2022 13:23:40 +0530 Subject: [PATCH 0848/2223] PCI: qcom-ep: Rely on the clocks supplied by devicetree Generally, device drivers should just rely on the platform data like devicetree to supply the clocks required for the functioning of the peripheral. There is no need to hardcode the clk info in the driver. So get rid of the static clk info and obtain the platform supplied clks. The total number of clocks supplied is obtained using the devm_clk_bulk_get_all() API and used for the rest of the clk_bulk_ APIs. Link: https://lore.kernel.org/r/20220914075350.7992-3-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 33 +++++++++-------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 98c64a85d01f8..e6ba781594a60 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -130,16 +130,6 @@ enum qcom_pcie_ep_link_status { QCOM_PCIE_EP_LINK_DOWN, }; -static struct clk_bulk_data qcom_pcie_ep_clks[] = { - { .id = "cfg" }, - { .id = "aux" }, - { .id = "bus_master" }, - { .id = "bus_slave" }, - { .id = "ref" }, - { .id = "sleep" }, - { .id = "slave_q2a" }, -}; - /** * struct qcom_pcie_ep - Qualcomm PCIe Endpoint Controller * @pci: Designware PCIe controller struct @@ -151,6 +141,8 @@ static struct clk_bulk_data qcom_pcie_ep_clks[] = { * @reset: PERST# GPIO * @wake: WAKE# GPIO * @phy: PHY controller block + * @clks: PCIe clocks + * @num_clks: PCIe clocks count * @perst_en: Flag for PERST enable * @perst_sep_en: Flag for PERST separation enable * @link_status: PCIe Link status @@ -170,6 +162,9 @@ struct qcom_pcie_ep { struct gpio_desc *wake; struct phy *phy; + struct clk_bulk_data *clks; + int num_clks; + u32 perst_en; u32 perst_sep_en; @@ -244,8 +239,7 @@ static int qcom_pcie_enable_resources(struct qcom_pcie_ep *pcie_ep) { int ret; - ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks), - qcom_pcie_ep_clks); + ret = clk_bulk_prepare_enable(pcie_ep->num_clks, pcie_ep->clks); if (ret) return ret; @@ -266,8 +260,7 @@ static int qcom_pcie_enable_resources(struct qcom_pcie_ep *pcie_ep) err_phy_exit: phy_exit(pcie_ep->phy); err_disable_clk: - clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), - qcom_pcie_ep_clks); + clk_bulk_disable_unprepare(pcie_ep->num_clks, pcie_ep->clks); return ret; } @@ -276,8 +269,7 @@ static void qcom_pcie_disable_resources(struct qcom_pcie_ep *pcie_ep) { phy_power_off(pcie_ep->phy); phy_exit(pcie_ep->phy); - clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), - qcom_pcie_ep_clks); + clk_bulk_disable_unprepare(pcie_ep->num_clks, pcie_ep->clks); } static int qcom_pcie_perst_deassert(struct dw_pcie *pci) @@ -495,10 +487,11 @@ static int qcom_pcie_ep_get_resources(struct platform_device *pdev, return ret; } - ret = devm_clk_bulk_get(dev, ARRAY_SIZE(qcom_pcie_ep_clks), - qcom_pcie_ep_clks); - if (ret) - return ret; + pcie_ep->num_clks = devm_clk_bulk_get_all(dev, &pcie_ep->clks); + if (pcie_ep->num_clks < 0) { + dev_err(dev, "Failed to get clocks\n"); + return pcie_ep->num_clks; + } pcie_ep->core_reset = devm_reset_control_get_exclusive(dev, "core"); if (IS_ERR(pcie_ep->core_reset)) -- GitLab From 9cf4843e1acf08ab5c523bc4fa8f7b24de2bea3a Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 14 Sep 2022 13:23:41 +0530 Subject: [PATCH 0849/2223] PCI: qcom-ep: Make use of the cached dev pointer In the qcom_pcie_ep_get_resources() function, dev pointer is already cached in a local variable. So let's make use of it instead of getting the dev pointer again from pdev struct. Link: https://lore.kernel.org/r/20220914075350.7992-4-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index e6ba781594a60..51afd9c547f5a 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -483,7 +483,7 @@ static int qcom_pcie_ep_get_resources(struct platform_device *pdev, ret = qcom_pcie_ep_get_io_resources(pdev, pcie_ep); if (ret) { - dev_err(&pdev->dev, "Failed to get io resources %d\n", ret); + dev_err(dev, "Failed to get io resources %d\n", ret); return ret; } @@ -505,7 +505,7 @@ static int qcom_pcie_ep_get_resources(struct platform_device *pdev, if (IS_ERR(pcie_ep->wake)) return PTR_ERR(pcie_ep->wake); - pcie_ep->phy = devm_phy_optional_get(&pdev->dev, "pciephy"); + pcie_ep->phy = devm_phy_optional_get(dev, "pciephy"); if (IS_ERR(pcie_ep->phy)) ret = PTR_ERR(pcie_ep->phy); -- GitLab From 54b978e03a3ce7aa3b40deeb1b4c0c9dd6660aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 26 Sep 2022 22:30:21 +0300 Subject: [PATCH 0850/2223] drm/i915: Round to closest in g4x+ HDMI clock readout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On pre-ddi platforms we have slightly different code being used for HDMI TMDS clock to dotclock conversion between the state computation and state readout. Both of these need to round the same way in order to not get a mismatch between the computed and read out states. Fix up the rounding direction in the readout path to match what is used during state computation. Another option would to just use intel_crtc_dotclock() in the readout path as well, but I don't really want to do that as the current code more accurately represents how the hardware really works; The HDMI port register defines whether we're actually outputting 8bpc or 12bpc over HDMI, and the PIPECONF bpc setting just defines what goes over FDI between the CPU and PCH. The fact that we try to cram all that into a single pipe_bpp during state computation is perhaps not entirely great... Fixes: f2c9df101095 ("drm/i915: Round TMDS clock to nearest") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220926193021.23287-1-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula (cherry picked from commit 86b972ef1091882d66672399c6f8ebdd12a3b707) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/g4x_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/g4x_hdmi.c b/drivers/gpu/drm/i915/display/g4x_hdmi.c index 5fbd2ae958692..2b73f5ff0d02b 100644 --- a/drivers/gpu/drm/i915/display/g4x_hdmi.c +++ b/drivers/gpu/drm/i915/display/g4x_hdmi.c @@ -120,7 +120,7 @@ static void intel_hdmi_get_config(struct intel_encoder *encoder, pipe_config->hw.adjusted_mode.flags |= flags; if ((tmp & SDVO_COLOR_FORMAT_MASK) == HDMI_COLOR_FORMAT_12bpc) - dotclock = pipe_config->port_clock * 2 / 3; + dotclock = DIV_ROUND_CLOSEST(pipe_config->port_clock * 2, 3); else dotclock = pipe_config->port_clock; -- GitLab From 9947e57b22ddfb6f697fa45ef5c92d2aa17b2edf Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Wed, 15 Jun 2022 04:20:02 -0400 Subject: [PATCH 0851/2223] SUNRPC: Directly use ida_alloc()/free() Use ida_alloc()/ida_free() instead of ida_simple_get()/ida_simple_remove(). The latter is deprecated and more verbose. Signed-off-by: Bo Liu Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 4 ++-- net/sunrpc/xprt.c | 4 ++-- net/sunrpc/xprtmultipath.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index c284efa3d1efc..4d8665f15dd7e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -345,7 +345,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; - clid = ida_simple_get(&rpc_clids, 0, 0, GFP_KERNEL); + clid = ida_alloc(&rpc_clids, GFP_KERNEL); if (clid < 0) return clid; clnt->cl_clid = clid; @@ -354,7 +354,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt) static void rpc_free_clid(struct rpc_clnt *clnt) { - ida_simple_remove(&rpc_clids, clnt->cl_clid); + ida_free(&rpc_clids, clnt->cl_clid); } static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f8fae78156494..a50febadb37e9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1788,7 +1788,7 @@ static int xprt_alloc_id(struct rpc_xprt *xprt) { int id; - id = ida_simple_get(&rpc_xprt_ids, 0, 0, GFP_KERNEL); + id = ida_alloc(&rpc_xprt_ids, GFP_KERNEL); if (id < 0) return id; @@ -1798,7 +1798,7 @@ static int xprt_alloc_id(struct rpc_xprt *xprt) static void xprt_free_id(struct rpc_xprt *xprt) { - ida_simple_remove(&rpc_xprt_ids, xprt->id); + ida_free(&rpc_xprt_ids, xprt->id); } struct rpc_xprt *xprt_alloc(struct net *net, size_t size, diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 685db598acbe1..701250b305dba 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -103,7 +103,7 @@ static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) { int id; - id = ida_simple_get(&rpc_xprtswitch_ids, 0, 0, gfp_flags); + id = ida_alloc(&rpc_xprtswitch_ids, gfp_flags); if (id < 0) return id; @@ -113,7 +113,7 @@ static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) static void xprt_switch_free_id(struct rpc_xprt_switch *xps) { - ida_simple_remove(&rpc_xprtswitch_ids, xps->xps_id); + ida_free(&rpc_xprtswitch_ids, xps->xps_id); } /** -- GitLab From 724e2df95b08a7e6ca989a9f96b29dc92ece9cd9 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Wed, 15 Jun 2022 02:27:45 -0400 Subject: [PATCH 0852/2223] NFSv4: Directly use ida_alloc()/free() Use ida_alloc()/ida_free() instead of ida_simple_get()/ida_simple_remove(). The latter is deprecated and more verbose. Signed-off-by: Bo Liu Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9bab3e9c702a4..beb9448df515f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -497,8 +497,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; - sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, - gfp_flags); + sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags); if (sp->so_seqid.owner_id < 0) { kfree(sp); return NULL; @@ -534,7 +533,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_cred(sp->so_cred); - ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); + ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -877,8 +876,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, - 0, 0, GFP_KERNEL_ACCOUNT); + lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); @@ -890,7 +888,7 @@ out_free: void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id); nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } -- GitLab From d6abc719a213b8c409789799786e11d203adb3b0 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 7 Jun 2022 15:32:01 +0800 Subject: [PATCH 0853/2223] SUNRPC: use max_t() to simplify open code Use max_t() to simplify open code which uses "if...else" to get maximum of two values. Generated by coccinelle script: scripts/coccinelle/misc/minmax.cocci Signed-off-by: Ziyang Xuan Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a50febadb37e9..71dc263734441 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1822,10 +1822,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, goto out_free; list_add(&req->rq_list, &xprt->free); } - if (max_alloc > num_prealloc) - xprt->max_reqs = max_alloc; - else - xprt->max_reqs = num_prealloc; + xprt->max_reqs = max_t(unsigned int, max_alloc, num_prealloc); xprt->min_reqs = num_prealloc; xprt->num_reqs = num_prealloc; -- GitLab From 7e7ce2ccbae746a88e21b4ce94dbf372b31c152c Mon Sep 17 00:00:00 2001 From: yuzhe Date: Wed, 15 Jun 2022 13:39:24 +0800 Subject: [PATCH 0854/2223] nfs: remove unnecessary (void*) conversions. remove unnecessary void* type castings. Signed-off-by: yuzhe Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 6 +++--- fs/nfs/nfs42xattr.c | 2 +- fs/nfs/nfs4idmap.c | 2 +- fs/nfs/nfs4proc.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bea7c005119c3..a87e529065f91 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -318,7 +318,7 @@ struct nfs_find_desc { static int nfs_find_actor(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; @@ -336,7 +336,7 @@ nfs_find_actor(struct inode *inode, void *opaque) static int nfs_init_locked(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); @@ -2271,7 +2271,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) static void init_once(void *foo) { - struct nfs_inode *nfsi = (struct nfs_inode *) foo; + struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index a9bf09fdf2c32..76ae118342066 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -981,7 +981,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) static void nfs4_xattr_cache_init_once(void *p) { - struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + struct nfs4_xattr_cache *cache = p; spin_lock_init(&cache->listxattr_lock); atomic_long_set(&cache->nent, 0); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index ec6afd3c4bca6..e3fdd2f45b01f 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -583,7 +583,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; - struct idmap *idmap = (struct idmap *)aux; + struct idmap *idmap = aux; struct key *key = rka->target_key; int ret = -ENOKEY; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3ed14a2a84a44..17362ba94592c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6607,7 +6607,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) struct nfs4_delegreturndata *d_data; struct pnfs_layout_hdr *lo; - d_data = (struct nfs4_delegreturndata *)data; + d_data = data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { nfs4_sequence_done(task, &d_data->res.seq_res); @@ -8900,7 +8900,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred) void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct nfs4_add_xprt_data *adata = data; struct rpc_task *task; int status; -- GitLab From 384edeb46f07f4ee1b3adda9416e724421e2fad5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 10 Aug 2022 13:40:01 +0200 Subject: [PATCH 0855/2223] NFS: clean up a needless assignment in nfs_file_write() Commit 064109db53ec ("NFS: remove redundant code in nfs_file_write()") identifies that filemap_fdatawait_range() will always return 0 and removes a dead error-handling case in nfs_file_write(). With this change however, assigning the return of filemap_fdatawait_range() to the result variable is a dead store. Remove this needless assignment. No functional change. No change in object code. Signed-off-by: Lukas Bulwahn Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e032fe201a367..7a4d2fe9c0b38 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -655,9 +655,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } if (mntflags & NFS_MOUNT_WRITE_WAIT) { - result = filemap_fdatawait_range(file->f_mapping, - iocb->ki_pos - written, - iocb->ki_pos - 1); + filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } result = generic_write_sync(iocb, written); if (result < 0) -- GitLab From 15bcdc92d108b3bd85a44d7712496c89cf3ffddd Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:02:29 +0200 Subject: [PATCH 0856/2223] SUNRPC: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e976007f4fd00..b3341c202ea07 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -261,7 +261,7 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) switch (sap->sa_family) { case AF_LOCAL: sun = xs_addr_un(xprt); - strlcpy(buf, sun->sun_path, sizeof(buf)); + strscpy(buf, sun->sun_path, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); break; -- GitLab From 0dd7439f382518e9997cfa7ca9d06799dbeb33fa Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:15 +0200 Subject: [PATCH 0857/2223] NFS: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 2 +- fs/nfs/nfsroot.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3c5678aec006f..7a5162afa5c0d 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -254,7 +254,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) goto error; ip_addr = (const char *)buf; } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); err = nfs_idmap_new(clp); if (err < 0) { diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index fa148308822cc..620329b7e6aeb 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -139,7 +139,7 @@ static int __init nfs_root_setup(char *line) ROOT_DEV = Root_NFS; if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + strscpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; if (n >= sizeof(nfs_root_parms)) -- GitLab From 90377158bd2d2acd20e6131e84c234d715b7aa42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Aug 2022 16:56:48 -0400 Subject: [PATCH 0858/2223] NFSv4/pNFS: Always return layout stats on layout return for flexfiles We want to ensure that the server never misses the layout stats when we're closing the file, so that it knows whether or not to update its internal state. Otherwise, if we were racing with a layout stat, we might cause the server to invalidate its layout before the layout stat got processed. Fixes: 06946c6a3d8b ("pNFS/flexfiles: Only send layoutstats updates for mirrors that were updated") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayout.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d285561e59f6..1443330ae9985 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -30,14 +30,20 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) #define FF_LAYOUTRETURN_MAXERR 20 +enum nfs4_ff_op_type { + NFS4_FF_OP_LAYOUTSTATS, + NFS4_FF_OP_LAYOUTRETURN, +}; + static unsigned short io_maxretrans; static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); -static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, +static int +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit); + int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, struct nfs4_ff_layout_mirror *mirror); @@ -2161,8 +2167,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) FF_LAYOUTRETURN_MAXERR); spin_lock(&args->inode->i_lock); - ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + ff_args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &ff_args->devinfo[0], + ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN); spin_unlock(&args->inode->i_lock); args->ld_private->ops = &layoutreturn_ops; @@ -2396,7 +2403,7 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = { static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit) + int dev_limit, enum nfs4_ff_op_type type) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; @@ -2408,7 +2415,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, break; if (IS_ERR_OR_NULL(mirror->mirror_ds)) continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) continue; /* mirror refcount put in cleanup_layoutstats */ if (!refcount_inc_not_zero(&mirror->ref)) @@ -2448,7 +2457,9 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) spin_lock(&args->inode->i_lock); ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], dev_count); + &args->devinfo[0], + dev_count, + NFS4_FF_OP_LAYOUTSTATS); spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); -- GitLab From 1b00adce8afdb842615a5bf3774510f14a9b769a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 28 Jul 2022 17:42:54 +0300 Subject: [PATCH 0859/2223] irqchip/ls-extirq: Fix invalid wait context by avoiding to use regmap The irqchip->irq_set_type method is called by __irq_set_trigger() under the desc->lock raw spinlock. The ls-extirq implementation, ls_extirq_irq_set_type(), uses an MMIO regmap created by of_syscon_register(), which uses plain spinlocks (the kind that are sleepable on RT). Therefore, this is an invalid locking scheme for which we get a kernel splat stating just that ("[ BUG: Invalid wait context ]"), because the context in which the plain spinlock may sleep is atomic due to the raw spinlock. We need to go raw spinlocks all the way. Make this driver ioremap its INTPCR register on its own, and stop relying on syscon to provide a regmap. Fixes: 0dcd9f872769 ("irqchip: Add support for Layerscape external interrupt lines") Signed-off-by: Vladimir Oltean [maz: trimmed down commit log] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220728144254.175385-1-vladimir.oltean@nxp.com --- drivers/irqchip/irq-ls-extirq.c | 87 ++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/drivers/irqchip/irq-ls-extirq.c b/drivers/irqchip/irq-ls-extirq.c index 853b3972dbe78..d8d48b1f7c29d 100644 --- a/drivers/irqchip/irq-ls-extirq.c +++ b/drivers/irqchip/irq-ls-extirq.c @@ -6,8 +6,7 @@ #include #include #include -#include -#include +#include #include #include @@ -16,13 +15,41 @@ #define LS1021A_SCFGREVCR 0x200 struct ls_extirq_data { - struct regmap *syscon; - u32 intpcr; + void __iomem *intpcr; + raw_spinlock_t lock; + bool big_endian; bool is_ls1021a_or_ls1043a; u32 nirq; struct irq_fwspec map[MAXIRQ]; }; +static void ls_extirq_intpcr_rmw(struct ls_extirq_data *priv, u32 mask, + u32 value) +{ + u32 intpcr; + + /* + * Serialize concurrent calls to ls_extirq_set_type() from multiple + * IRQ descriptors, making sure the read-modify-write is atomic. + */ + raw_spin_lock(&priv->lock); + + if (priv->big_endian) + intpcr = ioread32be(priv->intpcr); + else + intpcr = ioread32(priv->intpcr); + + intpcr &= ~mask; + intpcr |= value; + + if (priv->big_endian) + iowrite32be(intpcr, priv->intpcr); + else + iowrite32(intpcr, priv->intpcr); + + raw_spin_unlock(&priv->lock); +} + static int ls_extirq_set_type(struct irq_data *data, unsigned int type) { @@ -51,7 +78,8 @@ ls_extirq_set_type(struct irq_data *data, unsigned int type) default: return -EINVAL; } - regmap_update_bits(priv->syscon, priv->intpcr, mask, value); + + ls_extirq_intpcr_rmw(priv, mask, value); return irq_chip_set_type_parent(data, type); } @@ -143,7 +171,6 @@ ls_extirq_parse_map(struct ls_extirq_data *priv, struct device_node *node) static int __init ls_extirq_of_init(struct device_node *node, struct device_node *parent) { - struct irq_domain *domain, *parent_domain; struct ls_extirq_data *priv; int ret; @@ -151,40 +178,52 @@ ls_extirq_of_init(struct device_node *node, struct device_node *parent) parent_domain = irq_find_host(parent); if (!parent_domain) { pr_err("Cannot find parent domain\n"); - return -ENODEV; + ret = -ENODEV; + goto err_irq_find_host; } priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - - priv->syscon = syscon_node_to_regmap(node->parent); - if (IS_ERR(priv->syscon)) { - ret = PTR_ERR(priv->syscon); - pr_err("Failed to lookup parent regmap\n"); - goto out; + if (!priv) { + ret = -ENOMEM; + goto err_alloc_priv; } - ret = of_property_read_u32(node, "reg", &priv->intpcr); - if (ret) { - pr_err("Missing INTPCR offset value\n"); - goto out; + + /* + * All extirq OF nodes are under a scfg/syscon node with + * the 'ranges' property + */ + priv->intpcr = of_iomap(node, 0); + if (!priv->intpcr) { + pr_err("Cannot ioremap OF node %pOF\n", node); + ret = -ENOMEM; + goto err_iomap; } ret = ls_extirq_parse_map(priv, node); if (ret) - goto out; + goto err_parse_map; + priv->big_endian = of_device_is_big_endian(parent); priv->is_ls1021a_or_ls1043a = of_device_is_compatible(node, "fsl,ls1021a-extirq") || of_device_is_compatible(node, "fsl,ls1043a-extirq"); + raw_spin_lock_init(&priv->lock); domain = irq_domain_add_hierarchy(parent_domain, 0, priv->nirq, node, &extirq_domain_ops, priv); - if (!domain) + if (!domain) { ret = -ENOMEM; + goto err_add_hierarchy; + } -out: - if (ret) - kfree(priv); + return 0; + +err_add_hierarchy: +err_parse_map: + iounmap(priv->intpcr); +err_iomap: + kfree(priv); +err_alloc_priv: +err_irq_find_host: return ret; } -- GitLab From af4e20d335d4414814030ba26f1689884c831269 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Fri, 30 Sep 2022 19:02:23 +0530 Subject: [PATCH 0860/2223] drm/i915/ehl: Update MOCS table for EHL Add these extra EHL entries back since we have drm-tip commit 13d29c823738 ("drm/i915/ehl: unconditionally flush the pages on acquire") introduces proper flushing to make it work as expected. Cc: Chris Wilson Cc: Matthew Auld Fixes: 046091758b50 ("Revert "drm/i915/ehl: Update MOCS table for EHL"") Signed-off-by: Matt Roper Signed-off-by: Tejas Upadhyay Acked-by: Matthew Auld Signed-off-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20220930133223.2757282-1-tejas.upadhyay@intel.com (cherry picked from commit 6fa964c045a6bc3321a9186e87bfbcfd1059b0f1) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_mocs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index c6ebe27810764..152244d7f62a0 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -207,6 +207,14 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { MOCS_ENTRY(15, \ LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), \ L3_3_WB), \ + /* Bypass LLC - Uncached (EHL+) */ \ + MOCS_ENTRY(16, \ + LE_1_UC | LE_TC_1_LLC | LE_SCF(1), \ + L3_1_UC), \ + /* Bypass LLC - L3 (Read-Only) (EHL+) */ \ + MOCS_ENTRY(17, \ + LE_1_UC | LE_TC_1_LLC | LE_SCF(1), \ + L3_3_WB), \ /* Self-Snoop - L3 + LLC */ \ MOCS_ENTRY(18, \ LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SSE(3), \ -- GitLab From 1de2e7e08e8cd0f281ba9f079a25e72543fe82f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Mon, 3 Oct 2022 10:20:11 +0300 Subject: [PATCH 0861/2223] drm/i915/psr: Fix PSR_IMR/IIR field handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current PSR code is supposed to use TRANSCODER_EDP to force 0 shift for bits in PSR_IMR/IIR registers: /* * gen12+ has registers relative to transcoder and one per transcoder * using the same bit definition: handle it as TRANSCODER_EDP to force * 0 shift in bit definition */ At the time of writing the code assumption "TRANSCODER_EDP == 0" was made. This is not the case and all fields in PSR_IMR and PSR_IIR are shifted incorrectly if DISPLAY_VER >= 12. Fix this by adding separate register field defines for >=12 and add bit getter functions to keep code readability. v4: - Remove EDP from TGL definitions (José) - Use REG_BIT and REG_GENMASK (José) v3: - Add separate register field defines (José) - Add bit getter functions (José) v2: - Improve commit message (José) Cc: José Roberto de Souza Cc: Mika Kahola Fixes: 8241cfbe67f4 ("drm/i915/tgl: Access the right register when handling PSR interruptions") Signed-off-by: Jouni Högander Reviewed-by: José Roberto de Souza Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20221003072011.72408-1-jouni.hogander@intel.com (cherry picked from commit 8da8e32e0b095613af2c2ce4b322240269164a8e) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_psr.c | 78 ++++++++++++++---------- drivers/gpu/drm/i915/i915_reg.h | 16 +++-- 2 files changed, 59 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 9def8d9fade6b..d4cce627d7a87 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -116,34 +116,56 @@ static bool psr2_global_enabled(struct intel_dp *intel_dp) } } +static u32 psr_irq_psr_error_bit_get(struct intel_dp *intel_dp) +{ + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + + return DISPLAY_VER(dev_priv) >= 12 ? TGL_PSR_ERROR : + EDP_PSR_ERROR(intel_dp->psr.transcoder); +} + +static u32 psr_irq_post_exit_bit_get(struct intel_dp *intel_dp) +{ + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + + return DISPLAY_VER(dev_priv) >= 12 ? TGL_PSR_POST_EXIT : + EDP_PSR_POST_EXIT(intel_dp->psr.transcoder); +} + +static u32 psr_irq_pre_entry_bit_get(struct intel_dp *intel_dp) +{ + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + + return DISPLAY_VER(dev_priv) >= 12 ? TGL_PSR_PRE_ENTRY : + EDP_PSR_PRE_ENTRY(intel_dp->psr.transcoder); +} + +static u32 psr_irq_mask_get(struct intel_dp *intel_dp) +{ + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + + return DISPLAY_VER(dev_priv) >= 12 ? TGL_PSR_MASK : + EDP_PSR_MASK(intel_dp->psr.transcoder); +} + static void psr_irq_control(struct intel_dp *intel_dp) { struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); - enum transcoder trans_shift; i915_reg_t imr_reg; u32 mask, val; - /* - * gen12+ has registers relative to transcoder and one per transcoder - * using the same bit definition: handle it as TRANSCODER_EDP to force - * 0 shift in bit definition - */ - if (DISPLAY_VER(dev_priv) >= 12) { - trans_shift = 0; + if (DISPLAY_VER(dev_priv) >= 12) imr_reg = TRANS_PSR_IMR(intel_dp->psr.transcoder); - } else { - trans_shift = intel_dp->psr.transcoder; + else imr_reg = EDP_PSR_IMR; - } - mask = EDP_PSR_ERROR(trans_shift); + mask = psr_irq_psr_error_bit_get(intel_dp); if (intel_dp->psr.debug & I915_PSR_DEBUG_IRQ) - mask |= EDP_PSR_POST_EXIT(trans_shift) | - EDP_PSR_PRE_ENTRY(trans_shift); + mask |= psr_irq_post_exit_bit_get(intel_dp) | + psr_irq_pre_entry_bit_get(intel_dp); - /* Warning: it is masking/setting reserved bits too */ val = intel_de_read(dev_priv, imr_reg); - val &= ~EDP_PSR_TRANS_MASK(trans_shift); + val &= ~psr_irq_mask_get(intel_dp); val |= ~mask; intel_de_write(dev_priv, imr_reg, val); } @@ -191,25 +213,21 @@ void intel_psr_irq_handler(struct intel_dp *intel_dp, u32 psr_iir) enum transcoder cpu_transcoder = intel_dp->psr.transcoder; struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); ktime_t time_ns = ktime_get(); - enum transcoder trans_shift; i915_reg_t imr_reg; - if (DISPLAY_VER(dev_priv) >= 12) { - trans_shift = 0; + if (DISPLAY_VER(dev_priv) >= 12) imr_reg = TRANS_PSR_IMR(intel_dp->psr.transcoder); - } else { - trans_shift = intel_dp->psr.transcoder; + else imr_reg = EDP_PSR_IMR; - } - if (psr_iir & EDP_PSR_PRE_ENTRY(trans_shift)) { + if (psr_iir & psr_irq_pre_entry_bit_get(intel_dp)) { intel_dp->psr.last_entry_attempt = time_ns; drm_dbg_kms(&dev_priv->drm, "[transcoder %s] PSR entry attempt in 2 vblanks\n", transcoder_name(cpu_transcoder)); } - if (psr_iir & EDP_PSR_POST_EXIT(trans_shift)) { + if (psr_iir & psr_irq_post_exit_bit_get(intel_dp)) { intel_dp->psr.last_exit = time_ns; drm_dbg_kms(&dev_priv->drm, "[transcoder %s] PSR exit completed\n", @@ -226,7 +244,7 @@ void intel_psr_irq_handler(struct intel_dp *intel_dp, u32 psr_iir) } } - if (psr_iir & EDP_PSR_ERROR(trans_shift)) { + if (psr_iir & psr_irq_psr_error_bit_get(intel_dp)) { u32 val; drm_warn(&dev_priv->drm, "[transcoder %s] PSR aux error\n", @@ -243,7 +261,7 @@ void intel_psr_irq_handler(struct intel_dp *intel_dp, u32 psr_iir) * or unset irq_aux_error. */ val = intel_de_read(dev_priv, imr_reg); - val |= EDP_PSR_ERROR(trans_shift); + val |= psr_irq_psr_error_bit_get(intel_dp); intel_de_write(dev_priv, imr_reg, val); schedule_work(&intel_dp->psr.work); @@ -1194,14 +1212,12 @@ static bool psr_interrupt_error_check(struct intel_dp *intel_dp) * first time that PSR HW tries to activate so lets keep PSR disabled * to avoid any rendering problems. */ - if (DISPLAY_VER(dev_priv) >= 12) { + if (DISPLAY_VER(dev_priv) >= 12) val = intel_de_read(dev_priv, TRANS_PSR_IIR(intel_dp->psr.transcoder)); - val &= EDP_PSR_ERROR(0); - } else { + else val = intel_de_read(dev_priv, EDP_PSR_IIR); - val &= EDP_PSR_ERROR(intel_dp->psr.transcoder); - } + val &= psr_irq_psr_error_bit_get(intel_dp); if (val) { intel_dp->psr.sink_not_reliable = true; drm_dbg_kms(&dev_priv->drm, diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 1a9bd829fc7ea..0b287a59dc2f4 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2157,10 +2157,18 @@ #define TRANS_PSR_IIR(tran) _MMIO_TRANS2(tran, _PSR_IIR_A) #define _EDP_PSR_TRANS_SHIFT(trans) ((trans) == TRANSCODER_EDP ? \ 0 : ((trans) - TRANSCODER_A + 1) * 8) -#define EDP_PSR_TRANS_MASK(trans) (0x7 << _EDP_PSR_TRANS_SHIFT(trans)) -#define EDP_PSR_ERROR(trans) (0x4 << _EDP_PSR_TRANS_SHIFT(trans)) -#define EDP_PSR_POST_EXIT(trans) (0x2 << _EDP_PSR_TRANS_SHIFT(trans)) -#define EDP_PSR_PRE_ENTRY(trans) (0x1 << _EDP_PSR_TRANS_SHIFT(trans)) +#define TGL_PSR_MASK REG_GENMASK(2, 0) +#define TGL_PSR_ERROR REG_BIT(2) +#define TGL_PSR_POST_EXIT REG_BIT(1) +#define TGL_PSR_PRE_ENTRY REG_BIT(0) +#define EDP_PSR_MASK(trans) (TGL_PSR_MASK << \ + _EDP_PSR_TRANS_SHIFT(trans)) +#define EDP_PSR_ERROR(trans) (TGL_PSR_ERROR << \ + _EDP_PSR_TRANS_SHIFT(trans)) +#define EDP_PSR_POST_EXIT(trans) (TGL_PSR_POST_EXIT << \ + _EDP_PSR_TRANS_SHIFT(trans)) +#define EDP_PSR_PRE_ENTRY(trans) (TGL_PSR_PRE_ENTRY << \ + _EDP_PSR_TRANS_SHIFT(trans)) #define _SRD_AUX_DATA_A 0x60814 #define _SRD_AUX_DATA_EDP 0x6f814 -- GitLab From c56453a00f19ccddee302f5f9fe96b80e0b47fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 3 Oct 2022 14:15:39 +0300 Subject: [PATCH 0862/2223] drm/i915: Fix watermark calculations for gen12+ RC CCS modifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take the gen12+ RC CCS modifier into account when calculating the watermarks. Othwerwise we'll calculate the watermarks thinking this Y-tiled modifier is linear. The rc_surface part is actually a nop since that is not used for any glk+ platform. v2: Split RC CCS vs. MC CCS to separate patches Cc: stable@vger.kernel.org Fixes: b3e57bccd68a ("drm/i915/tgl: Gen-12 render decompression") Reviewed-by: Juha-Pekka Heikkila Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221003111544.8007-2-ville.syrjala@linux.intel.com (cherry picked from commit a89a96a586114f67598c6391c75678b4dba5c2da) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/skl_watermark.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index 01b0932757ed7..132baada3e115 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1710,10 +1710,12 @@ skl_compute_wm_params(const struct intel_crtc_state *crtc_state, modifier == I915_FORMAT_MOD_4_TILED || modifier == I915_FORMAT_MOD_Yf_TILED || modifier == I915_FORMAT_MOD_Y_TILED_CCS || - modifier == I915_FORMAT_MOD_Yf_TILED_CCS; + modifier == I915_FORMAT_MOD_Yf_TILED_CCS || + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS; wp->x_tiled = modifier == I915_FORMAT_MOD_X_TILED; wp->rc_surface = modifier == I915_FORMAT_MOD_Y_TILED_CCS || - modifier == I915_FORMAT_MOD_Yf_TILED_CCS; + modifier == I915_FORMAT_MOD_Yf_TILED_CCS || + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS; wp->is_planar = intel_format_info_is_yuv_semiplanar(format, modifier); wp->width = width; -- GitLab From 484b2b9281000274ef7c5cb0a9ebc5da6f5c281c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 3 Oct 2022 14:15:40 +0300 Subject: [PATCH 0863/2223] drm/i915: Fix watermark calculations for gen12+ MC CCS modifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take the gen12+ MC CCS modifier into account when calculating the watermarks. Othwerwise we'll calculate the watermarks thinking this Y-tiled modifier is linear. The rc_surface part is actually a nop since that is not used for any glk+ platform. v2: Split RC CCS vs. MC CCS to separate patches Cc: stable@vger.kernel.org Fixes: 2dfbf9d2873a ("drm/i915/tgl: Gen-12 display can decompress surfaces compressed by the media engine") Reviewed-by: Juha-Pekka Heikkila Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221003111544.8007-3-ville.syrjala@linux.intel.com (cherry picked from commit 91c9651425fe955b1387f3637607dda005f3f710) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/skl_watermark.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index 132baada3e115..49fc5e2b56fd1 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1711,11 +1711,13 @@ skl_compute_wm_params(const struct intel_crtc_state *crtc_state, modifier == I915_FORMAT_MOD_Yf_TILED || modifier == I915_FORMAT_MOD_Y_TILED_CCS || modifier == I915_FORMAT_MOD_Yf_TILED_CCS || - modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS; + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS || + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS; wp->x_tiled = modifier == I915_FORMAT_MOD_X_TILED; wp->rc_surface = modifier == I915_FORMAT_MOD_Y_TILED_CCS || modifier == I915_FORMAT_MOD_Yf_TILED_CCS || - modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS; + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS || + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS; wp->is_planar = intel_format_info_is_yuv_semiplanar(format, modifier); wp->width = width; -- GitLab From 070a2855900de17b1e11a0dc35af9794e80f1a28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 3 Oct 2022 14:15:41 +0300 Subject: [PATCH 0864/2223] drm/i915: Fix watermark calculations for gen12+ CCS+CC modifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take the gen12+ CCS+CC modifier into account when calculating the watermarks. Othwerwise we'll calculate the watermarks thinking this Y-tiled modifier is linear. The rc_surface part is actually a nop since that is not used for any glk+ platform. Cc: stable@vger.kernel.org Fixes: d1e2775e9b96 ("drm/i915/tgl: Add Clear Color support for TGL Render Decompression") Reviewed-by: Juha-Pekka Heikkila Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221003111544.8007-4-ville.syrjala@linux.intel.com (cherry picked from commit a627455bbe50a111475d7a42beb58fa64bd96c83) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/skl_watermark.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index 49fc5e2b56fd1..3676662897e73 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1712,12 +1712,14 @@ skl_compute_wm_params(const struct intel_crtc_state *crtc_state, modifier == I915_FORMAT_MOD_Y_TILED_CCS || modifier == I915_FORMAT_MOD_Yf_TILED_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS || - modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS; + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS || + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC; wp->x_tiled = modifier == I915_FORMAT_MOD_X_TILED; wp->rc_surface = modifier == I915_FORMAT_MOD_Y_TILED_CCS || modifier == I915_FORMAT_MOD_Yf_TILED_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS || - modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS; + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS || + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC; wp->is_planar = intel_format_info_is_yuv_semiplanar(format, modifier); wp->width = width; -- GitLab From ccfa6d35f9233702c924316cdf40c05b6ce88113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 3 Oct 2022 14:15:42 +0300 Subject: [PATCH 0865/2223] drm/i915: Fix watermark calculations for DG2 CCS modifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take the DG2 CCS modifiers into account when calculating the watermarks. Othwerwise we'll calculate the watermarks thinking these tile-4 modifiers are linear. The rc_surface part is actually a nop since that is not used for any glk+ platform. Cc: stable@vger.kernel.org Fixes: 4c3afa72138c ("drm/i915/dg2: Add support for DG2 render and media compression") Reviewed-by: Juha-Pekka Heikkila Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221003111544.8007-5-ville.syrjala@linux.intel.com (cherry picked from commit f25d9f81a8e09ace4f04106995550bae1f522143) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/skl_watermark.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index 3676662897e73..a120d49b95ca3 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1713,13 +1713,17 @@ skl_compute_wm_params(const struct intel_crtc_state *crtc_state, modifier == I915_FORMAT_MOD_Yf_TILED_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS || - modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC; + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC || + modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS || + modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS; wp->x_tiled = modifier == I915_FORMAT_MOD_X_TILED; wp->rc_surface = modifier == I915_FORMAT_MOD_Y_TILED_CCS || modifier == I915_FORMAT_MOD_Yf_TILED_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS || - modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC; + modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC || + modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS || + modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS; wp->is_planar = intel_format_info_is_yuv_semiplanar(format, modifier); wp->width = width; -- GitLab From b2e3a1af8cce4117de06ff1a4eab0749753ede27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 3 Oct 2022 14:15:43 +0300 Subject: [PATCH 0866/2223] drm/i915: Fix watermark calculations for DG2 CCS+CC modifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take the DG2 CCS+CC modifier into account when calculating the watermarks. Othwerwise we'll calculate the watermarks thinking this tile-4 modifier is linear. The rc_surface part is actually a nop since that is not used for any glk+ platform. Cc: stable@vger.kernel.org Fixes: 680025dcc400 ("drm/i915/dg2: Add support for DG2 clear color compression") Reviewed-by: Juha-Pekka Heikkila Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221003111544.8007-6-ville.syrjala@linux.intel.com (cherry picked from commit 334810f82024815283a6e7febd3d2de1fed6c232) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/skl_watermark.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index a120d49b95ca3..18178b01375e4 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -1715,7 +1715,8 @@ skl_compute_wm_params(const struct intel_crtc_state *crtc_state, modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC || modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS || - modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS; + modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS || + modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC; wp->x_tiled = modifier == I915_FORMAT_MOD_X_TILED; wp->rc_surface = modifier == I915_FORMAT_MOD_Y_TILED_CCS || modifier == I915_FORMAT_MOD_Yf_TILED_CCS || @@ -1723,7 +1724,8 @@ skl_compute_wm_params(const struct intel_crtc_state *crtc_state, modifier == I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS || modifier == I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC || modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS || - modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS; + modifier == I915_FORMAT_MOD_4_TILED_DG2_MC_CCS || + modifier == I915_FORMAT_MOD_4_TILED_DG2_RC_CCS_CC; wp->is_planar = intel_format_info_is_yuv_semiplanar(format, modifier); wp->width = width; -- GitLab From cdf6428dd518435a05739abf7659589de30970f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 27 Sep 2022 21:24:55 +0300 Subject: [PATCH 0867/2223] drm/i915: Reject excessive dotclocks early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure modes with crazy big dotclocks are rejected early, so as to not cause problems for subsequent code via integer overflows and whatnot. These would eventually be rejected in intel_crtc_compute_pipe_mode() but that is now too late as we do the clock computations a bit earlier than that. And we don't want to just reorder the two since we still want to check the final computed dotclock against the hardware limit to make sure we didn't end up above the limit due to rounding/etc. Fixes: 0ff0e219d9b8 ("drm/i915: Compute clocks earlier") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220927182455.3422-1-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula (cherry picked from commit df2f59c5857b56a5cc40b6562b032c5d8d50cdfc) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_display.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index accf5311b664f..f0063e0d4ed3d 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -8130,6 +8130,17 @@ static void intel_setup_outputs(struct drm_i915_private *dev_priv) drm_helper_move_panel_connectors_to_head(&dev_priv->drm); } +static int max_dotclock(struct drm_i915_private *i915) +{ + int max_dotclock = i915->max_dotclk_freq; + + /* icl+ might use bigjoiner */ + if (DISPLAY_VER(i915) >= 11) + max_dotclock *= 2; + + return max_dotclock; +} + static enum drm_mode_status intel_mode_valid(struct drm_device *dev, const struct drm_display_mode *mode) @@ -8167,6 +8178,13 @@ intel_mode_valid(struct drm_device *dev, DRM_MODE_FLAG_CLKDIV2)) return MODE_BAD; + /* + * Reject clearly excessive dotclocks early to + * avoid having to worry about huge integers later. + */ + if (mode->clock > max_dotclock(dev_priv)) + return MODE_CLOCK_HIGH; + /* Transcoder timing limits */ if (DISPLAY_VER(dev_priv) >= 11) { hdisplay_max = 16384; -- GitLab From 2ad4b6f5e1179f3879b6d4392070039e32ce55a3 Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Mon, 3 Oct 2022 13:05:38 -0400 Subject: [PATCH 0868/2223] Orangefs: change iterate to iterate_shared Changed .iterate to .iterate_shared in orangefs_dir_operations. I didn't change anything else, there were no xfstests regressions and no problem with any of my other tests... Signed-off-by: Mike Marshall --- fs/orangefs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index e2c2699d80162..9cacce5d55c1b 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -398,7 +398,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) const struct file_operations orangefs_dir_operations = { .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_dir_iterate, + .iterate_shared = orangefs_dir_iterate, .open = orangefs_dir_open, .release = orangefs_dir_release }; -- GitLab From 7e736b8e36ff87080890690670c90c91d6d80091 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:45 +0800 Subject: [PATCH 0869/2223] mm: introduce common struct mm_slot Patch series "add common struct mm_slot and use it in THP and KSM", v2. At present, both THP and KSM module have similar structures mm_slot for organizing and recording the information required for scanning mm, and each defines the following exactly the same operation functions: - alloc_mm_slot - free_mm_slot - get_mm_slot - insert_to_mm_slots_hash In order to de-duplicate these codes, this patchset introduces a common struct mm_slot, and lets THP and KSM to use it. This patch (of 7): At present, both THP and KSM module have similar structures mm_slot for organizing and recording the information required for scanning mm, and each defines the following exactly the same operation functions: - alloc_mm_slot - free_mm_slot - get_mm_slot - insert_to_mm_slots_hash In order to de-duplicate these codes, this patch introduces a common struct mm_slot, and subsequent patches will let THP and KSM to use it. Link: https://lkml.kernel.org/r/20220831031951.43152-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220831031951.43152-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/mm_slot.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 mm/mm_slot.h diff --git a/mm/mm_slot.h b/mm/mm_slot.h new file mode 100644 index 0000000000000..83f18ed1c4bde --- /dev/null +++ b/mm/mm_slot.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _LINUX_MM_SLOT_H +#define _LINUX_MM_SLOT_H + +#include +#include + +/* + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: link to the mm_slots hash list + * @mm_node: link into the mm_slots list + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +#define mm_slot_entry(ptr, type, member) \ + container_of(ptr, type, member) + +static inline void *mm_slot_alloc(struct kmem_cache *cache) +{ + if (!cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(cache, GFP_KERNEL); +} + +static inline void mm_slot_free(struct kmem_cache *cache, void *objp) +{ + kmem_cache_free(cache, objp); +} + +#define mm_slot_lookup(_hashtable, _mm) \ +({ \ + struct mm_slot *tmp_slot, *mm_slot = NULL; \ + \ + hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \ + if (_mm == tmp_slot->mm) { \ + mm_slot = tmp_slot; \ + break; \ + } \ + \ + mm_slot; \ +}) + +#define mm_slot_insert(_hashtable, _mm, _mm_slot) \ +({ \ + _mm_slot->mm = _mm; \ + hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \ +}) + +#endif /* _LINUX_MM_SLOT_H */ -- GitLab From b26e27015ec9a47eed3c960b7e3065c8ba8d16d7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:46 +0800 Subject: [PATCH 0870/2223] mm: thp: convert to use common struct mm_slot Rename private struct mm_slot to struct khugepaged_mm_slot and convert to use common struct mm_slot with no functional change. [zhengqi.arch@bytedance.com: fix build error with CONFIG_SHMEM disabled] Link: https://lkml.kernel.org/r/639fa8d5-8e5b-2333-69dc-40ed46219364@bytedance.com Link: https://lkml.kernel.org/r/20220831031951.43152-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 123 ++++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 71 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7c13d65aeb14e..1e59fe7bfae36 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "mm_slot.h" enum scan_result { SCAN_FAIL, @@ -99,17 +100,13 @@ struct collapse_control { }; /** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for + * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot * @nr_pte_mapped_thp: number of pte mapped THP * @pte_mapped_thp: address array corresponding pte mapped THP */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; +struct khugepaged_mm_slot { + struct mm_slot slot; /* pte-mapped THP in this mm */ int nr_pte_mapped_thp; @@ -126,7 +123,7 @@ struct mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; unsigned long address; }; @@ -390,8 +387,9 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); + sizeof(struct khugepaged_mm_slot), + __alignof__(struct khugepaged_mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -408,36 +406,6 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; @@ -445,28 +413,31 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) void __khugepaged_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; + slot = &mm_slot->slot; + /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); return; } spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); @@ -486,21 +457,23 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, void __khugepaged_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* @@ -1318,16 +1291,17 @@ out: return result; } -static void collect_mm_slot(struct mm_slot *mm_slot) +static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. @@ -1336,7 +1310,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) */ /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } @@ -1349,12 +1323,14 @@ static void collect_mm_slot(struct mm_slot *mm_slot) static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); @@ -1486,9 +1462,10 @@ abort: goto drop_hpage; } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) @@ -2040,7 +2017,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, BUILD_BUG(); } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } #endif @@ -2051,7 +2028,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; @@ -2060,18 +2038,20 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; - if (khugepaged_scan.mm_slot) + if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, + slot = &mm_slot->slot; + } else { + slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); - mm = mm_slot->mm; + mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. @@ -2166,10 +2146,11 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); + if (slot->mm_node.next != &khugepaged_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.mm_slot = + mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; @@ -2264,7 +2245,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); -- GitLab From 79e1119b7e0099c6c9379ca3129ffb7aa2a1c249 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:47 +0800 Subject: [PATCH 0871/2223] ksm: remove redundant declarations in ksm.h Currently, for struct stable_node, no one uses it in both the include/linux/ksm.h file and the file that contains it. For struct mem_cgroup, it's also not used in ksm.h. So they're all redundant, just remove them. Link: https://lkml.kernel.org/r/20220831031951.43152-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/ksm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0b4f17418f64c..7e232ba59b865 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -15,9 +15,6 @@ #include #include -struct stable_node; -struct mem_cgroup; - #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); -- GitLab From 21fbd59136e0773e0b920371860d9b6757cdb250 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:48 +0800 Subject: [PATCH 0872/2223] ksm: add the ksm prefix to the names of the ksm private structures In order to prevent the name of the private structure of ksm from being the same as the name of the common structure used in subsequent patches, prefix their names with ksm in advance. Link: https://lkml.kernel.org/r/20220831031951.43152-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- Documentation/mm/ksm.rst | 2 +- mm/ksm.c | 216 +++++++++++++++++++-------------------- 2 files changed, 109 insertions(+), 109 deletions(-) diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst index 9e37add068e64..f83cfbc12f4ca 100644 --- a/Documentation/mm/ksm.rst +++ b/Documentation/mm/ksm.rst @@ -26,7 +26,7 @@ tree. If a KSM page is shared between less than ``max_page_sharing`` VMAs, the node of the stable tree that represents such KSM page points to a -list of struct rmap_item and the ``page->mapping`` of the +list of struct ksm_rmap_item and the ``page->mapping`` of the KSM page points to the stable tree node. When the sharing passes this threshold, KSM adds a second dimension to diff --git a/mm/ksm.c b/mm/ksm.c index 0cd2f4b623345..de61946106ce4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -82,7 +82,7 @@ * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented - * using the same struct stable_node structure. + * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to @@ -112,16 +112,16 @@ */ /** - * struct mm_slot - ksm information per mm that is being scanned + * struct ksm_mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list * @mm_list: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ -struct mm_slot { +struct ksm_mm_slot { struct hlist_node link; struct list_head mm_list; - struct rmap_item *rmap_list; + struct ksm_rmap_item *rmap_list; struct mm_struct *mm; }; @@ -135,14 +135,14 @@ struct mm_slot { * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; unsigned long address; - struct rmap_item **rmap_list; + struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** - * struct stable_node - node of the stable rbtree + * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain @@ -153,7 +153,7 @@ struct ksm_scan { * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ -struct stable_node { +struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ @@ -182,7 +182,7 @@ struct stable_node { }; /** - * struct rmap_item - reverse mapping item for virtual addresses + * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) @@ -193,8 +193,8 @@ struct stable_node { * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node */ -struct rmap_item { - struct rmap_item *rmap_list; +struct ksm_rmap_item { + struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA @@ -207,7 +207,7 @@ struct rmap_item { union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ - struct stable_node *head; + struct ksm_stable_node *head; struct hlist_node hlist; }; }; @@ -230,7 +230,7 @@ static LIST_HEAD(migrate_nodes); #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct mm_slot ksm_mm_head = { +static struct ksm_mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), }; static struct ksm_scan ksm_scan = { @@ -298,21 +298,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) static int __init ksm_slab_init(void) { - rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; - stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; - mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; @@ -334,18 +334,18 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } -static __always_inline bool is_stable_node_chain(struct stable_node *chain) +static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } -static __always_inline bool is_stable_node_dup(struct stable_node *dup) +static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } -static inline void stable_node_chain_add_dup(struct stable_node *dup, - struct stable_node *chain) +static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, + struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; @@ -354,14 +354,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup, ksm_stable_node_dups++; } -static inline void __stable_node_dup_del(struct stable_node *dup) +static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } -static inline void stable_node_dup_del(struct stable_node *dup) +static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) @@ -373,9 +373,9 @@ static inline void stable_node_dup_del(struct stable_node *dup) #endif } -static inline struct rmap_item *alloc_rmap_item(void) +static inline struct ksm_rmap_item *alloc_rmap_item(void) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); @@ -384,7 +384,7 @@ static inline struct rmap_item *alloc_rmap_item(void) return rmap_item; } -static inline void free_rmap_item(struct rmap_item *rmap_item) +static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; @@ -392,7 +392,7 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) kmem_cache_free(rmap_item_cache, rmap_item); } -static inline struct stable_node *alloc_stable_node(void) +static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under @@ -402,28 +402,28 @@ static inline struct stable_node *alloc_stable_node(void) return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } -static inline void free_stable_node(struct stable_node *stable_node) +static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } -static inline struct mm_slot *alloc_mm_slot(void) +static inline struct ksm_mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ return NULL; return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); } -static inline void free_mm_slot(struct mm_slot *mm_slot) +static inline void free_mm_slot(struct ksm_mm_slot *mm_slot) { kmem_cache_free(mm_slot_cache, mm_slot); } -static struct mm_slot *get_mm_slot(struct mm_struct *mm) +static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) { - struct mm_slot *slot; + struct ksm_mm_slot *slot; hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) if (slot->mm == mm) @@ -433,7 +433,7 @@ static struct mm_slot *get_mm_slot(struct mm_struct *mm) } static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) + struct ksm_mm_slot *mm_slot) { mm_slot->mm = mm; hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); @@ -529,7 +529,7 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, return vma; } -static void break_cow(struct rmap_item *rmap_item) +static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -548,7 +548,7 @@ static void break_cow(struct rmap_item *rmap_item) mmap_read_unlock(mm); } -static struct page *get_mergeable_page(struct rmap_item *rmap_item) +static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -589,10 +589,10 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } -static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, +static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { - struct stable_node *chain = alloc_stable_node(); + struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); @@ -622,7 +622,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, return chain; } -static inline void free_stable_node_chain(struct stable_node *chain, +static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); @@ -630,9 +630,9 @@ static inline void free_stable_node_chain(struct stable_node *chain, ksm_stable_node_chains--; } -static void remove_node_from_stable_tree(struct stable_node *stable_node) +static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); @@ -694,7 +694,7 @@ enum get_ksm_page_flags { * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, +static struct page *get_ksm_page(struct ksm_stable_node *stable_node, enum get_ksm_page_flags flags) { struct page *page; @@ -773,10 +773,10 @@ stale: * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *page; stable_node = rmap_item->head; @@ -823,10 +823,10 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { - struct rmap_item *rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); @@ -863,18 +863,18 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } -static inline struct stable_node *folio_stable_node(struct folio *folio) +static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } -static inline struct stable_node *page_stable_node(struct page *page) +static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) + struct ksm_stable_node *stable_node) { VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); @@ -884,7 +884,7 @@ static inline void set_page_stable_node(struct page *page, /* * Only called through the sysfs control interface: */ -static int remove_stable_node(struct stable_node *stable_node) +static int remove_stable_node(struct ksm_stable_node *stable_node) { struct page *page; int err; @@ -922,10 +922,10 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } -static int remove_stable_node_chain(struct stable_node *stable_node, +static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -949,14 +949,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node, static int remove_all_stable_nodes(void) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, - struct stable_node, node); + struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; @@ -975,14 +975,14 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; @@ -1007,7 +1007,7 @@ static int unmerge_and_remove_all_rmap_items(void) spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); if (ksm_test_exit(mm)) { hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); @@ -1295,7 +1295,7 @@ out: * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, +static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; @@ -1332,9 +1332,9 @@ out: * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, +static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, - struct rmap_item *tree_rmap_item, + struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; @@ -1354,7 +1354,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, } static __always_inline -bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* @@ -1368,17 +1368,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) } static __always_inline -bool is_page_sharing_candidate(struct stable_node *stable_node) +bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct page *_tree_page, *tree_page = NULL; int nr = 0; @@ -1492,7 +1492,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, return tree_page; } -static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, +static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, struct rb_root *root) { if (!is_stable_node_chain(stable_node)) @@ -1519,12 +1519,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *stable_node = *_stable_node; + struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; @@ -1541,18 +1541,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct stable_node **s_n_d, - struct stable_node **s_n, +static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct stable_node **s_n_d, - struct stable_node *s_n, +static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, struct rb_root *root) { - struct stable_node *old_stable_node = s_n; + struct ksm_stable_node *old_stable_node = s_n; struct page *tree_page; tree_page = __stable_node_chain(s_n_d, &s_n, root, false); @@ -1576,8 +1576,8 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; - struct stable_node *page_node; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *page_node; page_node = page_stable_node(page); if (page_node && page_node->head != &migrate_nodes) { @@ -1597,7 +1597,7 @@ again: int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain_prune(&stable_node_dup, &stable_node, root); /* @@ -1820,14 +1820,14 @@ chain_append: * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct page *kpage) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; kpfn = page_to_pfn(kpage); @@ -1842,7 +1842,7 @@ again: int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { @@ -1911,7 +1911,7 @@ again: rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { - struct stable_node *orig = stable_node; + struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { @@ -1940,7 +1940,7 @@ again: * the same walking algorithm in an rbtree. */ static -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, +struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { @@ -1954,12 +1954,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, new = &root->rb_node; while (*new) { - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); - tree_rmap_item = rb_entry(*new, struct rmap_item, node); + tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; @@ -2011,8 +2011,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ -static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node, +static void stable_tree_append(struct ksm_rmap_item *rmap_item, + struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* @@ -2054,12 +2054,12 @@ static void stable_tree_append(struct rmap_item *rmap_item, * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *kpage; unsigned int checksum; int err; @@ -2215,11 +2215,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } } -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, - struct rmap_item **rmap_list, +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, unsigned long addr) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; @@ -2244,12 +2244,12 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, return rmap_item; } -static struct rmap_item *scan_get_next_rmap_item(struct page **page) +static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct mm_slot *slot; + struct ksm_mm_slot *slot; struct vm_area_struct *vma; - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; @@ -2277,7 +2277,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct page *page; list_for_each_entry_safe(stable_node, next, @@ -2294,7 +2294,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + slot = list_entry(slot->mm_list.next, struct ksm_mm_slot, mm_list); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); /* @@ -2368,7 +2368,7 @@ no_vmas: spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2414,7 +2414,7 @@ no_vmas: */ static void ksm_do_scan(unsigned int scan_npages) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { @@ -2518,7 +2518,7 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; int needs_wakeup; mm_slot = alloc_mm_slot(); @@ -2557,7 +2557,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; int easy_to_free = 0; /* @@ -2635,8 +2635,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { - struct stable_node *stable_node; - struct rmap_item *rmap_item; + struct ksm_stable_node *stable_node; + struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); @@ -2706,7 +2706,7 @@ again: #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); @@ -2739,7 +2739,7 @@ static void wait_while_offlining(void) } } -static bool stable_node_dup_remove_range(struct stable_node *stable_node, +static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { @@ -2755,12 +2755,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node, return false; } -static bool stable_node_chain_remove_range(struct stable_node *stable_node, +static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -2784,14 +2784,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node, static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { - stable_node = rb_entry(node, struct stable_node, node); + stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + -- GitLab From 23f746e412b405fbd6fb9652c0f7c33818713c43 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:49 +0800 Subject: [PATCH 0873/2223] ksm: convert ksm_mm_slot.mm_list to ksm_mm_slot.mm_node In order to use common struct mm_slot, convert ksm_mm_slot.mm_list to ksm_mm_slot.mm_node in advance, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-6-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index de61946106ce4..f9cd502233f09 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -114,13 +114,13 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @mm_node: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct ksm_mm_slot { struct hlist_node link; - struct list_head mm_list; + struct list_head mm_node; struct ksm_rmap_item *rmap_list; struct mm_struct *mm; }; @@ -231,7 +231,7 @@ static LIST_HEAD(migrate_nodes); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { - .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), + .mm_node = LIST_HEAD_INIT(ksm_mm_head.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -981,8 +981,8 @@ static int unmerge_and_remove_all_rmap_items(void) int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_node.next, + struct ksm_mm_slot, mm_node); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; @@ -1006,11 +1006,11 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, + struct ksm_mm_slot, mm_node); if (ksm_test_exit(mm)) { hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + list_del(&mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); free_mm_slot(mm_slot); @@ -2253,7 +2253,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_list)) + if (list_empty(&ksm_mm_head.mm_node)) return NULL; slot = ksm_scan.mm_slot; @@ -2294,7 +2294,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct ksm_mm_slot, mm_list); + slot = list_entry(slot->mm_node.next, struct ksm_mm_slot, mm_node); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); /* @@ -2367,8 +2367,8 @@ no_vmas: remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(slot->mm_node.next, + struct ksm_mm_slot, mm_node); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2380,7 +2380,7 @@ no_vmas: * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&slot->link); - list_del(&slot->mm_list); + list_del(&slot->mm_node); spin_unlock(&ksm_mmlist_lock); free_mm_slot(slot); @@ -2429,7 +2429,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2526,7 +2526,7 @@ int __ksm_enter(struct mm_struct *mm) return -ENOMEM; /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_list); + needs_wakeup = list_empty(&ksm_mm_head.mm_node); spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); @@ -2541,9 +2541,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + list_add_tail(&mm_slot->mm_node, &ksm_mm_head.mm_node); else - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + list_add_tail(&mm_slot->mm_node, &ksm_scan.mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2574,11 +2574,11 @@ void __ksm_exit(struct mm_struct *mm) if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + list_del(&mm_slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_list, - &ksm_scan.mm_slot->mm_list); + list_move(&mm_slot->mm_node, + &ksm_scan.mm_slot->mm_node); } } spin_unlock(&ksm_mmlist_lock); -- GitLab From 79b09941563737fad52a6b5ce9b9f0e1abf01bec Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:50 +0800 Subject: [PATCH 0874/2223] ksm: convert ksm_mm_slot.link to ksm_mm_slot.hash In order to use common struct mm_slot, convert ksm_mm_slot.link to ksm_mm_slot.hash in advance, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-7-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index f9cd502233f09..9300e7a48e887 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -113,13 +113,13 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned - * @link: link to the mm_slots hash list + * @hash: link to the mm_slots hash list * @mm_node: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct ksm_mm_slot { - struct hlist_node link; + struct hlist_node hash; struct list_head mm_node; struct ksm_rmap_item *rmap_list; struct mm_struct *mm; @@ -425,7 +425,7 @@ static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) { struct ksm_mm_slot *slot; - hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) + hash_for_each_possible(mm_slots_hash, slot, hash, (unsigned long)mm) if (slot->mm == mm) return slot; @@ -436,7 +436,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, struct ksm_mm_slot *mm_slot) { mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); + hash_add(mm_slots_hash, &mm_slot->hash, (unsigned long)mm); } /* @@ -1009,7 +1009,7 @@ static int unmerge_and_remove_all_rmap_items(void) ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, struct ksm_mm_slot, mm_node); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->link); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); @@ -2379,7 +2379,7 @@ no_vmas: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->link); + hash_del(&slot->hash); list_del(&slot->mm_node); spin_unlock(&ksm_mmlist_lock); @@ -2573,7 +2573,7 @@ void __ksm_exit(struct mm_struct *mm) mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->link); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); easy_to_free = 1; } else { -- GitLab From 58730ab6c7cab4e8525b7492ac369ccbfff5093a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:51 +0800 Subject: [PATCH 0875/2223] ksm: convert to use common struct mm_slot Convert to use common struct mm_slot, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-8-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 132 +++++++++++++++++++++++-------------------------------- 1 file changed, 56 insertions(+), 76 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9300e7a48e887..c3edb5836a441 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -42,6 +42,7 @@ #include #include "internal.h" +#include "mm_slot.h" #ifdef CONFIG_NUMA #define NUMA(x) (x) @@ -113,16 +114,12 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned - * @hash: link to the mm_slots hash list - * @mm_node: link into the mm_slots list, rooted in ksm_mm_head + * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items - * @mm: the mm that this information is valid for */ struct ksm_mm_slot { - struct hlist_node hash; - struct list_head mm_node; + struct mm_slot slot; struct ksm_rmap_item *rmap_list; - struct mm_struct *mm; }; /** @@ -231,7 +228,7 @@ static LIST_HEAD(migrate_nodes); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { - .mm_node = LIST_HEAD_INIT(ksm_mm_head.mm_node), + .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -409,36 +406,6 @@ static inline void free_stable_node(struct ksm_stable_node *stable_node) kmem_cache_free(stable_node_cache, stable_node); } -static inline struct ksm_mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct ksm_mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct ksm_mm_slot *slot; - - hash_for_each_possible(mm_slots_hash, slot, hash, (unsigned long)mm) - if (slot->mm == mm) - return slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct ksm_mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (unsigned long)mm); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -976,20 +943,22 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(ksm_mm_head.slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { - VMA_ITERATOR(vmi, mm_slot->mm, 0); + VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); - mm = mm_slot->mm; + mm = mm_slot->slot.mm; mmap_read_lock(mm); for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) @@ -1006,14 +975,15 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else @@ -2235,7 +2205,7 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ - rmap_item->mm = mm_slot->mm; + rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; @@ -2247,17 +2217,18 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct ksm_mm_slot *slot; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_node)) + if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; - slot = ksm_scan.mm_slot; - if (slot == &ksm_mm_head) { + mm_slot = ksm_scan.mm_slot; + if (mm_slot == &ksm_mm_head) { /* * A number of pages can hang around indefinitely on per-cpu * pagevecs, raised page count preventing write_protect_page @@ -2294,20 +2265,23 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_node.next, struct ksm_mm_slot, mm_node); - ksm_scan.mm_slot = slot; + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ - if (slot == &ksm_mm_head) + if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } + slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); @@ -2337,7 +2311,7 @@ next_mm: if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); - rmap_item = get_next_rmap_item(slot, + rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = @@ -2358,7 +2332,7 @@ next_page: if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: @@ -2367,8 +2341,9 @@ no_vmas: remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2379,11 +2354,11 @@ no_vmas: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->hash); - list_del(&slot->mm_node); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); @@ -2400,8 +2375,8 @@ no_vmas: } /* Repeat until we've completed scanning the whole list */ - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) + mm_slot = ksm_scan.mm_slot; + if (mm_slot != &ksm_mm_head) goto next_mm; ksm_scan.seqnr++; @@ -2429,7 +2404,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_node); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2519,17 +2494,20 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int needs_wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; + slot = &mm_slot->slot; + /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_node); + needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle @@ -2541,9 +2519,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_node, &ksm_mm_head.mm_node); + list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else - list_add_tail(&mm_slot->mm_node, &ksm_scan.mm_slot->mm_node); + list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2558,6 +2536,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int easy_to_free = 0; /* @@ -2570,21 +2549,22 @@ void __ksm_exit(struct mm_struct *mm) */ spin_lock(&ksm_mmlist_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_node, - &ksm_scan.mm_slot->mm_node); + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { -- GitLab From 49fd9b6df54e610d817f04ab0f94919f5c1a4f66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:57 +0100 Subject: [PATCH 0876/2223] mm/vmscan: fix a lot of comments Patch series "MM folio changes for 6.1", v2. My focus this round has been on shmem. I believe it is now fully converted to folios. Of course, shmem interacts with a lot of the swap cache and other parts of the kernel, so there are patches all over the MM. This patch series survives a round of xfstests on tmpfs, which is nice, but hardly an exhaustive test. Hugh was nice enough to run a round of tests on it and found a bug which is fixed in this edition. This patch (of 57): A lot of comments mention pages when they should say folios. Fix them up. [akpm@linux-foundation.org: fixups for mglru additions] Link: https://lkml.kernel.org/r/20220902194653.1739778-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220902194653.1739778-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/vmscan.c | 263 ++++++++++++++++++++++++++-------------------------- 1 file changed, 130 insertions(+), 133 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ba9423b141de..9ce6cc74d9eae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -90,7 +90,7 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; - /* Can active pages be deactivated as part of reclaim? */ + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; @@ -100,10 +100,10 @@ struct scan_control { /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; - /* Can mapped pages be reclaimed? */ + /* Can mapped folios be reclaimed? */ unsigned int may_unmap:1; - /* Can pages be swapped as part of reclaim? */ + /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; /* Proactive reclaim invoked by userspace through memory.reclaim */ @@ -128,7 +128,7 @@ struct scan_control { /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; - /* The file pages on the current node are dangerously low */ + /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ @@ -146,7 +146,7 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ s8 priority; - /* The highest zone to isolate pages for reclaim from */ + /* The highest zone to isolate folios for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ @@ -454,7 +454,7 @@ static bool cgroup_reclaim(struct scan_control *sc) * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in - * shrink_page_list() is used for throttling instead, which lacks all the + * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * @@ -575,9 +575,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, } /* - * This misses isolated pages which are not accounted for to save counters. + * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is - * not expected that isolated pages will be a dominating factor. + * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { @@ -1050,9 +1050,9 @@ void drop_slab(void) static inline int is_page_cache_freeable(struct folio *folio) { /* - * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache and optional buffer - * heads at page->private. + * A freeable page cache folio is referenced only by the caller + * that isolated the folio, the page cache and optional filesystem + * private data at folio->private. */ return folio_ref_count(folio) - folio_test_private(folio) == 1 + folio_nr_pages(folio); @@ -1092,8 +1092,8 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) return true; /* - * If there are a lot of dirty/writeback pages then do not - * throttle as throttling will occur when the pages cycle + * If there are a lot of dirty/writeback folios then do not + * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ for (i = 0; i < MAX_NR_ZONES; i++) { @@ -1136,7 +1136,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from - * writeback to a slow device to excessive references pages at the tail + * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU. */ switch(reason) { @@ -1182,8 +1182,8 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) } /* - * Account for pages written if tasks are throttled waiting on dirty - * pages to clean. If enough pages have been cleaned since throttling + * Account for folios written if tasks are throttled waiting on dirty + * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -1209,18 +1209,18 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, /* possible outcome of pageout() */ typedef enum { - /* failed to write page out, page is locked */ + /* failed to write folio out, folio is locked */ PAGE_KEEP, - /* move page to the active list, page is locked */ + /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ + /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, - /* page is clean and locked */ + /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; /* - * pageout is called by shrink_page_list() for each dirty page. + * pageout is called by shrink_folio_list() for each dirty folio. * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, @@ -1294,7 +1294,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } /* - * Same as remove_mapping, but if the page is removed from the mapping, it + * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, @@ -1310,34 +1310,34 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* - * The non racy check for a busy page. + * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has - * a ref to the page, it may be possible that they dirty it then - * drop the reference. So if PageDirty is tested before page_count - * here, then the following race may occur: + * a ref to the folio, it may be possible that they dirty it then + * drop the reference. So if the dirty flag is tested before the + * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); - * !PageDirty(page) [good] - * SetPageDirty(page); - * put_page(page); - * !page_count(page) [good, discard it] + * !folio_test_dirty(folio) [good] + * folio_set_dirty(folio); + * folio_put(folio); + * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot - * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_refcount. + * escape unnoticed. The smp_rmb is needed to ensure the folio->flags + * load is not satisfied before that of folio->_refcount. * - * Note that if SetPageDirty is always performed via set_page_dirty, + * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ + /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) { folio_ref_unfreeze(folio, refcount); goto cannot_free; @@ -1368,7 +1368,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, * back. * * We also don't store shadows for DAX mappings because the - * only page cache pages found in these are zero pages + * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. @@ -1436,14 +1436,14 @@ void folio_putback_lru(struct folio *folio) folio_put(folio); /* drop ref from isolate */ } -enum page_references { - PAGEREF_RECLAIM, - PAGEREF_RECLAIM_CLEAN, - PAGEREF_KEEP, - PAGEREF_ACTIVATE, +enum folio_references { + FOLIOREF_RECLAIM, + FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_KEEP, + FOLIOREF_ACTIVATE, }; -static enum page_references folio_check_references(struct folio *folio, +static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; @@ -1458,11 +1458,11 @@ static enum page_references folio_check_references(struct folio *folio, * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* rmap lock contention: rotate */ if (referenced_ptes == -1) - return PAGEREF_KEEP; + return FOLIOREF_KEEP; if (referenced_ptes) { /* @@ -1482,34 +1482,34 @@ static enum page_references folio_check_references(struct folio *folio, folio_set_referenced(folio); if (referenced_folio || referenced_ptes > 1) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* * Activate file-backed executable folios after first usage. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; - return PAGEREF_KEEP; + return FOLIOREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) - return PAGEREF_RECLAIM_CLEAN; + return FOLIOREF_RECLAIM_CLEAN; - return PAGEREF_RECLAIM; + return FOLIOREF_RECLAIM; } -/* Check if a page is dirty or under writeback */ +/* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* - * Anonymous pages are not handled by flushers and must be written + * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. - * MADV_FREE anonymous pages are put into inactive file list too. + * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed. */ @@ -1564,11 +1564,10 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) } /* - * Take pages on @demote_list and attempt to demote them to - * another node. Pages which are not demoted are left on - * @demote_pages. + * Take folios on @demote_folios and attempt to demote them to another node. + * Folios which are not demoted are left on @demote_folios. */ -static unsigned int demote_page_list(struct list_head *demote_pages, +static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); @@ -1587,7 +1586,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, .nmask = &allowed_mask }; - if (list_empty(demote_pages)) + if (list_empty(demote_folios)) return 0; if (target_nid == NUMA_NO_NODE) @@ -1596,7 +1595,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_pages, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_page, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1625,17 +1624,15 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) } /* - * shrink_page_list() returns the number of reclaimed pages + * shrink_folio_list() returns the number of reclaimed pages */ -static unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references) -{ - LIST_HEAD(ret_pages); - LIST_HEAD(free_pages); - LIST_HEAD(demote_pages); +static unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references) +{ + LIST_HEAD(ret_folios); + LIST_HEAD(free_folios); + LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; @@ -1646,16 +1643,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, do_demote_pass = can_demote(pgdat->node_id, sc); retry: - while (!list_empty(page_list)) { + while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; - enum page_references references = PAGEREF_RECLAIM; + enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); - folio = lru_to_folio(page_list); + folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) @@ -1779,7 +1776,7 @@ retry: folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ - list_add_tail(&folio->lru, page_list); + list_add_tail(&folio->lru, folio_list); continue; } } @@ -1788,13 +1785,13 @@ retry: references = folio_check_references(folio, sc); switch (references) { - case PAGEREF_ACTIVATE: + case FOLIOREF_ACTIVATE: goto activate_locked; - case PAGEREF_KEEP: + case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; - case PAGEREF_RECLAIM: - case PAGEREF_RECLAIM_CLEAN: + case FOLIOREF_RECLAIM: + case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } @@ -1804,7 +1801,7 @@ retry: */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) { - list_add(&folio->lru, &demote_pages); + list_add(&folio->lru, &demote_folios); folio_unlock(folio); continue; } @@ -1831,7 +1828,7 @@ retry: */ if (!folio_entire_mapcount(folio) && split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { @@ -1839,7 +1836,7 @@ retry: goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); @@ -1851,7 +1848,7 @@ retry: } else if (folio_test_swapbacked(folio) && folio_test_large(folio)) { /* Split shmem folio */ - if (split_folio_to_list(folio, page_list)) + if (split_folio_to_list(folio, folio_list)) goto keep_locked; } @@ -1916,7 +1913,7 @@ retry: goto activate_locked; } - if (references == PAGEREF_RECLAIM_CLEAN) + if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; @@ -2029,13 +2026,13 @@ free_it: nr_reclaimed += nr_pages; /* - * Is there need to periodically free_page_list? It would + * Is there need to periodically free_folio_list? It would * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) destroy_large_folio(folio); else - list_add(&folio->lru, &free_pages); + list_add(&folio->lru, &free_folios); continue; activate_locked_split: @@ -2063,29 +2060,29 @@ activate_locked: keep_locked: folio_unlock(folio); keep: - list_add(&folio->lru, &ret_pages); + list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } - /* 'page_list' is always empty here */ + /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Folios that could not be demoted are still in @demote_pages */ - if (!list_empty(&demote_pages)) { - /* Folios which weren't demoted go back on @page_list for retry: */ - list_splice_init(&demote_pages, page_list); + nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + /* Folios that could not be demoted are still in @demote_folios */ + if (!list_empty(&demote_folios)) { + /* Folios which weren't demoted go back on @folio_list for retry: */ + list_splice_init(&demote_folios, folio_list); do_demote_pass = false; goto retry; } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_pages); + mem_cgroup_uncharge_list(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_pages); + free_unref_page_list(&free_folios); - list_splice(&ret_pages, page_list); + list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) @@ -2094,7 +2091,7 @@ keep: } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *folio_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2122,7 +2119,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); @@ -2181,7 +2178,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -2288,8 +2285,8 @@ move: * * Context: * - * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages() (which is called + * (1) Must be called with an elevated refcount on the folio. This is a + * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. @@ -2361,13 +2358,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * move_pages_to_lru() moves folios from private @list to appropriate LRU list. + * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ -static unsigned int move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct lruvec *lruvec, + struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(folios_to_free); @@ -2387,7 +2384,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, /* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: - * #0 move_pages_to_lru #1 release_pages + * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock @@ -2444,11 +2441,11 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, + enum lru_list lru) { - LIST_HEAD(page_list); + LIST_HEAD(folio_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; @@ -2475,7 +2472,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2490,10 +2487,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &page_list); + move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; @@ -2504,16 +2501,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); - mem_cgroup_uncharge_list(&page_list); - free_unref_page_list(&page_list); + mem_cgroup_uncharge_list(&folio_list); + free_unref_page_list(&folio_list); /* - * If dirty pages are scanned that are not queued for IO, it + * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty pages to the end of + * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of - * dirty pages grows not through writes but through memory + * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. @@ -2572,7 +2569,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2632,8 +2629,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ spin_lock_irq(&lruvec->lru_lock); - nr_activate = move_pages_to_lru(lruvec, &l_active); - nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + nr_activate = move_folios_to_lru(lruvec, &l_active); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); @@ -2649,7 +2646,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -static unsigned int reclaim_page_list(struct list_head *page_list, +static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { struct reclaim_stat dummy_stat; @@ -2663,9 +2660,9 @@ static unsigned int reclaim_page_list(struct list_head *page_list, .no_demotion = 1, }; - nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); - while (!list_empty(page_list)) { - folio = lru_to_folio(page_list); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } @@ -2695,11 +2692,11 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; } - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -2729,13 +2726,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive - * page has a chance to be referenced again before it is reclaimed. + * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * - * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio - * of 3 means 3:1 or 25% of the pages are kept on the inactive list. + * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive @@ -2884,8 +2881,8 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) * Determine how aggressively the anon and file LRU lists should be * scanned. * - * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan - * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan + * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) @@ -2900,7 +2897,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; - /* If we have no swap space, do not bother scanning anon pages. */ + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; @@ -3647,7 +3644,7 @@ static int folio_update_gen(struct folio *folio, int gen) do { /* lru_gen_del_folio() has isolated this page? */ if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_page_list() */ + /* for shrink_folio_list() */ new_flags = old_flags | BIT(PG_referenced); continue; } @@ -4574,7 +4571,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } /* - * This function exploits spatial locality when shrink_page_list() walks the + * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the @@ -4795,7 +4792,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); - /* for shrink_page_list() */ + /* for shrink_folio_list() */ folio_clear_reclaim(folio); folio_clear_referenced(folio); @@ -4998,7 +4995,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; - reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); list_for_each_entry(folio, &list, lru) { /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ @@ -5015,7 +5012,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &list); + move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) -- GitLab From 379708ffde1b049bc41084e0a0572c44c8a1d2c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:58 +0100 Subject: [PATCH 0877/2223] mm: add the first tail page to struct folio Some of the static checkers get confused by extracting the page from the folio and referring to fields in the first tail page. Adding these fields to struct folio lets us avoid doing that. It has the risk that people will refer to those fields without checking that the folio is actually a large folio, so prefix them with underscores and document the preferred function to use instead. Link: https://lkml.kernel.org/r/20220902194653.1739778-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f30f262431c9..5c87d0f292a23 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -245,6 +245,13 @@ struct page { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @_flags_1: For large folios, additional page flags. + * @__head: Points to the folio. Do not use. + * @_folio_dtor: Which destructor to use for this folio. + * @_folio_order: Do not use directly, call folio_order(). + * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). + * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -283,9 +290,17 @@ struct folio { }; struct page page; }; + unsigned long _flags_1; + unsigned long __head; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; +#ifdef CONFIG_64BIT + unsigned int _folio_nr_pages; +#endif }; -static_assert(sizeof(struct page) == sizeof(struct folio)); #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); @@ -300,6 +315,19 @@ FOLIO_MATCH(_refcount, _refcount); FOLIO_MATCH(memcg_data, memcg_data); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + sizeof(struct page)) +FOLIO_MATCH(flags, _flags_1); +FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_dtor, _folio_dtor); +FOLIO_MATCH(compound_order, _folio_order); +FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_pincount, _pincount); +#ifdef CONFIG_64BIT +FOLIO_MATCH(compound_nr, _folio_nr_pages); +#endif +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { -- GitLab From c3a15bff46cb5149aeae4c8ae69443d791fa6578 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:59 +0100 Subject: [PATCH 0878/2223] mm: reimplement folio_order() and folio_nr_pages() Instead of calling compound_order() and compound_nr_pages(), use the folio directly. Saves 1905 bytes from mm/filemap.o due to folio_test_large() now being a cheaper check than PageHead(). Link: https://lkml.kernel.org/r/20220902194653.1739778-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e56dd8f7eae19..a37c8a29c49ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,7 +729,9 @@ static inline unsigned int compound_order(struct page *page) */ static inline unsigned int folio_order(struct folio *folio) { - return compound_order(&folio->page); + if (!folio_test_large(folio)) + return 0; + return folio->_folio_order; } #include @@ -1659,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone, */ static inline long folio_nr_pages(struct folio *folio) { - return compound_nr(&folio->page); + if (!folio_test_large(folio)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif } /** -- GitLab From d788f5b374c2ba204fed57e39acf2452acc24812 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:00 +0100 Subject: [PATCH 0879/2223] mm: add split_folio() This wrapper removes a need to use split_huge_page(&folio->page). Convert two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/shmem.c | 2 +- mm/truncate.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 38265f9f782e9..a1341fdcf666d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -444,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio, return split_huge_page_to_list(&folio->page, list); } +static inline int split_folio(struct folio *folio) +{ + return split_folio_to_list(folio, NULL); +} + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/mm/shmem.c b/mm/shmem.c index 42e5888bf84d8..674bde8b30850 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -629,7 +629,7 @@ next: goto move_back; } - ret = split_huge_page(&folio->page); + ret = split_folio(folio); folio_unlock(folio); folio_put(folio); diff --git a/mm/truncate.c b/mm/truncate.c index 0b0708bf935f3..c0be77e5c0083 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -240,7 +240,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; - if (split_huge_page(&folio->page) == 0) + if (split_folio(folio) == 0) return true; if (folio_test_dirty(folio)) return false; -- GitLab From 681ecf6301786cb06942b57f0ef7103b07ae6813 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:01 +0100 Subject: [PATCH 0880/2223] mm: add folio_add_lru_vma() Convert lru_cache_add_inactive_or_unevictable() to folio_add_lru_vma() and add a compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-6-willy@infradead.org Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- mm/folio-compat.c | 6 ++++++ mm/swap.c | 19 +++++++++---------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6308150b234a4..2ede1e3695d9b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -379,11 +379,11 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -extern void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_pages); -extern void lru_note_cost_folio(struct folio *); -extern void folio_add_lru(struct folio *); -extern void lru_cache_add(struct page *); +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); +void lru_note_cost_folio(struct folio *); +void folio_add_lru(struct folio *); +void folio_add_lru_vma(struct folio *, struct vm_area_struct *); +void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 458618c7302c3..e1e23b4947d73 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -88,6 +88,12 @@ void lru_cache_add(struct page *page) } EXPORT_SYMBOL(lru_cache_add); +void lru_cache_add_inactive_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + folio_add_lru_vma(page_folio(page), vma); +} + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { diff --git a/mm/swap.c b/mm/swap.c index 0a3871a70952f..955930f41d20c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -537,22 +537,21 @@ void folio_add_lru(struct folio *folio) EXPORT_SYMBOL(folio_add_lru); /** - * lru_cache_add_inactive_or_unevictable - * @page: the page to be added to LRU - * @vma: vma in which page is mapped for determining reclaimability + * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * @folio: The folio to be added to the LRU. + * @vma: VMA in which the folio is mapped. * - * Place @page on the inactive or unevictable LRU list, depending on its - * evictability. + * If the VMA is mlocked, @folio is added to the unevictable list. + * Otherwise, it is treated the same way as folio_add_lru(). */ -void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma) +void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(page); + mlock_new_page(&folio->page); else - lru_cache_add(page); + folio_add_lru(folio); } /* -- GitLab From f530ed0e2d01aafc4d0e3cf8ab6b64bbdb7696a7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:02 +0100 Subject: [PATCH 0881/2223] shmem: convert shmem_writepage() to use a folio throughout Even though we will split any large folio that comes in, write the code to handle large folios so as to not leave a trap for whoever tries to handle large folios in the swap cache. Link: https://lkml.kernel.org/r/20220902194653.1739778-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 674bde8b30850..3d2d35728793b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1328,17 +1328,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, * and its shmem_writeback() needs them to be split when swapping. */ - if (PageTransCompound(page)) { + if (folio_test_large(folio)) { /* Ensure the subpages are still dirty */ - SetPageDirty(page); + folio_test_set_dirty(folio); if (split_huge_page(page) < 0) goto redirty; - ClearPageDirty(page); + folio = page_folio(page); + folio_clear_dirty(folio); } - BUG_ON(!PageLocked(page)); - mapping = page->mapping; - index = page->index; + BUG_ON(!folio_test_locked(folio)); + mapping = folio->mapping; + index = folio->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) @@ -1361,15 +1362,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a - * fallocated page arriving here is now to initialize it and write it. + * fallocated folio arriving here is now to initialize it and write it. * - * That's okay for a page already fallocated earlier, but if we have + * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track - * of this page in case we have to undo it, and (b) it may not be a + * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So - * reactivate the page, and let shmem_fallocate() quit when too many. + * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); @@ -1385,9 +1386,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (shmem_falloc) goto redirty; } - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } swap = folio_alloc_swap(folio); @@ -1396,7 +1397,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the page is + * if it's not already there. Do it now before the folio is * moved to swap cache, when its pagelock no longer protects * the inode from eviction. But don't unlock the mutex until * we've incremented swapped, because shmem_unuse_inode() will @@ -1406,7 +1407,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, + if (add_to_swap_cache(&folio->page, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); @@ -1415,21 +1416,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(&folio->page, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); - BUG_ON(page_mapped(page)); - swap_writepage(page, wbc); + BUG_ON(folio_mapped(folio)); + swap_writepage(&folio->page, wbc); return 0; } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(page, swap); + put_swap_page(&folio->page, swap); redirty: - set_page_dirty(page); + folio_mark_dirty(folio); if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ - unlock_page(page); + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ + folio_unlock(folio); return 0; } -- GitLab From 4cd400fd1f55dde1fa430a706828042daed94c43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:03 +0100 Subject: [PATCH 0882/2223] shmem: convert shmem_delete_from_page_cache() to take a folio Remove the assertion that the page is not Compound as this function now handles large folios correctly. Link: https://lkml.kernel.org/r/20220902194653.1739778-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3d2d35728793b..9e851fc876016 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -763,23 +763,22 @@ error: } /* - * Like delete_from_page_cache, but substitutes swap for page. + * Like delete_from_page_cache, but substitutes swap for @folio. */ -static void shmem_delete_from_page_cache(struct page *page, void *radswap) +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); int error; - VM_BUG_ON_PAGE(PageCompound(page), page); - xa_lock_irq(&mapping->i_pages); - error = shmem_replace_entry(mapping, page->index, page, radswap); - page->mapping = NULL; - mapping->nrpages--; - __dec_lruvec_page_state(page, NR_FILE_PAGES); - __dec_lruvec_page_state(page, NR_SHMEM); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - put_page(page); + folio_put(folio); BUG_ON(error); } @@ -1416,7 +1415,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(&folio->page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); -- GitLab From 907ea17eb2b436f07332c935476d77893abae735 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:04 +0100 Subject: [PATCH 0883/2223] shmem: convert shmem_replace_page() to use folios throughout Introduce folio_set_swap_entry() to abstract how both folio->private and swp_entry_t work. Use swap_address_space() directly instead of indirecting through folio_mapping(). Include an assertion that the old folio is not large as we only allocate a single-page folio to replace it. Use folio_put_refs() instead of calling folio_put() twice. Link: https://lkml.kernel.org/r/20220902194653.1739778-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 ++++ mm/shmem.c | 67 +++++++++++++++++++++----------------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2ede1e3695d9b..61e13d1a4caba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -355,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) +{ + folio->private = (void *)entry.val; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); diff --git a/mm/shmem.c b/mm/shmem.c index 9e851fc876016..4113f1b9d4a82 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1560,12 +1560,6 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) { @@ -1617,51 +1611,49 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct page *oldpage, *newpage; struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; int error; - oldpage = *pagep; - entry.val = page_private(oldpage); + old = page_folio(*pagep); + entry = folio_swap_entry(old); swap_index = swp_offset(entry); - swap_mapping = page_mapping(oldpage); + swap_mapping = swap_address_space(entry); /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); - if (!newpage) + VM_BUG_ON_FOLIO(folio_test_large(old), old); + new = shmem_alloc_folio(gfp, info, index); + if (!new) return -ENOMEM; - get_page(newpage); - copy_highpage(newpage, oldpage); - flush_dcache_page(newpage); + folio_get(new); + folio_copy(new, old); + flush_dcache_folio(new); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); - SetPageUptodate(newpage); - set_page_private(newpage, entry.val); - SetPageSwapCache(newpage); + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + folio_set_swap_entry(new, entry); + folio_set_swapcache(new); /* * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); + error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - old = page_folio(oldpage); - new = page_folio(newpage); mem_cgroup_migrate(old, new); - __inc_lruvec_page_state(newpage, NR_FILE_PAGES); - __inc_lruvec_page_state(newpage, NR_SHMEM); - __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); - __dec_lruvec_page_state(oldpage, NR_SHMEM); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); + __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); + __lruvec_stat_mod_folio(old, NR_SHMEM, -1); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1671,18 +1663,17 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * both PageSwapCache and page_private after getting page lock; * but be defensive. Reverse old to newpage for clear and free. */ - oldpage = newpage; + old = new; } else { - lru_cache_add(newpage); - *pagep = newpage; + folio_add_lru(new); + *pagep = &new->page; } - ClearPageSwapCache(oldpage); - set_page_private(oldpage, 0); + folio_clear_swapcache(old); + old->private = NULL; - unlock_page(oldpage); - put_page(oldpage); - put_page(oldpage); + folio_unlock(old); + folio_put_refs(old, 2); return error; } @@ -2383,6 +2374,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, } #ifdef CONFIG_USERFAULTFD +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + return &shmem_alloc_folio(gfp, info, index)->page; +} + int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, -- GitLab From 14d01ee9fcb901c9e020f2dcd71c500f10c3bd03 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:05 +0100 Subject: [PATCH 0884/2223] mm/swapfile: remove page_swapcount() By restructuring folio_swapped(), it can use swap_swapcount() instead of page_swapcount(). It's even a little more efficient. Link: https://lkml.kernel.org/r/20220902194653.1739778-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 469d9af86be2f..e0aaeac5c829d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1431,30 +1431,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) spin_unlock(&p->lock); } -/* - * How many references to page are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -static int page_swapcount(struct page *page) -{ - int count = 0; - struct swap_info_struct *p; - struct swap_cluster_info *ci; - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - p = _swap_info_get(entry); - if (p) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); - count = swap_count(p->swap_map[offset]); - unlock_cluster_or_swap_info(p, ci); - } - return count; -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si; @@ -1469,11 +1445,16 @@ int __swap_count(swp_entry_t entry) return count; } +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { - int count = 0; pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; + int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); @@ -1574,17 +1555,16 @@ unlock_out: static bool folio_swapped(struct folio *folio) { - swp_entry_t entry; - struct swap_info_struct *si; + swp_entry_t entry = folio_swap_entry(folio); + struct swap_info_struct *si = _swap_info_get(entry); + + if (!si) + return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return page_swapcount(&folio->page) != 0; + return swap_swapcount(si, entry) != 0; - entry = folio_swap_entry(folio); - si = _swap_info_get(entry); - if (si) - return swap_page_trans_huge_swapped(si, entry); - return false; + return swap_page_trans_huge_swapped(si, entry); } /* -- GitLab From bdb0ed54a4768dc3c2613d4c45f94c887d43cd7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:06 +0100 Subject: [PATCH 0885/2223] mm/swapfile: convert try_to_free_swap() to folio_free_swap() Add kernel-doc for folio_free_swap() and make it return bool. Add a try_to_free_swap() compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ++++++ mm/folio-compat.c | 7 +++++++ mm/swapfile.c | 32 ++++++++++++++++++-------------- mm/vmscan.c | 2 +- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 61e13d1a4caba..dac6308d878e9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -490,6 +490,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); +bool folio_free_swap(struct folio *folio); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); @@ -606,6 +607,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio) return entry; } +static inline bool folio_free_swap(struct folio *folio) +{ + return false; +} + static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d73..06d47f00609b5 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -146,3 +146,10 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } + +#ifdef CONFIG_SWAP +int try_to_free_swap(struct page *page) +{ + return folio_free_swap(page_folio(page)); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index e0aaeac5c829d..f2a446799a393 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1567,43 +1567,47 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry); } -/* - * If swap is getting full, or if there are no more mappings of this page, - * then try_to_free_swap is called to free its swap space. +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. */ -int try_to_free_swap(struct page *page) +bool folio_free_swap(struct folio *folio) { - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) - return 0; + return false; if (folio_test_writeback(folio)) - return 0; + return false; if (folio_swapped(folio)) - return 0; + return false; /* * Once hibernation has begun to create its image of memory, - * there's a danger that one of the calls to try_to_free_swap() + * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free - * the swap from a page which has already been recorded in the - * image as a clean swapcache page, and then reuse its swap for + * the swap from a folio which has already been recorded in the + * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the - * original page might be freed under memory pressure, then + * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) - return 0; + return false; delete_from_swap_cache(folio); folio_set_dirty(folio); - return 1; + return true; } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ce6cc74d9eae..9268e64590e4d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2049,7 +2049,7 @@ activate_locked: if (folio_test_swapcache(folio) && (mem_cgroup_swap_full(&folio->page) || folio_test_mlocked(folio))) - try_to_free_swap(&folio->page); + folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); -- GitLab From a0d3374b070776e985bbd7b165b178fa688bf37a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:07 +0100 Subject: [PATCH 0886/2223] mm/swap: convert __read_swap_cache_async() to use a folio Remove a few hidden (and one visible) calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 41afa6d45b239..b1e181fc52687 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -411,7 +411,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, bool *new_page_allocated) { struct swap_info_struct *si; - struct page *page; + struct folio *folio; void *shadow = NULL; *new_page_allocated = false; @@ -426,11 +426,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (page) - return page; + if (folio) + return folio_file_page(folio, swp_offset(entry)); /* * Just skip read ahead for unused swap slot. @@ -448,8 +448,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - page = alloc_page_vma(gfp_mask, vma, addr); - if (!page) + folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + if (!folio) return NULL; /* @@ -459,7 +459,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (!err) break; - put_page(page); + folio_put(folio); if (err != -EEXIST) return NULL; @@ -477,30 +477,30 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * The swap entry is ours to swap in. Prepare the new page. */ - __SetPageLocked(page); - __SetPageSwapBacked(page); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_page(&folio->page, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(&folio->page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page_folio(page), shadow); + workingset_refault(folio, shadow); - /* Caller will initiate read into locked page */ - lru_cache_add(page); + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); *new_page_allocated = true; - return page; + return &folio->page; fail_unlock: - put_swap_page(page, entry); - unlock_page(page); - put_page(page); + put_swap_page(&folio->page, entry); + folio_unlock(folio); + folio_put(folio); return NULL; } -- GitLab From a4c366f01f10073e0220656561b875627ff7cd90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:08 +0100 Subject: [PATCH 0887/2223] mm/swap: convert add_to_swap_cache() to take a folio With all callers using folios, we can convert add_to_swap_cache() to take a folio and use it throughout. Link: https://lkml.kernel.org/r/20220902194653.1739778-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- mm/swap.h | 4 ++-- mm/swap_state.c | 34 +++++++++++++++++----------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4113f1b9d4a82..ced76c229b960 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1406,7 +1406,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(&folio->page, swap, + if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); diff --git a/mm/swap.h b/mm/swap.h index 0ffa5b478051a..29e38f3d82d03 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -32,7 +32,7 @@ extern struct address_space *swapper_spaces[]; void show_swap_cache_info(void); bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp); void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow); @@ -122,7 +122,7 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry) return NULL; } -static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, +static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp_mask, void **shadowp) { return -1; diff --git a/mm/swap_state.c b/mm/swap_state.c index b1e181fc52687..ecf1accc2fb18 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,21 +85,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = thp_nr_pages(page); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); + unsigned long i, nr = folio_nr_pages(folio); void *old; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapCache(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - page_ref_add(page, nr); - SetPageSwapCache(page); + folio_ref_add(folio, nr); + folio_set_swapcache(folio); do { xas_lock_irq(&xas); @@ -107,19 +107,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { - VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); old = xas_load(&xas); if (xa_is_value(old)) { if (shadowp) *shadowp = old; } - set_page_private(page + i, entry.val + i); - xas_store(&xas, page); + set_page_private(folio_page(folio, i), entry.val + i); + xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -127,8 +127,8 @@ unlock: if (!xas_error(&xas)) return 0; - ClearPageSwapCache(page); - page_ref_sub(page, nr); + folio_clear_swapcache(folio); + folio_ref_sub(folio, nr); return xas_error(&xas); } @@ -194,7 +194,7 @@ bool add_to_swap(struct folio *folio) /* * Add it to the swap cache. */ - err = add_to_swap_cache(&folio->page, entry, + err = add_to_swap_cache(folio, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* @@ -484,7 +484,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(&folio->page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); -- GitLab From 4081f7446d95a9d3ced12dc04ff02c187a761e90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:09 +0100 Subject: [PATCH 0888/2223] mm/swap: convert put_swap_page() to put_swap_folio() With all callers now using a folio, we can convert this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swap_slots.c | 2 +- mm/swap_state.c | 6 +++--- mm/swapfile.c | 4 ++-- mm/vmscan.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index dac6308d878e9..42cbef554de68 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -491,7 +491,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); -extern void put_swap_page(struct page *page, swp_entry_t entry); +void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); @@ -576,7 +576,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void put_swap_page(struct page *page, swp_entry_t swp) +static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } diff --git a/mm/shmem.c b/mm/shmem.c index ced76c229b960..56cabf9bb947b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1424,7 +1424,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 10b94d64cc257..0bec1f705f8e0 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -343,7 +343,7 @@ repeat: get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(folio, entry)) { - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); entry.val = 0; } return entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index ecf1accc2fb18..ea354efd37356 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -218,7 +218,7 @@ bool add_to_swap(struct folio *folio) return true; fail: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); return false; } @@ -237,7 +237,7 @@ void delete_from_swap_cache(struct folio *folio) __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return &folio->page; fail_unlock: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_unlock(folio); folio_put(folio); return NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index f2a446799a393..aafe739dc2a62 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1332,7 +1332,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void put_swap_page(struct page *page, swp_entry_t entry) +void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1341,7 +1341,7 @@ void put_swap_page(struct page *page, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(thp_nr_pages(page)); + int size = swap_entry_size(folio_nr_pages(folio)); si = _swap_info_get(entry); if (!si) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9268e64590e4d..1707e3bfcfe42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1352,7 +1352,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); xa_unlock_irq(&mapping->i_pages); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); -- GitLab From 63ad4add3823051aeb1fcd1ba981f6efd07086bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:10 +0100 Subject: [PATCH 0889/2223] mm: convert do_swap_page() to use a folio Removes quite a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 57 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e49faa0a1f9a6..04f54abdf9d2d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,6 +3724,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page = NULL, *swapcache; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; @@ -3768,19 +3769,23 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; + if (page) + folio = page_folio(page); if (!page) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (page) { - __SetPageLocked(page); - __SetPageSwapBacked(page); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, + vma, vmf->address, false); + page = &folio->page; + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); if (mem_cgroup_swapin_charge_page(page, - vma->vm_mm, GFP_KERNEL, entry)) { + vma->vm_mm, GFP_KERNEL, + entry)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3788,20 +3793,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) shadow = get_shadow_from_swap_cache(entry); if (shadow) - workingset_refault(page_folio(page), - shadow); + workingset_refault(folio, shadow); - lru_cache_add(page); + folio_add_lru(folio); /* To provide entry to swap_readpage() */ - set_page_private(page, entry.val); + folio_set_swap_entry(folio, entry); swap_readpage(page, true, NULL); - set_page_private(page, 0); + folio->private = NULL; } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = page; + if (page) + folio = page_folio(page); } if (!page) { @@ -3844,7 +3850,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!PageSwapCache(page) || + if (unlikely(!folio_test_swapcache(folio) || page_private(page) != entry.val)) goto out_page; @@ -3859,6 +3865,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = swapcache; goto out_page; } + folio = page_folio(page); /* * If we want to map a page that's in the swapcache writable, we @@ -3867,7 +3874,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * pagevecs if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && - !PageKsm(page) && !PageLRU(page)) + !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3881,7 +3888,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } @@ -3894,14 +3901,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ - BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); - BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); + BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ - if (!PageKsm(page)) { + if (!folio_test_ksm(folio)) { /* * Note that pte_swp_exclusive() == false for architectures * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. @@ -3913,7 +3920,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache -> certainly exclusive. */ exclusive = true; - } else if (exclusive && PageWriteback(page) && + } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support @@ -3956,7 +3963,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ - if (!PageKsm(page) && (exclusive || page_count(page) == 1)) { + if (!folio_test_ksm(folio) && + (exclusive || folio_ref_count(folio) == 1)) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; @@ -3976,16 +3984,17 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_lru_vma(folio, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } - VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page))); + VM_BUG_ON(!folio_test_anon(folio) || + (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - unlock_page(page); + folio_unlock(folio); if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused @@ -4017,9 +4026,9 @@ out: out_nomap: pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: - unlock_page(page); + folio_unlock(folio); out_release: - put_page(page); + folio_put(folio); if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); -- GitLab From d4f9565ae598bd6b6ffbd8b4dfbf97a9e339da2d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:11 +0100 Subject: [PATCH 0890/2223] mm: convert do_swap_page()'s swapcache variable to a folio The 'swapcache' variable is used to track whether the page is from the swapcache or not. It can do this equally well by being the folio of the page rather than the page itself, and this saves a number of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 04f54abdf9d2d..1e114438f6064 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,8 +3724,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - struct page *page = NULL, *swapcache; + struct folio *swapcache, *folio = NULL; + struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool exclusive = false; @@ -3768,11 +3768,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; page = lookup_swap_cache(entry, vma, vmf->address); - swapcache = page; if (page) folio = page_folio(page); + swapcache = folio; - if (!page) { + if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ @@ -3805,12 +3805,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - swapcache = page; if (page) folio = page_folio(page); + swapcache = folio; } - if (!page) { + if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. @@ -3862,7 +3862,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; - page = swapcache; goto out_page; } folio = page_folio(page); @@ -3873,7 +3872,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner. Try removing the extra reference from the local LRU * pagevecs if required. */ - if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3914,7 +3913,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. */ exclusive = pte_swp_exclusive(vmf->orig_pte); - if (page != swapcache) { + if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. @@ -3982,7 +3981,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->orig_pte = pte; /* ksm created a completely new copy */ - if (unlikely(page != swapcache && swapcache)) { + if (unlikely(folio != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { @@ -3995,7 +3994,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); folio_unlock(folio); - if (page != swapcache && swapcache) { + if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -4004,8 +4003,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * so that the swap count won't change under a * parallel locked swapcache. */ - unlock_page(swapcache); - put_page(swapcache); + folio_unlock(swapcache); + folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { @@ -4029,9 +4028,9 @@ out_page: folio_unlock(folio); out_release: folio_put(folio); - if (page != swapcache && swapcache) { - unlock_page(swapcache); - put_page(swapcache); + if (folio != swapcache && swapcache) { + folio_unlock(swapcache); + folio_put(swapcache); } if (si) put_swap_device(si); -- GitLab From 6599591816f522c1cc8ec4eb5cea75738963756a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:12 +0100 Subject: [PATCH 0891/2223] memcg: convert mem_cgroup_swapin_charge_page() to mem_cgroup_swapin_charge_folio() All callers now have a folio, so pass it in here and remove an unnecessary call to page_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 13 ++++++------- mm/memory.c | 2 +- mm/swap_state.c | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60545e4a1c034..ca0df42662ad1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -688,7 +688,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1254,7 +1254,7 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } -static inline int mem_cgroup_swapin_charge_page(struct page *page, +static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e804056422db0..621b4472c4094 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6844,21 +6844,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin - * @page: page to charge + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the page is allocated + * @entry: swap entry for which the folio is allocated * - * This function charges a page allocated for swapin. Please call this before - * adding the page to the swapcache. + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { - struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; diff --git a/mm/memory.c b/mm/memory.c index 1e114438f6064..b36b177e0ea91 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3783,7 +3783,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { ret = VM_FAULT_OOM; diff --git a/mm/swap_state.c b/mm/swap_state.c index ea354efd37356..a7e0438902dd1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -480,7 +480,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(&folio->page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ -- GitLab From 7a7256d5f512b6c17957df7f59cf5e281b3ddba3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:13 +0100 Subject: [PATCH 0892/2223] shmem: convert shmem_mfill_atomic_pte() to use a folio Assert that this is a single-page folio as there are several assumptions in here that it's exactly PAGE_SIZE bytes large. Saves several calls to compound_head() and removes the last caller of shmem_alloc_page(). Link: https://lkml.kernel.org/r/20220902194653.1739778-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 56cabf9bb947b..8754e2b4800a1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2374,12 +2374,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, } #ifdef CONFIG_USERFAULTFD -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -2395,7 +2389,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; - struct page *page; int ret; pgoff_t max_off; @@ -2414,53 +2407,53 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!*pagep) { ret = -ENOMEM; - page = shmem_alloc_page(gfp, info, pgoff); - if (!page) + folio = shmem_alloc_folio(gfp, info, pgoff); + if (!folio) goto out_unacct_blocks; if (!zeropage) { /* COPY */ - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_folio(folio, 0); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - *pagep = page; + *pagep = &folio->page; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } - flush_dcache_page(page); + flush_dcache_folio(folio); } else { /* ZEROPAGE */ - clear_user_highpage(page, dst_addr); + clear_user_highpage(&folio->page, dst_addr); } } else { - page = *pagep; + folio = page_folio(*pagep); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *pagep = NULL; } - VM_BUG_ON(PageLocked(page)); - VM_BUG_ON(PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; - folio = page_folio(page); ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - page, true, wp_copy); + &folio->page, true, wp_copy); if (ret) goto out_delete_from_cache; @@ -2470,13 +2463,13 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - unlock_page(page); + folio_unlock(folio); return 0; out_delete_from_cache: - delete_from_page_cache(page); + filemap_remove_folio(folio); out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; -- GitLab From 0d698e257241436e01182508d93fc290987eb37d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:14 +0100 Subject: [PATCH 0893/2223] shmem: convert shmem_replace_page() to shmem_replace_folio() The caller has a folio, so convert the calling convention and rename the function. Link: https://lkml.kernel.org/r/20220902194653.1739778-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8754e2b4800a1..2bb6f5cfdc111 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1608,7 +1608,7 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) return folio_zonenum(folio) > gfp_zone(gfp); } -static int shmem_replace_page(struct page **pagep, gfp_t gfp, +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct folio *old, *new; @@ -1617,7 +1617,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, pgoff_t swap_index; int error; - old = page_folio(*pagep); + old = *foliop; entry = folio_swap_entry(old); swap_index = swp_offset(entry); swap_mapping = swap_address_space(entry); @@ -1666,7 +1666,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, old = new; } else { folio_add_lru(new); - *pagep = &new->page; + *foliop = new; } folio_clear_swapcache(old); @@ -1772,8 +1772,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(swap, folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - folio = page_folio(page); + error = shmem_replace_folio(&folio, gfp, info, index); if (error) goto failed; } -- GitLab From c9edc242811d4c4b939b283f4f40b89f9c5b3b5a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:15 +0100 Subject: [PATCH 0894/2223] swap: add swap_cache_get_folio() Convert lookup_swap_cache() into swap_cache_get_folio() and add a lookup_swap_cache() wrapper around it. [akpm@linux-foundation.org: add CONFIG_SWAP=n stub for swap_cache_get_folio()] Link: https://lkml.kernel.org/r/20220902194653.1739778-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.h | 8 ++++++++ mm/swap_state.c | 32 +++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 29e38f3d82d03..ccd8d9a9ad36d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -39,6 +39,8 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); @@ -99,6 +101,12 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } +static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + return NULL; +} + static inline struct page *lookup_swap_cache(swp_entry_t swp, struct vm_area_struct *vma, unsigned long addr) diff --git a/mm/swap_state.c b/mm/swap_state.c index a7e0438902dd1..b96bf4ec8b5b1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -317,24 +317,24 @@ static inline bool swap_use_vma_readahead(void) } /* - * Lookup a swap entry in the swap cache. A found page will be returned + * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the page + * lock getting page table operations atomic even if we drop the folio * lock before returning. */ -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct folio *folio; struct swap_info_struct *si; si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - if (page) { + if (folio) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -342,10 +342,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ - if (unlikely(PageTransCompound(page))) - return page; + if (unlikely(folio_test_large(folio))) + return folio; - readahead = TestClearPageReadahead(page); + readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; @@ -366,7 +366,17 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, } } - return page; + return folio; +} + +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = swap_cache_get_folio(entry, vma, addr); + + if (!folio) + return NULL; + return folio_file_page(folio, swp_offset(entry)); } /** -- GitLab From 5739a81cf89f2bbbfff691439b8fcdf3c8d33f5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:16 +0100 Subject: [PATCH 0895/2223] shmem: eliminate struct page from shmem_swapin_folio() Convert shmem_swapin() to return a folio and use swap_cache_get_folio(), removing all uses of struct page in this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2bb6f5cfdc111..b685acd9f1495 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1486,7 +1486,7 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) mpol_cond_put(vma->vm_policy); } -static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; @@ -1499,7 +1499,9 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); - return page; + if (!page) + return NULL; + return page_folio(page); } /* @@ -1721,7 +1723,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct page *page; struct folio *folio = NULL; swp_entry_t swap; int error; @@ -1734,8 +1735,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EIO; /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { + folio = swap_cache_get_folio(swap, NULL, 0); + if (!folio) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -1743,13 +1744,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { + folio = shmem_swapin(swap, gfp, info, index); + if (!folio) { error = -ENOMEM; goto failed; } } - folio = page_folio(page); /* We have to do this with folio locked to prevent races */ folio_lock(folio); -- GitLab From fc26babbc7d45a98607918d336744269bc59d7b5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:17 +0100 Subject: [PATCH 0896/2223] shmem: convert shmem_getpage_gfp() to shmem_get_folio_gfp() Add a shmem_getpage_gfp() wrapper for compatibility with current users. Link: https://lkml.kernel.org/r/20220902194653.1739778-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 70 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b685acd9f1495..89536091928f2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -139,17 +139,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type); - -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -1595,7 +1584,7 @@ failed: /* * When a page is moved from swapcache to shmem filecache (either by the - * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), @@ -1812,7 +1801,7 @@ unlock: } /* - * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap @@ -1821,10 +1810,10 @@ unlock: * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1864,7 +1853,7 @@ repeat: if (error == -EEXIST) goto repeat; - *pagep = &folio->page; + *foliop = folio; return error; } @@ -1874,7 +1863,7 @@ repeat: folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; - /* fallocated page */ + /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); @@ -1882,10 +1871,10 @@ repeat: } /* - * SGP_READ: succeed on hole, with NULL page, letting caller zero. - * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ - *pagep = NULL; + *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) @@ -1918,7 +1907,7 @@ alloc_nohuge: if (error != -ENOSPC) goto unlock; /* - * Try to reclaim some space by splitting a huge page + * Try to reclaim some space by splitting a large folio * beyond i_size on the filesystem. */ while (retry--) { @@ -1954,9 +1943,9 @@ alloc_nohuge: if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { + folio_next_index(folio) - 1) { /* - * Part of the huge page is beyond i_size: subject + * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); @@ -1973,14 +1962,14 @@ alloc_nohuge: } /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { @@ -2006,7 +1995,7 @@ clear: goto unlock; } out: - *pagep = folio_page(folio, index - hindex); + *foliop = folio; return 0; /* @@ -2036,6 +2025,29 @@ unlock: return error; } +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + struct vm_fault *vmf, vm_fault_t *fault_type) +{ + struct folio *folio = NULL; + int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma, + vmf, fault_type); + + if (folio) + *pagep = folio_file_page(folio, index); + else + *pagep = NULL; + return ret; +} + +int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the -- GitLab From 68a541001a31856fb99614861de1c03109d2ea4d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:18 +0100 Subject: [PATCH 0897/2223] shmem: convert shmem_fault() to use shmem_get_folio_gfp() No particular advantage for this function, but necessary to remove shmem_getpage_gfp(). [hughd@google.com: fix crash] Link: https://lkml.kernel.org/r/7693a84-bdc2-27b5-2695-d0fe8566571f@google.com Link: https://lkml.kernel.org/r/20220902194653.1739778-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 89536091928f2..154432dc847b7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2065,6 +2065,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2127,10 +2128,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); + if (folio) + vmf->page = folio_file_page(folio, vmf->pgoff); return ret; } -- GitLab From a3a9c39704f4fec403ef173e62e069558b7eb85a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:19 +0100 Subject: [PATCH 0898/2223] shmem: convert shmem_read_mapping_page_gfp() to use shmem_get_folio_gfp() Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 154432dc847b7..c3e2a65a65fc2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4270,18 +4270,20 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; + struct folio *folio; struct page *page; int error; BUG_ON(!shmem_mapping(mapping)); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) return ERR_PTR(error); - unlock_page(page); + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); return ERR_PTR(-EIO); } -- GitLab From 4e1fc793ad9892cec67b40c9f67583160e08f695 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:20 +0100 Subject: [PATCH 0899/2223] shmem: add shmem_get_folio() With no remaining callers of shmem_getpage_gfp(), add shmem_get_folio() and reimplement shmem_getpage() as a call to shmem_get_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 23 ++++++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ff0b990de83d4..f4bd50b08a915 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,6 +113,8 @@ enum sgp_type { extern int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/shmem.c b/mm/shmem.c index c3e2a65a65fc2..32afc8039e660 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2025,14 +2025,18 @@ unlock: return error; } -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type) +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + +int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp) { struct folio *folio = NULL; - int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma, - vmf, fault_type); + int ret = shmem_get_folio(inode, index, &folio, sgp); if (folio) *pagep = folio_file_page(folio, index); @@ -2041,13 +2045,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return ret; } -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} - /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the -- GitLab From a7f5862cc0624ca6b21da5a634ff232dc65776b5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:21 +0100 Subject: [PATCH 0900/2223] shmem: convert shmem_get_partial_folio() to use shmem_get_folio() Get rid of an unnecessary folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 32afc8039e660..772a30593fcca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -874,10 +874,9 @@ void shmem_unlock_mapping(struct address_space *mapping) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; - struct page *page; /* - * At first avoid shmem_getpage(,,,SGP_READ): that fails + * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated pages as holes. */ folio = __filemap_get_folio(inode->i_mapping, index, @@ -888,9 +887,9 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * But read a page back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ - page = NULL; - shmem_getpage(inode, index, &page, SGP_READ); - return page ? page_folio(page) : NULL; + folio = NULL; + shmem_get_folio(inode, index, &folio, SGP_READ); + return folio; } /* -- GitLab From eff1f906c2dcd83ce7cbd38d2b853d2c49027f39 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:22 +0100 Subject: [PATCH 0901/2223] shmem: convert shmem_write_begin() to use shmem_get_folio() Use a folio throughout this function, saving a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 772a30593fcca..c69b53602a1d8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2498,6 +2498,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ @@ -2509,14 +2510,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); if (ret) return ret; + *pagep = folio_file_page(folio, index); if (PageHWPoison(*pagep)) { - unlock_page(*pagep); - put_page(*pagep); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; return -EIO; } -- GitLab From 4601e2fc8b57840660ce1a1ee98aea873fa15eee Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:23 +0100 Subject: [PATCH 0902/2223] shmem: convert shmem_file_read_iter() to use shmem_get_folio() Use a folio throughout, saving five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c69b53602a1d8..0f81193128470 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2577,6 +2577,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) offset = *ppos & ~PAGE_MASK; for (;;) { + struct folio *folio = NULL; struct page *page = NULL; pgoff_t end_index; unsigned long nr, ret; @@ -2591,17 +2592,18 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, SGP_READ); + error = shmem_get_folio(inode, index, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) { - unlock_page(page); + if (folio) { + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); error = -EIO; break; } @@ -2617,14 +2619,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (index == end_index) { nr = i_size & ~PAGE_MASK; if (nr <= offset) { - if (page) - put_page(page); + if (folio) + folio_put(folio); break; } } nr -= offset; - if (page) { + if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -2636,13 +2638,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Mark the page accessed if we read the beginning. */ if (!offset) - mark_page_accessed(page); + folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ ret = copy_page_to_iter(page, offset, nr, to); - put_page(page); + folio_put(folio); } else if (user_backed_iter(to)) { /* -- GitLab From b0802b22a97581608df3d2db2e705fe599777b18 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:24 +0100 Subject: [PATCH 0903/2223] shmem: convert shmem_fallocate() to use a folio Call shmem_get_folio() and use the folio APIs instead of the page APIs. Saves several calls to compound_head() and removes assumptions about the size of a large folio. Link: https://lkml.kernel.org/r/20220902194653.1739778-29-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0f81193128470..c2016a7cfc296 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2787,7 +2787,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, info->fallocend = end; for (index = start; index < end; ) { - struct page *page; + struct folio *folio; /* * Good, the fallocate(2) manpage permits EINTR: we may have @@ -2798,10 +2798,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC); + error = shmem_get_folio(inode, index, &folio, + SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; - /* Remove the !PageUptodate pages we added */ + /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, @@ -2810,37 +2811,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } - index++; /* * Here is a more important optimization than it appears: - * a second SGP_FALLOC on the same huge page will clear it, - * making it PageUptodate and un-undoable if we fail later. + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. */ - if (PageTransCompound(page)) { - index = round_up(index, HPAGE_PMD_NR); - /* Beware 32-bit wraparound */ - if (!index) - index--; - } + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* - * If !PageUptodate, leave it that way so that freeable pages + * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. - * But set_page_dirty so that memory pressure will swap rather - * than free the pages we are allocating (and SGP_CACHE pages + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); cond_resched(); } -- GitLab From 7ad0414bded6e8678840368be5cc72b9957a4478 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:25 +0100 Subject: [PATCH 0904/2223] shmem: convert shmem_symlink() to use a folio While symlinks will always be < PAGE_SIZE, using the folio APIs gets rid of unnecessary calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-30-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c2016a7cfc296..4948ceffcc9fd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3093,7 +3093,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, int error; int len; struct inode *inode; - struct page *page; + struct folio *folio; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3121,18 +3121,18 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) { iput(inode); return error; } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - memcpy(page_address(page), symname, len); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - put_page(page); + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); -- GitLab From e4b57722d0e6be8820039a7d506378640aee5073 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:26 +0100 Subject: [PATCH 0905/2223] shmem: convert shmem_get_link() to use a folio Symlinks will never use a large folio, but using the folio API removes a lot of unnecessary folio->page->folio conversions. Link: https://lkml.kernel.org/r/20220902194653.1739778-31-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4948ceffcc9fd..e6e934adeed7f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3143,40 +3143,41 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, static void shmem_put_link(void *arg) { - mark_page_accessed(arg); - put_page(arg); + folio_mark_accessed(arg); + folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page = NULL; + struct folio *folio = NULL; int error; + if (!dentry) { - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page) || - !PageUptodate(page)) { - put_page(page); + if (PageHWPoison(&folio->page) || + !folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); - if (!page) + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page)) { - unlock_page(page); - put_page(page); + if (PageHWPoison(&folio->page)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ECHILD); } - unlock_page(page); + folio_unlock(folio); } - set_delayed_call(done, shmem_put_link, page); - return page_address(page); + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR -- GitLab From 7459c149ae9ca7d6f241b3a3764aa81b9c405a0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:27 +0100 Subject: [PATCH 0906/2223] khugepaged: call shmem_get_folio() shmem_getpage() is being removed, so call its replacement and find the precise page ourselves. Link: https://lkml.kernel.org/r/20220902194653.1739778-32-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 +++++-- mm/shmem.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1e59fe7bfae36..57af2c841b410 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1647,13 +1647,16 @@ static int collapse_file(struct mm_struct *mm, struct file *file, } if (xa_is_value(page) || !PageUptodate(page)) { + struct folio *folio; + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOALLOC)) { + if (shmem_get_folio(mapping->host, index, + &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } + page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); diff --git a/mm/shmem.c b/mm/shmem.c index e6e934adeed7f..909149b25d98b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3158,7 +3158,7 @@ static const char *shmem_get_link(struct dentry *dentry, folio = filemap_get_folio(inode->i_mapping, 0); if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(&folio->page) || + if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); @@ -3169,7 +3169,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(&folio->page)) { + if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); -- GitLab From 12acf4fbc4f78b24822317888b9406d56dc9ad2a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:28 +0100 Subject: [PATCH 0907/2223] userfaultfd: convert mcontinue_atomic_pte() to use a folio shmem_getpage() is being replaced by shmem_get_folio() so use a folio throughout this function. Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-33-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7327b2573f7c2..9c035be2148bb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -243,20 +243,22 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; struct page *page; int ret; - ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find page. */ + ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; if (ret) goto out; - if (!page) { + if (!folio) { ret = -EFAULT; goto out; } + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; goto out_release; @@ -267,13 +269,13 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } -- GitLab From 923e2f0e7c30db5c1ee5d680050ab781e6c114fb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:29 +0100 Subject: [PATCH 0908/2223] shmem: remove shmem_getpage() With all callers removed, remove this wrapper function. The flags are now mysteriously called SGP, but I think we can live with that. Link: https://lkml.kernel.org/r/20220902194653.1739778-34-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 +--- mm/shmem.c | 15 +-------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f4bd50b08a915..f24071e3c826e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -102,7 +102,7 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); -/* Flag allocation requirements to shmem_getpage */ +/* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ @@ -111,8 +111,6 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -extern int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp); int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); diff --git a/mm/shmem.c b/mm/shmem.c index 909149b25d98b..3d0b729fcc5ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -179,7 +179,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_block(unsigned long flags, long pages) @@ -2031,19 +2031,6 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); } -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - struct folio *folio = NULL; - int ret = shmem_get_folio(inode, index, &folio, sgp); - - if (folio) - *pagep = folio_file_page(folio, index); - else - *pagep = NULL; - return ret; -} - /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the -- GitLab From 000085b9af9f3ca13dd672a753f815ac0cb45d0a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:30 +0100 Subject: [PATCH 0909/2223] swapfile: convert try_to_unuse() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-35-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index aafe739dc2a62..23cdbe8e47cfb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2032,7 +2032,7 @@ static int try_to_unuse(unsigned int type) struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct page *page; + struct folio *folio; swp_entry_t entry; unsigned int i; @@ -2082,21 +2082,21 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - page = find_get_page(swap_address_space(entry), i); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), i); + if (!folio) continue; /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock. The page + * It is conceivable that a racing task removed this folio from + * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But - * that is okay, try_to_free_swap() only removes stale pages. + * that is okay, folio_free_swap() only removes stale folios. */ - lock_page(page); - wait_on_page_writeback(page); - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + folio_wait_writeback(folio); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); } /* -- GitLab From 2c3f6194b008b23e52a8e135bdd56b67fdaa55ca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:31 +0100 Subject: [PATCH 0910/2223] swapfile: convert __try_to_reclaim_swap() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-36-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 23cdbe8e47cfb..e3e1bd3d20b17 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -132,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct page *page; + struct folio *folio; int ret = 0; - page = find_get_page(swap_address_space(entry), offset); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), offset); + if (!folio) return 0; /* * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use try_to_free_swap() with explicit lock_page() + * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (trylock_page(page)) { + if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) - ret = try_to_free_swap(page); - unlock_page(page); + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ret = folio_free_swap(folio); + folio_unlock(folio); } - put_page(page); + folio_put(folio); return ret; } -- GitLab From f102cd8b173e066179b472fb6e3b18e31a1cc394 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:32 +0100 Subject: [PATCH 0911/2223] swapfile: convert unuse_pte_range() to use a folio Delay fetching the precise page from the folio until we're in unuse_pte(). Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-37-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index e3e1bd3d20b17..3820b5ab64d94 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1758,8 +1758,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct folio *folio) { + struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; @@ -1831,17 +1832,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - struct page *page; swp_entry_t entry; pte_t *pte; struct swap_info_struct *si; - unsigned long offset; int ret = 0; volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { + struct folio *folio; + unsigned long offset; + if (!is_swap_pte(*pte)) continue; @@ -1852,8 +1854,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); swap_map = &si->swap_map[offset]; - page = lookup_swap_cache(entry, vma, addr); - if (!page) { + folio = swap_cache_get_folio(entry, vma, addr); + if (!folio) { + struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1863,25 +1866,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (page) + folio = page_folio(page); } - if (!page) { + if (!folio) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } - lock_page(page); - wait_on_page_writeback(page); - ret = unuse_pte(vma, pmd, addr, entry, page); + folio_lock(folio); + folio_wait_writeback(folio); + ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); -- GitLab From 5a423081b2465d38baf2fcbbc19f77d211507061 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:33 +0100 Subject: [PATCH 0912/2223] mm: convert do_swap_page() to use swap_cache_get_folio() Saves a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-38-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b36b177e0ea91..0018df3f0cc24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3767,9 +3767,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - page = lookup_swap_cache(entry, vma, vmf->address); - if (page) - folio = page_folio(page); + folio = swap_cache_get_folio(entry, vma, vmf->address); + if (folio) + page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { -- GitLab From cb691e2f28bc63b1a872aa593dd542ee796e8364 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:34 +0100 Subject: [PATCH 0913/2223] mm: remove lookup_swap_cache() All callers have now been converted to swap_cache_get_folio(), so we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-39-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- mm/swap.h | 10 ---------- mm/swap_state.c | 12 +----------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 621b4472c4094..9863fb5889729 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5569,7 +5569,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Because lookup_swap_cache() updates some statistics counter, + * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); diff --git a/mm/swap.h b/mm/swap.h index ccd8d9a9ad36d..cc08c459c6190 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,9 +41,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct page *lookup_swap_cache(swp_entry_t entry, - struct vm_area_struct *vma, - unsigned long addr); struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -107,13 +104,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, return NULL; } -static inline struct page *lookup_swap_cache(swp_entry_t swp, - struct vm_area_struct *vma, - unsigned long addr) -{ - return NULL; -} - static inline struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { diff --git a/mm/swap_state.c b/mm/swap_state.c index b96bf4ec8b5b1..4af135a7b53c4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -369,16 +369,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, return folio; } -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) -{ - struct folio *folio = swap_cache_get_folio(entry, vma, addr); - - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); -} - /** * find_get_incore_page - Find and get a page from the page or swap caches. * @mapping: The address_space to search. @@ -430,7 +420,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, int err; /* * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling + * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ si = get_swap_device(entry); -- GitLab From aedd74d4397a2b1a4882215b6169b47d139c0319 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:35 +0100 Subject: [PATCH 0914/2223] swap_state: convert free_swap_cache() to use a folio Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-40-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4af135a7b53c4..438d0676c5be2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -272,16 +272,19 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, /* * If we are the only user, then try to free up the swap cache. * - * Its ok to check for PageSwapCache without the page lock + * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside - * try_to_free_swap() _with_ the lock. + * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); + struct folio *folio = page_folio(page); + + if (folio_test_swapcache(folio) && !folio_mapped(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); } } -- GitLab From 71fa1a533d2e027a3df98fd065605bebab42d7bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:36 +0100 Subject: [PATCH 0915/2223] swap: convert swap_writepage() to use a folio Removes many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-41-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_io.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index fc6b3fb1f7c59..2af34dd8fa4db 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -180,29 +180,30 @@ bad_bmap: */ int swap_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret = 0; - if (try_to_free_swap(page)) { - unlock_page(page); + if (folio_free_swap(folio)) { + folio_unlock(folio); goto out; } /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(page); + ret = arch_prepare_to_swap(&folio->page); if (ret) { - set_page_dirty(page); - unlock_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); goto out; } - if (frontswap_store(page) == 0) { - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + if (frontswap_store(&folio->page) == 0) { + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); goto out; } - ret = __swap_writepage(page, wbc); + ret = __swap_writepage(&folio->page, wbc); out: return ret; } -- GitLab From e4a2ed94908cc0104b8826ed8d831661ed1c3ea1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:37 +0100 Subject: [PATCH 0916/2223] mm: convert do_wp_page() to use a folio Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-42-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0018df3f0cc24..2f1397b7c77dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3368,6 +3368,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); @@ -3414,48 +3415,47 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page)) { - struct page *page = vmf->page; - + folio = page_folio(vmf->page); + if (folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. */ - if (PageAnonExclusive(page)) + if (PageAnonExclusive(vmf->page)) goto reuse; /* - * We have to verify under page lock: these early checks are - * just an optimization to avoid locking the page and freeing + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * - * PageKsm() doesn't necessarily raise the page refcount. + * KSM doesn't necessarily raise the folio refcount. */ - if (PageKsm(page) || page_count(page) > 3) + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) goto copy; - if (!PageLRU(page)) + if (!folio_test_lru(folio)) /* * Note: We cannot easily detect+handle references from - * remote LRU pagevecs or references to PageLRU() pages. + * remote LRU pagevecs or references to LRU folios. */ lru_add_drain(); - if (page_count(page) > 1 + PageSwapCache(page)) + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) goto copy; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto copy; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (PageKsm(page) || page_count(page) != 1) { - unlock_page(page); + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); goto copy; } /* - * Ok, we've got the only page reference from our mapping - * and the page is locked, it's dark out, and we're wearing + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(page, vma); - unlock_page(page); + page_move_anon_rmap(vmf->page, vma); + folio_unlock(folio); reuse: if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); -- GitLab From 2fad3d14b9ebc8e42977bfb34a8165bb61a7c3f7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:38 +0100 Subject: [PATCH 0917/2223] huge_memory: convert do_huge_pmd_wp_page() to use a folio Removes many calls to compound_head(). Does not remove the assumption that a folio may not be larger than a PMD. Link: https://lkml.kernel.org/r/20220902194653.1739778-43-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 84bf1d5f6b7e8..1181e623bf5b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1305,6 +1305,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; @@ -1326,46 +1327,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } page = pmd_page(orig_pmd); + folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; - if (!trylock_page(page)) { - get_page(page); + if (!folio_trylock(folio)) { + folio_get(folio); spin_unlock(vmf->ptl); - lock_page(page); + folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } - put_page(page); + folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { - unlock_page(page); + folio_unlock(folio); goto reuse; } /* - * See do_wp_page(): we can only reuse the page exclusively if there are - * no additional references. Note that we always drain the LRU - * pagevecs immediately after adding a THP. + * See do_wp_page(): we can only reuse the folio exclusively if + * there are no additional references. Note that we always drain + * the LRU pagevecs immediately after adding a THP. */ - if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + if (folio_ref_count(folio) > + 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (page_count(page) == 1) { + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_ref_count(folio) == 1) { pmd_t entry; page_move_anon_rmap(page, vma); - unlock_page(page); + folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); @@ -1380,7 +1383,7 @@ reuse: } unlock_fallback: - unlock_page(page); + folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); -- GitLab From 98b211d6415f9538b81e50da71f09d195ce2afe6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:39 +0100 Subject: [PATCH 0918/2223] madvise: convert madvise_free_pte_range() to use a folio Saves a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-44-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/madvise.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index a3fc4cd32ed36..2baa93ca23109 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -601,6 +601,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; + struct folio *folio; struct page *page; int nr_swap = 0; unsigned long next; @@ -645,56 +646,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, page = vm_normal_page(vma, addr, ptent); if (!page || is_zone_device_page(page)) continue; + folio = page_folio(page); /* - * If pmd isn't transhuge but the page is THP and + * If pmd isn't transhuge but the folio is large and * is owned by only this process, split it and * deactivate all pages. */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) + if (folio_test_large(folio)) { + if (folio_mapcount(folio) != 1) goto out; - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); goto out; } pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); goto out; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; continue; } - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (PageSwapCache(page) || PageDirty(page)) { - if (!trylock_page(page)) + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { + if (!folio_trylock(folio)) continue; /* - * If page is shared with others, we couldn't clear - * PG_dirty of the page. + * If folio is shared with others, we mustn't clear + * the folio's dirty flag. */ - if (page_mapcount(page) != 1) { - unlock_page(page); + if (folio_mapcount(folio) != 1) { + folio_unlock(folio); continue; } - if (PageSwapCache(page) && !try_to_free_swap(page)) { - unlock_page(page); + if (folio_test_swapcache(folio) && + !folio_free_swap(folio)) { + folio_unlock(folio); continue; } - ClearPageDirty(page); - unlock_page(page); + folio_clear_dirty(folio); + folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { @@ -712,7 +713,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } - mark_page_lazyfree(page); + mark_page_lazyfree(&folio->page); } out: if (nr_swap) { -- GitLab From 5fcd079af9ed4e69cca0a2f77c6255d0eb8a8cca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:40 +0100 Subject: [PATCH 0919/2223] uprobes: use folios more widely in __replace_page() Remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-45-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 401bc2d24ce06..70375c7c0c4b9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ -#include /* try_to_free_swap */ +#include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,8 +154,9 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -169,8 +170,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -186,7 +187,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -198,15 +199,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, vma, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } -- GitLab From b4e6f66e45b43aed0903731b6c0700573f88282a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:41 +0100 Subject: [PATCH 0920/2223] ksm: use a folio in replace_page() Replace three calls to compound_head() with one. Link: https://lkml.kernel.org/r/20220902194653.1739778-46-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/ksm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index c3edb5836a441..c19fcca9bc03d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1110,6 +1110,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; pmd_t *pmd; pmd_t pmde; pte_t *ptep; @@ -1178,10 +1179,11 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + folio = page_folio(page); page_remove_rmap(page, vma, false); - if (!page_mapped(page)) - try_to_free_swap(page); - put_page(page); + if (!folio_mapped(folio)) + folio_free_swap(folio); + folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; -- GitLab From a160e5377b55bc5c1925a7456b656aabfc07261f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:42 +0100 Subject: [PATCH 0921/2223] mm: convert do_swap_page() to use folio_free_swap() Also convert should_try_to_free_swap() to use a folio. This removes a few calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-47-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2f1397b7c77dc..b8e4dae18ac15 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3641,14 +3641,14 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } -static inline bool should_try_to_free_swap(struct page *page, +static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || - PageMlocked(page)) + if (mem_cgroup_swap_full(&folio->page) || (vma->vm_flags & VM_LOCKED) || + folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we @@ -3656,8 +3656,8 @@ static inline bool should_try_to_free_swap(struct page *page, * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ - return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && - page_count(page) == 2; + return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && + folio_ref_count(folio) == 2; } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -3949,8 +3949,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * yet. */ swap_free(entry); - if (should_try_to_free_swap(page, vma, vmf->flags)) - try_to_free_swap(page); + if (should_try_to_free_swap(folio, vma, vmf->flags)) + folio_free_swap(folio); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); -- GitLab From 9202d527b715f67bcdccbb9b712b65fe053f8109 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:43 +0100 Subject: [PATCH 0922/2223] memcg: convert mem_cgroup_swap_full() to take a folio All callers now have a folio, so convert the function to take a folio. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/memcontrol.c | 6 +++--- mm/memory.c | 2 +- mm/swapfile.c | 2 +- mm/vmscan.c | 3 +-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42cbef554de68..d8bd6401c3e7d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -692,7 +692,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); -extern bool mem_cgroup_swap_full(struct page *page); +extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { @@ -714,7 +714,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return get_nr_swap_pages(); } -static inline bool mem_cgroup_swap_full(struct page *page) +static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9863fb5889729..632402001bca1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7406,18 +7406,18 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } -bool mem_cgroup_swap_full(struct page *page) +bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return false; diff --git a/mm/memory.c b/mm/memory.c index b8e4dae18ac15..2f1a6da7f1e65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3647,7 +3647,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, { if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(&folio->page) || (vma->vm_flags & VM_LOCKED) || + if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 3820b5ab64d94..4efcfe34e45b9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -148,7 +148,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) ret = folio_free_swap(folio); folio_unlock(folio); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 1707e3bfcfe42..c5a4bff11da69 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2047,8 +2047,7 @@ activate_locked_split: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && - (mem_cgroup_swap_full(&folio->page) || - folio_test_mlocked(folio))) + (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { -- GitLab From 3b344157c0c15b8f9588e3021dfb22ee25f4508a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:44 +0100 Subject: [PATCH 0923/2223] mm: remove try_to_free_swap() All callers have now been converted to folio_free_swap() and we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-49-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ mm/folio-compat.c | 7 ------- mm/memory.c | 2 +- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d8bd6401c3e7d..fc8d98660326f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -595,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int try_to_free_swap(struct page *page) -{ - return 0; -} - static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 06d47f00609b5..e1e23b4947d73 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -146,10 +146,3 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } - -#ifdef CONFIG_SWAP -int try_to_free_swap(struct page *page) -{ - return folio_free_swap(page_folio(page)); -} -#endif diff --git a/mm/memory.c b/mm/memory.c index 2f1a6da7f1e65..6e568f190e7a8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3844,7 +3844,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (swapcache) { /* - * Make sure try_to_free_swap or swapoff did not release the + * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not -- GitLab From 595af4c9368aba88c45831ef80ed686b602fe3fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:45 +0100 Subject: [PATCH 0924/2223] rmap: convert page_move_anon_rmap() to use a folio Removes one call to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-50-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 2ff17b9aabd9b..d44ff516a2089 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1099,22 +1099,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, */ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; - struct page *subpage = page; - - page = compound_head(page); + void *anon_vma = vma->anon_vma; + struct folio *folio = page_folio(page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma += PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - SetPageAnonExclusive(subpage); + WRITE_ONCE(folio->mapping, anon_vma); + SetPageAnonExclusive(page); } /** -- GitLab From 682a71a1b6b363bff71440f4eca6498f827a839d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:46 +0100 Subject: [PATCH 0925/2223] migrate: convert __unmap_and_move() to use folios Removes a lot of calls to compound_head(). Also remove a VM_BUG_ON that can never trigger as the PageAnon bit is the bottom bit of page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-51-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 75 ++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index eb594b0db8060..1ea149f14f849 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -993,17 +993,15 @@ out: return rc; } -static int __unmap_and_move(struct page *page, struct page *newpage, +static int __unmap_and_move(struct folio *src, struct folio *dst, int force, enum migrate_mode mode) { - struct folio *folio = page_folio(page); - struct folio *dst = page_folio(newpage); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(page); + bool is_lru = !__PageMovable(&src->page); - if (!trylock_page(page)) { + if (!folio_trylock(src)) { if (!force || mode == MIGRATE_ASYNC) goto out; @@ -1023,10 +1021,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (current->flags & PF_MEMALLOC) goto out; - lock_page(page); + folio_lock(src); } - if (PageWriteback(page)) { + if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, @@ -1043,12 +1041,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!force) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(src); } /* - * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, @@ -1060,22 +1058,22 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) - anon_vma = page_get_anon_vma(page); + if (folio_test_anon(src) && !folio_test_ksm(src)) + anon_vma = page_get_anon_vma(&src->page); /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one - * holding a reference to newpage at this point. We used to have a BUG - * here if trylock_page(newpage) fails, but would like to allow for - * cases where there might be a race with the previous use of newpage. + * holding a reference to dst at this point. We used to have a BUG + * here if folio_trylock(dst) fails, but would like to allow for + * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG(). */ - if (unlikely(!trylock_page(newpage))) + if (unlikely(!folio_trylock(dst))) goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_folio(dst, folio, mode); + rc = move_to_new_folio(dst, src, mode); goto out_unlock_both; } @@ -1083,7 +1081,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. - * Calling try_to_unmap() against a page->mapping==NULL page will + * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory @@ -1091,57 +1089,56 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ - if (!page->mapping) { - VM_BUG_ON_PAGE(PageAnon(page), page); - if (page_has_private(page)) { - try_to_free_buffers(folio); + if (!src->mapping) { + if (folio_test_private(src)) { + try_to_free_buffers(src); goto out_unlock_both; } - } else if (page_mapped(page)) { + } else if (folio_mapped(src)) { /* Establish migration ptes */ - VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, - page); - try_to_migrate(folio, 0); + VM_BUG_ON_FOLIO(folio_test_anon(src) && + !folio_test_ksm(src) && !anon_vma, src); + try_to_migrate(src, 0); page_was_mapped = true; } - if (!page_mapped(page)) - rc = move_to_new_folio(dst, folio, mode); + if (!folio_mapped(src)) + rc = move_to_new_folio(dst, src, mode); /* - * When successful, push newpage to LRU immediately: so that if it + * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will - * automatically build up the correct newpage->mlock_count for it. + * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ if (rc == MIGRATEPAGE_SUCCESS) { - lru_cache_add(newpage); + folio_add_lru(dst); if (page_was_mapped) lru_add_drain(); } if (page_was_mapped) - remove_migration_ptes(folio, - rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); out_unlock_both: - unlock_page(newpage); + folio_unlock(dst); out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - unlock_page(page); + folio_unlock(src); out: /* - * If migration is successful, decrease refcount of the newpage, + * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ if (rc == MIGRATEPAGE_SUCCESS) - put_page(newpage); + folio_put(dst); return rc; } @@ -1157,6 +1154,7 @@ static int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason, struct list_head *ret) { + struct folio *dst, *src = page_folio(page); int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; @@ -1174,9 +1172,10 @@ static int unmap_and_move(new_page_t get_new_page, newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; + dst = page_folio(newpage); newpage->private = 0; - rc = __unmap_and_move(page, newpage, force, mode); + rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); -- GitLab From c33db29231ad242b0c381c60b1603f5e1dec7e46 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:47 +0100 Subject: [PATCH 0926/2223] migrate: convert unmap_and_move_huge_page() to use folios Saves several calls to compound_head() and removes a couple of uses of page->lru. Link: https://lkml.kernel.org/r/20220902194653.1739778-52-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1ea149f14f849..c1c2d9d9032b9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1263,7 +1263,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!hugepage_migration_supported(page_hstate(hpage))) return -ENOSYS; - if (page_count(hpage) == 1) { + if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ putback_active_hugepage(hpage); return MIGRATEPAGE_SUCCESS; @@ -1274,7 +1274,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; dst = page_folio(new_hpage); - if (!trylock_page(hpage)) { + if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { @@ -1284,29 +1284,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, default: goto out; } - lock_page(hpage); + folio_lock(src); } /* * Check for pages which are in the process of being freed. Without - * page_mapping() set, hugetlbfs specific move page routine will not + * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) { + if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } - if (PageAnon(hpage)) - anon_vma = page_get_anon_vma(hpage); + if (folio_test_anon(src)) + anon_vma = page_get_anon_vma(&src->page); - if (unlikely(!trylock_page(new_hpage))) + if (unlikely(!folio_trylock(dst))) goto put_anon; - if (page_mapped(hpage)) { + if (folio_mapped(src)) { enum ttu_flags ttu = 0; - if (!PageAnon(hpage)) { + if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take @@ -1327,7 +1327,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, i_mmap_unlock_write(mapping); } - if (!page_mapped(hpage)) + if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) @@ -1335,7 +1335,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: - unlock_page(new_hpage); + folio_unlock(dst); put_anon: if (anon_vma) @@ -1347,12 +1347,12 @@ put_anon: } out_unlock: - unlock_page(hpage); + folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); else if (rc != -EAGAIN) - list_move_tail(&hpage->lru, ret); + list_move_tail(&src->lru, ret); /* * If migration was not successful and there's a freeing callback, use -- GitLab From 3e9a13daa61253e28a1c7d8f366931e0a58a2b5a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:48 +0100 Subject: [PATCH 0927/2223] huge_memory: convert split_huge_page_to_list() to use a folio Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-53-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 49 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1181e623bf5b7..bb8266b099f54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2622,27 +2622,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); - XA_STATE(xas, &head->mapping->i_pages, head->index); + struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_PAGE(!PageLocked(head), head); - VM_BUG_ON_PAGE(!PageCompound(head), head); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - is_hzp = is_huge_zero_page(head); - VM_WARN_ON_ONCE_PAGE(is_hzp, head); + is_hzp = is_huge_zero_page(&folio->page); + VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); if (is_hzp) return -EBUSY; - if (PageWriteback(head)) + if (folio_test_writeback(folio)) return -EBUSY; - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -2651,7 +2650,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(head); + anon_vma = page_get_anon_vma(&folio->page); if (!anon_vma) { ret = -EBUSY; goto out; @@ -2662,7 +2661,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } else { gfp_t gfp; - mapping = head->mapping; + mapping = folio->mapping; /* Truncated ? */ if (!mapping) { @@ -2679,7 +2678,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), gfp); + xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -2693,7 +2692,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but - * head page lock is good enough to serialize the trimming. + * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) @@ -2709,38 +2708,38 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(head); + unmap_page(&folio->page); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* - * Check if the head page is present in page cache. - * We assume all tail are present too, if head is there. + * Check if the folio is present in page cache. + * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != head) + if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - if (page_ref_freeze(head, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(head))) { + if (folio_ref_freeze(folio, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(&folio->page))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(head)); + list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - int nr = thp_nr_pages(head); + int nr = folio_nr_pages(folio); - xas_split(&xas, head, thp_order(head)); - if (PageSwapBacked(head)) { - __mod_lruvec_page_state(head, NR_SHMEM_THPS, + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __mod_lruvec_page_state(head, NR_FILE_THPS, + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } -- GitLab From 684555aacc90d70e6a4b96b3b238f1d9ea87408d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:49 +0100 Subject: [PATCH 0928/2223] huge_memory: convert unmap_page() to unmap_folio() Remove a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-54-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bb8266b099f54..22949ff6df131 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2355,13 +2355,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void unmap_page(struct page *page) +static void unmap_folio(struct folio *folio) { - struct folio *folio = page_folio(page); enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); /* * Anon pages need migration entries to preserve them, but file @@ -2378,7 +2377,7 @@ static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; - /* If unmap_page() uses try_to_migrate() on file, remove this check */ + /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { @@ -2428,7 +2427,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, - * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop @@ -2700,7 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* - * Racy check if we can split the page, before unmap_page() will + * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { @@ -2708,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(&folio->page); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); -- GitLab From 29eea9b5a9c9ecf21164a082a42bfabe06fdcb30 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:50 +0100 Subject: [PATCH 0929/2223] mm: convert page_get_anon_vma() to folio_get_anon_vma() With all callers now passing in a folio, rename the function and convert all callers. Removes a couple of calls to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-55-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 2 +- mm/migrate.c | 6 +++--- mm/rmap.c | 14 +++++++------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 72b2bcc37f73b..3d56e3712bb2a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -163,7 +163,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *page_get_anon_vma(struct page *page); +struct anon_vma *folio_get_anon_vma(struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 22949ff6df131..36ef79b851958 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2649,7 +2649,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(&folio->page); + anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; diff --git a/mm/migrate.c b/mm/migrate.c index c1c2d9d9032b9..c228afba0963d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1052,14 +1052,14 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * - * Only page_get_anon_vma() understands the subtleties of + * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ if (folio_test_anon(src) && !folio_test_ksm(src)) - anon_vma = page_get_anon_vma(&src->page); + anon_vma = folio_get_anon_vma(src); /* * Block others from accessing the new page when we get around to @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (folio_test_anon(src)) - anon_vma = page_get_anon_vma(&src->page); + anon_vma = folio_get_anon_vma(src); if (unlikely(!folio_trylock(dst))) goto put_anon; diff --git a/mm/rmap.c b/mm/rmap.c index d44ff516a2089..86511e633fcda 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -486,16 +486,16 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. */ -struct anon_vma *page_get_anon_vma(struct page *page) +struct anon_vma *folio_get_anon_vma(struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long)READ_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; - if (!page_mapped(page)) + if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); @@ -505,13 +505,13 @@ struct anon_vma *page_get_anon_vma(struct page *page) } /* - * If this page is still mapped, then its anon_vma cannot have been + * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; @@ -523,11 +523,11 @@ out: } /* - * Similar to page_get_anon_vma() except it locks the anon_vma. + * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex + * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, -- GitLab From 0c826c0b6a176b9ed5ace7106fd1770bb48f1898 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:51 +0100 Subject: [PATCH 0930/2223] rmap: remove page_unlock_anon_vma_read() This was simply an alias for anon_vma_unlock_read() since 2011. Link: https://lkml.kernel.org/r/20220902194653.1739778-56-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 ----- mm/memory-failure.c | 2 +- mm/rmap.c | 5 ----- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d56e3712bb2a..ca3e4ba6c58c4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -458,13 +458,8 @@ struct rmap_walk_control { void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); - -/* - * Called by memory-failure.c to kill processes. - */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, struct rmap_walk_control *rwc); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e554f9f583ca9..145bb561ddb3a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -529,7 +529,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma_read(av); + anon_vma_unlock_read(av); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 86511e633fcda..0b9264e58d256 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -599,11 +599,6 @@ out: return anon_vma; } -void page_unlock_anon_vma_read(struct anon_vma *anon_vma) -{ - anon_vma_unlock_read(anon_vma); -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is -- GitLab From 82e66bf76173a1525db9866455a7fdbc07b57297 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:52 +0100 Subject: [PATCH 0931/2223] uprobes: use new_folio in __replace_page() Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-57-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70375c7c0c4b9..e0a9b945e7bc0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,6 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; @@ -164,8 +165,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -180,9 +181,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); + folio_get(new_folio); page_add_new_anon_rmap(new_page, vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); -- GitLab From 19672a9e4a75252871cba319f4e3b859b8fdf671 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:53 +0100 Subject: [PATCH 0932/2223] mm: convert lock_page_or_retry() to folio_lock_or_retry() Remove a call to compound_head() in each of the two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-58-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 9 +++------ mm/memory.c | 10 +++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09de43e36a64b..32846b6306dbd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page) } /* - * lock_page_or_retry - Lock the page, unless this would block and the + * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, - unsigned int flags) +static inline bool folio_lock_or_retry(struct folio *folio, + struct mm_struct *mm, unsigned int flags) { - struct folio *folio; might_sleep(); - - folio = page_folio(page); return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } diff --git a/mm/memory.c b/mm/memory.c index 6e568f190e7a8..d671ad367d677 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3618,11 +3618,11 @@ EXPORT_SYMBOL(unmap_mapping_range); */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, vma->vm_mm, vmf->address & PAGE_MASK, @@ -3632,10 +3632,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) - restore_exclusive_pte(vma, page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); - unlock_page(page); + folio_unlock(folio); mmu_notifier_invalidate_range_end(&range); return 0; @@ -3835,7 +3835,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); + locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); if (!locked) { ret |= VM_FAULT_RETRY; -- GitLab From 8eeda55fe08944421cf57f6185fe37b069829e7b Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 5 Sep 2022 10:09:18 +0800 Subject: [PATCH 0933/2223] mm/hugetlb.c: remove unnecessary initialization of local `err' Link: https://lkml.kernel.org/r/20220905020918.3552-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2ca4e8c3163ef..008955d8f411c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3718,7 +3718,7 @@ static ssize_t demote_store(struct kobject *kobj, unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; - int err = 0; + int err; int nid; err = kstrtoul(buf, 10, &nr_demote); -- GitLab From c274cd5c9bf5ded4b3f2a4e99f76223c8f006051 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 4 Sep 2022 22:36:06 +0800 Subject: [PATCH 0934/2223] mm/damon/sysfs: simplify the judgement whether kdamonds are busy It is unnecessary to get the number of the running kdamond to judge whether kdamonds are busy. Here we can use the damon_sysfs_kdamond_running() helper and return -EBUSY directly when finding a running kdamond. Meanwhile, merging with the judgement that a kdamond has current sysfs command callback request to make the code more clear. Link: https://lkml.kernel.org/r/1662302166-13216-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7488e27c87c37..fe6c6870cf868 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2657,23 +2657,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) kdamonds->kdamonds_arr = NULL; } -static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds, +static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, int nr_kdamonds) { - int nr_running_ctxs = 0; int i; for (i = 0; i < nr_kdamonds; i++) { - struct damon_ctx *ctx = kdamonds[i]->damon_ctx; - - if (!ctx) - continue; - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - nr_running_ctxs++; - mutex_unlock(&ctx->kdamond_lock); + if (damon_sysfs_kdamond_running(kdamonds[i]) || + damon_sysfs_cmd_request.kdamond == kdamonds[i]) + return true; } - return nr_running_ctxs; + + return false; } static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, @@ -2682,15 +2677,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; int err, i; - if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr)) + if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr)) return -EBUSY; - for (i = 0; i < kdamonds->nr; i++) { - if (damon_sysfs_cmd_request.kdamond == - kdamonds->kdamonds_arr[i]) - return -EBUSY; - } - damon_sysfs_kdamonds_rm_dirs(kdamonds); if (!nr_kdamonds) return 0; -- GitLab From 710bb68c2e3a24512e2d2bae470960d7488e97b1 Mon Sep 17 00:00:00 2001 From: Matthias Goergens Date: Mon, 5 Sep 2022 11:19:04 +0800 Subject: [PATCH 0935/2223] hugetlb_encode.h: fix undefined behaviour (34 << 26) Left-shifting past the size of your datatype is undefined behaviour in C. The literal 34 gets the type `int`, and that one is not big enough to be left shifted by 26 bits. An `unsigned` is long enough (on any machine that has at least 32 bits for their ints.) For uniformity, we mark all the literals as unsigned. But it's only really needed for HUGETLB_FLAG_ENCODE_16GB. Thanks to Randy Dunlap for an initial review and suggestion. Link: https://lkml.kernel.org/r/20220905031904.150925-1-matthias.goergens@gmail.com Signed-off-by: Matthias Goergens Acked-by: Randy Dunlap Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/uapi/asm-generic/hugetlb_encode.h | 26 +++++++++++----------- tools/include/asm-generic/hugetlb_encode.h | 26 +++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index 4f3d5aaa11f53..de687009bfe53 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -20,18 +20,18 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f -#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16KB (14U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_64KB (16U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34U << HUGETLB_FLAG_ENCODE_SHIFT) #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h index 4f3d5aaa11f53..de687009bfe53 100644 --- a/tools/include/asm-generic/hugetlb_encode.h +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -20,18 +20,18 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f -#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16KB (14U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_64KB (16U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34U << HUGETLB_FLAG_ENCODE_SHIFT) #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ -- GitLab From b05f41a1aa56fd646f2aa048ee446b6a2edb80d3 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 5 Sep 2022 14:45:57 -0700 Subject: [PATCH 0936/2223] filemap: convert filemap_range_has_writeback() to use folios Removes 3 calls to compound_head(). Link: https://lkml.kernel.org/r/20220905214557.868606-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 68bd70fe71d59..aab125d423b8f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,22 +632,23 @@ bool filemap_range_has_writeback(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; + struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || folio_test_locked(folio) || + folio_test_writeback(folio)) break; } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); -- GitLab From ca77f290cff1dfa095d71ae16cc7cda8ee6df495 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:16 +0200 Subject: [PATCH 0937/2223] kasan: check KASAN_NO_FREE_META in __kasan_metadata_size Patch series "kasan: switch tag-based modes to stack ring from per-object metadata", v3. This series makes the tag-based KASAN modes use a ring buffer for storing stack depot handles for alloc/free stack traces for slab objects instead of per-object metadata. This ring buffer is referred to as the stack ring. On each alloc/free of a slab object, the tagged address of the object and the current stack trace are recorded in the stack ring. On each bug report, if the accessed address belongs to a slab object, the stack ring is scanned for matching entries. The newest entries are used to print the alloc/free stack traces in the report: one entry for alloc and one for free. The advantages of this approach over storing stack trace handles in per-object metadata with the tag-based KASAN modes: - Allows to find relevant stack traces for use-after-free bugs without using quarantine for freed memory. (Currently, if the object was reallocated multiple times, the report contains the latest alloc/free stack traces, not necessarily the ones relevant to the buggy allocation.) - Allows to better identify and mark use-after-free bugs, effectively making the CONFIG_KASAN_TAGS_IDENTIFY functionality always-on. - Has fixed memory overhead. The disadvantage: - If the affected object was allocated/freed long before the bug happened and the stack trace events were purged from the stack ring, the report will have no stack traces. Discussion ========== The proposed implementation of the stack ring uses a single ring buffer for the whole kernel. This might lead to contention due to atomic accesses to the ring buffer index on multicore systems. At this point, it is unknown whether the performance impact from this contention would be significant compared to the slowdown introduced by collecting stack traces due to the planned changes to the latter part, see the section below. For now, the proposed implementation is deemed to be good enough, but this might need to be revisited once the stack collection becomes faster. A considered alternative is to keep a separate ring buffer for each CPU and then iterate over all of them when printing a bug report. This approach requires somehow figuring out which of the stack rings has the freshest stack traces for an object if multiple stack rings have them. Further plans ============= This series is a part of an effort to make KASAN stack trace collection suitable for production. This requires stack trace collection to be fast and memory-bounded. The planned steps are: 1. Speed up stack trace collection (potentially, by using SCS; patches on-hold until steps #2 and #3 are completed). 2. Keep stack trace handles in the stack ring (this series). 3. Add a memory-bounded mode to stack depot or provide an alternative memory-bounded stack storage. 4. Potentially, implement stack trace collection sampling to minimize the performance impact. This patch (of 34): __kasan_metadata_size() calculates the size of the redzone for objects in a slab cache. When accounting for presence of kasan_free_meta in the redzone, this function only compares free_meta_offset with 0. But free_meta_offset could also be equal to KASAN_NO_FREE_META, which indicates that kasan_free_meta is not present at all. Add a comparison with KASAN_NO_FREE_META into __kasan_metadata_size(). Link: https://lkml.kernel.org/r/cover.1662411799.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/c7b316d30d90e5947eb8280f4dc78856a49298cf.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 69f583855c8be..f6a6c7d0d8b8f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -224,8 +224,9 @@ size_t __kasan_metadata_size(struct kmem_cache *cache) return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); } struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, -- GitLab From c249f9af85ee006976c0fae584daf947cc959931 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:17 +0200 Subject: [PATCH 0938/2223] kasan: rename kasan_set_*_info to kasan_save_*_info Rename set_alloc_info() and kasan_set_free_info() to save_alloc_info() and kasan_save_free_info(). The new names make more sense. Link: https://lkml.kernel.org/r/9f04777a15cb9d96bf00331da98e021d732fe1c9.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 8 ++++---- mm/kasan/generic.c | 2 +- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f6a6c7d0d8b8f..90b6cadd2dac0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -365,7 +365,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_set_free_info(cache, object, tag); + kasan_save_free_info(cache, object, tag); return kasan_quarantine_put(cache, object); } @@ -424,7 +424,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void set_alloc_info(struct kmem_cache *cache, void *object, +static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags, bool is_kmalloc) { struct kasan_alloc_meta *alloc_meta; @@ -468,7 +468,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, false); + save_alloc_info(cache, (void *)object, flags, false); return tagged_object; } @@ -514,7 +514,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * This also rewrites the alloc info when called from kasan_krealloc(). */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, true); + save_alloc_info(cache, (void *)object, flags, true); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 437fcc7e77cf2..03a3770cfeaec 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -358,7 +358,7 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } -void kasan_set_free_info(struct kmem_cache *cache, +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 01c03e45acd42..bf16a74dc0276 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -285,7 +285,7 @@ struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); -void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 8f48b9502a177..b453a353bc862 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,7 +17,7 @@ #include "kasan.h" -void kasan_set_free_info(struct kmem_cache *cache, +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; -- GitLab From 196894a6e20273d78479bdf76eec3a741e72d31c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:18 +0200 Subject: [PATCH 0939/2223] kasan: move is_kmalloc check out of save_alloc_info Move kasan_info.is_kmalloc check out of save_alloc_info(). This is a preparatory change that simplifies the following patches in this series. Link: https://lkml.kernel.org/r/df89f1915b788f9a10319905af6d0202a3b30c30.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 90b6cadd2dac0..6a75237ed308d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -424,15 +424,10 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void save_alloc_info(struct kmem_cache *cache, void *object, - gfp_t flags, bool is_kmalloc) +static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { struct kasan_alloc_meta *alloc_meta; - /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ - if (cache->kasan_info.is_kmalloc && !is_kmalloc) - return; - alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) kasan_set_track(&alloc_meta->alloc_track, flags); @@ -467,8 +462,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled()) - save_alloc_info(cache, (void *)object, flags, false); + if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + save_alloc_info(cache, (void *)object, flags); return tagged_object; } @@ -513,8 +508,8 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled()) - save_alloc_info(cache, (void *)object, flags, true); + if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; -- GitLab From ccf643e6dacf33ec618bd64e10eb0347173ad482 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:19 +0200 Subject: [PATCH 0940/2223] kasan: split save_alloc_info implementations Provide standalone implementations of save_alloc_info() for the Generic and tag-based modes. For now, the implementations are the same, but they will diverge later in the series. Link: https://lkml.kernel.org/r/77f1a078489c1e859aedb5403f772e5e1f7410a0.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 13 ++----------- mm/kasan/generic.c | 9 +++++++++ mm/kasan/kasan.h | 1 + mm/kasan/tags.c | 9 +++++++++ 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6a75237ed308d..93e64e1b44131 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -424,15 +424,6 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -463,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) - save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, (void *)object, flags); return tagged_object; } @@ -509,7 +500,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * This also rewrites the alloc info when called from kasan_krealloc(). */ if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) - save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 03a3770cfeaec..98c451a3b01f8 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -358,6 +358,15 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index bf16a74dc0276..d401fb770f67f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -285,6 +285,7 @@ struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index b453a353bc862..1ba3c8399f72c 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,6 +17,15 @@ #include "kasan.h" +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { -- GitLab From 687c85afa67a635dae683cf0ab6012e76333065b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:20 +0200 Subject: [PATCH 0941/2223] kasan: drop CONFIG_KASAN_TAGS_IDENTIFY Drop CONFIG_KASAN_TAGS_IDENTIFY and related code to simplify making changes to the reporting code. The dropped functionality will be restored in the following patches in this series. Link: https://lkml.kernel.org/r/4c66ba98eb237e9ed9312c19d423bbcf4ecf88f8.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 8 -------- mm/kasan/kasan.h | 12 +----------- mm/kasan/report_tags.c | 28 ---------------------------- mm/kasan/tags.c | 21 ++------------------- 4 files changed, 3 insertions(+), 66 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f0973da583e04..ca09b1cf8ee9d 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -167,14 +167,6 @@ config KASAN_STACK as well, as it adds inline-style instrumentation that is run unconditionally. -config KASAN_TAGS_IDENTIFY - bool "Memory corruption type identification" - depends on KASAN_SW_TAGS || KASAN_HW_TAGS - help - Enables best-effort identification of the bug types (use-after-free - or out-of-bounds) at the cost of increased memory consumption. - Only applicable for the tag-based KASAN modes. - config KASAN_VMALLOC bool "Check accesses to vmalloc allocations" depends on HAVE_ARCH_KASAN_VMALLOC diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d401fb770f67f..15c718782c1fa 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -169,23 +169,13 @@ struct kasan_track { depot_stack_handle_t stack; }; -#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS) -#define KASAN_NR_FREE_STACKS 5 -#else -#define KASAN_NR_FREE_STACKS 1 -#endif - struct kasan_alloc_meta { struct kasan_track alloc_track; /* Generic mode stores free track in kasan_free_meta. */ #ifdef CONFIG_KASAN_GENERIC depot_stack_handle_t aux_stack[2]; #else - struct kasan_track free_track[KASAN_NR_FREE_STACKS]; -#endif -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; - u8 free_track_idx; + struct kasan_track free_track; #endif }; diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index e25d2166e813d..35cf3cae4aa45 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -5,37 +5,9 @@ */ #include "kasan.h" -#include "../slab.h" const char *kasan_get_bug_type(struct kasan_report_info *info) { -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - struct kasan_alloc_meta *alloc_meta; - struct kmem_cache *cache; - struct slab *slab; - const void *addr; - void *object; - u8 tag; - int i; - - tag = get_tag(info->access_addr); - addr = kasan_reset_tag(info->access_addr); - slab = kasan_addr_to_slab(addr); - if (slab) { - cache = slab->slab_cache; - object = nearest_obj(cache, slab, (void *)addr); - alloc_meta = kasan_get_alloc_meta(cache, object); - - if (alloc_meta) { - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; - } - } - return "out-of-bounds"; - } -#endif - /* * If access_size is a negative number, then it has reason to be * defined as out-of-bounds bug type. diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 1ba3c8399f72c..e0e5de8ce834d 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -30,39 +30,22 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; - u8 idx = 0; alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - idx = alloc_meta->free_track_idx; - alloc_meta->free_pointer_tag[idx] = tag; - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; -#endif - - kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); + kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; - int i = 0; alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return NULL; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - break; - } - if (i == KASAN_NR_FREE_STACKS) - i = alloc_meta->free_track_idx; -#endif - - return &alloc_meta->free_track[i]; + return &alloc_meta->free_track; } -- GitLab From 88f29765ae3b00f8b9362f299f6140cd9b988f75 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:21 +0200 Subject: [PATCH 0942/2223] kasan: introduce kasan_print_aux_stacks Add a kasan_print_aux_stacks() helper that prints the auxiliary stack traces for the Generic mode. This change hides references to alloc_meta from the common reporting code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/67c7a9ea6615533762b1f8ccc267cd7f9bafb749.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 6 ++++++ mm/kasan/report.c | 15 +-------------- mm/kasan/report_generic.c | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 15c718782c1fa..30ff341b6d35e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -266,6 +266,12 @@ void kasan_print_address_stack_frame(const void *addr); static inline void kasan_print_address_stack_frame(const void *addr) { } #endif +#ifdef CONFIG_KASAN_GENERIC +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } +#endif + bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index fe3f606b3a986..cd9f5c7fc6db1 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -270,20 +270,7 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object, pr_err("\n"); } -#ifdef CONFIG_KASAN_GENERIC - if (!alloc_meta) - return; - if (alloc_meta->aux_stack[0]) { - pr_err("Last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[0]); - pr_err("\n"); - } - if (alloc_meta->aux_stack[1]) { - pr_err("Second to last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[1]); - pr_err("\n"); - } -#endif + kasan_print_aux_stacks(cache, object); } static void describe_object(struct kmem_cache *cache, void *object, diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 6689fb9a919b1..348dc207d4623 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -132,6 +132,26 @@ void kasan_metadata_fetch_row(char *buffer, void *row) memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[0]); + pr_err("\n"); + } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +} + #ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, -- GitLab From f3647cbfe5a34af1a22f2627dda5fb078a47f0d3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:22 +0200 Subject: [PATCH 0943/2223] kasan: introduce kasan_get_alloc_track Add a kasan_get_alloc_track() helper that fetches alloc_track for a slab object and use this helper in the common reporting code. For now, the implementations of this helper are the same for the Generic and tag-based modes, but they will diverge later in the series. This change hides references to alloc_meta from the common reporting code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/0c365a35f4a833fff46f9d42c3212b32f7166556.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 14 +++++++++++++- mm/kasan/kasan.h | 4 +++- mm/kasan/report.c | 8 ++++---- mm/kasan/tags.c | 14 +++++++++++++- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 98c451a3b01f8..f212b9ae57b59 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -381,8 +381,20 @@ void kasan_save_free_info(struct kmem_cache *cache, *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) + void *object, u8 tag) { if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) return NULL; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 30ff341b6d35e..b65a51349c51b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -283,8 +283,10 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); + void *object, u8 tag); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cd9f5c7fc6db1..5d225d7d9c4c7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -255,12 +255,12 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, static void describe_object_stacks(struct kmem_cache *cache, void *object, const void *addr, u8 tag) { - struct kasan_alloc_meta *alloc_meta; + struct kasan_track *alloc_track; struct kasan_track *free_track; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) { - print_track(&alloc_meta->alloc_track, "Allocated"); + alloc_track = kasan_get_alloc_track(cache, object); + if (alloc_track) { + print_track(alloc_track, "Allocated"); pr_err("\n"); } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index e0e5de8ce834d..7b1fc8e7c99c9 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -38,8 +38,20 @@ void kasan_save_free_info(struct kmem_cache *cache, kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) + void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; -- GitLab From 836daba099472baaa8b6a57772e8bb2d55f1f9d7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:23 +0200 Subject: [PATCH 0944/2223] kasan: introduce kasan_init_object_meta Add a kasan_init_object_meta() helper that initializes metadata for a slab object and use it in the common code. For now, the implementations of this helper are the same for the Generic and tag-based modes, but they will diverge later in the series. This change hides references to alloc_meta from the common code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/47c12938fc7f8105e7aaa592527c0e9d3c81fc37.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 10 +++------- mm/kasan/generic.c | 9 +++++++++ mm/kasan/kasan.h | 2 ++ mm/kasan/tags.c | 9 +++++++++ 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 93e64e1b44131..18107675a7fe0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -313,13 +313,9 @@ static inline u8 assign_tag(struct kmem_cache *cache, void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - if (kasan_stack_collection_enabled()) { - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); - } + /* Initialize per-object metadata if it is present. */ + if (kasan_stack_collection_enabled()) + kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ object = set_tag(object, assign_tag(cache, object, true)); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index f212b9ae57b59..5462ddbc21e68 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,15 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b65a51349c51b..2c8c3cce7bc64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -279,6 +279,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 7b1fc8e7c99c9..2e200969a4b84 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,6 +17,15 @@ #include "kasan.h" +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { struct kasan_alloc_meta *alloc_meta; -- GitLab From 74984e79071aafd528f03b8418657c05011b94f3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:24 +0200 Subject: [PATCH 0945/2223] kasan: clear metadata functions for tag-based modes Remove implementations of the metadata-related functions for the tag-based modes. The following patches in the series will provide alternative implementations. As of this patch, the tag-based modes no longer collect alloc and free stack traces. This functionality will be restored later in the series. Link: https://lkml.kernel.org/r/470fbe5d15e8015092e76e395de354be18ccceab.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/tags.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 2e200969a4b84..f11c89505c778 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -19,54 +19,25 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); } void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return; - - kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, void *object) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->alloc_track; + return NULL; } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->free_track; + return NULL; } -- GitLab From 2f3568017268fc34eb0b6b4b3163c0f2e619fde6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:25 +0200 Subject: [PATCH 0946/2223] kasan: move kasan_get_*_meta to generic.c Move the implementations of kasan_get_alloc/free_meta() to generic.c, as the common KASAN code does not use these functions anymore. Also drop kasan_reset_tag() from the implementation, as the Generic mode does not tag pointers. Link: https://lkml.kernel.org/r/ffcfc0ad654d78a2ef4ca054c943ddb4e5ca477b.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 19 ------------------- mm/kasan/generic.c | 17 +++++++++++++++++ mm/kasan/kasan.h | 14 +++++++------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 18107675a7fe0..19ddc0ed0e7bd 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -229,25 +229,6 @@ size_t __kasan_metadata_size(struct kmem_cache *cache) sizeof(struct kasan_free_meta) : 0); } -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object) -{ - if (!cache->kasan_info.alloc_meta_offset) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset; -} - -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset; -} -#endif - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5462ddbc21e68..fa654cb96a0dc 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,23 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) +{ + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return (void *)object + cache->kasan_info.free_meta_offset; +} + void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { struct kasan_alloc_meta *alloc_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 2c8c3cce7bc64..fdd577f3eb9d7 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -209,13 +209,6 @@ struct kunit_kasan_status { }; #endif -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object); -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object); -#endif - #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) static inline const void *kasan_shadow_to_mem(const void *shadow_addr) @@ -281,6 +274,13 @@ struct slab *kasan_addr_to_slab(const void *addr); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +#ifdef CONFIG_KASAN_GENERIC +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#endif + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -- GitLab From 284f8590a1dfbe1c33b50bf6e8f8dc714e61bfd3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:26 +0200 Subject: [PATCH 0947/2223] kasan: introduce kasan_requires_meta Add a kasan_requires_meta() helper that indicates whether the enabled KASAN mode requires per-object metadata and use this helper in the common code. Also hide kasan_init_object_meta() under CONFIG_KASAN_GENERIC ifdef check, as Generic is the only mode that uses per-object metadata. To allow for a potential future change that makes Generic KASAN support the kasan.stacktrace command-line parameter, let kasan_requires_meta() return kasan_stack_collection_enabled() instead of simply returning true. Link: https://lkml.kernel.org/r/cf837e9996246aaaeebf704ccf8ec26a34fcf64f.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 13 +++++-------- mm/kasan/kasan.h | 33 +++++++++++++++++++++++++++++---- mm/kasan/tags.c | 4 ---- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 19ddc0ed0e7bd..d0300954d76be 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -88,13 +88,10 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* - * Only allow cache merging when stack collection is disabled and no metadata - * is present. - */ +/* Only allow cache merging when no per-object metadata is present. */ slab_flags_t __kasan_never_merge(void) { - if (kasan_stack_collection_enabled()) + if (kasan_requires_meta()) return SLAB_KASAN; return 0; } @@ -152,7 +149,7 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, */ *flags |= SLAB_KASAN; - if (!kasan_stack_collection_enabled()) + if (!kasan_requires_meta()) return; ok_size = *size; @@ -220,7 +217,7 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) size_t __kasan_metadata_size(struct kmem_cache *cache) { - if (!kasan_stack_collection_enabled()) + if (!kasan_requires_meta()) return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + @@ -295,7 +292,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { /* Initialize per-object metadata if it is present. */ - if (kasan_stack_collection_enabled()) + if (kasan_requires_meta()) kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fdd577f3eb9d7..1736abd661b6a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -43,7 +43,7 @@ static inline bool kasan_sync_fault_possible(void) return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } -#else +#else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_stack_collection_enabled(void) { @@ -60,7 +60,31 @@ static inline bool kasan_sync_fault_possible(void) return true; } -#endif +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/* Generic KASAN uses per-object metadata to store stack traces. */ +static inline bool kasan_requires_meta(void) +{ + /* + * Technically, Generic KASAN always collects stack traces right now. + * However, let's use kasan_stack_collection_enabled() in case the + * kasan.stacktrace command-line argument is changed to affect + * Generic KASAN. + */ + return kasan_stack_collection_enabled(); +} + +#else /* CONFIG_KASAN_GENERIC */ + +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline bool kasan_requires_meta(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -272,13 +296,14 @@ void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); -void kasan_init_object_meta(struct kmem_cache *cache, const void *object); - #ifdef CONFIG_KASAN_GENERIC +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index f11c89505c778..4f24669085e92 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,10 +17,6 @@ #include "kasan.h" -void kasan_init_object_meta(struct kmem_cache *cache, const void *object) -{ -} - void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { } -- GitLab From 5935143d118569cdbccbae182763d2b451120c40 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:27 +0200 Subject: [PATCH 0948/2223] kasan: introduce kasan_init_cache_meta Add a kasan_init_cache_meta() helper that initializes metadata-related cache parameters and use this helper in the common KASAN code. Put the implementation of this new helper into generic.c, as only the Generic mode uses per-object metadata. Link: https://lkml.kernel.org/r/a6d7ea01876eb36472c9879f7b23f1b24766276e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 80 ++-------------------------------------------- mm/kasan/generic.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 2 ++ 3 files changed, 83 insertions(+), 78 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d0300954d76be..b6a74fe5e740d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -118,28 +118,9 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -/* - * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. - * For larger allocations larger redzones are used. - */ -static inline unsigned int optimal_redzone(unsigned int object_size) -{ - return - object_size <= 64 - 16 ? 16 : - object_size <= 128 - 32 ? 32 : - object_size <= 512 - 64 ? 64 : - object_size <= 4096 - 128 ? 128 : - object_size <= (1 << 14) - 256 ? 256 : - object_size <= (1 << 15) - 512 ? 512 : - object_size <= (1 << 16) - 1024 ? 1024 : 2048; -} - void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { - unsigned int ok_size; - unsigned int optimal_size; - /* * SLAB_KASAN is used to mark caches as ones that are sanitized by * KASAN. Currently this flag is used in two places: @@ -149,65 +130,8 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, */ *flags |= SLAB_KASAN; - if (!kasan_requires_meta()) - return; - - ok_size = *size; - - /* Add alloc meta into redzone. */ - cache->kasan_info.alloc_meta_offset = *size; - *size += sizeof(struct kasan_alloc_meta); - - /* - * If alloc meta doesn't fit, don't add it. - * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal - * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for - * larger sizes. - */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.alloc_meta_offset = 0; - *size = ok_size; - /* Continue, since free meta might still fit. */ - } - - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - - /* - * Add free meta into redzone when it's not possible to store - * it in the object. This is the case when: - * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can - * be touched after it was freed, or - * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation, or - * 3. Object is too small. - * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. - */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { - ok_size = *size; - - cache->kasan_info.free_meta_offset = *size; - *size += sizeof(struct kasan_free_meta); - - /* If free meta doesn't fit, don't add it. */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - *size = ok_size; - } - } - - /* Calculate size with optimal redzone. */ - optimal_size = cache->object_size + optimal_redzone(cache->object_size); - /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ - if (optimal_size > KMALLOC_MAX_SIZE) - optimal_size = KMALLOC_MAX_SIZE; - /* Use optimal size if the size with added metas is not large enough. */ - if (*size < optimal_size) - *size = optimal_size; + if (kasan_requires_meta()) + kasan_init_cache_meta(cache, size); } void __kasan_cache_create_kmalloc(struct kmem_cache *cache) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index fa654cb96a0dc..73aea784040a2 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,85 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static inline unsigned int optimal_redzone(unsigned int object_size) +{ + return + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; +} + +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) +{ + unsigned int ok_size; + unsigned int optimal_size; + + ok_size = *size; + + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. + */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.alloc_meta_offset = 0; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* Only the generic mode uses free meta or flexible redzones. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + return; + } + + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; +} + struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1736abd661b6a..6da35370ba37f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -297,12 +297,14 @@ struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); #ifdef CONFIG_KASAN_GENERIC +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); #else +static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { } static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif -- GitLab From 02856beb2d801423f88f2e8cb2eed0d6f14a4f92 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:28 +0200 Subject: [PATCH 0949/2223] kasan: drop CONFIG_KASAN_GENERIC check from kasan_init_cache_meta As kasan_init_cache_meta() is only defined for the Generic mode, it does not require the CONFIG_KASAN_GENERIC check. Link: https://lkml.kernel.org/r/211f8f2b213aa91e9148ca63342990b491c4917a.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 73aea784040a2..5125fad76f70a 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -367,12 +367,6 @@ void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) /* Continue, since free meta might still fit. */ } - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - /* * Add free meta into redzone when it's not possible to store * it in the object. This is the case when: -- GitLab From f372bde922e2ced8e0b5a928887b4cf587cc4453 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:29 +0200 Subject: [PATCH 0950/2223] kasan: only define kasan_metadata_size for Generic mode KASAN provides a helper for calculating the size of per-object metadata stored in the redzone. As now only the Generic mode uses per-object metadata, only define kasan_metadata_size() for this mode. Link: https://lkml.kernel.org/r/8f81d4938b80446bc72538a08217009f328a3e23.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 17 ++++++++--------- mm/kasan/common.c | 11 ----------- mm/kasan/generic.c | 11 +++++++++++ 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b092277bf48d6..027df75995731 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -150,14 +150,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) __kasan_cache_create_kmalloc(cache); } -size_t __kasan_metadata_size(struct kmem_cache *cache); -static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) -{ - if (kasan_enabled()) - return __kasan_metadata_size(cache); - return 0; -} - void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { @@ -282,7 +274,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} -static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -333,6 +324,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +size_t kasan_metadata_size(struct kmem_cache *cache); + void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); @@ -340,6 +333,12 @@ void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return 0; +} + static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b6a74fe5e740d..7c79c560315d3 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -139,17 +139,6 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) cache->kasan_info.is_kmalloc = true; } -size_t __kasan_metadata_size(struct kmem_cache *cache) -{ - if (!kasan_requires_meta()) - return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - ((cache->kasan_info.free_meta_offset && - cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? - sizeof(struct kasan_free_meta) : 0); -} - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5125fad76f70a..806ab92032c3b 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -427,6 +427,17 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) __memset(alloc_meta, 0, sizeof(*alloc_meta)); } +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + if (!kasan_requires_meta()) + return 0; + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); -- GitLab From 3b7f8813e9ecf7fe91f2f8dc3b581a111cd374a5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:30 +0200 Subject: [PATCH 0951/2223] kasan: only define kasan_never_merge for Generic mode KASAN prevents merging of slab caches whose objects have per-object metadata stored in redzones. As now only the Generic mode uses per-object metadata, define kasan_never_merge() only for this mode. Link: https://lkml.kernel.org/r/81ed01f29ff3443580b7e2fe362a8b47b1e8006d.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ mm/kasan/common.c | 8 -------- mm/kasan/generic.c | 8 ++++++++ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 027df75995731..9743d4b3a9185 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -103,14 +103,6 @@ struct kasan_cache { bool is_kmalloc; }; -slab_flags_t __kasan_never_merge(void); -static __always_inline slab_flags_t kasan_never_merge(void) -{ - if (kasan_enabled()) - return __kasan_never_merge(); - return 0; -} - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -261,10 +253,6 @@ static __always_inline bool kasan_check_byte(const void *addr) #else /* CONFIG_KASAN */ -static inline slab_flags_t kasan_never_merge(void) -{ - return 0; -} static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} @@ -325,6 +313,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC size_t kasan_metadata_size(struct kmem_cache *cache); +slab_flags_t kasan_never_merge(void); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -338,6 +327,11 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } +/* And thus nothing prevents cache merging. */ +static inline slab_flags_t kasan_never_merge(void) +{ + return 0; +} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7c79c560315d3..c2690e9380303 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -88,14 +88,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* Only allow cache merging when no per-object metadata is present. */ -slab_flags_t __kasan_never_merge(void) -{ - if (kasan_requires_meta()) - return SLAB_KASAN; - return 0; -} - void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 806ab92032c3b..25333bf3c99f2 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,14 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* Only allow cache merging when no per-object metadata is present. */ +slab_flags_t kasan_never_merge(void) +{ + if (!kasan_requires_meta()) + return 0; + return SLAB_KASAN; +} + /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. -- GitLab From 26f21f3ac76df6cf3b447e8231f8754991165475 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:31 +0200 Subject: [PATCH 0952/2223] kasan: only define metadata offsets for Generic mode Hide the definitions of alloc_meta_offset and free_meta_offset under an ifdef CONFIG_KASAN_GENERIC check, as these fields are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/d4bafa0534facafd1a23c465a94261e64f366493.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9743d4b3a9185..a212c2e3f32de 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -98,8 +98,10 @@ static inline bool kasan_has_integrated_init(void) #ifdef CONFIG_KASAN struct kasan_cache { +#ifdef CONFIG_KASAN_GENERIC int alloc_meta_offset; int free_meta_offset; +#endif bool is_kmalloc; }; -- GitLab From be95e13fcc6ded156c65ece01486d9cc33d22dc8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:32 +0200 Subject: [PATCH 0953/2223] kasan: only define metadata structs for Generic mode Hide the definitions of kasan_alloc_meta and kasan_free_meta under an ifdef CONFIG_KASAN_GENERIC check, as these structures are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/8d2aabff8c227c444a3f62edf87d5630beb77640.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 6da35370ba37f..cae60e4d88426 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -193,14 +193,12 @@ struct kasan_track { depot_stack_handle_t stack; }; +#ifdef CONFIG_KASAN_GENERIC + struct kasan_alloc_meta { struct kasan_track alloc_track; - /* Generic mode stores free track in kasan_free_meta. */ -#ifdef CONFIG_KASAN_GENERIC + /* Free track is stored in kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; -#else - struct kasan_track free_track; -#endif }; struct qlist_node { @@ -219,12 +217,12 @@ struct qlist_node { * After that, slab allocator stores the freelist pointer in the object. */ struct kasan_free_meta { -#ifdef CONFIG_KASAN_GENERIC struct qlist_node quarantine_link; struct kasan_track free_track; -#endif }; +#endif /* CONFIG_KASAN_GENERIC */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) /* Used in KUnit-compatible KASAN tests. */ struct kunit_kasan_status { -- GitLab From 682ed08924407b719fa0b1123a26971748d76ace Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:33 +0200 Subject: [PATCH 0954/2223] kasan: only define kasan_cache_create for Generic mode Right now, kasan_cache_create() assigns SLAB_KASAN for all KASAN modes and then sets up metadata-related cache parameters for the Generic mode. SLAB_KASAN is used in two places: 1. In slab_ksize() to account for per-object metadata when calculating the size of the accessible memory within the object. 2. In slab_common.c via kasan_never_merge() to prevent merging of caches with per-object metadata. Both cases are only relevant when per-object metadata is present, which is only the case with the Generic mode. Thus, assign SLAB_KASAN and define kasan_cache_create() only for the Generic mode. Also update the SLAB_KASAN-related comment. Link: https://lkml.kernel.org/r/61faa2aa1906e2d02c97d00ddf99ce8911dda095.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ include/linux/slab.h | 2 +- mm/kasan/common.c | 16 ---------------- mm/kasan/generic.c | 17 ++++++++++++++++- 4 files changed, 23 insertions(+), 30 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a212c2e3f32de..d811b3d7d2a15 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -128,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page, __kasan_unpoison_pages(page, order, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags); -static __always_inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, slab_flags_t *flags) -{ - if (kasan_enabled()) - __kasan_cache_create(cache, size, flags); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache); static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) { @@ -260,9 +251,6 @@ static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_unpoison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, - slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, @@ -316,6 +304,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} size_t kasan_metadata_size(struct kmem_cache *cache); slab_flags_t kasan_never_merge(void); +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -334,6 +324,10 @@ static inline slab_flags_t kasan_never_merge(void) { return 0; } +/* And no cache-related metadata initialization is required. */ +static inline void kasan_cache_create(struct kmem_cache *cache, + unsigned int *size, + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/slab.h b/include/linux/slab.h index 352e3f082acce..617a39f7db466 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -106,7 +106,7 @@ # define SLAB_ACCOUNT 0 #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC #define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else #define SLAB_KASAN 0 diff --git a/mm/kasan/common.c b/mm/kasan/common.c index c2690e9380303..8efa631909514 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -110,22 +110,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags) -{ - /* - * SLAB_KASAN is used to mark caches as ones that are sanitized by - * KASAN. Currently this flag is used in two places: - * 1. In slab_ksize() when calculating the size of the accessible - * memory within the object. - * 2. In slab_common.c to prevent merging of sanitized caches. - */ - *flags |= SLAB_KASAN; - - if (kasan_requires_meta()) - kasan_init_cache_meta(cache, size); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache) { cache->kasan_info.is_kmalloc = true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 25333bf3c99f2..f6bef347de870 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -352,11 +352,26 @@ static inline unsigned int optimal_redzone(unsigned int object_size) object_size <= (1 << 16) - 1024 ? 1024 : 2048; } -void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) { unsigned int ok_size; unsigned int optimal_size; + if (!kasan_requires_meta()) + return; + + /* + * SLAB_KASAN is used to mark caches that are sanitized by KASAN + * and that thus have per-object metadata. + * Currently this flag is used in two places: + * 1. In slab_ksize() to account for per-object metadata when + * calculating the size of the accessible memory within the object. + * 2. In slab_common.c via kasan_never_merge() to prevent merging of + * caches with per-object metadata. + */ + *flags |= SLAB_KASAN; + ok_size = *size; /* Add alloc meta into redzone. */ -- GitLab From 6b07434980a1926780cb5c5644fb198fb9c3997b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:34 +0200 Subject: [PATCH 0955/2223] kasan: pass tagged pointers to kasan_save_alloc/free_info Pass tagged pointers to kasan_save_alloc/free_info(). This is a preparatory patch to simplify other changes in the series. Link: https://lkml.kernel.org/r/d5bc48cfcf0dca8269dc3ed863047e4d4d2030f1.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 6 ++---- mm/kasan/generic.c | 3 +-- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 3 +-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 8efa631909514..f8e16a2421978 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -193,13 +193,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine, bool init) { - u8 tag; void *tagged_object; if (!kasan_arch_is_ready()) return false; - tag = get_tag(object); tagged_object = object; object = kasan_reset_tag(object); @@ -228,7 +226,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_save_free_info(cache, object, tag); + kasan_save_free_info(cache, tagged_object); return kasan_quarantine_put(cache, object); } @@ -317,7 +315,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) - kasan_save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; } diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index f6bef347de870..aff39af3c532a 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -500,8 +500,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_set_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cae60e4d88426..cca49ab029f1c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -309,7 +309,7 @@ static inline void kasan_init_object_meta(struct kmem_cache *cache, const void * depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); +void kasan_save_free_info(struct kmem_cache *cache, void *object); struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, void *object); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 4f24669085e92..fd11d10a4ffc6 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -21,8 +21,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { } -void kasan_save_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { } -- GitLab From b89933e9a54d3e7c4da081bc0b986341b62cdab6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:35 +0200 Subject: [PATCH 0956/2223] kasan: move kasan_get_alloc/free_track definitions Move the definitions of kasan_get_alloc/free_track() to report_*.c, as they belong with other the reporting code. Link: https://lkml.kernel.org/r/0cb15423956889b3905a0174b58782633bbbd72e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 21 --------------------- mm/kasan/report_generic.c | 21 +++++++++++++++++++++ mm/kasan/report_tags.c | 12 ++++++++++++ mm/kasan/tags.c | 12 ------------ 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index aff39af3c532a..d8b5590f9484b 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -512,24 +512,3 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object) /* The object was freed and has free track set. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } - -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->alloc_track; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; -} diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 348dc207d4623..74d21786ef091 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -127,6 +127,27 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) + return NULL; + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + return &kasan_get_free_meta(cache, object)->free_track; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 35cf3cae4aa45..79b6497d8a81b 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -21,3 +21,15 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } + +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + return NULL; +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + return NULL; +} diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index fd11d10a4ffc6..39a0481e5228c 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -24,15 +24,3 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) void kasan_save_free_info(struct kmem_cache *cache, void *object) { } - -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) -{ - return NULL; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - return NULL; -} -- GitLab From 9ef08d265e3f02b3266a46f684de5741724bd7f8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:36 +0200 Subject: [PATCH 0957/2223] kasan: cosmetic changes in report.c Do a few non-functional style fixes for the code in report.c. Link: https://lkml.kernel.org/r/b728eae71f3ea505a885449724de21cf3f476a7b.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 5d225d7d9c4c7..83f420a28c0b2 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -200,25 +200,22 @@ static void print_error_description(struct kasan_report_info *info) static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); - if (track->stack) { + if (track->stack) stack_depot_print(track->stack); - } else { + else pr_err("(stack is not available)\n"); - } } struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) return virt_to_head_page(addr); return NULL; } struct slab *kasan_addr_to_slab(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) return virt_to_slab(addr); return NULL; } -- GitLab From 2c9fb1fd1dd0b17cf8f48935a9c3ecea066f10e8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:37 +0200 Subject: [PATCH 0958/2223] kasan: use virt_addr_valid in kasan_addr_to_page/slab Instead of open-coding the validity checks for addr in kasan_addr_to_page/slab(), use the virt_addr_valid() helper. Link: https://lkml.kernel.org/r/c22a4850d74d7430f8a6c08216fd55c2860a2b9e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 83f420a28c0b2..570f9419b90cc 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -208,14 +208,14 @@ static void print_track(struct kasan_track *track, const char *prefix) struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_head_page(addr); return NULL; } struct slab *kasan_addr_to_slab(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_slab(addr); return NULL; } -- GitLab From 0f282f15dcc479b1f70ef4c2324db8a6df670fcb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:38 +0200 Subject: [PATCH 0959/2223] kasan: use kasan_addr_to_slab in print_address_description Use the kasan_addr_to_slab() helper in print_address_description() instead of separately invoking PageSlab() and page_slab(). Link: https://lkml.kernel.org/r/8b744fbf8c3c7fc5d34329ec70b60ee5c8dba66c.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 7 +++++++ mm/kasan/report.c | 11 ++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f8e16a2421978..50f4338b477f2 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,6 +30,13 @@ #include "kasan.h" #include "../slab.h" +struct slab *kasan_addr_to_slab(const void *addr) +{ + if (virt_addr_valid(addr)) + return virt_to_slab(addr); + return NULL; +} + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 570f9419b90cc..cd31b3b89ca15 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -213,13 +213,6 @@ struct page *kasan_addr_to_page(const void *addr) return NULL; } -struct slab *kasan_addr_to_slab(const void *addr) -{ - if (virt_addr_valid(addr)) - return virt_to_slab(addr); - return NULL; -} - static void describe_object_addr(struct kmem_cache *cache, void *object, const void *addr) { @@ -297,12 +290,12 @@ static inline bool init_task_stack_addr(const void *addr) static void print_address_description(void *addr, u8 tag) { struct page *page = kasan_addr_to_page(addr); + struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (page && PageSlab(page)) { - struct slab *slab = page_slab(page); + if (slab) { struct kmem_cache *cache = slab->slab_cache; void *object = nearest_obj(cache, slab, addr); -- GitLab From 559756e8a2e153f0f2ddf29c0ed9ac7b88345fb6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:39 +0200 Subject: [PATCH 0960/2223] kasan: make kasan_addr_to_page static As kasan_addr_to_page() is only used in report.c, rename it to addr_to_page() and make it static. Link: https://lkml.kernel.org/r/66c1267200fe0c16e2ac8847a9315fda041918cb.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 1 - mm/kasan/report.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cca49ab029f1c..4fddfdb08abf1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -291,7 +291,6 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); -struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); #ifdef CONFIG_KASAN_GENERIC diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cd31b3b89ca15..ac526c10ebff7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -206,7 +206,7 @@ static void print_track(struct kasan_track *track, const char *prefix) pr_err("(stack is not available)\n"); } -struct page *kasan_addr_to_page(const void *addr) +static inline struct page *addr_to_page(const void *addr) { if (virt_addr_valid(addr)) return virt_to_head_page(addr); @@ -289,7 +289,7 @@ static inline bool init_task_stack_addr(const void *addr) static void print_address_description(void *addr, u8 tag) { - struct page *page = kasan_addr_to_page(addr); + struct page *page = addr_to_page(addr); struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); -- GitLab From a794898a0e17c1c563fcce614efbd3644d48fa2e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:40 +0200 Subject: [PATCH 0961/2223] kasan: simplify print_report To simplify reading the implementation of print_report(), remove the tagged_addr variable and rename untagged_addr to addr. Link: https://lkml.kernel.org/r/f64f5f1093b3c06896bf0f850c5d9e661313fcb2.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ac526c10ebff7..dc38ada86f85d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -397,17 +397,16 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *tagged_addr = info->access_addr; - void *untagged_addr = kasan_reset_tag(tagged_addr); - u8 tag = get_tag(tagged_addr); + void *addr = kasan_reset_tag(info->access_addr); + u8 tag = get_tag(info->access_addr); print_error_description(info); - if (addr_has_metadata(untagged_addr)) + if (addr_has_metadata(addr)) kasan_print_tags(tag, info->first_bad_addr); pr_err("\n"); - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, tag); + if (addr_has_metadata(addr)) { + print_address_description(addr, tag); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); -- GitLab From 015b109f1f7a799a51def6be37a53b650c4a8fda Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:41 +0200 Subject: [PATCH 0962/2223] kasan: introduce complete_report_info Introduce a complete_report_info() function that fills in the first_bad_addr field of kasan_report_info instead of doing it in kasan_report_*(). This function will be extended in the next patch. Link: https://lkml.kernel.org/r/8eb1a9bd01f5d31eab4524da54a101b8720b469e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 5 ++++- mm/kasan/report.c | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4fddfdb08abf1..7e07115873d3b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -153,12 +153,15 @@ enum kasan_report_type { }; struct kasan_report_info { + /* Filled in by kasan_report_*(). */ enum kasan_report_type type; void *access_addr; - void *first_bad_addr; size_t access_size; bool is_write; unsigned long ip; + + /* Filled in by the common reporting code. */ + void *first_bad_addr; }; /* Do not change the struct layout: compiler ABI. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index dc38ada86f85d..0c2e7a58095d9 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -413,6 +413,17 @@ static void print_report(struct kasan_report_info *info) } } +static void complete_report_info(struct kasan_report_info *info) +{ + void *addr = kasan_reset_tag(info->access_addr); + + if (info->type == KASAN_REPORT_ACCESS) + info->first_bad_addr = kasan_find_first_bad_addr( + info->access_addr, info->access_size); + else + info->first_bad_addr = addr; +} + void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; @@ -430,11 +441,12 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty info.type = type; info.access_addr = ptr; - info.first_bad_addr = kasan_reset_tag(ptr); info.access_size = 0; info.is_write = false; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&flags, ptr); @@ -463,11 +475,12 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; - info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); info.access_size = size; info.is_write = is_write; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&irq_flags, ptr); -- GitLab From 7fae3dd08e3e88491f06e22e648913e3f8cf30f0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:42 +0200 Subject: [PATCH 0963/2223] kasan: fill in cache and object in complete_report_info Add cache and object fields to kasan_report_info and fill them in in complete_report_info() instead of fetching them in the middle of the report printing code. This allows the reporting code to get access to the object information before starting printing the report. One of the following patches uses this information to determine the bug type with the tag-based modes. Link: https://lkml.kernel.org/r/23264572cb2cbb8f0efbb51509b6757eb3cc1fc9.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 2 ++ mm/kasan/report.c | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7e07115873d3b..b8fa1e50f3d48 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -162,6 +162,8 @@ struct kasan_report_info { /* Filled in by the common reporting code. */ void *first_bad_addr; + struct kmem_cache *cache; + void *object; }; /* Do not change the struct layout: compiler ABI. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0c2e7a58095d9..763de8e68887f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -287,19 +287,16 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static void print_address_description(void *addr, u8 tag) +static void print_address_description(void *addr, u8 tag, + struct kasan_report_info *info) { struct page *page = addr_to_page(addr); - struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (slab) { - struct kmem_cache *cache = slab->slab_cache; - void *object = nearest_obj(cache, slab, addr); - - describe_object(cache, object, addr, tag); + if (info->cache && info->object) { + describe_object(info->cache, info->object, addr, tag); pr_err("\n"); } @@ -406,7 +403,7 @@ static void print_report(struct kasan_report_info *info) pr_err("\n"); if (addr_has_metadata(addr)) { - print_address_description(addr, tag); + print_address_description(addr, tag, info); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); @@ -416,12 +413,20 @@ static void print_report(struct kasan_report_info *info) static void complete_report_info(struct kasan_report_info *info) { void *addr = kasan_reset_tag(info->access_addr); + struct slab *slab; if (info->type == KASAN_REPORT_ACCESS) info->first_bad_addr = kasan_find_first_bad_addr( info->access_addr, info->access_size); else info->first_bad_addr = addr; + + slab = kasan_addr_to_slab(addr); + if (slab) { + info->cache = slab->slab_cache; + info->object = nearest_obj(info->cache, slab, addr); + } else + info->cache = info->object = NULL; } void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) -- GitLab From 92a38eacd6412bb09f98245ba5b3aa89e3dd6656 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:43 +0200 Subject: [PATCH 0964/2223] kasan: rework function arguments in report.c Pass a pointer to kasan_report_info to describe_object() and describe_object_stacks(), instead of passing the structure's fields. The untagged pointer and the tag are still passed as separate arguments to some of the functions to avoid duplicating the untagging logic. This is preparatory change for the next patch. Link: https://lkml.kernel.org/r/2e0cdb91524ab528a3c2b12b6d8bcb69512fc4af.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 763de8e68887f..ec018f8499920 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -213,8 +213,8 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(struct kmem_cache *cache, void *object, - const void *addr) +static void describe_object_addr(const void *addr, struct kmem_cache *cache, + void *object) { unsigned long access_addr = (unsigned long)addr; unsigned long object_addr = (unsigned long)object; @@ -242,33 +242,32 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object_stacks(u8 tag, struct kasan_report_info *info) { struct kasan_track *alloc_track; struct kasan_track *free_track; - alloc_track = kasan_get_alloc_track(cache, object); + alloc_track = kasan_get_alloc_track(info->cache, info->object); if (alloc_track) { print_track(alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(cache, object, tag); + free_track = kasan_get_free_track(info->cache, info->object, tag); if (free_track) { print_track(free_track, "Freed"); pr_err("\n"); } - kasan_print_aux_stacks(cache, object); + kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object(const void *addr, u8 tag, + struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(cache, object, addr, tag); - describe_object_addr(cache, object, addr); + describe_object_stacks(tag, info); + describe_object_addr(addr, info->cache, info->object); } static inline bool kernel_or_module_addr(const void *addr) @@ -296,7 +295,7 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); if (info->cache && info->object) { - describe_object(info->cache, info->object, addr, tag); + describe_object(addr, tag, info); pr_err("\n"); } -- GitLab From 59e6e098d1c156f7c449af903c3b48a5470f6120 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:44 +0200 Subject: [PATCH 0965/2223] kasan: introduce kasan_complete_mode_report_info Add bug_type and alloc/free_track fields to kasan_report_info and add a kasan_complete_mode_report_info() function that fills in these fields. This function is implemented differently for different KASAN mode. Change the reporting code to use the filled in fields instead of invoking kasan_get_bug_type() and kasan_get_alloc/free_track(). For the Generic mode, kasan_complete_mode_report_info() invokes these functions instead. For the tag-based modes, only the bug_type field is filled in; alloc/free_track are handled in the next patch. Using a single function that fills in these fields is required for the tag-based modes, as the values for all three fields are determined in a single procedure implemented in the following patch. Link: https://lkml.kernel.org/r/8432b861054fa8d0cee79a8877dedeaf3b677ca8.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 33 +++++++++++++++++---------------- mm/kasan/report.c | 30 ++++++++++++++---------------- mm/kasan/report_generic.c | 32 +++++++++++++++++--------------- mm/kasan/report_tags.c | 13 +++---------- 4 files changed, 51 insertions(+), 57 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b8fa1e50f3d48..7df107dc400ac 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -146,6 +146,13 @@ static inline bool kasan_requires_meta(void) #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + enum kasan_report_type { KASAN_REPORT_ACCESS, KASAN_REPORT_INVALID_FREE, @@ -164,6 +171,11 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + + /* Filled in by the mode-specific reporting code. */ + const char *bug_type; + struct kasan_track alloc_track; + struct kasan_track free_track; }; /* Do not change the struct layout: compiler ABI. */ @@ -189,14 +201,7 @@ struct kasan_global { #endif }; -/* Structures for keeping alloc and free tracks. */ - -#define KASAN_STACK_DEPTH 64 - -struct kasan_track { - u32 pid; - depot_stack_handle_t stack; -}; +/* Structures for keeping alloc and free meta. */ #ifdef CONFIG_KASAN_GENERIC @@ -270,16 +275,16 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ +void *kasan_find_first_bad_addr(void *addr, size_t size); +void kasan_complete_mode_report_info(struct kasan_report_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) void kasan_print_tags(u8 addr_tag, const void *addr); #else static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_report_info *info); -void kasan_metadata_fetch_row(char *buffer, void *row); - #if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else @@ -314,10 +319,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ec018f8499920..39e8e5a80b829 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -185,8 +185,7 @@ static void print_error_description(struct kasan_report_info *info) return; } - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -242,31 +241,25 @@ static void describe_object_addr(const void *addr, struct kmem_cache *cache, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(u8 tag, struct kasan_report_info *info) +static void describe_object_stacks(struct kasan_report_info *info) { - struct kasan_track *alloc_track; - struct kasan_track *free_track; - - alloc_track = kasan_get_alloc_track(info->cache, info->object); - if (alloc_track) { - print_track(alloc_track, "Allocated"); + if (info->alloc_track.stack) { + print_track(&info->alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(info->cache, info->object, tag); - if (free_track) { - print_track(free_track, "Freed"); + if (info->free_track.stack) { + print_track(&info->free_track, "Freed"); pr_err("\n"); } kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(const void *addr, u8 tag, - struct kasan_report_info *info) +static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(tag, info); + describe_object_stacks(info); describe_object_addr(addr, info->cache, info->object); } @@ -295,7 +288,7 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); if (info->cache && info->object) { - describe_object(addr, tag, info); + describe_object(addr, info); pr_err("\n"); } @@ -426,6 +419,9 @@ static void complete_report_info(struct kasan_report_info *info) info->object = nearest_obj(info->cache, slab, addr); } else info->cache = info->object = NULL; + + /* Fill in mode-specific report info fields. */ + kasan_complete_mode_report_info(info); } void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) @@ -443,6 +439,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty start_report(&flags, true); + memset(&info, 0, sizeof(info)); info.type = type; info.access_addr = ptr; info.access_size = 0; @@ -477,6 +474,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, start_report(&irq_flags, true); + memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; info.access_size = size; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 74d21786ef091..087c1d8c81456 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_report_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -127,25 +127,27 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) +void kasan_complete_mode_report_info(struct kasan_report_info *info) { struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; + info->bug_type = get_bug_type(info); - return &alloc_meta->alloc_track; -} + if (!info->cache || !info->object) + return; -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; + alloc_meta = kasan_get_alloc_meta(info->cache, info->object); + if (alloc_meta) + memcpy(&info->alloc_track, &alloc_meta->alloc_track, + sizeof(info->alloc_track)); + + if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) { + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + free_meta = kasan_get_free_meta(info->cache, info->object); + memcpy(&info->free_track, &free_meta->free_track, + sizeof(info->free_track)); + } } void kasan_metadata_fetch_row(char *buffer, void *row) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 79b6497d8a81b..5cbac2cdb177b 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -6,7 +6,7 @@ #include "kasan.h" -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -22,14 +22,7 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) +void kasan_complete_mode_report_info(struct kasan_report_info *info) { - return NULL; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - return NULL; + info->bug_type = get_bug_type(info); } -- GitLab From 7bc0584e5d2a687c0855a1b3dec9a6d6857d757b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:45 +0200 Subject: [PATCH 0966/2223] kasan: implement stack ring for tag-based modes Implement storing stack depot handles for alloc/free stack traces for slab objects for the tag-based KASAN modes in a ring buffer. This ring buffer is referred to as the stack ring. On each alloc/free of a slab object, the tagged address of the object and the current stack trace are recorded in the stack ring. On each bug report, if the accessed address belongs to a slab object, the stack ring is scanned for matching entries. The newest entries are used to print the alloc/free stack traces in the report: one entry for alloc and one for free. The number of entries in the stack ring is fixed in this patch, but one of the following patches adds a command-line argument to control it. [andreyknvl@google.com: initialize read-write lock in stack ring] Link: https://lkml.kernel.org/r/576182d194e27531e8090bad809e4136953895f4.1663700262.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/692de14b6b6a1bc817fd55e4ad92fc1f83c1ab59.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 21 +++++++++++++ mm/kasan/report_tags.c | 71 ++++++++++++++++++++++++++++++++++++++++++ mm/kasan/tags.c | 52 +++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7df107dc400ac..cfff81139d67e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #ifndef __MM_KASAN_KASAN_H #define __MM_KASAN_KASAN_H +#include #include #include #include @@ -233,6 +234,26 @@ struct kasan_free_meta { #endif /* CONFIG_KASAN_GENERIC */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +struct kasan_stack_ring_entry { + void *ptr; + size_t size; + u32 pid; + depot_stack_handle_t stack; + bool is_free; +}; + +#define KASAN_STACK_RING_SIZE (32 << 10) + +struct kasan_stack_ring { + rwlock_t lock; + atomic64_t pos; + struct kasan_stack_ring_entry entries[KASAN_STACK_RING_SIZE]; +}; + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) /* Used in KUnit-compatible KASAN tests. */ struct kunit_kasan_status { diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 5cbac2cdb177b..1b78136542bb6 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -4,8 +4,12 @@ * Copyright (c) 2020 Google, Inc. */ +#include + #include "kasan.h" +extern struct kasan_stack_ring stack_ring; + static const char *get_bug_type(struct kasan_report_info *info) { /* @@ -24,5 +28,72 @@ static const char *get_bug_type(struct kasan_report_info *info) void kasan_complete_mode_report_info(struct kasan_report_info *info) { + unsigned long flags; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *ptr; + u32 pid; + depot_stack_handle_t stack; + bool is_free; + bool alloc_found = false, free_found = false; + info->bug_type = get_bug_type(info); + + if (!info->cache || !info->object) + return; + } + + write_lock_irqsave(&stack_ring.lock, flags); + + pos = atomic64_read(&stack_ring.pos); + + /* + * The loop below tries to find stack ring entries relevant to the + * buggy object. This is a best-effort process. + * + * First, another object with the same tag can be allocated in place of + * the buggy object. Also, since the number of entries is limited, the + * entries relevant to the buggy object can be overwritten. + */ + + for (u64 i = pos - 1; i != pos - 1 - KASAN_STACK_RING_SIZE; i--) { + if (alloc_found && free_found) + break; + + entry = &stack_ring.entries[i % KASAN_STACK_RING_SIZE]; + + /* Paired with smp_store_release() in save_stack_info(). */ + ptr = (void *)smp_load_acquire(&entry->ptr); + + if (kasan_reset_tag(ptr) != info->object || + get_tag(ptr) != get_tag(info->access_addr)) + continue; + + pid = READ_ONCE(entry->pid); + stack = READ_ONCE(entry->stack); + is_free = READ_ONCE(entry->is_free); + + if (is_free) { + /* + * Second free of the same object. + * Give up on trying to find the alloc entry. + */ + if (free_found) + break; + + info->free_track.pid = pid; + info->free_track.stack = stack; + free_found = true; + } else { + /* Second alloc of the same object. Give up. */ + if (alloc_found) + break; + + info->alloc_track.pid = pid; + info->alloc_track.stack = stack; + alloc_found = true; + } + } + + write_unlock_irqrestore(&stack_ring.lock, flags); } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 39a0481e5228c..a0524e037f499 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -6,6 +6,7 @@ * Copyright (c) 2020 Google, Inc. */ +#include #include #include #include @@ -16,11 +17,62 @@ #include #include "kasan.h" +#include "../slab.h" + +/* Non-zero, as initial pointer values are 0. */ +#define STACK_RING_BUSY_PTR ((void *)1) + +struct kasan_stack_ring stack_ring = { + .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) +}; + +static void save_stack_info(struct kmem_cache *cache, void *object, + gfp_t gfp_flags, bool is_free) +{ + unsigned long flags; + depot_stack_handle_t stack; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *old_ptr; + + stack = kasan_save_stack(gfp_flags, true); + + /* + * Prevent save_stack_info() from modifying stack ring + * when kasan_complete_mode_report_info() is walking it. + */ + read_lock_irqsave(&stack_ring.lock, flags); + +next: + pos = atomic64_fetch_add(1, &stack_ring.pos); + entry = &stack_ring.entries[pos % KASAN_STACK_RING_SIZE]; + + /* Detect stack ring entry slots that are being written to. */ + old_ptr = READ_ONCE(entry->ptr); + if (old_ptr == STACK_RING_BUSY_PTR) + goto next; /* Busy slot. */ + if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) + goto next; /* Busy slot. */ + + WRITE_ONCE(entry->size, cache->object_size); + WRITE_ONCE(entry->pid, current->pid); + WRITE_ONCE(entry->stack, stack); + WRITE_ONCE(entry->is_free, is_free); + + /* + * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). + */ + smp_store_release(&entry->ptr, (s64)object); + + read_unlock_irqrestore(&stack_ring.lock, flags); +} void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { + save_stack_info(cache, object, flags, false); } void kasan_save_free_info(struct kmem_cache *cache, void *object) { + save_stack_info(cache, object, GFP_NOWAIT, true); } -- GitLab From 7ebfce33125100e3f0c5e059845a019a1401433d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:46 +0200 Subject: [PATCH 0967/2223] kasan: support kasan.stacktrace for SW_TAGS Add support for the kasan.stacktrace command-line argument for Software Tag-Based KASAN. The following patch adds a command-line argument for selecting the stack ring size, and, as the stack ring is supported by both the Software and the Hardware Tag-Based KASAN modes, it is natural that both of them have support for kasan.stacktrace too. Link: https://lkml.kernel.org/r/3b43059103faa7f8796017847b7d674b658f11b5.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 15 ++++++----- mm/kasan/hw_tags.c | 39 +--------------------------- mm/kasan/kasan.h | 36 +++++++++++++++++--------- mm/kasan/sw_tags.c | 5 +++- mm/kasan/tags.c | 43 +++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 57 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 1772fd457fed9..7bd38c1810185 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -111,9 +111,15 @@ parameter can be used to control panic and reporting behaviour: report or also panic the kernel (default: ``report``). The panic happens even if ``kasan_multi_shot`` is enabled. -Hardware Tag-Based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore, it supports -additional boot parameters that allow disabling KASAN or controlling features: +Software and Hardware Tag-Based KASAN modes (see the section about various +modes below) support disabling stack trace collection: + +- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack + traces collection (default: ``on``). + +Hardware Tag-Based KASAN mode is intended for use in production as a security +mitigation. Therefore, it supports additional boot parameters that allow +disabling KASAN altogether or controlling its features: - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). @@ -132,9 +138,6 @@ additional boot parameters that allow disabling KASAN or controlling features: - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc allocations (default: ``on``). -- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack - traces collection (default: ``on``). - Error reports ~~~~~~~~~~~~~ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9ad8eff71b28d..b22c4f461cb0b 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -38,16 +38,9 @@ enum kasan_arg_vmalloc { KASAN_ARG_VMALLOC_ON, }; -enum kasan_arg_stacktrace { - KASAN_ARG_STACKTRACE_DEFAULT, - KASAN_ARG_STACKTRACE_OFF, - KASAN_ARG_STACKTRACE_ON, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* * Whether KASAN is enabled at all. @@ -66,9 +59,6 @@ EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to enable vmalloc tagging. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -/* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -122,23 +112,6 @@ static int __init early_kasan_flag_vmalloc(char *arg) } early_param("kasan.vmalloc", early_kasan_flag_vmalloc); -/* kasan.stacktrace=off/on */ -static int __init early_kasan_flag_stacktrace(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "off")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; - else if (!strcmp(arg, "on")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; - else - return -EINVAL; - - return 0; -} -early_param("kasan.stacktrace", early_kasan_flag_stacktrace); - static inline const char *kasan_mode_info(void) { if (kasan_mode == KASAN_MODE_ASYNC) @@ -213,17 +186,7 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_stacktrace) { - case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default is specified by kasan_flag_stacktrace definition. */ - break; - case KASAN_ARG_STACKTRACE_OFF: - static_branch_disable(&kasan_flag_stacktrace); - break; - case KASAN_ARG_STACKTRACE_ON: - static_branch_enable(&kasan_flag_stacktrace); - break; - } + kasan_init_tags(); /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cfff81139d67e..447baf1a7a2e4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -8,13 +8,31 @@ #include #include -#ifdef CONFIG_KASAN_HW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) #include + +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +static inline bool kasan_stack_collection_enabled(void) +{ + return static_branch_unlikely(&kasan_flag_stacktrace); +} + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_stack_collection_enabled(void) +{ + return true; +} + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS + #include "../slab.h" DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, @@ -29,11 +47,6 @@ static inline bool kasan_vmalloc_enabled(void) return static_branch_likely(&kasan_flag_vmalloc); } -static inline bool kasan_stack_collection_enabled(void) -{ - return static_branch_unlikely(&kasan_flag_stacktrace); -} - static inline bool kasan_async_fault_possible(void) { return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM; @@ -46,11 +59,6 @@ static inline bool kasan_sync_fault_possible(void) #else /* CONFIG_KASAN_HW_TAGS */ -static inline bool kasan_stack_collection_enabled(void) -{ - return true; -} - static inline bool kasan_async_fault_possible(void) { return false; @@ -410,6 +418,10 @@ static inline void kasan_enable_tagging(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void __init kasan_init_tags(void); +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_force_async_fault(void); diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 77f13f391b577..a3afaf2ad1b11 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void) for_each_possible_cpu(cpu) per_cpu(prng_state, cpu) = (u32)get_cycles(); - pr_info("KernelAddressSanitizer initialized (sw-tags)\n"); + kasan_init_tags(); + + pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", + kasan_stack_collection_enabled() ? "on" : "off"); } /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index a0524e037f499..dd929ab166fb4 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -19,6 +19,17 @@ #include "kasan.h" #include "../slab.h" +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + /* Non-zero, as initial pointer values are 0. */ #define STACK_RING_BUSY_PTR ((void *)1) @@ -26,6 +37,38 @@ struct kasan_stack_ring stack_ring = { .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) }; +/* kasan.stacktrace=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); + +void __init kasan_init_tags(void) +{ + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default is specified by kasan_flag_stacktrace definition. */ + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; + } +} + static void save_stack_info(struct kmem_cache *cache, void *object, gfp_t gfp_flags, bool is_free) { -- GitLab From 80b92bfe3bb75aa6688f58af9df356757a46f659 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:47 +0200 Subject: [PATCH 0968/2223] kasan: dynamically allocate stack ring entries Instead of using a large static array, allocate the stack ring dynamically via memblock_alloc(). The size of the stack ring is controlled by a new kasan.stack_ring_size command-line parameter. When kasan.stack_ring_size is not provided, the default value of 32 << 10 is used. When the stack trace collection is disabled via kasan.stacktrace=off, the stack ring is not allocated. Link: https://lkml.kernel.org/r/03b82ab60db53427e9818e0b0c1971baa10c3cbc.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 4 +++- mm/kasan/kasan.h | 5 ++--- mm/kasan/report_tags.c | 4 ++-- mm/kasan/tags.c | 25 ++++++++++++++++++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 7bd38c1810185..5c93ab9150494 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -112,10 +112,12 @@ parameter can be used to control panic and reporting behaviour: if ``kasan_multi_shot`` is enabled. Software and Hardware Tag-Based KASAN modes (see the section about various -modes below) support disabling stack trace collection: +modes below) support altering stack trace collection behavior: - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). +- ``kasan.stack_ring_size=`` specifies the number of entries + in the stack ring (default: ``32768``). Hardware Tag-Based KASAN mode is intended for use in production as a security mitigation. Therefore, it supports additional boot parameters that allow diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 447baf1a7a2e4..abbcc1b0eec50 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -252,12 +252,11 @@ struct kasan_stack_ring_entry { bool is_free; }; -#define KASAN_STACK_RING_SIZE (32 << 10) - struct kasan_stack_ring { rwlock_t lock; + size_t size; atomic64_t pos; - struct kasan_stack_ring_entry entries[KASAN_STACK_RING_SIZE]; + struct kasan_stack_ring_entry *entries; }; #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 1b78136542bb6..57f7355377f14 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -56,11 +56,11 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * entries relevant to the buggy object can be overwritten. */ - for (u64 i = pos - 1; i != pos - 1 - KASAN_STACK_RING_SIZE; i--) { + for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) { if (alloc_found && free_found) break; - entry = &stack_ring.entries[i % KASAN_STACK_RING_SIZE]; + entry = &stack_ring.entries[i % stack_ring.size]; /* Paired with smp_store_release() in save_stack_info(). */ ptr = (void *)smp_load_acquire(&entry->ptr); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index dd929ab166fb4..67a222586846e 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,8 @@ #include "kasan.h" #include "../slab.h" +#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10) + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -54,6 +57,16 @@ static int __init early_kasan_flag_stacktrace(char *arg) } early_param("kasan.stacktrace", early_kasan_flag_stacktrace); +/* kasan.stack_ring_size= */ +static int __init early_kasan_flag_stack_ring_size(char *arg) +{ + if (!arg) + return -EINVAL; + + return kstrtoul(arg, 0, &stack_ring.size); +} +early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); + void __init kasan_init_tags(void) { switch (kasan_arg_stacktrace) { @@ -67,6 +80,16 @@ void __init kasan_init_tags(void) static_branch_enable(&kasan_flag_stacktrace); break; } + + if (kasan_stack_collection_enabled()) { + if (!stack_ring.size) + stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT; + stack_ring.entries = memblock_alloc( + sizeof(stack_ring.entries[0]) * stack_ring.size, + SMP_CACHE_BYTES); + if (WARN_ON(!stack_ring.entries)) + static_branch_disable(&kasan_flag_stacktrace); + } } static void save_stack_info(struct kmem_cache *cache, void *object, @@ -88,7 +111,7 @@ static void save_stack_info(struct kmem_cache *cache, void *object, next: pos = atomic64_fetch_add(1, &stack_ring.pos); - entry = &stack_ring.entries[pos % KASAN_STACK_RING_SIZE]; + entry = &stack_ring.entries[pos % stack_ring.size]; /* Detect stack ring entry slots that are being written to. */ old_ptr = READ_ONCE(entry->ptr); -- GitLab From 1f538e1f2d294cf8a9486fb1a7d4d4f0d16e2b01 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:48 +0200 Subject: [PATCH 0969/2223] kasan: better identify bug types for tag-based modes Identify the bug type for the tag-based modes based on the stack trace entries found in the stack ring. If a free entry is found first (meaning that it was added last), mark the bug as use-after-free. If an alloc entry is found first, mark the bug as slab-out-of-bounds. Otherwise, assign the common bug type. This change returns the functionalify of the previously dropped CONFIG_KASAN_TAGS_IDENTIFY. Link: https://lkml.kernel.org/r/13ce7fa07d9d995caedd1439dfae4d51401842f2.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report_tags.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 57f7355377f14..d3510424d29be 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -10,7 +10,7 @@ extern struct kasan_stack_ring stack_ring; -static const char *get_bug_type(struct kasan_report_info *info) +static const char *get_common_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -37,9 +37,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) bool is_free; bool alloc_found = false, free_found = false; - info->bug_type = get_bug_type(info); - - if (!info->cache || !info->object) + if (!info->cache || !info->object) { + info->bug_type = get_common_bug_type(info); return; } @@ -84,6 +83,13 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->free_track.pid = pid; info->free_track.stack = stack; free_found = true; + + /* + * If a free entry is found first, the bug is likely + * a use-after-free. + */ + if (!info->bug_type) + info->bug_type = "use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) @@ -92,8 +98,19 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->alloc_track.pid = pid; info->alloc_track.stack = stack; alloc_found = true; + + /* + * If an alloc entry is found first, the bug is likely + * an out-of-bounds. + */ + if (!info->bug_type) + info->bug_type = "slab-out-of-bounds"; } } write_unlock_irqrestore(&stack_ring.lock, flags); + + /* Assign the common bug type if no entries were found. */ + if (!info->bug_type) + info->bug_type = get_common_bug_type(info); } -- GitLab From 34b592ce5cc2dbd7d94812bff12ec32d3ec6f65c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:49 +0200 Subject: [PATCH 0970/2223] kasan: add another use-after-free test Add a new use-after-free test that checks that KASAN detects use-after-free when another object was allocated in the same slot. This test is mainly relevant for the tag-based modes, which do not use quarantine. Once [1] is resolved, this test can be extended to check that the stack traces in the report point to the proper kmalloc/kfree calls. [1] https://bugzilla.kernel.org/show_bug.cgi?id=212203 Link: https://lkml.kernel.org/r/0659cfa15809dd38faa02bc0a59d0b5dbbd81211.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- lib/test_kasan.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 58c1b01ccfe20..505f77ffad279 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -612,6 +612,29 @@ again: kfree(ptr2); } +/* + * Check that KASAN detects use-after-free when another object was allocated in + * the same slot. Relevant for the tag-based modes, which do not use quarantine. + */ +static void kmalloc_uaf3(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 100; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); +} + static void kfree_via_page(struct kunit *test) { char *ptr; @@ -1382,6 +1405,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_uaf), KUNIT_CASE(kmalloc_uaf_memset), KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kmalloc_uaf3), KUNIT_CASE(kfree_via_page), KUNIT_CASE(kfree_via_phys), KUNIT_CASE(kmem_cache_oob), -- GitLab From f7e01ab828fd4bf6d25b1f143a3994241e8572bf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 6 Sep 2022 00:18:36 +0200 Subject: [PATCH 0971/2223] kasan: move tests to mm/kasan/ Move KASAN tests to mm/kasan/ to keep the test code alongside the implementation. Link: https://lkml.kernel.org/r/676398f0aeecd47d2f8e3369ea0e95563f641a36.1662416260.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - lib/Makefile | 5 ----- mm/kasan/Makefile | 8 ++++++++ lib/test_kasan.c => mm/kasan/kasan_test.c | 2 +- lib/test_kasan_module.c => mm/kasan/kasan_test_module.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) rename lib/test_kasan.c => mm/kasan/kasan_test.c (99%) rename lib/test_kasan_module.c => mm/kasan/kasan_test_module.c (99%) diff --git a/MAINTAINERS b/MAINTAINERS index c66b63ad83d84..6f1033f3c1eda 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10938,7 +10938,6 @@ F: arch/*/include/asm/*kasan.h F: arch/*/mm/kasan_init* F: include/linux/kasan*.h F: lib/Kconfig.kasan -F: lib/test_kasan*.c F: mm/kasan/ F: scripts/Makefile.kasan diff --git a/lib/Makefile b/lib/Makefile index 6dc0d6f8e57d7..d7d94102991b3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -65,11 +65,6 @@ obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_SIPHASH) += test_siphash.o obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o -obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o -CFLAGS_test_kasan.o += -fno-builtin -CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) -obj-$(CONFIG_KASAN_MODULE_TEST) += test_kasan_module.o -CFLAGS_test_kasan_module.o += -fno-builtin obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) UBSAN_SANITIZE_test_ubsan.o := y diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1f84df9c302e7..d4837bff3b60f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) + +CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) +CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) + obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o + +obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o +obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/lib/test_kasan.c b/mm/kasan/kasan_test.c similarity index 99% rename from lib/test_kasan.c rename to mm/kasan/kasan_test.c index 505f77ffad279..f25692def7813 100644 --- a/lib/test_kasan.c +++ b/mm/kasan/kasan_test.c @@ -25,7 +25,7 @@ #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) diff --git a/lib/test_kasan_module.c b/mm/kasan/kasan_test_module.c similarity index 99% rename from lib/test_kasan_module.c rename to mm/kasan/kasan_test_module.c index b112cbc835e90..e4ca82dc2c16d 100644 --- a/lib/test_kasan_module.c +++ b/mm/kasan/kasan_test_module.c @@ -13,7 +13,7 @@ #include #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" static noinline void __init copy_user_test(void) { -- GitLab From dcc579663f607392ade99a2301278239e819f57e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sun, 11 Sep 2022 01:25:30 +0200 Subject: [PATCH 0972/2223] kasan: better invalid/double-free report header Update the report header for invalid- and double-free bugs to contain the address being freed: BUG: KASAN: invalid-free in kfree+0x280/0x2a8 Free of addr ffff00000beac001 by task kunit_try_catch/99 Link: https://lkml.kernel.org/r/fce40f8dbd160972fe01a1ff39d0c426c310e4b7.1662852281.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/report.c | 23 ++++++++++++++++------- mm/kasan/report_generic.c | 3 ++- mm/kasan/report_tags.c | 2 +- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 39e8e5a80b829..df3602062bfd6 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -175,17 +175,14 @@ static void end_report(unsigned long *flags, void *addr) static void print_error_description(struct kasan_report_info *info) { - if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip); - return; - } + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); - if (info->type == KASAN_REPORT_DOUBLE_FREE) { - pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip); + if (info->type != KASAN_REPORT_ACCESS) { + pr_err("Free of addr %px by task %s/%d\n", + info->access_addr, current->comm, task_pid_nr(current)); return; } - pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -420,6 +417,18 @@ static void complete_report_info(struct kasan_report_info *info) } else info->cache = info->object = NULL; + switch (info->type) { + case KASAN_REPORT_INVALID_FREE: + info->bug_type = "invalid-free"; + break; + case KASAN_REPORT_DOUBLE_FREE: + info->bug_type = "double-free"; + break; + default: + /* bug_type filled in by kasan_complete_mode_report_info. */ + break; + } + /* Fill in mode-specific report info fields. */ kasan_complete_mode_report_info(info); } diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 087c1d8c81456..043c94b046054 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -132,7 +132,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) struct kasan_alloc_meta *alloc_meta; struct kasan_free_meta *free_meta; - info->bug_type = get_bug_type(info); + if (!info->bug_type) + info->bug_type = get_bug_type(info); if (!info->cache || !info->object) return; diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index d3510424d29be..ecede06ef374a 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -37,7 +37,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) bool is_free; bool alloc_found = false, free_found = false; - if (!info->cache || !info->object) { + if ((!info->cache || !info->object) && !info->bug_type) { info->bug_type = get_common_bug_type(info); return; } -- GitLab From 6a760f58c792b6f7411f886271bb03f697464433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Fri, 26 Aug 2022 08:06:31 +0300 Subject: [PATCH 0973/2223] mm/hmm/test: use char dev with struct device to get device node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM selftests use an in-kernel pseudo device to emulate device memory. The pseudo device registers a major device range for two or four pseudo device instances. User space has a script that reads /proc/devices in order to find the assigned major number, and sends that to mknod(1), once for each node. Change this to properly use cdev and struct device APIs. Delete the /proc/devices parsing from the user-space test script, now that it is unnecessary. Also, delete an unused field in struct dmirror_device: devmem. Link: https://lkml.kernel.org/r/20220826050631.25771-1-mpenttil@redhat.com Signed-off-by: Mika Penttilä Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Alistair Popple Cc: Ralph Campbell Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_hmm.c | 13 ++++++++++--- tools/testing/selftests/vm/test_hmm.sh | 10 ---------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e3965cafd27cf..6a33f6b1b4651 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -107,8 +107,8 @@ struct dmirror_chunk { */ struct dmirror_device { struct cdev cdevice; - struct hmm_devmem *devmem; unsigned int zone_device_type; + struct device device; unsigned int devmem_capacity; unsigned int devmem_count; @@ -1390,7 +1390,14 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) cdev_init(&mdevice->cdevice, &dmirror_fops); mdevice->cdevice.owner = THIS_MODULE; - ret = cdev_add(&mdevice->cdevice, dev, 1); + device_initialize(&mdevice->device); + mdevice->device.devt = dev; + + ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id); + if (ret) + return ret; + + ret = cdev_device_add(&mdevice->cdevice, &mdevice->device); if (ret) return ret; @@ -1416,7 +1423,7 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) kfree(mdevice->devmem_chunks); } - cdev_del(&mdevice->cdevice); + cdev_device_del(&mdevice->cdevice, &mdevice->device); } static int __init hmm_dmirror_init(void) diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 539c9371e592a..46e19b5d648d6 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -52,21 +52,11 @@ load_driver() usage fi fi - if [ $? == 0 ]; then - major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) - mknod /dev/hmm_dmirror0 c $major 0 - mknod /dev/hmm_dmirror1 c $major 1 - if [ $# -eq 2 ]; then - mknod /dev/hmm_dmirror2 c $major 2 - mknod /dev/hmm_dmirror3 c $major 3 - fi - fi } unload_driver() { modprobe -r $DRIVER > /dev/null 2>&1 - rm -f /dev/hmm_dmirror? } run_smoke() -- GitLab From 36001cba4f728e7fa2a58bc69fece22eaeef5cca Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 6 Sep 2022 23:18:47 +0800 Subject: [PATCH 0974/2223] mm/damon/core: iterate the regions list from current point in damon_set_regions() We iterate the whole regions list every time to get the first/last regions intersecting with the specific range in damon_set_regions(), in order to add new region or resize existing regions to fit in the specific range. Actually, it is unnecessary to iterate the new added regions and the front regions that have been checked. Just iterate the regions list from the current point using list_for_each_entry_from() every time to improve performance. The kunit tests passed: [PASSED] damon_test_apply_three_regions1 [PASSED] damon_test_apply_three_regions2 [PASSED] damon_test_apply_three_regions3 [PASSED] damon_test_apply_three_regions4 Link: https://lkml.kernel.org/r/1662477527-13003-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++++++ mm/damon/core.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7b1f4a4882308..d54acec048d6f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -463,9 +463,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) return list_last_entry(&t->regions_list, struct damon_region, list); } +static inline struct damon_region *damon_first_region(struct damon_target *t) +{ + return list_first_entry(&t->regions_list, struct damon_region, list); +} + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) +#define damon_for_each_region_from(r, t) \ + list_for_each_entry_from(r, &t->regions_list, list) + #define damon_for_each_region_safe(r, next, t) \ list_for_each_entry_safe(r, next, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 9964b9d007686..5e00c04ceef04 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -195,6 +195,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, damon_destroy_region(r, t); } + r = damon_first_region(t); /* Add new regions or resize existing regions to fit in the ranges */ for (i = 0; i < nr_ranges; i++) { struct damon_region *first = NULL, *last, *newr; @@ -202,7 +203,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, range = &ranges[i]; /* Get the first/last regions intersecting with the range */ - damon_for_each_region(r, t) { + damon_for_each_region_from(r, t) { if (damon_intersect(r, range)) { if (!first) first = r; -- GitLab From 61768a1b37c664faf028d925e6b7825768afcc00 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Wed, 7 Sep 2022 16:41:16 +0800 Subject: [PATCH 0975/2223] mm/damon: simplify damon_ctx check in damon_sysfs_before_terminate In damon_sysfs_before_terminate(), it needs to check whether ctx->ops.id supports 'DAMON_OPS_VADDR' or 'DAMON_OPS_FVADDR', there we can use damon_target_has_pid() instead. Link: https://lkml.kernel.org/r/20220907084116.62053-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index fe6c6870cf868..1719bb3531e30 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2309,7 +2309,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR) + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); -- GitLab From 0bba9af03d55d2cc1aa7616a8b9e522ceb49d180 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Wed, 7 Sep 2022 16:01:13 +0800 Subject: [PATCH 0976/2223] mm/page_owner.c: remove redundant drain_all_pages Remove an expensive and unnecessary operation as PCP pages are safely skipped when reading page owner.PCP pages can be skipped because PAGE_EXT_OWNER_ALLOCATED is cleared. With draining PCP pages, these pages are moved to buddy list so they can be identified as buddy pages and skipped quickly. Although it improved efficiency of PFN walker, the drain is guaranteed expensive that is unlikely to be offset by a slight increase in efficiency when skipping free pages. PAGE_EXT_OWNER_ALLOCATED is cleared in the page owner reset path below: free_unref_page -> free_unref_page_prepare -> free_pcp_prepare -> free_pages_prepare which do page owner reset -> free_unref_page_commit which add pages into pcp list Link: https://lkml.kernel.org/r/1662704326-15899-1-git-send-email-quic_zhenhuah@quicinc.com Link: https://lkml.kernel.org/r/1662633204-10044-1-git-send-email-quic_zhenhuah@quicinc.com Link: https://lkml.kernel.org/r/1662537673-9392-1-git-send-email-quic_zhenhuah@quicinc.com Signed-off-by: Zhenhua Huang Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 90023f938c19f..54f3e039fb483 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -524,8 +524,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; - drain_all_pages(NULL); - /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* -- GitLab From 4f9bc69ac5ce34071a9a51343bc81ca76cb2e3f1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:42 +0800 Subject: [PATCH 0977/2223] mm: reuse pageblock_start/end_pfn() macro Move pageblock_start_pfn/pageblock_end_pfn() into pageblock-flags.h, then they could be used somewhere else, not only in compaction, also use ALIGN_DOWN() instead of round_down() to be pair with ALIGN(), which should be same for pageblock usage. Link: https://lkml.kernel.org/r/20220907060844.126891-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 2 ++ mm/compaction.c | 2 -- mm/memblock.c | 2 +- mm/page_alloc.c | 13 ++++++------- mm/page_isolation.c | 11 +++++------ mm/page_owner.c | 4 ++-- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 83c7248053a1e..a09b7fe6bbf8e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,8 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) +#define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) /* Forward declaration */ struct page; diff --git a/mm/compaction.c b/mm/compaction.c index 262c4676b32c1..9cbe8562b63ac 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -52,8 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) #define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) -#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) -#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) /* * Page order with-respect-to which proactive compaction diff --git a/mm/memblock.c b/mm/memblock.c index b5d3026979fcc..46fe7575f03c6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2000,7 +2000,7 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - start = round_down(start, pageblock_nr_pages); + start = pageblock_start_pfn(start); /* * If we had a previous bank, and there is a space diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44f3c93643161..1637db90472ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -544,7 +544,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); #else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); #endif /* CONFIG_SPARSEMEM */ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } @@ -1857,7 +1857,7 @@ void set_zone_contiguous(struct zone *zone) unsigned long block_start_pfn = zone->zone_start_pfn; unsigned long block_end_pfn; - block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(block_start_pfn); for (; block_start_pfn < zone_end_pfn(zone); block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -2653,8 +2653,8 @@ int move_freepages_block(struct zone *zone, struct page *page, *num_movable = 0; pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages - 1; + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) @@ -6934,9 +6934,8 @@ static void __init init_unavailable_range(unsigned long spfn, u64 pgcnt = 0; for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; + if (!pfn_valid(pageblock_start_pfn(pfn))) { + pfn = pageblock_end_pfn(pfn) - 1; continue; } __init_single_page(pfn_to_page(pfn), pfn, zone, node); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index eb3a68ca92ad9..5819cb9c62f37 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -37,8 +37,8 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e struct zone *zone = page_zone(page); unsigned long pfn; - VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) != - ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages)); + VM_BUG_ON(pageblock_start_pfn(start_pfn) != + pageblock_start_pfn(end_pfn - 1)); if (is_migrate_cma_page(page)) { /* @@ -172,7 +172,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * to avoid redundant checks. */ check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages), + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), end_pfn); unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, @@ -532,7 +532,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; /* isolation is done at page block granularity */ - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; bool skip_isolation = false; @@ -579,10 +579,9 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); - for (pfn = isolate_start; pfn < isolate_end; pfn += pageblock_nr_pages) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 54f3e039fb483..2d27f532df4c1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -297,7 +297,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); @@ -635,7 +635,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { -- GitLab From 5f7fa13fa858c17580ed513bd5e0a4b36d68fdd6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:43 +0800 Subject: [PATCH 0978/2223] mm: add pageblock_align() macro Add pageblock_align() macro and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + mm/memblock.c | 4 ++-- mm/page_isolation.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index a09b7fe6bbf8e..293c76630fa8b 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,7 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) diff --git a/mm/memblock.c b/mm/memblock.c index 46fe7575f03c6..511d4783dcf1d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2014,12 +2014,12 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); } #ifdef CONFIG_SPARSEMEM if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5819cb9c62f37..fa82faa07dafb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -533,7 +533,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, struct page *page; /* isolation is done at page block granularity */ unsigned long isolate_start = pageblock_start_pfn(start_pfn); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_end = pageblock_align(end_pfn); int ret; bool skip_isolation = false; @@ -580,7 +580,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; unsigned long isolate_start = pageblock_start_pfn(start_pfn); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_end = pageblock_align(end_pfn); for (pfn = isolate_start; pfn < isolate_end; -- GitLab From ee0913c4719610204315a0d8a35122c6233249e0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:44 +0800 Subject: [PATCH 0979/2223] mm: add pageblock_aligned() macro Add pageblock_aligned() and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Cc: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + mm/compaction.c | 8 ++++---- mm/memory_hotplug.c | 6 ++---- mm/page_alloc.c | 17 +++++++---------- mm/page_isolation.c | 2 +- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 293c76630fa8b..5f1ae07d724b8 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -54,6 +54,7 @@ extern unsigned int pageblock_order; #define pageblock_nr_pages (1UL << pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_aligned(pfn) IS_ALIGNED((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) diff --git a/mm/compaction.c b/mm/compaction.c index 9cbe8562b63ac..e2a9615f5fded 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -402,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, if (cc->ignore_skip_hint) return false; - if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + if (!pageblock_aligned(pfn)) return false; skip = get_pageblock_skip(page); @@ -884,7 +884,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * COMPACT_CLUSTER_MAX at a time so the second call must * not falsely conclude that the block should be skipped. */ - if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!valid_page && pageblock_aligned(low_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; page = NULL; @@ -1937,7 +1937,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * before making it "skip" so other compaction instances do * not scan the same block. */ - if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + if (pageblock_aligned(low_pfn) && !fast_find_block && !isolation_suitable(cc, page)) continue; @@ -2123,7 +2123,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * migration source is unmovable/reclaimable but it's not worth * special casing. */ - if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9ae1f98548b10..fd40f7e9f1763 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1085,8 +1085,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; @@ -1806,8 +1805,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1637db90472ed..0002ded4ab0e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1892,15 +1892,14 @@ static void __init deferred_free_range(unsigned long pfn, page = pfn_to_page(pfn); /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { + if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) + if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, 0); } @@ -1928,7 +1927,7 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + if (pageblock_aligned(pfn) && !pfn_valid(pfn)) return false; return true; } @@ -1940,14 +1939,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn) static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; - } else if (!(pfn & nr_pgmask)) { + } else if (pageblock_aligned(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; } else { @@ -1967,7 +1965,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; int nid = zone_to_nid(zone); unsigned long nr_pages = 0; int zid = zone_idx(zone); @@ -1977,7 +1974,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, if (!deferred_pfn_valid(pfn)) { page = NULL; continue; - } else if (!page || !(pfn & nr_pgmask)) { + } else if (!page || pageblock_aligned(pfn)) { page = pfn_to_page(pfn); } else { page++; @@ -6759,7 +6756,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * such that unmovable allocations won't be scattered all * over the place during system boot. */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, migratetype); cond_resched(); } @@ -6802,7 +6799,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index fa82faa07dafb..04141a9bea704 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -312,7 +312,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, struct zone *zone; int ret; - VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); + VM_BUG_ON(!pageblock_aligned(boundary_pfn)); if (isolate_before) isolate_pageblock = boundary_pfn - pageblock_nr_pages; -- GitLab From fc5dfebc8055426299739dd1a7828af9638c94fb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 16:26:43 +0800 Subject: [PATCH 0980/2223] memblock tests: add new pageblock related macro Add new pageblock_start_pfn() and pageblock_align() macro which are needed by memblock tests. Link: https://lkml.kernel.org/r/20220907082643.186979-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/memblock/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index 7c2eb5c9bb54d..e65f89b12f1cd 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -22,6 +22,8 @@ enum zone_type { #define pageblock_order (MAX_ORDER - 1) #define pageblock_nr_pages BIT(pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) struct zone { atomic_long_t managed_pages; -- GitLab From 410f8e82689e1e66044fea51ef852054a09502b7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:35 +0000 Subject: [PATCH 0981/2223] memcg: extract memcg_vmstats from struct mem_cgroup Patch series "memcg: reduce memory overhead of memory cgroups". Currently a lot of memory is wasted to maintain the vmevents for memory cgroups as we have multiple arrays of size NR_VM_EVENT_ITEMS which can be as large as 110. However memcg code uses small portion of those entries. This patch series eliminate this overhead by removing the unneeded vmevent entries from memory cgroup data structures. This patch (of 3): This is a preparatory patch to reduce the memory overhead of memory cgroup. The struct memcg_vmstats is the largest object embedded into the struct mem_cgroup. This patch extracts struct memcg_vmstats from struct mem_cgroup to ease the following patches in reducing the size of struct memcg_vmstats. Link: https://lkml.kernel.org/r/20220907043537.3457014-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220907043537.3457014-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 37 +++---------------------- mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ca0df42662ad1..dc7d40e575d5f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,29 +80,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct memcg_vmstats_percpu { - /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct memcg_vmstats { - /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; -}; +struct memcg_vmstats_percpu; +struct memcg_vmstats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; @@ -298,7 +277,7 @@ struct mem_cgroup { CACHELINE_PADDING(_pad1_); /* memory.stat */ - struct memcg_vmstats vmstats; + struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -1001,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 632402001bca1..0a44a733bb03c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,40 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -827,7 +861,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats.events[event]); + return READ_ONCE(memcg->vmstats->events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -5170,6 +5204,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -5199,6 +5234,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5418,9 +5457,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats.state_pending[i]; + delta = memcg->vmstats->state_pending[i]; if (delta) - memcg->vmstats.state_pending[i] = 0; + memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ v = READ_ONCE(statc->state[i]); @@ -5433,15 +5472,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) continue; /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats.state[i] += delta; + memcg->vmstats->state[i] += delta; if (parent) - parent->vmstats.state_pending[i] += delta; + parent->vmstats->state_pending[i] += delta; } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - delta = memcg->vmstats.events_pending[i]; + delta = memcg->vmstats->events_pending[i]; if (delta) - memcg->vmstats.events_pending[i] = 0; + memcg->vmstats->events_pending[i] = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { @@ -5452,9 +5491,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (!delta) continue; - memcg->vmstats.events[i] += delta; + memcg->vmstats->events[i] += delta; if (parent) - parent->vmstats.events_pending[i] += delta; + parent->vmstats->events_pending[i] += delta; } for_each_node_state(nid, N_MEMORY) { -- GitLab From d396def5d86dbeb4ceb4a9dca92611ce206dc66a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:36 +0000 Subject: [PATCH 0982/2223] memcg: rearrange code This is a preparatory patch for easing the review of the follow up patch which will reduce the memory overhead of memory cgroups. Link: https://lkml.kernel.org/r/20220907043537.3457014-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a44a733bb03c..78fd7cfb4f929 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,29 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; @@ -1501,29 +1524,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, -#endif -}; - static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; -- GitLab From 8278f1c7b4920105f2f30a8df9b8212b378101d2 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:37 +0000 Subject: [PATCH 0983/2223] memcg: reduce size of memcg vmstats structures The struct memcg_vmstats and struct memcg_vmstats_percpu contains two arrays each for events of size NR_VM_EVENT_ITEMS which can be as large as 110. However the memcg v1 only uses 4 of those while memcg v2 uses 15. The union of both is 17. On a 64 bit system, we are wasting approximately ((110 - 17) * 8 * 2) * (nr_cpus + 1) bytes which is significant on large machines. This patch reduces the size of the given structures by adding one indirection and only stores array of events which are actually used by the memcg code. With this patch, the size of memcg_vmstats has reduced from 2544 bytes to 1056 bytes while the size of memcg_vmstats_percpu has reduced from 2568 bytes to 1080 bytes. [akpm@linux-foundation.org: fix memcg_events_local() array index, per Shakeel] Link: https://lkml.kernel.org/r/CALvZod70Mvxr+Nzb6k0yiU2RFYjTD=0NFhKK-Eyp+5ejd1PSFw@mail.gmail.com Link: https://lkml.kernel.org/r/20220907043537.3457014-4-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78fd7cfb4f929..1f204a2620543 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -671,6 +671,8 @@ static void flush_memcg_stats_dwork(struct work_struct *w) /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSTEAL_KSWAPD, @@ -692,14 +694,30 @@ static const unsigned int memcg_vm_event_stat[] = { #endif }; +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; + unsigned long events_prev[NR_MEMCG_EVENTS]; /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; @@ -709,11 +727,11 @@ struct memcg_vmstats_percpu { struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; + unsigned long events_pending[NR_MEMCG_EVENTS]; }; unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) @@ -873,27 +891,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - if (mem_cgroup_disabled()) + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats->events[event]); + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); return x; } @@ -1564,10 +1592,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT)); - for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; + seq_buf_printf(&s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); + } /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); @@ -5309,6 +5342,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); @@ -5477,7 +5511,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) parent->vmstats->state_pending[i] += delta; } - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + for (i = 0; i < NR_MEMCG_EVENTS; i++) { delta = memcg->vmstats->events_pending[i]; if (delta) memcg->vmstats->events_pending[i] = 0; -- GitLab From 4e07acdda7fc23f5c4666e54961ef972a1195ffd Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 17:35:30 +0800 Subject: [PATCH 0984/2223] mm/hwpoison: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220906093530.243262-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/hwpoison-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 65e242b5a4327..d0548e382b6ba 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -63,13 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); -static void pfn_inject_exit(void) +static void __exit pfn_inject_exit(void) { hwpoison_filter_enable = 0; debugfs_remove_recursive(hwpoison_dir); } -static int pfn_inject_init(void) +static int __init pfn_inject_init(void) { hwpoison_dir = debugfs_create_dir("hwpoison", NULL); -- GitLab From 679d7f69d60bbd124542e620b745c17643cdf680 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 15:53:12 +0800 Subject: [PATCH 0985/2223] mm/rodata_test: use PAGE_ALIGNED() helper Use PAGE_ALIGNED() helper instead of open-coding operation, no functional changes here. Link: https://lkml.kernel.org/r/20220906075312.166595-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/rodata_test.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 2613371945b7e..6d783436951f3 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,13 +9,13 @@ #include #include +#include #include static const int rodata_test_data = 0xC3; void rodata_test(void) { - unsigned long start, end; int zero = 0; /* test 1: read the value */ @@ -39,13 +39,11 @@ void rodata_test(void) } /* test 4: check if the rodata section is PAGE_SIZE aligned */ - start = (unsigned long)__start_rodata; - end = (unsigned long)__end_rodata; - if (start & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__start_rodata)) { pr_err("start of .rodata is not page size aligned\n"); return; } - if (end & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__end_rodata)) { pr_err("end of .rodata is not page size aligned\n"); return; } -- GitLab From f5a79d7c0c87c8d88bb5e3f3c898258fdf1b3b05 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 8 Sep 2022 19:14:43 +0000 Subject: [PATCH 0986/2223] mm/damon: introduce struct damos_access_pattern damon_new_scheme() has too many parameters, so introduce struct damos_access_pattern to simplify it. In additon, we can't use a bpf trace kprobe that has more than 5 parameters. Link: https://lkml.kernel.org/r/20220908191443.129534-1-sj@kernel.org Signed-off-by: Yajun Deng Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 37 ++++++++++++++++++---------------- mm/damon/core.c | 31 ++++++++++++++--------------- mm/damon/dbgfs.c | 27 +++++++++++++++---------- mm/damon/lru_sort.c | 46 ++++++++++++++++++++++++++----------------- mm/damon/reclaim.c | 23 +++++++++++++--------- mm/damon/sysfs.c | 17 +++++++++++----- 6 files changed, 106 insertions(+), 75 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index d54acec048d6f..90f20675da22a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -216,13 +216,26 @@ struct damos_stat { }; /** - * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. * @max_sz_region: Maximum size of target regions. * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. + */ +struct damos_access_pattern { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. @@ -230,10 +243,8 @@ struct damos_stat { * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the - * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, - * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to - * those. To avoid consuming too much CPU time or IO resources for the - * &action, "a is used. + * &pattern and applies &action to those. To avoid consuming too much + * CPU time or IO resources for the &action, "a is used. * * To do the work only when needed, schemes can be activated for specific * system situations using &wmarks. If all schemes that registered to the @@ -248,12 +259,7 @@ struct damos_stat { * &action is applied. */ struct damos { - unsigned long min_sz_region; - unsigned long max_sz_region; - unsigned int min_nr_accesses; - unsigned int max_nr_accesses; - unsigned int min_age_region; - unsigned int max_age_region; + struct damos_access_pattern pattern; enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; @@ -509,12 +515,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks); +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index 5e00c04ceef04..bae41990f4227 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -231,24 +231,21 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks) +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) { struct damos *scheme; scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->min_sz_region = min_sz_region; - scheme->max_sz_region = max_sz_region; - scheme->min_nr_accesses = min_nr_accesses; - scheme->max_nr_accesses = max_nr_accesses; - scheme->min_age_region = min_age_region; - scheme->max_age_region = max_age_region; + scheme->pattern.min_sz_region = pattern->min_sz_region; + scheme->pattern.max_sz_region = pattern->max_sz_region; + scheme->pattern.min_nr_accesses = pattern->min_nr_accesses; + scheme->pattern.max_nr_accesses = pattern->max_nr_accesses; + scheme->pattern.min_age_region = pattern->min_age_region; + scheme->pattern.max_age_region = pattern->max_age_region; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -667,10 +664,12 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) unsigned long sz; sz = r->ar.end - r->ar.start; - return s->min_sz_region <= sz && sz <= s->max_sz_region && - s->min_nr_accesses <= r->nr_accesses && - r->nr_accesses <= s->max_nr_accesses && - s->min_age_region <= r->age && r->age <= s->max_age_region; + return s->pattern.min_sz_region <= sz && + sz <= s->pattern.max_sz_region && + s->pattern.min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_age_region <= r->age && + r->age <= s->pattern.max_age_region; } static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 652a94deafe35..1422037cedd2b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -131,9 +131,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->min_sz_region, s->max_sz_region, - s->min_nr_accesses, s->max_nr_accesses, - s->min_age_region, s->max_age_region, + s->pattern.min_sz_region, + s->pattern.max_sz_region, + s->pattern.min_nr_accesses, + s->pattern.max_nr_accesses, + s->pattern.min_age_region, + s->pattern.max_age_region, damos_action_to_dbgfs_scheme_action(s->action), s->quota.ms, s->quota.sz, s->quota.reset_interval, @@ -221,8 +224,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, struct damos *scheme, **schemes; const int max_nr_schemes = 256; int pos = 0, parsed, ret; - unsigned long min_sz, max_sz; - unsigned int min_nr_a, max_nr_a, min_age, max_age; unsigned int action_input; enum damos_action action; @@ -233,13 +234,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_access_pattern pattern = {}; struct damos_quota quota = {}; struct damos_watermarks wmarks; ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action_input, "a.ms, + &pattern.min_sz_region, &pattern.max_sz_region, + &pattern.min_nr_accesses, + &pattern.max_nr_accesses, + &pattern.min_age_region, + &pattern.max_age_region, + &action_input, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, "a.weight_age, &wmarks.metric, @@ -251,7 +257,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if ((int)action < 0) goto fail; - if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + if (pattern.min_sz_region > pattern.max_sz_region || + pattern.min_nr_accesses > pattern.max_nr_accesses || + pattern.min_age_region > pattern.max_age_region) goto fail; if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || @@ -259,8 +267,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, goto fail; pos += parsed; - scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, - min_age, max_age, action, "a, &wmarks); + scheme = damon_new_scheme(&pattern, action, "a, &wmarks); if (!scheme) goto fail; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 9de6f00a71c5d..0184ed4828b7e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -293,6 +293,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end) /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and accessed for more than the threshold */ + .min_nr_accesses = hot_thres, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -313,26 +324,31 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .weight_nr_accesses = 1, .weight_age = 0, }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and accessed for more than the threshold */ - hot_thres, UINT_MAX, - /* no matter its age */ - 0, UINT_MAX, + + return damon_new_scheme( + &pattern, /* prioritize those on LRU lists, as soon as found */ DAMOS_LRU_PRIO, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = cold_thres, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -354,21 +370,15 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .weight_nr_accesses = 0, .weight_age = 1, }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for cold_thres or more micro-seconds, and */ - cold_thres, UINT_MAX, + + return damon_new_scheme( + &pattern, /* mark those as not accessed, as soon as found */ DAMOS_LRU_DEPRIO, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } static int damon_lru_sort_apply_parameters(void) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a7faf51b4bd4a..5aeca0b9e88ec 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -264,6 +264,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end) static struct damos *damon_reclaim_new_scheme(void) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = min_age / aggr_interval, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -284,21 +295,15 @@ static struct damos *damon_reclaim_new_scheme(void) .weight_nr_accesses = 0, .weight_age = 1 }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for min_age or more micro-seconds, and */ - min_age / aggr_interval, UINT_MAX, + + return damon_new_scheme( + &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } static int damon_reclaim_apply_parameters(void) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1719bb3531e30..9fcf7bae41eb9 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2259,11 +2259,20 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { - struct damon_sysfs_access_pattern *pattern = + struct damon_sysfs_access_pattern *access_pattern = sysfs_scheme->access_pattern; struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; struct damos_quota quota = { .ms = sysfs_quotas->ms, .sz = sysfs_quotas->sz, @@ -2280,10 +2289,8 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - return damon_new_scheme(pattern->sz->min, pattern->sz->max, - pattern->nr_accesses->min, pattern->nr_accesses->max, - pattern->age->min, pattern->age->max, - sysfs_scheme->action, "a, &wmarks); + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); } static int damon_sysfs_set_schemes(struct damon_ctx *ctx, -- GitLab From 5934ec1362b235c4341807c28f79b6a596ce1b40 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 8 Sep 2022 11:13:17 +0800 Subject: [PATCH 0987/2223] mm/damon/vaddr: add a comment for 'default' case in damon_va_apply_scheme() The switch case 'DAMOS_STAT' and switch case 'default' have same return value in damon_va_apply_scheme(), and the 'default' case is for DAMOS actions that not supported by 'vaddr'. It might make sense to add a comment here. [akpm@linux-foundation.org: fx comment grammar] Link: https://lkml.kernel.org/r/1662606797-23534-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 349b44d699e2a..c2c08c1b316bd 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -658,6 +658,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_STAT: return 0; default: + /* + * DAMOS actions that are not yet supported by 'vaddr'. + */ return 0; } -- GitLab From 36f05cab0a2c97bda288c3b6a557ec5fb8d9bba6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 Sep 2022 09:00:31 -0400 Subject: [PATCH 0988/2223] tmpfs: add support for an i_version counter NFSv4 mandates a change attribute to avoid problems with timestamp granularity, which Linux implements using the i_version counter. This is particularly important when the underlying filesystem is fast. Give tmpfs an i_version counter. Since it doesn't have to be persistent, we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion calls in the right places. Also, while there is no formal spec for xattrs, most implementations update the ctime on setxattr. Fix shmem_xattr_handler_set to update the ctime and bump the i_version appropriately. Link: https://lkml.kernel.org/r/20220909130031.15477-1-jlayton@kernel.org Signed-off-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- fs/posix_acl.c | 3 +++ mm/shmem.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5af33800743e4..efb88a5e59f9e 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,7 @@ #include #include #include +#include static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index 3d0b729fcc5ec..275899bacbeaf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "swap.h" static struct vfsmount *shm_mnt; @@ -1030,6 +1031,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1074,6 +1076,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; + bool update_mtime = false; + bool update_ctime = true; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) @@ -1094,7 +1098,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = current_time(inode); + update_mtime = true; + } else { + update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -1114,6 +1120,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns, setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } return error; } @@ -2890,6 +2902,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } @@ -2965,6 +2978,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -2982,6 +2996,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; @@ -3071,6 +3086,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns, old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } @@ -3123,6 +3140,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3194,6 +3212,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); return 0; } @@ -3257,9 +3276,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + int err; name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3722,7 +3747,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; #else sb->s_flags |= SB_NOUSER; #endif -- GitLab From ade38b8ca5ceeeb72e8d01357f3dcde7c87570cc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:55 +0000 Subject: [PATCH 0989/2223] selftest/damon: add a test for duplicate context dirs creation Patch series "mm/damon: minor fixes and cleanups". This patchset contains minor fixes and cleanups for DAMON including - selftest for a bug we found before (Patch 1), - fix of region holes in vaddr corner case and a kunit test for it (Patches 2 and 3), and - documents/Kconfig updates for title wordsmithing (Patch 4) and more aggressive DAMON debugfs interface deprecation announcement (Patches 5-7). This patch (of 7): Commit d26f60703606 ("mm/damon/dbgfs: avoid duplicate context directory creation") fixes a bug which could result in memory leak and DAMON disablement. This commit adds a selftest for verifying the fix and avoid regression. Link: https://lkml.kernel.org/r/20220909202901.57977-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220909202901.57977-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + .../debugfs_duplicate_context_creation.sh | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 0470c5f3e6906..a1fa2eff8192f 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -6,6 +6,7 @@ TEST_GEN_FILES += huge_count_read_write TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh +TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += sysfs.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh new file mode 100644 index 0000000000000..4a76e37ef16b1 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test duplicated context creation +# ================================ + +if ! echo foo > "$DBGFS/mk_contexts" +then + echo "context creation failed" + exit 1 +fi + +if echo foo > "$DBGFS/mk_contexts" +then + echo "duplicate context creation success" + exit 1 +fi + +if ! echo foo > "$DBGFS/rm_contexts" +then + echo "context deletion failed" + exit 1 +fi + +exit 0 -- GitLab From 9c950c22833cfd9887da7679534e5c6deb44b008 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:56 +0000 Subject: [PATCH 0990/2223] mm/damon/core: avoid holes in newly set monitoring target ranges When there are two or more non-contiguous regions intersecting with given new ranges, 'damon_set_regions()' does not fill the holes. This commit makes the function to fill the holes with newly created regions. [sj@kernel.org: handle error from 'damon_fill_regions_holes()'] Link: https://lkml.kernel.org/r/20220913215420.57761-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220909202901.57977-3-sj@kernel.org Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: SeongJae Park Reported-by: Yun Levi Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index bae41990f4227..5ad31d2feae40 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -168,6 +168,30 @@ static bool damon_intersect(struct damon_region *r, return !(r->ar.end <= re->start || re->end <= r->ar.start); } +/* + * Fill holes in regions with new regions. + */ +static int damon_fill_regions_holes(struct damon_region *first, + struct damon_region *last, struct damon_target *t) +{ + struct damon_region *r = first; + + damon_for_each_region_from(r, t) { + struct damon_region *next, *newr; + + if (r == last) + break; + next = damon_next_region(r); + if (r->ar.end != next->ar.start) { + newr = damon_new_region(r->ar.end, next->ar.start); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, r, next, t); + } + } + return 0; +} + /* * damon_set_regions() - Set regions of a target for given address ranges. * @t: the given target. @@ -184,6 +208,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, { struct damon_region *r, *next; unsigned int i; + int err; /* Remove regions which are not in the new ranges */ damon_for_each_region_safe(r, next, t) { @@ -226,6 +251,11 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, first->ar.start = ALIGN_DOWN(range->start, DAMON_MIN_REGION); last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + + /* fill possible holes in the range */ + err = damon_fill_regions_holes(first, last, t); + if (err) + return err; } } return 0; -- GitLab From 62f409560eb235ad9c2c9dbe1a3a57801431da5a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:57 +0000 Subject: [PATCH 0991/2223] mm/damon/core-test: test damon_set_regions Preceding commit fixes a bug in 'damon_set_regions()', which allows holes in the new monitoring target ranges. This commit adds a kunit test case for the problem to avoid any regression. Link: https://lkml.kernel.org/r/20220909202901.57977-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 45db79d28fdc3..3db9b73687562 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); } +static void damon_test_set_regions(struct kunit *test) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r1 = damon_new_region(4, 16); + struct damon_region *r2 = damon_new_region(24, 32); + struct damon_addr_range range = {.start = 8, .end = 28}; + unsigned long expects[] = {8, 16, 16, 24, 24, 28}; + int expect_idx = 0; + struct damon_region *r; + + damon_add_region(r1, t); + damon_add_region(r2, t); + damon_set_regions(t, &range, 1); + + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + } + damon_destroy_target(t); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_merge_regions_of), KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), + KUNIT_CASE(damon_test_set_regions), {}, }; -- GitLab From 0ff11f103f5d9daf14dddf05de9b12611eaf3fc1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:58 +0000 Subject: [PATCH 0992/2223] Docs/admin-guide/mm/damon: rename the title of the document The title of the DAMON document for admin-guide, 'Monitoring Data Accesses', could confuse readers in some ways. First of all, DAMON is not the only single way for data access monitoring. And the document is for not only the data access monitoring but also data access pattern based memory management optimizations (DAMOS). This commit updates the title to 'DAMON: Data Access MONitor', which more explicitly explains what the document describes. Link: https://lkml.kernel.org/r/20220909202901.57977-5-sj@kernel.org Fixes: c4ba6014aec3 ("Documentation: add documents for DAMON") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/index.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 05500042f7776..33d37bb2fb4e5 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -======================== -Monitoring Data Accesses -======================== +========================== +DAMON: Data Access MONitor +========================== :doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and -- GitLab From e8600ce2d2e6ad1df4d0717beb362ee4cd39aaa3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:59 +0000 Subject: [PATCH 0993/2223] mm/damon/Kconfig: notify debugfs deprecation plan Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface") announced the DAMON debugfs interface deprecation plan, but it is not so aggressively announced. As the deprecation time is coming, this commit makes the announce more easy to be found by adding the note to the config menu of DAMON debugfs interface. Link: https://lkml.kernel.org/r/20220909202901.57977-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 66265e3a9c659..7821fcb3f2586 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -68,6 +68,9 @@ config DAMON_DBGFS If unsure, say N. + This will be removed after >5.15.y LTS kernel is released, so users + should move to the sysfs interface (DAMON_SYSFS). + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y -- GitLab From 04cc7e4bf7c4bdff24b62432d2beafdde60cb72b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:29:00 +0000 Subject: [PATCH 0994/2223] Docs/admin-guide/mm/damon/start: mention the dependency as sysfs instead of debugfs 'Getting Started' document of DAMON says DAMON user-space tool, damo[1], is using DAMON debugfs interface, and therefore it needs to ensure debugfs is mounted. However, the latest version of the tool is using DAMON sysfs interface. Moreover, DAMON debugfs interface is going to be deprecated as announced by commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface"). This commit therefore update the document to tell readers about DAMON sysfs interface dependency instead and never mention about debugfs interface, which will be deprecated. [1] https://github.com/awslabs/damo Link: https://lkml.kernel.org/r/20220909202901.57977-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/start.rst | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 4d5ca2c46288a..9f88afc734da4 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -29,16 +29,9 @@ called DAMON Operator (DAMO). It is available at https://github.com/awslabs/damo. The examples below assume that ``damo`` is on your ``$PATH``. It's not mandatory, though. -Because DAMO is using the debugfs interface (refer to :doc:`usage` for the -detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as -below:: - - # mount -t debugfs none /sys/kernel/debug/ - -or append the following line to your ``/etc/fstab`` file so that your system -can automatically mount debugfs upon booting:: - - debugfs /sys/kernel/debug debugfs defaults 0 0 +Because DAMO is using the sysfs interface (refer to :doc:`usage` for the +detail) of DAMON, you should ensure :doc:`sysfs ` is +mounted. Recording Data Access Patterns -- GitLab From f1f3afd59d78db163f6655394980290c1bdf9eab Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:29:01 +0000 Subject: [PATCH 0995/2223] Docs/admin-guide/mm/damon/usage: note DAMON debugfs interface deprecation plan Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface") announced the DAMON debugfs interface deprecation plan, but it is not so aggressively announced. As the deprecation time is coming, this commit makes the announce more easy to be found by adding the note at the beginning of the DAMON debugfs interface usage document. Link: https://lkml.kernel.org/r/20220909202901.57977-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d52f572a90298..c050b882ddc1c 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -393,6 +393,11 @@ the files as above. Above is only for an example. debugfs Interface ================= +.. note:: + + DAMON debugfs interface will be removed after next LTS kernel is released, so + users should move to the :ref:`sysfs interface `. + DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and ``rm_contexts`` under its debugfs directory, ``/damon/``. -- GitLab From 85a34107eba913a2cb7c7c47c49f50073bfb67dd Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 16:39:47 +0800 Subject: [PATCH 0996/2223] mm/shuffle: convert module_param_call to module_param_cb module_param_call is now completely consistent with module_param_cb, so there is no need to keep two macros. Convert module_param_call to module_param_cb since former is obsolete and latter is more kernel-ish. Link: https://lkml.kernel.org/r/20220909083947.3595610-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Dan Williams Cc: Kefeng Wang Cc: Liu Shixin Cc: Paul Russel Signed-off-by: Andrew Morton --- mm/shuffle.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/shuffle.c b/mm/shuffle.c index c13c33b247e87..fb1393b8b3a9d 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -12,23 +12,22 @@ DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); static bool shuffle_param; -static int shuffle_show(char *buffer, const struct kernel_param *kp) -{ - return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N'); -} -static __meminit int shuffle_store(const char *val, +static __meminit int shuffle_param_set(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); - - if (rc < 0) - return rc; - if (shuffle_param) + if (param_set_bool(val, kp)) + return -EINVAL; + if (*(bool *)kp->arg) static_branch_enable(&page_alloc_shuffle_key); return 0; } -module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +static const struct kernel_param_ops shuffle_param_ops = { + .set = shuffle_param_set, + .get = param_get_bool, +}; +module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400); /* * For two pages to be swapped in the shuffle, they must be free (on a -- GitLab From 671f2fa8a2b2d15940d80be4a2baf22758724647 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Fri, 9 Sep 2022 11:37:22 +0300 Subject: [PATCH 0997/2223] zsmalloc: use correct types in _first_obj_offset functions Since commit ffedd09fa9b0 ("zsmalloc: Stop using slab fields in struct page") we are using page->page_type (unsigned int) field instead of page->units (int) as first object offset in a subpage of zspage. So get_first_obj_offset() and set_first_obj_offset() functions should work with unsigned int type. Link: https://lkml.kernel.org/r/20220909083722.85024-1-avromanov@sberdevices.ru Fixes: ffedd09fa9b0 ("zsmalloc: Stop using slab fields in struct page") Signed-off-by: Alexey Romanov Reviewed-by: Sergey Senozhatsky Cc: Alexey Romanov Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 12eb11e709393..525758713a553 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -472,12 +472,12 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -static inline int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct page *page) { return page->page_type; } -static inline void set_first_obj_offset(struct page *page, int offset) +static inline void set_first_obj_offset(struct page *page, unsigned int offset) { page->page_type = offset; } @@ -1592,7 +1592,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - int offset = 0; + unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); @@ -1846,7 +1846,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset; + unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; -- GitLab From 6b1964e685544b8f8ba6780c10a6b38c2b1282a5 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 16:31:40 +0800 Subject: [PATCH 0998/2223] mm: kfence: convert to DEFINE_SEQ_ATTRIBUTE Use DEFINE_SEQ_ATTRIBUTE helper macro to simplify the code. Link: https://lkml.kernel.org/r/20220909083140.3592919-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: Marco Elver Tested-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/kfence/core.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 8c08ae2101d7a..26de62a516652 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -719,24 +719,13 @@ static int show_object(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations object_seqops = { +static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; - -static int open_objects(struct inode *inode, struct file *file) -{ - return seq_open(file, &object_seqops); -} - -static const struct file_operations objects_fops = { - .open = open_objects, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(objects); static int __init kfence_debugfs_init(void) { -- GitLab From 0d83b2d89dbfad17b62d4e7fb8f0b0525ba1a204 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 9 Sep 2022 21:36:06 +0000 Subject: [PATCH 0999/2223] mm/damon: remove duplicate get_monitoring_region() definitions In lru_sort.c and reclaim.c, they are all defining get_monitoring_region() function, there is no need to define it separately. As 'get_monitoring_region()' is not a 'static' function anymore, we try to use a prefix to distinguish with other functions, so there rename it to 'damon_find_biggest_system_ram'. Link: https://lkml.kernel.org/r/20220909213606.136221-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ mm/damon/lru_sort.c | 37 ++----------------------------------- mm/damon/reclaim.c | 37 ++----------------------------------- 4 files changed, 46 insertions(+), 70 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 90f20675da22a..016b6c9c03d62 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -549,6 +549,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); + #endif /* CONFIG_DAMON */ #endif /* _DAMON_H */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 5ad31d2feae40..2437c61b0bc0b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1245,4 +1245,44 @@ static int kdamond_fn(void *data) return 0; } +/* + * struct damon_system_ram_region - System RAM resource address region of + * [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_system_ram_region { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_system_ram_region *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) + +{ + struct damon_system_ram_region arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + #include "core-test.h" diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 0184ed4828b7e..8415e18fcf0ef 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -257,39 +257,6 @@ module_param(nr_cold_quota_exceeds, ulong, 0400); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_lru_sort_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_lru_sort_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_lru_sort_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { @@ -414,8 +381,8 @@ static int damon_lru_sort_apply_parameters(void) if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) + !damon_find_biggest_system_ram(&monitor_region_start, + &monitor_region_end)) return -EINVAL; addr_range.start = monitor_region_start; addr_range.end = monitor_region_end; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 5aeca0b9e88ec..fe7bc0c55ecb3 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -229,39 +229,6 @@ module_param(nr_quota_exceeds, ulong, 0400); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_reclaim_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_reclaim_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_reclaim_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - static struct damos *damon_reclaim_new_scheme(void) { struct damos_access_pattern pattern = { @@ -328,8 +295,8 @@ static int damon_reclaim_apply_parameters(void) if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) + !damon_find_biggest_system_ram(&monitor_region_start, + &monitor_region_end)) return -EINVAL; addr_range.start = monitor_region_start; addr_range.end = monitor_region_end; -- GitLab From 14455eabd8404a503dc8e80cd8ce185e96a94b22 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Fri, 9 Sep 2022 07:31:09 +0000 Subject: [PATCH 1000/2223] mm: use nth_page instead of mem_map_offset mem_map_next To handle the discontiguous case, mem_map_next() has a parameter named `offset`. As a function caller, one would be confused why "get next entry" needs a parameter named "offset". The other drawback of mem_map_next() is that the callers must take care of the map between parameter "iter" and "offset", otherwise we may get an hole or duplication during iteration. So we use nth_page instead of mem_map_next. And replace mem_map_offset with nth_page() per Matthew's comments. Link: https://lkml.kernel.org/r/1662708669-9395-1-git-send-email-lic121@chinatelecom.cn Signed-off-by: Cheng Li Fixes: 69d177c2fc70 ("hugetlbfs: handle pages higher order than MAX_ORDER") Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++++------------ mm/internal.h | 28 ---------------------------- mm/memory.c | 21 ++++++++++----------- 3 files changed, 27 insertions(+), 51 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 008955d8f411c..6af123374e980 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1306,12 +1306,13 @@ static void __destroy_compound_gigantic_page(struct page *page, { int i; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; atomic_set(compound_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); p->mapping = NULL; clear_compound_head(p); if (!demote) @@ -1532,7 +1533,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, static void __update_and_free_page(struct hstate *h, struct page *page) { int i; - struct page *subpage = page; + struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1563,8 +1564,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (unlikely(PageHWPoison(page))) hugetlb_clear_page_hwpoison(page); - for (i = 0; i < pages_per_huge_page(h); - i++, subpage = mem_map_next(subpage, page, i)) { + for (i = 0; i < pages_per_huge_page(h); i++) { + subpage = nth_page(page, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1771,13 +1772,15 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, { int i, j; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __ClearPageReserved(page); __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); + /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -1824,14 +1827,16 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, out_error: /* undo tail page modifications made above */ - p = page + 1; - for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { + for (j = 1; j < i; j++) { + p = nth_page(page, j); clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + for (; j < nr_pages; j++) { + p = nth_page(page, j); __ClearPageReserved(p); + } set_compound_order(page, 0); #ifdef CONFIG_64BIT page[1].compound_nr = 0; @@ -6128,7 +6133,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, for (nr = 0; nr < refs; nr++) { if (likely(pages)) - pages[nr] = mem_map_offset(page, nr); + pages[nr] = nth_page(page, nr); if (vmas) vmas[nr] = vma; } @@ -6292,7 +6297,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); if (pages || vmas) - record_subpages_vmas(mem_map_offset(page, pfn_offset), + record_subpages_vmas(nth_page(page, pfn_offset), vma, refs, likely(pages) ? pages + i : NULL, vmas ? vmas + i : NULL); diff --git a/mm/internal.h b/mm/internal.h index 0f106a3982e73..e497ab14c9842 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -638,34 +638,6 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) } #endif /* !CONFIG_MMU */ -/* - * Return the mem_map entry representing the 'offset' subpage within - * the maximally aligned gigantic page 'base'. Handle any discontiguity - * in the mem_map at MAX_ORDER_NR_PAGES boundaries. - */ -static inline struct page *mem_map_offset(struct page *base, int offset) -{ - if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return nth_page(base, offset); - return base + offset; -} - -/* - * Iterator over all subpages within the maximally aligned gigantic - * page 'base'. Handle any discontiguity in the mem_map. - */ -static inline struct page *mem_map_next(struct page *iter, - struct page *base, int offset) -{ - if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { - unsigned long pfn = page_to_pfn(base) + offset; - if (!pfn_valid(pfn)) - return NULL; - return pfn_to_page(pfn); - } - return iter + 1; -} - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/memory.c b/mm/memory.c index d671ad367d677..c01c12500169d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5690,11 +5690,11 @@ static void clear_gigantic_page(struct page *page, unsigned int pages_per_huge_page) { int i; - struct page *p = page; + struct page *p; might_sleep(); - for (i = 0; i < pages_per_huge_page; - i++, p = mem_map_next(p, page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + p = nth_page(page, i); cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); } @@ -5730,13 +5730,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, struct page *dst_base = dst; struct page *src_base = src; - for (i = 0; i < pages_per_huge_page; ) { + for (i = 0; i < pages_per_huge_page; i++) { + dst = nth_page(dst_base, i); + src = nth_page(src_base, i); + cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); } } @@ -5783,10 +5782,10 @@ long copy_huge_page_from_user(struct page *dst_page, void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; - struct page *subpage = dst_page; + struct page *subpage; - for (i = 0; i < pages_per_huge_page; - i++, subpage = mem_map_next(subpage, dst_page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + subpage = nth_page(dst_page, i); if (allow_pagefault) page_kaddr = kmap(subpage); else -- GitLab From 13cc378403a83e70430ae9bad53fd65199f21fe1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 9 Sep 2022 10:57:11 +0800 Subject: [PATCH 1001/2223] writeback: remove unused macro DIRTY_FULL_SCOPE It's introduced but never used. Remove it. Link: https://lkml.kernel.org/r/20220909025711.32012-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Jan Kara Acked-by: Jens Axboe Cc: Bart Van Assche Cc: David Howells Cc: Matthew Wilcox Cc: NeilBrown Cc: Vlastimil Babka Cc: zhanglianjie Signed-off-by: Andrew Morton --- include/linux/writeback.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f045f6d6c4f0..06f9291b6fd51 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -17,20 +17,12 @@ struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* - * The 1/4 region under the global dirty thresh is for smooth dirty throttling: - * - * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) - * - * Further beyond, all dirtier tasks will enter a loop waiting (possibly long - * time) for the dirty pages to drop, unless written enough pages. - * * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 -#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) struct backing_dev_info; -- GitLab From f4981502088f8ea704beeedf3470e1d53bc2e46c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 10:16:53 +0800 Subject: [PATCH 1002/2223] mm/huge_memory: prevent THP_ZERO_PAGE_ALLOC increased twice A user who reads THP_ZERO_PAGE_ALLOC may be more concerned about the huge zero pages that are really allocated for thp. It is misleading to increase THP_ZERO_PAGE_ALLOC twice if two threads call get_huge_zero_page concurrently. Don't increase the value if the huge page is not really used. Update Documentation/admin-guide/mm/transhuge.rst to suit. Link: https://lkml.kernel.org/r/20220909021653.3371879-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Alexander Potapenko Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Kefeng Wang Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 7 +++---- mm/huge_memory.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index c9c37f16eef88..8e3418ec4503e 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -366,10 +366,9 @@ thp_split_pmd page table entry. thp_zero_page_alloc - is incremented every time a huge zero page is - successfully allocated. It includes allocations which where - dropped due race with other allocation. Note, it doesn't count - every map of the huge zero page, only its allocation. + is incremented every time a huge zero page used for thp is + successfully allocated. Note, it doesn't count every map of + the huge zero page, only its allocation. thp_zero_page_alloc_failed is incremented if kernel fails to allocate diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36ef79b851958..4938defe4e732 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -163,7 +163,6 @@ retry: count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } - count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); @@ -175,6 +174,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); + count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } -- GitLab From a17a8b3b3e6b08a9cd3b2134789843323d998bed Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 8 Sep 2022 16:19:32 +0800 Subject: [PATCH 1003/2223] mm/damon/sysfs: change few functions execute order There's no need to run container_of() as early as we do. The compiler figures this out, but the resulting code is more readable. Link: https://lkml.kernel.org/r/20220908081932.77370-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 9fcf7bae41eb9..d27dad5affec7 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1031,8 +1031,7 @@ static ssize_t nr_schemes_show(struct kobject *kobj, static ssize_t nr_schemes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); + struct damon_sysfs_schemes *schemes; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1040,6 +1039,8 @@ static ssize_t nr_schemes_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_schemes_add_dirs(schemes, nr); @@ -1237,8 +1238,7 @@ static ssize_t nr_regions_show(struct kobject *kobj, static ssize_t nr_regions_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_regions *regions = container_of(kobj, - struct damon_sysfs_regions, kobj); + struct damon_sysfs_regions *regions; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1246,6 +1246,8 @@ static ssize_t nr_regions_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + regions = container_of(kobj, struct damon_sysfs_regions, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_regions_add_dirs(regions, nr); @@ -1440,8 +1442,7 @@ static ssize_t nr_targets_show(struct kobject *kobj, static ssize_t nr_targets_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_targets *targets = container_of(kobj, - struct damon_sysfs_targets, kobj); + struct damon_sysfs_targets *targets; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1449,6 +1450,8 @@ static ssize_t nr_targets_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + targets = container_of(kobj, struct damon_sysfs_targets, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_targets_add_dirs(targets, nr); @@ -1962,8 +1965,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj, static ssize_t nr_contexts_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_contexts *contexts = container_of(kobj, - struct damon_sysfs_contexts, kobj); + struct damon_sysfs_contexts *contexts; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -1973,6 +1975,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj, if (nr < 0 || 1 < nr) return -EINVAL; + contexts = container_of(kobj, struct damon_sysfs_contexts, kobj); if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_contexts_add_dirs(contexts, nr); @@ -2737,8 +2740,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj, static ssize_t nr_kdamonds_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, - struct damon_sysfs_kdamonds, kobj); + struct damon_sysfs_kdamonds *kdamonds; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -2747,6 +2749,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); -- GitLab From e7fcac4cd2674fe6849c6ac8a51a7fc878a5e436 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 12 Sep 2022 23:11:53 +0800 Subject: [PATCH 1004/2223] mm/damon/sysfs: use the wrapper directly to check if the kdamond is running We can use the 'damon_sysfs_kdamond_running()' wrapper directly to check if the kdamond is running in 'damon_sysfs_turn_damon_on()'. Link: https://lkml.kernel.org/r/1662995513-24489-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index d27dad5affec7..da01befae8bd4 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2465,8 +2465,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) struct damon_ctx *ctx; int err; - if (kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx)) + if (damon_sysfs_kdamond_running(kdamond)) return -EBUSY; if (damon_sysfs_cmd_request.kdamond == kdamond) return -EBUSY; -- GitLab From a18709442869e55c42969142d5abf6beb776dbba Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 12 Sep 2022 22:39:03 +0800 Subject: [PATCH 1005/2223] mm/damon: improve damon_new_region strategy Kdamond is implemented as a periodical split-merge pattern, which will create and destroy regions possibly at high frequency (hundreds or even thousands of per sec), depending on the number of regions and aggregation period. In that case, kmalloc and kfree could bring speed and space overheads, which can be improved by using a private kmem cache. [set_pte_at@outlook.com: creating kmem cache for damon regions by KMEM_CACHE()] Link: https://lkml.kernel.org/r/Message-ID: Link: https://lkml.kernel.org/r/TYCP286MB2323DA1894FA55BB9CF90978CA449@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Dawei Li Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 2437c61b0bc0b..c9ec2de845b32 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -29,6 +29,8 @@ static bool running_exclusive_ctxs; static DEFINE_MUTEX(damon_ops_lock); static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; +static struct kmem_cache *damon_region_cache __ro_after_init; + /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ static bool __damon_is_registered_ops(enum damon_ops_id id) { @@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; - region = kmalloc(sizeof(*region), GFP_KERNEL); + region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); if (!region) return NULL; @@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t) static void damon_free_region(struct damon_region *r) { - kfree(r); + kmem_cache_free(damon_region_cache, r); } void damon_destroy_region(struct damon_region *r, struct damon_target *t) @@ -1285,4 +1287,17 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) return true; } +static int __init damon_init(void) +{ + damon_region_cache = KMEM_CACHE(damon_region, 0); + if (unlikely(!damon_region_cache)) { + pr_err("creating damon_region_cache fails\n"); + return -ENOMEM; + } + + return 0; +} + +subsys_initcall(damon_init); + #include "core-test.h" -- GitLab From f635725c3905e755a8c3e2dc8cab7fcd0d38977f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 13 Sep 2022 00:27:44 +0900 Subject: [PATCH 1006/2223] zram: do not waste zram_table_entry flags bits zram_table_entry::flags stores object size in the lower bits and zram pageflags in the upper bits. However, for some reason, we use 24 lower bits, while maximum zram object size is PAGE_SIZE, which requires PAGE_SHIFT bits (up to 16 on arm64). This wastes 24 - PAGE_SHIFT bits that we can use for additional zram pageflags instead. Also add a BUILD_BUG_ON() to alert us should we run out of bits in zram_table_entry::flags. Link: https://lkml.kernel.org/r/20220912152744.527438-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ drivers/block/zram/zram_drv.h | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 607f4634c27da..eb021db21ddfb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2131,6 +2131,8 @@ static int __init zram_init(void) { int ret; + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); if (ret < 0) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828f..a2bda53020fdd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -30,16 +30,15 @@ /* - * The lower ZRAM_FLAG_SHIFT bits of table.flags is for - * object size (excluding header), the higher bits is for - * zram_pageflags. + * ZRAM is mainly used for memory efficiency so we want to keep memory + * footprint small and thus squeeze size and zram pageflags into a flags + * member. The lower ZRAM_FLAG_SHIFT bits is for object size (excluding + * header), which cannot be larger than PAGE_SIZE (requiring PAGE_SHIFT + * bits), the higher bits are for zram_pageflags. * - * zram is mainly used for memory efficiency so we want to keep memory - * footprint small so we can squeeze size and flags into a field. - * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), - * the higher bits is for zram_pageflags. + * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ -#define ZRAM_FLAG_SHIFT 24 +#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { -- GitLab From f9bceb2f4114fe9a9725c922f9f1500d173d4763 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 14 Sep 2022 14:20:33 +0900 Subject: [PATCH 1007/2223] zram: keep comments within 80-columns limit Several trivial fixups (that I should have spotted during review). Link: https://lkml.kernel.org/r/20220914052033.838050-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index eb021db21ddfb..43eeef2b9fbe2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -329,8 +329,8 @@ static ssize_t idle_store(struct device *dev, if (!sysfs_streq(buf, "all")) { /* - * If it did not parse as 'all' try to treat it as an integer when - * we have memory tracking enabled. + * If it did not parse as 'all' try to treat it as an integer + * when we have memory tracking enabled. */ u64 age_sec; @@ -345,7 +345,10 @@ static ssize_t idle_store(struct device *dev, if (!init_done(zram)) goto out_unlock; - /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + /* + * A cutoff_time of 0 marks everything as idle, this is the + * "all" behavior. + */ mark_idle(zram, cutoff_time); rv = len; @@ -1416,11 +1419,11 @@ compress_again: if (comp_len != PAGE_SIZE) goto compress_again; /* - * If the page is not compressible, you need to acquire the lock and - * execute the code below. The zcomp_stream_get() call is needed to - * disable the cpu hotplug and grab the zstrm buffer back. - * It is necessary that the dereferencing of the zstrm variable below - * occurs correctly. + * If the page is not compressible, you need to acquire the + * lock and execute the code below. The zcomp_stream_get() + * call is needed to disable the cpu hotplug and grab the + * zstrm buffer back. It is necessary that the dereferencing + * of the zstrm variable below occurs correctly. */ zstrm = zcomp_stream_get(zram->comp); } -- GitLab From 3791bc7bf1034dcce89541e54630d0307cc199fb Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Sun, 11 Sep 2022 08:59:17 +0800 Subject: [PATCH 1008/2223] mm/damon: simplify scheme create in damon_lru_sort_apply_parameters In damon_lru_sort_apply_parameters(), we can use damon_set_schemes() to replace the way of creating the first 'scheme' in original code, this makes the code look cleaner. Link: https://lkml.kernel.org/r/20220911005917.835-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8415e18fcf0ef..307ba71adcfa9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -350,7 +350,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { - struct damos *scheme, *next_scheme; + struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; @@ -360,17 +360,15 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* free previously set schemes */ - damon_for_each_scheme_safe(scheme, next_scheme, ctx) - damon_destroy_scheme(scheme); - /* aggr_interval / sample_interval is the maximum nr_accesses */ hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - damon_add_scheme(ctx, scheme); + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + return err; cold_thres = cold_min_age / aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); -- GitLab From f82e70e26b505cd8a1d5c670dc5038a938708d4a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:28 +0000 Subject: [PATCH 1009/2223] mm/damon/paddr: make supported DAMOS actions of paddr clear Patch series "mm/damon: cleanup code". DAMON code was not so clean from the beginning, but it has been too much nowadays, especially due to the duplicates in DAMON_RECLAIM and DAMON_LRU_SORT. This patchset cleans some of the mess. This patch (of 22): The 'switch-case' statement in 'damon_va_apply_scheme()' function provides a 'case' for every supported DAMOS action while all not-yet-supported DAMOS actions fall through the 'default' case, and comment it so that people can easily know which actions are supported. Its counterpart in 'paddr', 'damon_pa_apply_scheme()', however, doesn't. This commit makes the 'paddr' side function follows the pattern of 'vaddr' for better readability and consistency. Link: https://lkml.kernel.org/r/20220913174449.50645-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220913174449.50645-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6b0d9e6aa6770..219127cb49e2e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -275,7 +275,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return damon_pa_mark_accessed(r); case DAMOS_LRU_DEPRIO: return damon_pa_deactivate_pages(r); + case DAMOS_STAT: + break; default: + /* DAMOS actions that not yet supported by 'paddr'. */ break; } return 0; -- GitLab From 8193321ac90d525b33815c77faae7d2d12042c03 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:29 +0000 Subject: [PATCH 1010/2223] mm/damon/paddr: deduplicate damon_pa_{mark_accessed,deactivate_pages}() The bodies of damon_pa_{mark_accessed,deactivate_pages}() contains duplicates. This commit factors out the common part to a separate function and removes the duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 219127cb49e2e..1ada62db68b13 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -232,7 +232,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r) return applied * PAGE_SIZE; } -static unsigned long damon_pa_mark_accessed(struct damon_region *r) +static inline unsigned long damon_pa_mark_accessed_or_deactivate( + struct damon_region *r, bool mark_accessed) { unsigned long addr, applied = 0; @@ -241,27 +242,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r) if (!page) continue; - mark_page_accessed(page); + if (mark_accessed) + mark_page_accessed(page); + else + deactivate_page(page); put_page(page); applied++; } return applied * PAGE_SIZE; } -static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +static unsigned long damon_pa_mark_accessed(struct damon_region *r) { - unsigned long addr, applied = 0; - - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); + return damon_pa_mark_accessed_or_deactivate(r, true); +} - if (!page) - continue; - deactivate_page(page); - put_page(page); - applied++; - } - return applied * PAGE_SIZE; +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + return damon_pa_mark_accessed_or_deactivate(r, false); } static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, -- GitLab From 02f17037fc6e38ca1c00ac87a112372a3867ba45 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:30 +0000 Subject: [PATCH 1011/2223] mm/damon/core: copy struct-to-struct instead of field-to-field in damon_new_scheme() The function for new 'struct damos' creation, 'damon_new_scheme()', copies each field of the struct one by one, though it could simply copied via struct to struct. This commit replaces the unnecessarily verbose field-to-field copies with struct-to-struct copies to make code simple and short. Link: https://lkml.kernel.org/r/20220913174449.50645-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c9ec2de845b32..a564f83e9efe7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -272,22 +272,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->pattern.min_sz_region = pattern->min_sz_region; - scheme->pattern.max_sz_region = pattern->max_sz_region; - scheme->pattern.min_nr_accesses = pattern->min_nr_accesses; - scheme->pattern.max_nr_accesses = pattern->max_nr_accesses; - scheme->pattern.min_age_region = pattern->min_age_region; - scheme->pattern.max_age_region = pattern->max_age_region; + scheme->pattern = *pattern; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota.ms = quota->ms; - scheme->quota.sz = quota->sz; - scheme->quota.reset_interval = quota->reset_interval; - scheme->quota.weight_sz = quota->weight_sz; - scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; - scheme->quota.weight_age = quota->weight_age; + scheme->quota = *quota; + /* caller might not zero-initialized the private fileds */ scheme->quota.total_charged_sz = 0; scheme->quota.total_charged_ns = 0; scheme->quota.esz = 0; @@ -296,11 +287,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->quota.charge_target_from = NULL; scheme->quota.charge_addr_from = 0; - scheme->wmarks.metric = wmarks->metric; - scheme->wmarks.interval = wmarks->interval; - scheme->wmarks.high = wmarks->high; - scheme->wmarks.mid = wmarks->mid; - scheme->wmarks.low = wmarks->low; + scheme->wmarks = *wmarks; scheme->wmarks.activated = true; return scheme; -- GitLab From 70e0c1d1bf945328915f52f7132b2d6ee8f25d46 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:31 +0000 Subject: [PATCH 1012/2223] mm/damon/core: factor out 'damos_quota' private fileds initialization The 'struct damos' creation function, 'damon_new_scheme()', does initialization of private fileds of 'struct damos_quota' in it. As its verbose and makes the function unnecessarily long, this commit factors it out to separate function. Link: https://lkml.kernel.org/r/20220913174449.50645-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index a564f83e9efe7..6d9f4c2dee35c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -263,6 +263,19 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } +/* initialize private fields of damos_quota and return the pointer */ +static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +{ + quota->total_charged_sz = 0; + quota->total_charged_ns = 0; + quota->esz = 0; + quota->charged_sz = 0; + quota->charged_from = 0; + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return quota; +} + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks) @@ -277,15 +290,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota = *quota; - /* caller might not zero-initialized the private fileds */ - scheme->quota.total_charged_sz = 0; - scheme->quota.total_charged_ns = 0; - scheme->quota.esz = 0; - scheme->quota.charged_sz = 0; - scheme->quota.charged_from = 0; - scheme->quota.charge_target_from = NULL; - scheme->quota.charge_addr_from = 0; + scheme->quota = *(damos_quota_init_priv(quota)); scheme->wmarks = *wmarks; scheme->wmarks.activated = true; -- GitLab From cbeaa77b044938cfe91818821ece6b0b1511e967 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:32 +0000 Subject: [PATCH 1013/2223] mm/damon/core: use a dedicated struct for monitoring attributes DAMON monitoring attributes are directly defined as fields of 'struct damon_ctx'. This makes 'struct damon_ctx' a little long and complicated. This commit defines and uses a struct, 'struct damon_attrs', which is dedicated for only the monitoring attributes to make the purpose of the five values clearer and simplify 'struct damon_ctx'. Link: https://lkml.kernel.org/r/20220913174449.50645-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 30 ++++++++++++++++++++---------- mm/damon/core.c | 34 +++++++++++++++++----------------- mm/damon/dbgfs.c | 6 +++--- mm/damon/ops-common.c | 4 ++-- mm/damon/vaddr.c | 4 ++-- 5 files changed, 44 insertions(+), 34 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 016b6c9c03d62..2ceee8b07726b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -389,13 +389,15 @@ struct damon_callback { }; /** - * struct damon_ctx - Represents a context for each monitoring. This is the - * main interface that allows users to set the attributes and get the results - * of the monitoring. + * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @min_nr_regions: The minimum number of adaptive monitoring + * regions. + * @max_nr_regions: The maximum number of adaptive monitoring + * regions. * * For each @sample_interval, DAMON checks whether each region is accessed or * not. It aggregates and keeps the access information (number of accesses to @@ -405,7 +407,21 @@ struct damon_callback { * @ops_update_interval. All time intervals are in micro-seconds. * Please refer to &struct damon_operations and &struct damon_callback for more * detail. + */ +struct damon_attrs { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long ops_update_interval; + unsigned long min_nr_regions; + unsigned long max_nr_regions; +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. * + * @attrs: Monitoring attributes for accuracy/overhead control. * @kdamond: Kernel thread who does the monitoring. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * @@ -427,15 +443,11 @@ struct damon_callback { * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @min_nr_regions: The minimum number of adaptive monitoring regions. - * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { - unsigned long sample_interval; - unsigned long aggr_interval; - unsigned long ops_update_interval; + struct damon_attrs attrs; /* private: internal use only */ struct timespec64 last_aggregation; @@ -448,8 +460,6 @@ struct damon_ctx { struct damon_operations ops; struct damon_callback callback; - unsigned long min_nr_regions; - unsigned long max_nr_regions; struct list_head adaptive_targets; struct list_head schemes; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6d9f4c2dee35c..bbd4c2d991dda 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -382,17 +382,17 @@ struct damon_ctx *damon_new_ctx(void) if (!ctx) return NULL; - ctx->sample_interval = 5 * 1000; - ctx->aggr_interval = 100 * 1000; - ctx->ops_update_interval = 60 * 1000 * 1000; + ctx->attrs.sample_interval = 5 * 1000; + ctx->attrs.aggr_interval = 100 * 1000; + ctx->attrs.ops_update_interval = 60 * 1000 * 1000; ktime_get_coarse_ts64(&ctx->last_aggregation); ctx->last_ops_update = ctx->last_aggregation; mutex_init(&ctx->kdamond_lock); - ctx->min_nr_regions = 10; - ctx->max_nr_regions = 1000; + ctx->attrs.min_nr_regions = 10; + ctx->attrs.max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -448,11 +448,11 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, if (min_nr_reg > max_nr_reg) return -EINVAL; - ctx->sample_interval = sample_int; - ctx->aggr_interval = aggr_int; - ctx->ops_update_interval = ops_upd_int; - ctx->min_nr_regions = min_nr_reg; - ctx->max_nr_regions = max_nr_reg; + ctx->attrs.sample_interval = sample_int; + ctx->attrs.aggr_interval = aggr_int; + ctx->attrs.ops_update_interval = ops_upd_int; + ctx->attrs.min_nr_regions = min_nr_reg; + ctx->attrs.max_nr_regions = max_nr_reg; return 0; } @@ -507,8 +507,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) sz += r->ar.end - r->ar.start; } - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; @@ -657,7 +657,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline, static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_aggregation, - ctx->aggr_interval); + ctx->attrs.aggr_interval); } /* @@ -1016,12 +1016,12 @@ static void kdamond_split_regions(struct damon_ctx *ctx) damon_for_each_target(t, ctx) nr_regions += damon_nr_regions(t); - if (nr_regions > ctx->max_nr_regions / 2) + if (nr_regions > ctx->attrs.max_nr_regions / 2) return; /* Maybe the middle of the region has different access frequency */ if (last_nr_regions == nr_regions && - nr_regions < ctx->max_nr_regions / 3) + nr_regions < ctx->attrs.max_nr_regions / 3) nr_subregions = 3; damon_for_each_target(t, ctx) @@ -1039,7 +1039,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) static bool kdamond_need_update_operations(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_ops_update, - ctx->ops_update_interval); + ctx->attrs.ops_update_interval); } /* @@ -1188,7 +1188,7 @@ static int kdamond_fn(void *data) continue; } - kdamond_usleep(ctx->sample_interval); + kdamond_usleep(ctx->attrs.sample_interval); if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 1422037cedd2b..74e7542af6d34 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file, mutex_lock(&ctx->kdamond_lock); ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->sample_interval, ctx->aggr_interval, - ctx->ops_update_interval, ctx->min_nr_regions, - ctx->max_nr_regions); + ctx->attrs.sample_interval, ctx->attrs.aggr_interval, + ctx->attrs.ops_update_interval, + ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); mutex_unlock(&ctx->kdamond_lock); return simple_read_from_buffer(buf, count, ppos, kbuf, ret); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index f599838b5f648..9310df72e1c54 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -99,10 +99,10 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, unsigned int age_weight = s->quota.weight_age; int hotness; - max_nr_accesses = c->aggr_interval / c->sample_interval; + max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; age_in_log++, age_in_sec >>= 1) ; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index c2c08c1b316bd..0eae47bd9ccbb 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -251,8 +251,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, for (i = 0; i < 3; i++) sz += regions[i].end - regions[i].start; - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; -- GitLab From bead3b00088eb8016b32cafa7e0701b3283e68a4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:33 +0000 Subject: [PATCH 1014/2223] mm/damon/core: reduce parameters for damon_set_attrs() Number of parameters for 'damon_set_attrs()' is six. As it could be confusing and verbose, this commit reduces the number by receiving single pointer to a 'struct damon_attrs'. Link: https://lkml.kernel.org/r/20220913174449.50645-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +--- mm/damon/core.c | 21 +++++---------------- mm/damon/dbgfs.c | 9 ++++++--- mm/damon/lru_sort.c | 10 ++++++++-- mm/damon/reclaim.c | 10 ++++++++-- mm/damon/sysfs.c | 12 ++++++++---- 6 files changed, 36 insertions(+), 30 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2ceee8b07726b..c5dc0c77c7722 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -540,9 +540,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); diff --git a/mm/damon/core.c b/mm/damon/core.c index bbd4c2d991dda..29635a82cb691 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -428,32 +428,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx) /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context - * @sample_int: time interval between samplings - * @aggr_int: time interval between aggregations - * @ops_upd_int: time interval between monitoring operations updates - * @min_nr_reg: minimal number of regions - * @max_nr_reg: maximum number of regions + * @attrs: monitoring attributes * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. * * Return: 0 on success, negative error code otherwise. */ -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg) +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { - if (min_nr_reg < 3) + if (attrs->min_nr_regions < 3) return -EINVAL; - if (min_nr_reg > max_nr_reg) + if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; - ctx->attrs.sample_interval = sample_int; - ctx->attrs.aggr_interval = aggr_int; - ctx->attrs.ops_update_interval = ops_upd_int; - ctx->attrs.min_nr_regions = min_nr_reg; - ctx->attrs.max_nr_regions = max_nr_reg; - + ctx->attrs = *attrs; return 0; } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 74e7542af6d34..c00eba4448d85 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - unsigned long s, a, r, minr, maxr; + struct damon_attrs attrs; char *kbuf; ssize_t ret; @@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file, return PTR_ERR(kbuf); if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &s, &a, &r, &minr, &maxr) != 5) { + &attrs.sample_interval, &attrs.aggr_interval, + &attrs.ops_update_interval, + &attrs.min_nr_regions, + &attrs.max_nr_regions) != 5) { ret = -EINVAL; goto out; } @@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + ret = damon_set_attrs(ctx, &attrs); if (!ret) ret = count; unlock_out: diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 307ba71adcfa9..6d5f83965276f 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -350,13 +350,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { + struct damon_attrs attrs = { + .sample_interval = sample_interval, + .aggr_interval = aggr_interval, + .ops_update_interval = 0, + .min_nr_regions = min_nr_regions, + .max_nr_regions = max_nr_regions, + }; struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &attrs); if (err) return err; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index fe7bc0c55ecb3..bc841efbab45e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -275,12 +275,18 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { + struct damon_attrs attrs = { + .sample_interval = sample_interval, + .aggr_interval = aggr_interval, + .ops_update_interval = 0, + .min_nr_regions = min_nr_regions, + .max_nr_regions = max_nr_regions, + }; struct damos *scheme; struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &attrs); if (err) return err; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index da01befae8bd4..3dbf3804ec88b 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2130,10 +2130,14 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; struct damon_sysfs_ul_range *sys_nr_regions = sys_attrs->nr_regions_range; - - return damon_set_attrs(ctx, sys_intervals->sample_us, - sys_intervals->aggr_us, sys_intervals->update_us, - sys_nr_regions->min, sys_nr_regions->max); + struct damon_attrs attrs = { + .sample_interval = sys_intervals->sample_us, + .aggr_interval = sys_intervals->aggr_us, + .ops_update_interval = sys_intervals->update_us, + .min_nr_regions = sys_nr_regions->min, + .max_nr_regions = sys_nr_regions->max, + }; + return damon_set_attrs(ctx, &attrs); } static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) -- GitLab From 8c341ae3341188a0bcef02f05aca7345501ce697 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:34 +0000 Subject: [PATCH 1015/2223] mm/damon/reclaim: use 'struct damon_attrs' for storing parameters for it DAMON_RECLAIM receives monitoring attributes by parameters one by one to separate variables, and then combine those into 'struct damon_attrs'. This commit makes the module directly stores the parameter values to a static 'struct damon_attrs' variable and use it to simplify the code. Link: https://lkml.kernel.org/r/20220913174449.50645-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index bc841efbab45e..d35a00d8dde2d 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -129,14 +129,22 @@ module_param(wmarks_mid, ulong, 0600); static unsigned long wmarks_low __read_mostly = 200; module_param(wmarks_low, ulong, 0600); +static struct damon_attrs damon_reclaim_mon_attrs = { + .sample_interval = 5000, + .aggr_interval = 100000, + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; + /* * Sampling interval for the monitoring in microseconds. * * The sampling interval of DAMON for the cold memory monitoring. Please refer * to the DAMON documentation for more detail. 5 ms by default. */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); +module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval, + ulong, 0600); /* * Aggregation interval for the monitoring in microseconds. @@ -144,8 +152,8 @@ module_param(sample_interval, ulong, 0600); * The aggregation interval of DAMON for the cold memory monitoring. Please * refer to the DAMON documentation for more detail. 100 ms by default. */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); +module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong, + 0600); /* * Minimum number of monitoring regions. @@ -155,8 +163,8 @@ module_param(aggr_interval, ulong, 0600); * But, setting this too high could result in increased monitoring overhead. * Please refer to the DAMON documentation for more detail. 10 by default. */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); +module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions, + ulong, 0600); /* * Maximum number of monitoring regions. @@ -166,8 +174,8 @@ module_param(min_nr_regions, ulong, 0600); * However, setting this too low could result in bad monitoring quality. * Please refer to the DAMON documentation for more detail. 1000 by default. */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions, + ulong, 0600); /* * Start of the target memory region in physical address. @@ -239,7 +247,8 @@ static struct damos *damon_reclaim_new_scheme(void) .min_nr_accesses = 0, .max_nr_accesses = 0, /* for min_age or more micro-seconds */ - .min_age_region = min_age / aggr_interval, + .min_age_region = min_age / + damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; struct damos_watermarks wmarks = { @@ -275,18 +284,11 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { - struct damon_attrs attrs = { - .sample_interval = sample_interval, - .aggr_interval = aggr_interval, - .ops_update_interval = 0, - .min_nr_regions = min_nr_regions, - .max_nr_regions = max_nr_regions, - }; struct damos *scheme; struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, &attrs); + err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); if (err) return err; -- GitLab From 135e128f8e48f30ea65e0ffad34dca37d2c8d171 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:35 +0000 Subject: [PATCH 1016/2223] mm/damon/lru_sort: use 'struct damon_attrs' for storing parameters for it DAMON_LRU_SORT receives monitoring attributes by parameters one by one to separate variables, and then combines those into 'struct damon_attrs'. This commit makes the module directly stores the parameter values to a static 'struct damon_attrs' variable and use it to simplify the code. Link: https://lkml.kernel.org/r/20220913174449.50645-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 6d5f83965276f..ade985b836527 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -127,14 +127,22 @@ module_param(wmarks_mid, ulong, 0600); static unsigned long wmarks_low __read_mostly = 50; module_param(wmarks_low, ulong, 0600); +static struct damon_attrs damon_lru_sort_mon_attrs = { + .sample_interval = 5000, + .aggr_interval = 100000, + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; + /* * Sampling interval for the monitoring in microseconds. * * The sampling interval of DAMON for the hot/cold memory monitoring. Please * refer to the DAMON documentation for more detail. 5 ms by default. */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); +module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval, + ulong, 0600); /* * Aggregation interval for the monitoring in microseconds. @@ -142,8 +150,8 @@ module_param(sample_interval, ulong, 0600); * The aggregation interval of DAMON for the hot/cold memory monitoring. * Please refer to the DAMON documentation for more detail. 100 ms by default. */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); +module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong, + 0600); /* * Minimum number of monitoring regions. @@ -153,8 +161,8 @@ module_param(aggr_interval, ulong, 0600); * But, setting this too high could result in increased monitoring overhead. * Please refer to the DAMON documentation for more detail. 10 by default. */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); +module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions, + ulong, 0600); /* * Maximum number of monitoring regions. @@ -164,8 +172,8 @@ module_param(min_nr_regions, ulong, 0600); * However, setting this too low could result in bad monitoring quality. * Please refer to the DAMON documentation for more detail. 1000 by default. */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions, + ulong, 0600); /* * Start of the target memory region in physical address. @@ -350,25 +358,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { - struct damon_attrs attrs = { - .sample_interval = sample_interval, - .aggr_interval = aggr_interval, - .ops_update_interval = 0, - .min_nr_regions = min_nr_regions, - .max_nr_regions = max_nr_regions, - }; struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, &attrs); + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) return err; /* aggr_interval / sample_interval is the maximum nr_accesses */ - hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / - 1000; + hot_thres = damon_lru_sort_mon_attrs.aggr_interval / + damon_lru_sort_mon_attrs.sample_interval * + hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; @@ -376,7 +378,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - cold_thres = cold_min_age / aggr_interval; + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); if (!scheme) return -ENOMEM; -- GitLab From b3c28d886329d8df66679f72f3f3c81c0dd21e88 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:36 +0000 Subject: [PATCH 1017/2223] mm/damon: implement a monitoring attributes module parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for monitoring attributes that having same names. This commot implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 mm/damon/modules-common.h diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h new file mode 100644 index 0000000000000..0abd0636bc649 --- /dev/null +++ b/mm/damon/modules-common.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \ + module_param_named(sample_interval, attrs.sample_interval, \ + ulong, 0600); \ + module_param_named(aggr_interval, attrs.aggr_interval, ulong, \ + 0600); \ + module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \ + 0600); \ + module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ + 0600); -- GitLab From 95f7c05d73fc6d9cfe43fb18b2f16b21eb55b5bf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:37 +0000 Subject: [PATCH 1018/2223] mm/damon/lru_sort: use monitoring attributes parameters generaotr macro This commit makes DAMON_LRU_SORT to generate the module parameters for DAMON monitoring attributes using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 47 +++++---------------------------------------- 1 file changed, 5 insertions(+), 42 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index ade985b836527..e95626acee6f9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -13,6 +13,8 @@ #include #include +#include "modules-common.h" + #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -128,52 +130,13 @@ static unsigned long wmarks_low __read_mostly = 50; module_param(wmarks_low, ulong, 0600); static struct damon_attrs damon_lru_sort_mon_attrs = { - .sample_interval = 5000, - .aggr_interval = 100000, + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ .ops_update_interval = 0, .min_nr_regions = 10, .max_nr_regions = 1000, }; - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the hot/cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 5 ms by default. - */ -module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval, - ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the hot/cold memory monitoring. - * Please refer to the DAMON documentation for more detail. 100 ms by default. - */ -module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong, - 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions, - ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions, - ulong, 0600); +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); /* * Start of the target memory region in physical address. -- GitLab From fdfc119c17cfbc0aa26be6b070f49aa1584a7e08 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:38 +0000 Subject: [PATCH 1019/2223] mm/damon/reclaim: use monitoring attributes parameters generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMON monitoring attributes using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 47 +++++----------------------------------------- 1 file changed, 5 insertions(+), 42 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index d35a00d8dde2d..48326bef20f51 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -13,6 +13,8 @@ #include #include +#include "modules-common.h" + #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -130,52 +132,13 @@ static unsigned long wmarks_low __read_mostly = 200; module_param(wmarks_low, ulong, 0600); static struct damon_attrs damon_reclaim_mon_attrs = { - .sample_interval = 5000, - .aggr_interval = 100000, + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ .ops_update_interval = 0, .min_nr_regions = 10, .max_nr_regions = 1000, }; - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the cold memory monitoring. Please refer - * to the DAMON documentation for more detail. 5 ms by default. - */ -module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval, - ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 100 ms by default. - */ -module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong, - 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions, - ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions, - ulong, 0600); +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); /* * Start of the target memory region in physical address. -- GitLab From b324ee36e9685689a55c1faee669cd7a1a42bae0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:39 +0000 Subject: [PATCH 1020/2223] mm/damon/modules-common: implement a watermarks module parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for watermarks that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 0abd0636bc649..1370590a37d18 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -16,3 +16,10 @@ 0600); \ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); + +#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ + module_param_named(wmarks_interval, wmarks->interval, ulong, \ + 0600); \ + module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ + module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ + module_param_named(wmarks_low, wmarks.lowulong, 0600); -- GitLab From 6517d2d97709e01c6758dcccc7a51e3731c8706f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:40 +0000 Subject: [PATCH 1021/2223] mm/damon/lru_sort: use watermarks parameters generator macro This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 64 ++++++--------------------------------- mm/damon/modules-common.h | 4 +-- 2 files changed, 12 insertions(+), 56 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index e95626acee6f9..20760b39b50a4 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -90,44 +90,14 @@ module_param(quota_ms, ulong, 0600); static unsigned long quota_reset_interval_ms __read_mostly = 1000; module_param(quota_reset_interval_ms, ulong, 0600); -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically - * checks the watermarks. 200 (20%) by default. - */ -static unsigned long wmarks_high __read_mostly = 200; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring - * and the LRU-lists sorting. 150 (15%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 150; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks - * the watermarks. 50 (5%) by default. - */ -static unsigned long wmarks_low __read_mostly = 50; -module_param(wmarks_low, ulong, 0600); +struct damos_watermarks damon_lru_sort_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 200, /* 20 percent */ + .mid = 150, /* 15 percent */ + .low = 50, /* 5 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks); static struct damon_attrs damon_lru_sort_mon_attrs = { .sample_interval = 5000, /* 5 ms */ @@ -242,13 +212,6 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try LRU-lists sorting of hot pages for more than half @@ -270,7 +233,7 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_lru_sort_wmarks); } /* Create a DAMON-based operation scheme for cold memory regions */ @@ -287,13 +250,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try LRU-lists sorting of cold pages for more than @@ -316,7 +272,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_lru_sort_wmarks); } static int damon_lru_sort_apply_parameters(void) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 1370590a37d18..4c2ce84869d58 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -18,8 +18,8 @@ 0600); #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ - module_param_named(wmarks_interval, wmarks->interval, ulong, \ + module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ - module_param_named(wmarks_low, wmarks.lowulong, 0600); + module_param_named(wmarks_low, wmarks.low, ulong, 0600); -- GitLab From 34f47ea688bb6d1c6d04f8d72546a623bd8d59de Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:41 +0000 Subject: [PATCH 1022/2223] mm/damon/reclaim: use watermarks parameters generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-15-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 56 ++++++++-------------------------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 48326bef20f51..7f845f617dc56 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -91,45 +91,14 @@ module_param(quota_sz, ulong, 0600); static unsigned long quota_reset_interval_ms __read_mostly = 1000; module_param(quota_reset_interval_ms, ulong, 0600); -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically - * checks the watermarks. 500 (50%) by default. - */ -static unsigned long wmarks_high __read_mostly = 500; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring - * and the reclaiming. 400 (40%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 400; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks - * the watermarks. In the case, the system falls back to the LRU-based page - * granularity reclamation logic. 200 (20%) by default. - */ -static unsigned long wmarks_low __read_mostly = 200; -module_param(wmarks_low, ulong, 0600); +struct damos_watermarks damon_reclaim_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 500, /* 50 percent */ + .mid = 400, /* 40 percent */ + .low = 200, /* 20 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks); static struct damon_attrs damon_reclaim_mon_attrs = { .sample_interval = 5000, /* 5 ms */ @@ -214,13 +183,6 @@ static struct damos *damon_reclaim_new_scheme(void) damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try reclamation for more than quota_ms milliseconds @@ -242,7 +204,7 @@ static struct damos *damon_reclaim_new_scheme(void) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_reclaim_wmarks); } static int damon_reclaim_apply_parameters(void) -- GitLab From 528ef2d996408d4b9cccf4b23a9976ab5e75cf39 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:42 +0000 Subject: [PATCH 1023/2223] mm/damon/modules-common: implement a stats parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS statistics that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-16-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 4c2ce84869d58..ed973e0770ae9 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -23,3 +23,15 @@ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ module_param_named(wmarks_low, wmarks.low, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \ + succ_name, qt_exceed_name) \ + module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \ + module_param_named(bytes_##try_name, stat.sz_tried, ulong, \ + 0400); \ + module_param_named(nr_##succ_name, stat.nr_applied, ulong, \ + 0400); \ + module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ + 0400); \ + module_param_named(qt_exceed_name, stat.qt_exceeds, ulong, \ + 0400); -- GitLab From b71f3ea83242890900bb0668201568df81244547 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:43 +0000 Subject: [PATCH 1024/2223] mm/damon/reclaim: use stat parameters generator This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS statistics using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-17-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7f845f617dc56..1ef8353ac15af 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -136,35 +136,9 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of memory regions that tried to be reclaimed. - */ -static unsigned long nr_reclaim_tried_regions __read_mostly; -module_param(nr_reclaim_tried_regions, ulong, 0400); - -/* - * Total bytes of memory regions that tried to be reclaimed. - */ -static unsigned long bytes_reclaim_tried_regions __read_mostly; -module_param(bytes_reclaim_tried_regions, ulong, 0400); - -/* - * Number of memory regions that successfully be reclaimed. - */ -static unsigned long nr_reclaimed_regions __read_mostly; -module_param(nr_reclaimed_regions, ulong, 0400); - -/* - * Total bytes of memory regions that successfully be reclaimed. - */ -static unsigned long bytes_reclaimed_regions __read_mostly; -module_param(bytes_reclaimed_regions, ulong, 0400); - -/* - * Number of times that the time/space quota limits have exceeded - */ -static unsigned long nr_quota_exceeds __read_mostly; -module_param(nr_quota_exceeds, ulong, 0400); +static struct damos_stat damon_reclaim_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, + reclaim_tried_regions, reclaimed_regions, quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; @@ -318,13 +292,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) struct damos *s; /* update the stats parameter */ - damon_for_each_scheme(s, c) { - nr_reclaim_tried_regions = s->stat.nr_tried; - bytes_reclaim_tried_regions = s->stat.sz_tried; - nr_reclaimed_regions = s->stat.nr_applied; - bytes_reclaimed_regions = s->stat.sz_applied; - nr_quota_exceeds = s->stat.qt_exceeds; - } + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; return damon_reclaim_handle_commit_inputs(); } -- GitLab From dd172fbf8f1d3befd0a22357a251d8d516354d5f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:44 +0000 Subject: [PATCH 1025/2223] mm/damon/lru_sort: use stat generator This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS statistics using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-18-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 83 +++++++-------------------------------------- 1 file changed, 12 insertions(+), 71 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 20760b39b50a4..13a752aed2720 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -135,65 +135,15 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_hot_regions __read_mostly; -module_param(nr_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly; -module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Number of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_hot_regions __read_mostly; -module_param(nr_lru_sorted_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_hot_regions __read_mostly; -module_param(bytes_lru_sorted_hot_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for hot regions have exceeded - */ -static unsigned long nr_hot_quota_exceeds __read_mostly; -module_param(nr_hot_quota_exceeds, ulong, 0400); +static struct damos_stat damon_lru_sort_hot_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, + lru_sort_tried_hot_regions, lru_sorted_hot_regions, + hot_quota_exceeds); -/* - * Number of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_cold_regions __read_mostly; -module_param(nr_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly; -module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Number of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_cold_regions __read_mostly; -module_param(nr_lru_sorted_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_cold_regions __read_mostly; -module_param(bytes_lru_sorted_cold_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for cold regions have exceeded - */ -static unsigned long nr_cold_quota_exceeds __read_mostly; -module_param(nr_cold_quota_exceeds, ulong, 0400); +static struct damos_stat damon_lru_sort_cold_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, + lru_sort_tried_cold_regions, lru_sorted_cold_regions, + cold_quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; @@ -397,19 +347,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c) /* update the stats parameter */ damon_for_each_scheme(s, c) { - if (s->action == DAMOS_LRU_PRIO) { - nr_lru_sort_tried_hot_regions = s->stat.nr_tried; - bytes_lru_sort_tried_hot_regions = s->stat.sz_tried; - nr_lru_sorted_hot_regions = s->stat.nr_applied; - bytes_lru_sorted_hot_regions = s->stat.sz_applied; - nr_hot_quota_exceeds = s->stat.qt_exceeds; - } else if (s->action == DAMOS_LRU_DEPRIO) { - nr_lru_sort_tried_cold_regions = s->stat.nr_tried; - bytes_lru_sort_tried_cold_regions = s->stat.sz_tried; - nr_lru_sorted_cold_regions = s->stat.nr_applied; - bytes_lru_sorted_cold_regions = s->stat.sz_applied; - nr_cold_quota_exceeds = s->stat.qt_exceeds; - } + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; } return damon_lru_sort_handle_commit_inputs(); -- GitLab From 63e0f90bac0c772c14aecfe36783ab60795d05db Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:45 +0000 Subject: [PATCH 1026/2223] mm/damon/modules-common: implement a damos quota params generator DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS quotas that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-19-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index ed973e0770ae9..3e99810b46899 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -17,6 +17,12 @@ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + module_param_named(quota_ms, quota.ms, ulong, 0600); \ + module_param_named(quota_sz, quota.sz, ulong, 0600); \ + module_param_named(quota_reset_interval_ms, \ + quota.reset_interval, ulong, 0600); + #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ @@ -33,5 +39,5 @@ 0400); \ module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ 0400); \ - module_param_named(qt_exceed_name, stat.qt_exceeds, ulong, \ + module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ 0400); -- GitLab From 1f55402685d10aa336cf9b25e83b416e4fc0c153 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:46 +0000 Subject: [PATCH 1027/2223] mm/damon/modules-common: implement damos time quota params generator DAMON_LRU_SORT have module parameters for DAMOS time quota only but size quota. This commit implements a macro for generating the module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-20-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 3e99810b46899..5a4921851d326 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -17,12 +17,15 @@ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); -#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ +#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ module_param_named(quota_ms, quota.ms, ulong, 0600); \ - module_param_named(quota_sz, quota.sz, ulong, 0600); \ module_param_named(quota_reset_interval_ms, \ quota.reset_interval, ulong, 0600); +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_sz, quota.sz, ulong, 0600); + #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ -- GitLab From a9d57c7369532cdcd3a834c3f0cc5ad6b2f0f1ff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:47 +0000 Subject: [PATCH 1028/2223] mm/damon/reclaim: use the quota params generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS quotas using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-21-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 64 +++++++++------------------------------------- 1 file changed, 12 insertions(+), 52 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 1ef8353ac15af..1acf808e16242 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -52,44 +52,17 @@ module_param(commit_inputs, bool, 0600); static unsigned long min_age __read_mostly = 120000000; module_param(min_age, ulong, 0600); -/* - * Limit of time for trying the reclamation in milliseconds. - * - * DAMON_RECLAIM tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be - * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, - * the limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * Limit of size of memory for the reclamation in bytes. - * - * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a - * time window (quota_reset_interval_ms) and makes no more than this limit is - * tried. This can be used for limiting consumption of CPU and IO. If this - * value is zero, the limit is disabled. - * - * 128 MiB by default. - */ -static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; -module_param(quota_sz, ulong, 0600); - -/* - * The time/size quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms) and size - * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than - * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms - * milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); +static struct damos_quota damon_reclaim_quota = { + /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */ + .ms = 10, + .sz = 128 * 1024 * 1024, + .reset_interval = 1000, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 +}; +DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, @@ -157,26 +130,13 @@ static struct damos *damon_reclaim_new_scheme(void) damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try reclamation for more than quota_ms milliseconds - * or quota_sz bytes within quota_reset_interval_ms. - */ - .ms = quota_ms, - .sz = quota_sz, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, page out older regions first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1 - }; return damon_new_scheme( &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ - "a, + &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ &damon_reclaim_wmarks); } -- GitLab From 45b8212fc555d07ed78b9270283d61afbdee1df6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:48 +0000 Subject: [PATCH 1029/2223] mm/damon/lru_sort: use quotas param generator This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-22-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 70 ++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 13a752aed2720..8d9c3d1fd6bef 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -65,30 +65,17 @@ module_param(hot_thres_access_freq, ulong, 0600); static unsigned long cold_min_age __read_mostly = 120000000; module_param(cold_min_age, ulong, 0600); -/* - * Limit of time for trying the LRU lists sorting in milliseconds. - * - * DAMON_LRU_SORT tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used - * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the - * limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * The time quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms). That is, - * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms - * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); +static struct damos_quota damon_lru_sort_quota = { + /* Use up to 10 ms per 1 sec, by default */ + .ms = 10, + .sz = 0, + .reset_interval = 1000, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, +}; +DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); struct damos_watermarks damon_lru_sort_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, @@ -162,19 +149,10 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of hot pages for more than half - * of quota_ms milliseconds within quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark hotter regions accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 1, - .weight_age = 0, - }; + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot pages sorting */ + quota.ms = quota.ms / 2; return damon_new_scheme( &pattern, @@ -200,20 +178,10 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of cold pages for more than - * half of quota_ms milliseconds within - * quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark colder regions not accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1, - }; + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for cold pages sorting */ + quota.ms = quota.ms / 2; return damon_new_scheme( &pattern, -- GitLab From a62518ab1da4eb8bf0335c0e254b3e82e9ce222e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:49 +0000 Subject: [PATCH 1030/2223] mm/damon/lru_sort: deduplicate hot/cold schemes generators damon_lru_sort_new_{hot,cold}_scheme() have quite a lot of duplicates. This commit factors out the duplicate to a separate function and use it for reducing the duplicate. Link: https://lkml.kernel.org/r/20220913174449.50645-23-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8d9c3d1fd6bef..07a0908963fd0 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -135,6 +135,25 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, static struct damon_ctx *ctx; static struct damon_target *target; +static struct damos *damon_lru_sort_new_scheme( + struct damos_access_pattern *pattern, enum damos_action action) +{ + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot/cold pages sorting */ + quota.ms = quota.ms / 2; + + return damon_new_scheme( + /* find the pattern, and */ + pattern, + /* (de)prioritize on LRU-lists */ + action, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &damon_lru_sort_wmarks); +} + /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { @@ -149,19 +168,8 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_quota quota = damon_lru_sort_quota; - - /* Use half of total quota for hot pages sorting */ - quota.ms = quota.ms / 2; - return damon_new_scheme( - &pattern, - /* prioritize those on LRU lists, as soon as found */ - DAMOS_LRU_PRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ @@ -178,19 +186,8 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_quota quota = damon_lru_sort_quota; - /* Use half of total quota for cold pages sorting */ - quota.ms = quota.ms / 2; - - return damon_new_scheme( - &pattern, - /* mark those as not accessed, as soon as found */ - DAMOS_LRU_DEPRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } static int damon_lru_sort_apply_parameters(void) -- GitLab From 8ef4d5caa66d62b3b87a14d01562fb487651df2e Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:24 +0800 Subject: [PATCH 1031/2223] mm/damon: simplify the parameter passing for 'prepare_access_checks' Patch series "mm/damon: code simplifications and cleanups". This patchset contains some code simplifications and cleanups for DAMON. This patch (of 4): The parameter 'struct damon_ctx *ctx' isn't used in the functions __damon_{p,v}a_prepare_access_check(), so we can remove it and simplify the parameter passing. Link: https://lkml.kernel.org/r/1663060287-30201-1-git-send-email-kaixuxia@tencent.com Link: https://lkml.kernel.org/r/1663060287-30201-2-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 5 ++--- mm/damon/vaddr.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1ada62db68b13..dfeebffe82f44 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -63,8 +63,7 @@ out: folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -78,7 +77,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(ctx, r); + __damon_pa_prepare_access_check(r); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 0eae47bd9ccbb..3f84584f99826 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -397,8 +397,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void __damon_va_prepare_access_check(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_prepare_access_check(struct mm_struct *mm, + struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -416,7 +416,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - __damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(mm, r); mmput(mm); } } -- GitLab From f1c71c2825218dc8b35c04ab439fdf3d32778c7c Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:25 +0800 Subject: [PATCH 1032/2223] mm/damon/sysfs: simplify the variable 'pid' assignment operation We can initialize the variable 'pid' with '-1' in pid_show() to simplify the variable assignment operation and make the code more readable. Link: https://lkml.kernel.org/r/1663060287-30201-3-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 3dbf3804ec88b..1fa0023f136eb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2592,19 +2592,16 @@ static ssize_t pid_show(struct kobject *kobj, struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); struct damon_ctx *ctx; - int pid; + int pid = -1; if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; ctx = kdamond->damon_ctx; - if (!ctx) { - pid = -1; + if (!ctx) goto out; - } + mutex_lock(&ctx->kdamond_lock); - if (!ctx->kdamond) - pid = -1; - else + if (ctx->kdamond) pid = ctx->kdamond->pid; mutex_unlock(&ctx->kdamond_lock); out: -- GitLab From 29454cf6ab3c49bc5d3f443e1d1417feca3d0ce5 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:26 +0800 Subject: [PATCH 1033/2223] mm/damon/core: simplify the kdamond stop mechanism by removing 'done' When the 'kdamond_wait_activation()' function or 'after_sampling()' or 'after_aggregation()' DAMON callbacks return an error, it is unnecessary to use bool 'done' to check if kdamond should be finished. This commit simplifies the kdamond stop mechanism by removing 'done' and break the while loop directly in the cases. Link: https://lkml.kernel.org/r/1663060287-30201-4-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 29635a82cb691..a843673c11cfc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1152,30 +1152,25 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; - bool done = false; pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->ops.init) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - done = true; + goto done; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx) && !done) { - if (kdamond_wait_activation(ctx)) { - done = true; - continue; - } + while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + break; if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && - ctx->callback.after_sampling(ctx)) { - done = true; - continue; - } + ctx->callback.after_sampling(ctx)) + break; kdamond_usleep(ctx->attrs.sample_interval); @@ -1187,10 +1182,8 @@ static int kdamond_fn(void *data) max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) { - done = true; - continue; - } + ctx->callback.after_aggregation(ctx)) + break; kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); @@ -1204,6 +1197,7 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); } } +done: damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) damon_destroy_region(r, t); -- GitLab From 4988fe69527c6e02066aeb454c2db4d6d51d317b Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 13 Sep 2022 15:13:58 +0800 Subject: [PATCH 1034/2223] mm/memcontrol: use kstrtobool for swapaccount param parsing Use kstrtobool which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220913071358.1812206-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f204a2620543..ac6440daf2086 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7507,10 +7507,10 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - if (!strcmp(s, "1")) - cgroup_memory_noswap = false; - else if (!strcmp(s, "0")) - cgroup_memory_noswap = true; + bool res; + + if (!kstrtobool(s, &res)) + cgroup_memory_noswap = !res; return 1; } __setup("swapaccount=", setup_swap_account); -- GitLab From a8368cd8e22531b3b248a2c869d71b668aeeb789 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:20:48 -0700 Subject: [PATCH 1035/2223] mm/page_alloc.c: rename check_free_page() to free_page_is_bad() The name "check_free_page()" provides no information regarding its return value when the page is indeed found to be bad. Renaming it to "free_page_is_bad()" makes it clear that a `true' return value means the page was bad. And make it return a bool, not an int. [akpm@linux-foundation.org: don't use bool as int] Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0002ded4ab0e4..c48357c124ebc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1285,20 +1285,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void check_free_page_bad(struct page *page) +static void free_page_is_bad_report(struct page *page) { bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int check_free_page(struct page *page) +static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) - return 0; + return false; /* Something has gone sideways, find it */ - check_free_page_bad(page); - return 1; + free_page_is_bad_report(page); + return true; } static int free_tail_pages_check(struct page *head_page, struct page *page) @@ -1430,7 +1430,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(check_free_page(page + i))) { + if (unlikely(free_page_is_bad(page + i))) { bad++; continue; } @@ -1441,8 +1441,8 @@ static __always_inline bool free_pages_prepare(struct page *page, page->mapping = NULL; if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); - if (check_free) - bad += check_free_page(page); + if (check_free && free_page_is_bad(page)) + bad++; if (bad) return false; @@ -1504,7 +1504,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return check_free_page(page); + return free_page_is_bad(page); else return false; } @@ -1525,7 +1525,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { - return check_free_page(page); + return free_page_is_bad(page); } #endif /* CONFIG_DEBUG_VM */ -- GitLab From d452289fcd68f13f4067f0ddd78a5d948cb7d9ea Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:30:38 -0700 Subject: [PATCH 1036/2223] mm/page_alloc.c: document bulkfree_pcp_prepare() return value Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c48357c124ebc..4e8ea824e7653 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1501,6 +1501,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) return free_pages_prepare(page, order, true, FPI_NONE); } +/* return true if this page has an inappropriate state */ static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) -- GitLab From aaa31e058dd82453c89302c9331945894ff555a6 Mon Sep 17 00:00:00 2001 From: ze zuo Date: Tue, 13 Sep 2022 01:55:05 +0000 Subject: [PATCH 1037/2223] mm/mempolicy: use PAGE_ALIGN instead of open-coding it Replace the simple calculation with PAGE_ALIGN. Link: https://lkml.kernel.org/r/20220913015505.1998958-1-zuoze1@huawei.com Signed-off-by: ze zuo Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 143e2eaaa6ec5..a937eaec5b68d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1270,7 +1270,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) @@ -1507,7 +1507,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) -- GitLab From b958d4d08fbfe938af24ea06ebbf839b48fa18a9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:02 +0800 Subject: [PATCH 1038/2223] mm: hugetlb: simplify per-node sysfs creation and removal Patch series "simplify handling of per-node sysfs creation and removal", v4. This patch (of 2): The following commit offload per-node sysfs creation and removal to a kworker and did not say why it is needed. And it also said "I don't know that this is absolutely required". It seems like the author was not sure as well. Since it only complicates the code, this patch will revert the changes to simplify the code. 39da08cb074c ("hugetlb: offload per node attribute registrations") We could use memory hotplug notifier to do per-node sysfs creation and removal instead of inserting those operations to node registration and unregistration. Then, it can reduce the code coupling between node.c and hugetlb.c. Also, it can simplify the code. Link: https://lkml.kernel.org/r/20220914072603.60293-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220914072603.60293-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Muchun Song Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 139 +------------------------------------------ include/linux/node.h | 24 ++------ mm/hugetlb.c | 35 +++++++---- 3 files changed, 30 insertions(+), 168 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index eb0f43784c2b3..ed391cb09999b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -587,64 +587,9 @@ static const struct attribute_group *node_dev_groups[] = { NULL }; -#ifdef CONFIG_HUGETLBFS -/* - * hugetlbfs per node attributes registration interface: - * When/if hugetlb[fs] subsystem initializes [sometime after this module], - * it will register its per node attributes for all online nodes with - * memory. It will also call register_hugetlbfs_with_node(), below, to - * register its attribute registration functions with this node driver. - * Once these hooks have been initialized, the node driver will call into - * the hugetlb module to [un]register attributes for hot-plugged nodes. - */ -static node_registration_func_t __hugetlb_register_node; -static node_registration_func_t __hugetlb_unregister_node; - -static inline bool hugetlb_register_node(struct node *node) -{ - if (__hugetlb_register_node && - node_state(node->dev.id, N_MEMORY)) { - __hugetlb_register_node(node); - return true; - } - return false; -} - -static inline void hugetlb_unregister_node(struct node *node) -{ - if (__hugetlb_unregister_node) - __hugetlb_unregister_node(node); -} - -void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister) -{ - __hugetlb_register_node = doregister; - __hugetlb_unregister_node = unregister; -} -#else -static inline void hugetlb_register_node(struct node *node) {} - -static inline void hugetlb_unregister_node(struct node *node) {} -#endif - static void node_device_release(struct device *dev) { - struct node *node = to_node(dev); - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - /* - * We schedule the work only when a memory section is - * onlined/offlined on this node. When we come here, - * all the memory on this node has been offlined, - * so we won't enqueue new work to this work. - * - * The work is using node->node_work, so we should - * flush work before freeing the memory. - */ - flush_work(&node->node_work); -#endif - kfree(node); + kfree(to_node(dev)); } /* @@ -665,11 +610,9 @@ static int register_node(struct node *node, int num) if (error) put_device(&node->dev); - else { - hugetlb_register_node(node); - + else compaction_register_node(node); - } + return error; } @@ -683,7 +626,6 @@ static int register_node(struct node *node, int num) void unregister_node(struct node *node) { compaction_unregister_node(node); - hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); @@ -905,74 +847,8 @@ void register_memory_blocks_under_node(int nid, unsigned long start_pfn, (void *)&nid, func); return; } - -#ifdef CONFIG_HUGETLBFS -/* - * Handle per node hstate attribute [un]registration on transistions - * to/from memoryless state. - */ -static void node_hugetlb_work(struct work_struct *work) -{ - struct node *node = container_of(work, struct node, node_work); - - /* - * We only get here when a node transitions to/from memoryless state. - * We can detect which transition occurred by examining whether the - * node has memory now. hugetlb_register_node() already check this - * so we try to register the attributes. If that fails, then the - * node has transitioned to memoryless, try to unregister the - * attributes. - */ - if (!hugetlb_register_node(node)) - hugetlb_unregister_node(node); -} - -static void init_node_hugetlb_work(int nid) -{ - INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work); -} - -static int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - switch (action) { - case MEM_ONLINE: - case MEM_OFFLINE: - /* - * offload per node hstate [un]registration to a work thread - * when transitioning to/from memoryless state. - */ - if (nid != NUMA_NO_NODE) - schedule_work(&node_devices[nid]->node_work); - break; - - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HUGETLBFS */ #endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) -static inline int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - return NOTIFY_OK; -} - -static void init_node_hugetlb_work(int nid) { } - -#endif - int __register_one_node(int nid) { int error; @@ -991,8 +867,6 @@ int __register_one_node(int nid) } INIT_LIST_HEAD(&node_devices[nid]->access_list); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); node_init_caches(nid); return error; @@ -1063,13 +937,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = { NULL, }; -#define NODE_CALLBACK_PRI 2 /* lower than SLAB */ void __init node_dev_init(void) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); @@ -1079,8 +948,6 @@ void __init node_dev_init(void) if (ret) panic("%s() failed to register subsystem: %d\n", __func__, ret); - register_hotmemory_notifier(&node_memory_callback_nb); - /* * Create all node devices, which will properly link the node * to applicable memory block devices and already created cpu devices. diff --git a/include/linux/node.h b/include/linux/node.h index 9ec680dd607f7..427a5975cf405 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -2,15 +2,15 @@ /* * include/linux/node.h - generic node definition * - * This is mainly for topological representation. We define the - * basic 'struct node' here, which can be embedded in per-arch + * This is mainly for topological representation. We define the + * basic 'struct node' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/node.c - * and system devices are handled in drivers/base/sys.c. + * and system devices are handled in drivers/base/sys.c. * * Nodes are exported via driverfs in the class/node/devices/ - * directory. + * directory. */ #ifndef _LINUX_NODE_H_ #define _LINUX_NODE_H_ @@ -18,7 +18,6 @@ #include #include #include -#include /** * struct node_hmem_attrs - heterogeneous memory performance attributes @@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid, struct node { struct device dev; struct list_head access_list; - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - struct work_struct node_work; -#endif #ifdef CONFIG_HMEM_REPORTING struct list_head cache_attrs; struct device *cache_dev; @@ -96,7 +91,6 @@ struct node { struct memory_block; extern struct node *node_devices[]; -typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void register_memory_blocks_under_node(int nid, unsigned long start_pfn, @@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, unsigned access); - -#ifdef CONFIG_HUGETLBFS -extern void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister); -#endif #else static inline void node_dev_init(void) { @@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } - -static inline void register_hugetlbfs_with_node(node_registration_func_t reg, - node_registration_func_t unreg) -{ -} #endif #define to_node(device) container_of(device, struct node, dev) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6af123374e980..397f2988c37f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -4000,6 +4001,23 @@ static void hugetlb_register_node(struct node *node) } } +static int __meminit hugetlb_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int nid = mnb->status_change_nid; + + if (nid == NUMA_NO_NODE) + return NOTIFY_DONE; + + if (action == MEM_GOING_ONLINE) + hugetlb_register_node(node_devices[nid]); + else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) + hugetlb_unregister_node(node_devices[nid]); + + return NOTIFY_OK; +} + /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4009,18 +4027,11 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + get_online_mems(); + hotplug_memory_notifier(hugetlb_memory_callback, 0); + for_each_node_state(nid, N_MEMORY) + hugetlb_register_node(node_devices[nid]); + put_online_mems(); } #else /* !CONFIG_NUMA */ -- GitLab From a4a00b451ef5e1deb959088e25e248f4ee399792 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:03 +0800 Subject: [PATCH 1039/2223] mm: hugetlb: eliminate memory-less nodes handling The memory-notify-based approach aims to handle meory-less nodes, however, it just adds the complexity of code as pointed by David in thread [1]. The handling of memory-less nodes is introduced by commit 4faf8d950ec4 ("hugetlb: handle memory hot-plug events"). >From its commit message, we cannot find any necessity of handling this case. So, we can simply register/unregister sysfs entries in register_node/unregister_node to simlify the code. BTW, hotplug callback added because in hugetlb_register_all_nodes() we register sysfs nodes only for N_MEMORY nodes, seeing commit 9b5e5d0fdc91, which said it was a preparation for handling memory-less nodes via memory hotplug. Since we want to remove memory hotplug, so make sure we only register per-node sysfs for online (N_ONLINE) nodes in hugetlb_register_all_nodes(). https://lore.kernel.org/linux-mm/60933ffc-b850-976c-78a0-0ee6e0ea9ef0@redhat.com/ [1] Link: https://lkml.kernel.org/r/20220914072603.60293-3-songmuchun@bytedance.com Suggested-by: David Hildenbrand Signed-off-by: Muchun Song Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 8 +++-- include/linux/hugetlb.h | 14 +++++++++ mm/hugetlb.c | 70 +++++++++++++++++------------------------ 3 files changed, 49 insertions(+), 43 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ed391cb09999b..80b1e91b96081 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct bus_type node_subsys = { .name = "node", @@ -608,10 +609,12 @@ static int register_node(struct node *node, int num) node->dev.groups = node_dev_groups; error = device_register(&node->dev); - if (error) + if (error) { put_device(&node->dev); - else + } else { + hugetlb_register_node(node); compaction_register_node(node); + } return error; } @@ -625,6 +628,7 @@ static int register_node(struct node *node, int num) */ void unregister_node(struct node *node) { + hugetlb_unregister_node(node); compaction_unregister_node(node); node_remove_accesses(node); node_remove_caches(node); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 57e72954a482e..6d7f397540602 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,6 +16,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; +struct node; #ifndef is_hugepd typedef struct { unsigned long pd; } hugepd_t; @@ -935,6 +936,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_NUMA +void hugetlb_register_node(struct node *node); +void hugetlb_unregister_node(struct node *node); +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1109,6 +1115,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { } + +static inline void hugetlb_register_node(struct node *node) +{ +} + +static inline void hugetlb_unregister_node(struct node *node) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 397f2988c37f5..0b1ab5af939e6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3871,24 +3871,8 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return 0; } -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } -} - #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3944,7 +3928,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3974,12 +3958,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -4001,23 +3988,6 @@ static void hugetlb_register_node(struct node *node) } } -static int __meminit hugetlb_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - if (nid == NUMA_NO_NODE) - return NOTIFY_DONE; - - if (action == MEM_GOING_ONLINE) - hugetlb_register_node(node_devices[nid]); - else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) - hugetlb_unregister_node(node_devices[nid]); - - return NOTIFY_OK; -} - /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4027,11 +3997,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - get_online_mems(); - hotplug_memory_notifier(hugetlb_memory_callback, 0); - for_each_node_state(nid, N_MEMORY) + for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); - put_online_mems(); } #else /* !CONFIG_NUMA */ @@ -4055,6 +4022,28 @@ static inline __init void hugetlb_cma_check(void) } #endif +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -4109,7 +4098,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP -- GitLab From c195c3215741746b1eb7ab7980b926ddc37a4be3 Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Wed, 14 Sep 2022 10:17:38 +0800 Subject: [PATCH 1040/2223] mm/filemap: make folio_put_wait_locked static It's only used in mm/filemap.c, since commit ("mm/migrate.c: rework migration_entry_wait() to not take a pageref"). Make it static. Link: https://lkml.kernel.org/r/20220914021738.3228011-1-sunke@kylinos.cn Signed-off-by: Ke Sun Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - mm/filemap.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 32846b6306dbd..23125ab87ded2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1039,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page) return folio_wait_locked_killable(page_folio(page)); } -int folio_put_wait_locked(struct folio *folio, int state); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index aab125d423b8f..f27c93a581ab4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1460,7 +1460,7 @@ EXPORT_SYMBOL(folio_wait_bit_killable); * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int folio_put_wait_locked(struct folio *folio, int state) +static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } -- GitLab From 3259914f8cab1bab3fe691a90ac3c47411cb0aba Mon Sep 17 00:00:00 2001 From: XU pengfei Date: Wed, 14 Sep 2022 09:21:14 +0800 Subject: [PATCH 1041/2223] mm/hugetlb: remove unnecessary 'NULL' values from pointer Pointer variables allocate memory first, and then judge. There is no need to initialize the assignment. Link: https://lkml.kernel.org/r/20220914012113.6271-1-xupengfei@nfschina.com Signed-off-by: XU pengfei Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b1ab5af939e6..d4347ae337fb5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -258,7 +258,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { - struct file_region *nrg = NULL; + struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); @@ -340,7 +340,7 @@ static bool has_same_uncharge_info(struct file_region *rg, static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { - struct file_region *nrg = NULL, *prg = NULL; + struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && -- GitLab From 188a39725ad7ded2d13e752a1a620152b0750175 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:02 -0700 Subject: [PATCH 1042/2223] hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race Patch series "hugetlb: Use new vma lock for huge pmd sharing synchronization", v2. hugetlb fault scalability regressions have recently been reported [1]. This is not the first such report, as regressions were also noted when commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") was added [2] in v5.7. At that time, a proposal to address the regression was suggested [3] but went nowhere. The regression and benefit of this patch series is not evident when using the vm_scalability benchmark reported in [2] on a recent kernel. Results from running, "./usemem -n 48 --prealloc --prefault -O -U 3448054972" 48 sample Avg next-20220913 next-20220913 next-20220913 unmodified revert i_mmap_sema locking vma sema locking, this series ----------------------------------------------------------------------------- 498150 KB/s 501934 KB/s 504793 KB/s The recent regression report [1] notes page fault and fork latency of shared hugetlb mappings. To measure this, I created two simple programs: 1) map a shared hugetlb area, write fault all pages, unmap area Do this in a continuous loop to measure faults per second 2) map a shared hugetlb area, write fault a few pages, fork and exit Do this in a continuous loop to measure forks per second These programs were run on a 48 CPU VM with 320GB memory. The shared mapping size was 250GB. For comparison, a single instance of the program was run. Then, multiple instances were run in parallel to introduce lock contention. Changing the locking scheme results in a significant performance benefit. test instances unmodified revert vma -------------------------------------------------------------------------- faults per sec 1 393043 395680 389932 faults per sec 24 71405 81191 79048 forks per sec 1 2802 2747 2725 forks per sec 24 439 536 500 Combined faults 24 1621 68070 53662 Combined forks 24 358 67 142 Combined test is when running both faulting program and forking program simultaneously. Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in the fault path to establish pmd sharing, so this is moved back to huge_pmd_share. With c0d0381ade79 reverted, this race is exposed: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When the new vma lock is put to use in patch 8, this will handle the fault/file truncation races. This is explained in patch 9 where code associated with these races is cleaned up. Patches 3 - 5 restructure existing code in preparation for using the new vma lock (rw semaphore) for pmd sharing synchronization. The idea is that this semaphore will be held in read mode for the duration of fault processing, and held in write mode for unmap operations which may call huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to synchronize huge pmd sharing. However it is only required in the fault path when setting up sharing, and will be acquired in huge_pmd_share(). Patch 6 adds the new vma lock and all supporting routines, but does not actually change code to use the new lock. Patch 7 refactors code in preparation for using the new lock. And, patch 8 finally adds code to make use of this new vma lock. Unfortunately, the fault code and truncate/hole punch code would naturally take locks in the opposite order which could lead to deadlock. Since the performance of page faults is more important, the truncation/hole punch code is modified to back out and take locks in the correct order if necessary. [1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/ [2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/ [3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/ This patch (of 9): Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") added code to take i_mmap_rwsem in read mode for the duration of fault processing. The use of i_mmap_rwsem to prevent fault/truncate races depends on this. However, this has been shown to cause performance/scaling issues. As a result, that code will be reverted. Since the use i_mmap_rwsem to address page fault/truncate races depends on this, it must also be reverted. In a subsequent patch, code will be added to detect the fault/truncate race and back out operations as required. Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 30 +++++++++--------------------- mm/hugetlb.c | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a9..a32031e751d14 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -419,9 +419,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents - * page faults in the truncated range by checking i_size. i_size is - * modified while holding i_mmap_rwsem. + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -451,16 +452,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash = 0; index = folio->index; - if (!truncate_op) { - /* - * Only need to hold the fault mutex in the - * hole punch case. This prevents races with - * page faults. Races are not possible in the - * case of truncation. - */ - hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - } + hash = hugetlb_fault_mutex_hash(mapping, index); + mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * If folio is mapped, it was faulted in after being @@ -504,8 +497,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } folio_unlock(folio); - if (!truncate_op) - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); @@ -543,8 +535,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_mmap_lock_write(mapping); i_size_write(inode, offset); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); @@ -703,11 +695,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d4347ae337fb5..14afb5b67dd42 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5560,17 +5560,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * i_size is modified when holding i_mmap_rwsem, so check here - * once for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - new_page = false; page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, @@ -5666,6 +5664,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) @@ -5774,10 +5776,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with i_size modifications during truncation. + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless -- GitLab From 3a47c54f09c4c89128d8f67d49296b1c25b317d0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:03 -0700 Subject: [PATCH 1043/2223] hugetlbfs: revert use i_mmap_rwsem for more pmd sharing synchronization Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") added code to take i_mmap_rwsem in read mode for the duration of fault processing. However, this has been shown to cause performance/scaling issues. Revert the code and go back to only taking the semaphore in huge_pmd_share during the fault path. Keep the code that takes i_mmap_rwsem in write mode before calling try_to_unmap as this is required if huge_pmd_unshare is called. NOTE: Reverting this code does expose the following race condition. Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) It is unknown if the above race was ever experienced by a user. It was discovered via code inspection when initially addressed. In subsequent patches, a new synchronization mechanism will be added to coordinate pmd sharing and eliminate this race. Link: https://lkml.kernel.org/r/20220914221810.95771-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 -- mm/hugetlb.c | 77 +++++++------------------------------------- mm/rmap.c | 8 +---- mm/userfaultfd.c | 11 ++----- 4 files changed, 15 insertions(+), 83 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a32031e751d14..dfb735a91bbbd 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -467,9 +467,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, if (unlikely(folio_mapped(folio))) { BUG_ON(truncate_op); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_lock_write(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), (index + 1) * pages_per_huge_page(h), diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 14afb5b67dd42..8283706bd81d6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4770,7 +4770,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; @@ -4782,14 +4781,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); - } else { - /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. - */ - i_mmap_lock_read(mapping); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4936,8 +4927,6 @@ again: if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); - } else { - i_mmap_unlock_read(mapping); } return ret; @@ -5346,29 +5335,8 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { - struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx; - u32 hash; - put_page(old_page); - /* - * Drop hugetlb_fault_mutex and i_mmap_rwsem before - * unmapping. unmapping needs to hold i_mmap_rwsem - * in write mode. Dropping i_mmap_rwsem in read mode - * here is OK as COW mappings do not interact with - * PMD sharing. - * - * Reacquire both after unmap operation. - */ - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - unmap_ref_private(mm, vma, old_page, haddr); - - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5523,9 +5491,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, */ hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); ret = handle_userfault(&vmf, reason); - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); return ret; @@ -5760,11 +5726,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, ptep); @@ -5772,31 +5733,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something has changed. - */ mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { - i_mmap_unlock_read(mapping); - return VM_FAULT_OOM; - } + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -5861,7 +5811,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5905,7 +5854,6 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -6745,12 +6693,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode if - * sharing is possible. For hugetlbfs, this prevents removal of any page - * table entries associated with the address space. This is important as we - * are setting up sharing based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -6764,7 +6710,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - i_mmap_assert_locked(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -6794,6 +6740,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_read(mapping); return pte; } @@ -6804,7 +6751,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/rmap.c b/mm/rmap.c index 0b9264e58d256..2a08647a61fca 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,10 +23,9 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) * (see hugetlbfs below) + * page->flags PG_locked (lock_page) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) @@ -45,11 +44,6 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock - * - * * hugetlbfs PageHuge() pages take locks in this order: - * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) - * page->flags PG_locked (lock_page) */ #include diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9c035be2148bb..0fdbd2c05587d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -379,14 +379,10 @@ retry: BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even - * in the case of shared pmds. fault mutex prevents - * races with other faulting threads. + * Serialize via hugetlb_fault_mutex. */ - mapping = dst_vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -394,7 +390,6 @@ retry: dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -402,7 +397,6 @@ retry: !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -411,7 +405,6 @@ retry: wp_copy); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); cond_resched(); -- GitLab From 7e1813d48dd30e6c6f235f6661d1bc108fcab528 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:04 -0700 Subject: [PATCH 1044/2223] hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache remove_huge_page removes a hugetlb page from the page cache. Change to hugetlb_delete_from_page_cache as it is a more descriptive name. huge_add_to_page_cache is global in scope, but only deals with hugetlb pages. For consistency and clarity, rename to hugetlb_add_to_page_cache. Link: https://lkml.kernel.org/r/20220914221810.95771-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 21 ++++++++++----------- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dfb735a91bbbd..edd69cc43ca5d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,7 +364,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void remove_huge_page(struct page *page) +static void hugetlb_delete_from_page_cache(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); @@ -478,15 +478,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, folio_lock(folio); /* * We must free the huge page and remove from page - * cache (remove_huge_page) BEFORE removing the - * region/reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the - * region/reserve map could fail. Correspondingly, - * the subpool and global reserve usage count can need - * to be adjusted. + * cache BEFORE removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out of memory + * conditions, removal of the region/reserve map could + * fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. */ VM_BUG_ON(HPageRestoreReserve(&folio->page)); - remove_huge_page(&folio->page); + hugetlb_delete_from_page_cache(&folio->page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, @@ -723,7 +722,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); - error = huge_add_to_page_cache(page, mapping, index); + error = hugetlb_add_to_page_cache(page, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); @@ -735,7 +734,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by huge_add_to_page_cache() + * unlock_page because locked by hugetlb_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -980,7 +979,7 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t index = page->index; - remove_huge_page(page); + hugetlb_delete_from_page_cache(page); if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d7f397540602..4893d6d070998 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -666,7 +666,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8283706bd81d6..accb166791c77 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5430,7 +5430,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { struct folio *folio = page_folio(page); @@ -5569,7 +5569,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_page = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = huge_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5981,11 +5981,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* * Serialization between remove_inode_hugepages() and - * huge_add_to_page_cache() below happens through the + * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = huge_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; page_in_pagecache = true; -- GitLab From c86272287bc65cb3d698a95c19651265e9f287cd Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:05 -0700 Subject: [PATCH 1045/2223] hugetlb: create remove_inode_single_folio to remove single file folio Create the new routine remove_inode_single_folio that will remove a single folio from a file. This is refactored code from remove_inode_hugepages. It checks for the uncommon case in which the folio is still mapped and unmaps. No functional change. This refactoring will be put to use and expanded upon in a subsequent patches. Link: https://lkml.kernel.org/r/20220914221810.95771-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 105 ++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index edd69cc43ca5d..7112a9a9f54df 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -411,6 +411,60 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, } } +/* + * Called with hugetlb fault mutex held. + * Returns true if page was actually removed, false otherwise. + */ +static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, + struct folio *folio, pgoff_t index, + bool truncate_op) +{ + bool ret = false; + + /* + * If folio is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) while holding + * the fault mutex. The mutex will prevent faults + * until we finish removing the folio. + */ + if (unlikely(folio_mapped(folio))) { + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h), + ZAP_FLAG_DROP_MARKER); + i_mmap_unlock_write(mapping); + } + + folio_lock(folio); + /* + * After locking page, make sure mapping is the same. + * We could have raced with page fault populate and + * backout code. + */ + if (folio_mapping(folio) == mapping) { + /* + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); + } + } + + folio_unlock(folio); + return ret; +} + /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. @@ -418,11 +472,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * maps and global counts. Page faults can race with truncation. + * During faults, hugetlb_no_page() checks i_size before page allocation, + * and again after obtaining page table lock. It will 'back out' + * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -456,44 +509,12 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* - * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * Remove folio that was part of folio_batch. */ - if (unlikely(folio_mapped(folio))) { - BUG_ON(truncate_op); - - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } - - folio_lock(folio); - /* - * We must free the huge page and remove from page - * cache BEFORE removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out of memory - * conditions, removal of the region/reserve map could - * fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, - index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } - - folio_unlock(folio); + if (remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op)) + freed++; + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); -- GitLab From 12710fd696343a0d6c318bdad22fa7809af7859b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:06 -0700 Subject: [PATCH 1046/2223] hugetlb: rename vma_shareable() and refactor code Rename the routine vma_shareable to vma_addr_pmd_shareable as it is checking a specific address within the vma. Refactor code to check if an aligned range is shareable as this will be needed in a subsequent patch. Link: https://lkml.kernel.org/r/20220914221810.95771-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- mm/hugetlb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index accb166791c77..482f7f357f753 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6640,26 +6640,33 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) return true; return false; } +static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, + unsigned long addr) +{ + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + + return __vma_aligned_range_pmd_shareable(vma, start, end); +} + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif - return vma_shareable(vma, addr); + return vma_addr_pmd_shareable(vma, addr); } /* -- GitLab From 8d9bfb2608145cf3e408428c224099e1585471af Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:07 -0700 Subject: [PATCH 1047/2223] hugetlb: add vma based lock for pmd sharing Allocate a new hugetlb_vma_lock structure and hang off vm_private_data for synchronization use by vmas that could be involved in pmd sharing. This data structure contains a rw semaphore that is the primary tool used for synchronization. This new structure is ref counted, so that it can exist when NOT attached to a vma. This is only helpful in resolving lock ordering issues where code may need to obtain the vma_lock while there are no guarantees the vma may go away. By obtaining a ref on the structure, it can be guaranteed that at least the rw semaphore will not go away. Only add infrastructure for the new lock here. Actual use will be added in subsequent patches. [mike.kravetz@oracle.com: fix build issue for missing hugetlb_vma_lock_release] Link: https://lkml.kernel.org/r/YyNUtA1vRASOE4+M@monkey Link: https://lkml.kernel.org/r/20220914221810.95771-7-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 43 ++++++++- kernel/fork.c | 6 +- mm/hugetlb.c | 207 ++++++++++++++++++++++++++++++++++++---- mm/rmap.c | 8 +- 4 files changed, 240 insertions(+), 24 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4893d6d070998..7b70aa9317292 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -115,6 +115,12 @@ struct file_region { #endif }; +struct hugetlb_vma_lock { + struct kref refs; + struct rw_semaphore rw_sema; + struct vm_area_struct *vma; +}; + extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); @@ -127,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); -void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, @@ -215,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags); +void hugetlb_vma_lock_read(struct vm_area_struct *vma); +void hugetlb_vma_unlock_read(struct vm_area_struct *vma); +void hugetlb_vma_lock_write(struct vm_area_struct *vma); +void hugetlb_vma_unlock_write(struct vm_area_struct *vma); +int hugetlb_vma_trylock_write(struct vm_area_struct *vma); +void hugetlb_vma_assert_locked(struct vm_area_struct *vma); +void hugetlb_vma_lock_release(struct kref *kref); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -226,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ -static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } @@ -337,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file, return -EINVAL; } +static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + static inline int pmd_huge(pmd_t pmd) { return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 50460330306a8..3d788f759e5f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -674,12 +674,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only + * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); + hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ mas.index = tmp->vm_start; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 482f7f357f753..f44b79998ac2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -859,7 +861,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * - * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. @@ -1006,12 +1008,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ -void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + /* + * Clear vm_private_data + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + */ + vma->vm_private_data = (void *)0; if (!(vma->vm_flags & VM_MAYSHARE)) - vma->vm_private_data = (void *)0; + return; } /* @@ -1042,7 +1052,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); } - reset_vma_resv_huge_pages(vma); + hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ @@ -4623,16 +4633,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } + + hugetlb_vma_lock_alloc(vma); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *resv = vma_resv_map(vma); + struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -6439,6 +6454,11 @@ bool hugetlb_reserve_pages(struct inode *inode, return false; } + /* + * vma specific semaphore used for pmd sharing synchronization + */ + hugetlb_vma_lock_alloc(vma); + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -6462,12 +6482,11 @@ bool hugetlb_reserve_pages(struct inode *inode, resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); - } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return false; + goto out_err; chg = to - from; @@ -6562,6 +6581,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: + hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. @@ -6641,14 +6661,34 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, } static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool check_vma_lock) { +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) - return true; - return false; + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (check_vma_lock && !vma->vm_private_data) + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; +} + +static bool vma_pmd_shareable(struct vm_area_struct *vma) +{ + unsigned long start = ALIGN(vma->vm_start, PUD_SIZE), + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return false; + + return __vma_aligned_range_pmd_shareable(vma, start, end, false); } static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, @@ -6657,15 +6697,11 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; - return __vma_aligned_range_pmd_shareable(vma, start, end); + return __vma_aligned_range_pmd_shareable(vma, start, end, true); } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { -#ifdef CONFIG_USERFAULTFD - if (uffd_disable_huge_pmd_share(vma)) - return false; -#endif return vma_addr_pmd_shareable(vma, addr); } @@ -6696,6 +6732,130 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, *end = ALIGN(*end, PUD_SIZE); } +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (!__vma_shareable_flags_pmd(vma)) + return 1; + + return down_write_trylock(&vma_lock->rw_sema); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. See comment in + * __unmap_hugepage_range_final about how VM_SHARED could + * be set without VM_MAYSHARE. As a result, we need to + * check if either is set in the free path. + */ + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED))) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * vma_lock structure may or not be released, but it + * certainly will no longer be attached to vma so clear + * pointer. + */ + vma_lock->vma = NULL; + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + vma->vm_private_data = NULL; + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + /* Check size/alignment for pmd sharing possible */ + if (!vma_pmd_shareable(vma)) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. + */ + return; + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -6782,6 +6942,19 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +void hugetlb_vma_lock_release(struct kref *kref) +{ +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ +} + pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { diff --git a/mm/rmap.c b/mm/rmap.c index 2a08647a61fca..0e179c823e0ac 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * page->flags PG_locked (lock_page) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock @@ -44,6 +44,12 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock + * + * hugetlbfs PageHuge() take locks in this order: + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * vma_lock (hugetlb specific lock for pmd_sharing) + * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) + * page->flags PG_locked (lock_page) */ #include -- GitLab From 378397ccb8e5a695a42e819df545ccd28641b683 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:08 -0700 Subject: [PATCH 1048/2223] hugetlb: create hugetlb_unmap_file_folio to unmap single file folio Create the new routine hugetlb_unmap_file_folio that will unmap a single file folio. This is refactored code from hugetlb_vmdelete_list. It is modified to do locking within the routine itself and check whether the page is mapped within a specific vma before unmapping. This refactoring will be put to use and expanded upon in a subsequent patch adding vma specific locking. Link: https://lkml.kernel.org/r/20220914221810.95771-8-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 123 +++++++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 29 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7112a9a9f54df..3bb1772fce2f7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -371,6 +371,94 @@ static void hugetlb_delete_from_page_cache(struct page *page) delete_from_page_cache(page); } +/* + * Called with i_mmap_rwsem held for inode based vma maps. This makes + * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault + * mutex for the page in the mapping. So, we can not race with page being + * faulted into the vma. + */ +static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + pte_t *ptep, pte; + + ptep = huge_pte_offset(vma->vm_mm, addr, + huge_page_size(hstate_vma(vma))); + + if (!ptep) + return false; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte) || !pte_present(pte)) + return false; + + if (pte_page(pte) == page) + return true; + + return false; +} + +/* + * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ +static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) +{ + if (vma->vm_pgoff < start) + return (start - vma->vm_pgoff) << PAGE_SHIFT; + else + return 0; +} + +static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) +{ + unsigned long t_end; + + if (!end) + return vma->vm_end; + + t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; + if (t_end > vma->vm_end) + t_end = vma->vm_end; + return t_end; +} + +/* + * Called with hugetlb fault mutex held. Therefore, no more mappings to + * this folio can be created while executing the routine. + */ +static void hugetlb_unmap_file_folio(struct hstate *h, + struct address_space *mapping, + struct folio *folio, pgoff_t index) +{ + struct rb_root_cached *root = &mapping->i_mmap; + struct page *page = &folio->page; + struct vm_area_struct *vma; + unsigned long v_start; + unsigned long v_end; + pgoff_t start, end; + + start = index * pages_per_huge_page(h); + end = (index + 1) * pages_per_huge_page(h); + + i_mmap_lock_write(mapping); + + vma_interval_tree_foreach(vma, root, start, end - 1) { + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + continue; + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, ZAP_FLAG_DROP_MARKER); + } + + i_mmap_unlock_write(mapping); +} + static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) @@ -383,30 +471,13 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { - unsigned long v_offset; + unsigned long v_start; unsigned long v_end; - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, zap_flags); } } @@ -428,14 +499,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * the fault mutex. The mutex will prevent faults * until we finish removing the folio. */ - if (unlikely(folio_mapped(folio))) { - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } + if (unlikely(folio_mapped(folio))) + hugetlb_unmap_file_folio(h, mapping, folio, index); folio_lock(folio); /* -- GitLab From 40549ba8f8e0ed1f8b235979563f619e9aa34fdf Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:09 -0700 Subject: [PATCH 1049/2223] hugetlb: use new vma_lock for pmd sharing synchronization The new hugetlb vma lock is used to address this race: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... The vma_lock is used as follows: - During fault processing. The lock is acquired in read mode before doing a page table lock and allocation (huge_pte_alloc). The lock is held until code is finished with the page table entry (ptep). - The lock must be held in write mode whenever huge_pmd_unshare is called. Lock ordering issues come into play when unmapping a page from all vmas mapping the page. The i_mmap_rwsem must be held to search for the vmas, and the vma lock must be held before calling unmap which will call huge_pmd_unshare. This is done today in: - try_to_migrate_one and try_to_unmap_ for page migration and memory error handling. In these routines we 'try' to obtain the vma lock and fail to unmap if unsuccessful. Calling routines already deal with the failure of unmapping. - hugetlb_vmdelete_list for truncation and hole punch. This routine also tries to acquire the vma lock. If it fails, it skips the unmapping. However, we can not have file truncation or hole punch fail because of contention. After hugetlb_vmdelete_list, truncation and hole punch call remove_inode_hugepages. remove_inode_hugepages checks for mapped pages and call hugetlb_unmap_file_page to unmap them. hugetlb_unmap_file_page is designed to drop locks and reacquire in the correct order to guarantee unmap success. Link: https://lkml.kernel.org/r/20220914221810.95771-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 66 +++++++++++++++++++++++++++- mm/hugetlb.c | 102 +++++++++++++++++++++++++++++++++++++++---- mm/memory.c | 2 + mm/rmap.c | 100 +++++++++++++++++++++++++++--------------- mm/userfaultfd.c | 9 +++- 5 files changed, 233 insertions(+), 46 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3bb1772fce2f7..009ae539b9b24 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h, struct folio *folio, pgoff_t index) { struct rb_root_cached *root = &mapping->i_mmap; + struct hugetlb_vma_lock *vma_lock; struct page *page = &folio->page; struct vm_area_struct *vma; unsigned long v_start; @@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h, end = (index + 1) * pages_per_huge_page(h); i_mmap_lock_write(mapping); - +retry: + vma_lock = NULL; vma_interval_tree_foreach(vma, root, start, end - 1) { v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); @@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h, if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) continue; + if (!hugetlb_vma_trylock_write(vma)) { + vma_lock = vma->vm_private_data; + /* + * If we can not get vma lock, we need to drop + * immap_sema and take locks in order. First, + * take a ref on the vma_lock structure so that + * we can be guaranteed it will not go away when + * dropping immap_sema. + */ + kref_get(&vma_lock->refs); + break; + } + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); + hugetlb_vma_unlock_write(vma); } i_mmap_unlock_write(mapping); + + if (vma_lock) { + /* + * Wait on vma_lock. We know it is still valid as we have + * a reference. We must 'open code' vma locking as we do + * not know if vma_lock is still attached to vma. + */ + down_write(&vma_lock->rw_sema); + i_mmap_lock_write(mapping); + + vma = vma_lock->vma; + if (!vma) { + /* + * If lock is no longer attached to vma, then just + * unlock, drop our reference and retry looking for + * other vmas. + */ + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + goto retry; + } + + /* + * vma_lock is still attached to vma. Check to see if vma + * still maps page and if so, unmap. + */ + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + unmap_hugepage_range(vma, vma->vm_start + v_start, + v_end, NULL, + ZAP_FLAG_DROP_MARKER); + + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + hugetlb_vma_unlock_write(vma); + + goto retry; + } } static void @@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, unsigned long v_start; unsigned long v_end; + if (!hugetlb_vma_trylock_write(vma)) + continue; + v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, zap_flags); + + /* + * Note that vma lock only exists for shared/non-private + * vmas. Therefore, lock is not held when calling + * unmap_hugepage_range for private vmas. + */ + hugetlb_vma_unlock_write(vma); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f44b79998ac2d..d78504959df76 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); + } else { + /* + * For shared mappings the vma lock must be held before + * calling huge_pte_offset in the src vma. Otherwise, the + * returned ptep could go away if part of a shared pmd and + * another thread calls huge_pmd_unshare. + */ + hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4942,6 +4950,8 @@ again: if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } else { + hugetlb_vma_unlock_read(src_vma); } return ret; @@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ + hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); @@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); + hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } @@ -5350,8 +5362,29 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx; + u32 hash; + put_page(old_page); + /* + * Drop hugetlb_fault_mutex and vma_lock before + * unmapping. unmapping needs to hold vma_lock + * in write mode. Dropping vma_lock in read mode + * here is OK as COW mappings do not interact with + * PMD sharing. + * + * Reacquire both after unmap operation. + */ + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + unmap_ref_private(mm, vma, old_page, haddr); + + mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be + * vma_lock and hugetlb_fault_mutex must be * dropped before handling userfault. Reacquire * after handling fault to make calling code simpler. */ + hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, reason); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); return ret; } @@ -5741,6 +5776,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, ptep); @@ -5748,23 +5788,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); - } else { - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + /* + * Acquire vma lock before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something has changed. + */ + hugetlb_vma_lock_read(vma); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) { + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return VM_FAULT_OOM; + } + entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ if (huge_pte_none_mostly(entry)) { @@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(pagecache_page); put_page(pagecache_page); } + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5868,6 +5921,7 @@ out_ptl: put_page(pagecache_page); } out_mutex: + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But @@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); - last_addr_mask = hugetlb_mask_last_page(h); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); @@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages << h->order; @@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; @@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + void hugetlb_vma_lock_release(struct kref *kref) { } @@ -7325,6 +7407,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = huge_pte_offset(mm, address, sz); @@ -7336,6 +7419,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/mm/mmu_notifier.rst. diff --git a/mm/memory.c b/mm/memory.c index c01c12500169d..b3ed17219d772 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1684,10 +1684,12 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); } } else unmap_page_range(tlb, vma, start, end, details); diff --git a/mm/rmap.c b/mm/rmap.c index 0e179c823e0ac..b6743c2b8b5f4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1551,24 +1551,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and fail + * if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { @@ -1926,26 +1941,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and + * fail if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } - /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0fdbd2c05587d..e24e8a47ce8a2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -379,16 +379,21 @@ retry: BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via hugetlb_fault_mutex. + * Serialize via vma_lock and hugetlb_fault_mutex. + * vma_lock ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(dst_vma); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } @@ -396,6 +401,7 @@ retry: if (mode != MCOPY_ATOMIC_CONTINUE && !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } @@ -404,6 +410,7 @@ retry: dst_addr, src_addr, mode, &page, wp_copy); + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); cond_resched(); -- GitLab From fa27759af4a6d7494c986c44695b13bcd6eaf46b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:10 -0700 Subject: [PATCH 1050/2223] hugetlb: clean up code checking for fault/truncation races With the new hugetlb vma lock in place, it can also be used to handle page fault races with file truncation. The lock is taken at the beginning of the code fault path in read mode. During truncation, it is taken in write mode for each vma which has the file mapped. The file's size (i_size) is modified before taking the vma lock to unmap. How are races handled? The page fault code checks i_size early in processing after taking the vma lock. If the fault is beyond i_size, the fault is aborted. If the fault is not beyond i_size the fault will continue and a new page will be added to the file. It could be that truncation code modifies i_size after the check in fault code. That is OK, as truncation code will soon remove the page. The truncation code will wait until the fault is finished, as it must obtain the vma lock in write mode. This patch cleans up/removes late checks in the fault paths that try to back out pages racing with truncation. As noted above, we just let the truncation code remove the pages. [mike.kravetz@oracle.com: fix reserve_alloc set but not used compiler warning] Link: https://lkml.kernel.org/r/Yyj7HsJWfHDoU24U@monkey Link: https://lkml.kernel.org/r/20220914221810.95771-10-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 31 ++++++++++++------------------- mm/hugetlb.c | 24 +++--------------------- 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 009ae539b9b24..ed57a029eab09 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -568,26 +568,19 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, folio_lock(folio); /* - * After locking page, make sure mapping is the same. - * We could have raced with page fault populate and - * backout code. + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. */ - if (folio_mapping(folio) == mapping) { - /* - * We must remove the folio from page cache before removing - * the region/ reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the region/reserve - * map could fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); - ret = true; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, index, - index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); } folio_unlock(folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d78504959df76..b0e39045a7a86 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5680,10 +5680,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } ptl = huge_pte_lock(h, mm, ptep); - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto backout; - ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) @@ -5727,10 +5723,10 @@ out: backout: spin_unlock(ptl); backout_unlocked: - unlock_page(page); - /* restore reserve for newly allocated pages not in page cache */ if (new_page && !new_pagecache_page) restore_reserve_on_error(h, vma, haddr, page); + + unlock_page(page); put_page(page); goto out; } @@ -6062,26 +6058,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lock(h, dst_mm, dst_pte); - /* - * Recheck the i_size after holding PT lock to make sure not - * to leave any page mapped (as page_mapped()) beyond the end - * of the i_size (remove_inode_hugepages() is strict about - * enforcing that). If we bail out here, we'll also leave a - * page in the radix tree in the vm_shared case beyond the end - * of the i_size, but remove_inode_hugepages() will take care - * of it as soon as we drop the hugetlb_fault_mutex_table. - */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - ret = -EFAULT; - if (idx >= size) - goto out_release_unlock; - - ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ + ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; -- GitLab From e41e614f6a3e3d0d21874a785d3a67d353e282da Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 15 Sep 2022 17:03:35 +0200 Subject: [PATCH 1051/2223] x86: add missing include to sparsemem.h Patch series "Add KernelMemorySanitizer infrastructure", v7. KernelMemorySanitizer (KMSAN) is a detector of errors related to uses of uninitialized memory. It relies on compile-time Clang instrumentation (similar to MSan in the userspace [1]) and tracks the state of every bit of kernel memory, being able to report an error if uninitialized value is used in a condition, dereferenced, or escapes to userspace, USB or DMA. KMSAN has reported more than 300 bugs in the past few years (recently fixed bugs: [2]), most of them with the help of syzkaller. Such bugs keep getting introduced into the kernel despite new compiler warnings and other analyses (the 6.0 cycle already resulted in several KMSAN-reported bugs, e.g. [3]). Mitigations like total stack and heap initialization are unfortunately very far from being deployable. The proposed patchset contains KMSAN runtime implementation together with small changes to other subsystems needed to make KMSAN work. The latter changes fall into several categories: 1. Changes and refactorings of existing code required to add KMSAN: - [01/43] x86: add missing include to sparsemem.h - [02/43] stackdepot: reserve 5 extra bits in depot_stack_handle_t - [03/43] instrumented.h: allow instrumenting both sides of copy_from_user() - [04/43] x86: asm: instrument usercopy in get_user() and __put_user_size() - [05/43] asm-generic: instrument usercopy in cacheflush.h - [10/43] libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE 2. KMSAN-related declarations in generic code, KMSAN runtime library, docs and configs: - [06/43] kmsan: add ReST documentation - [07/43] kmsan: introduce __no_sanitize_memory and __no_kmsan_checks - [09/43] x86: kmsan: pgtable: reduce vmalloc space - [11/43] kmsan: add KMSAN runtime core - [13/43] MAINTAINERS: add entry for KMSAN - [24/43] kmsan: add tests for KMSAN - [31/43] objtool: kmsan: list KMSAN API functions as uaccess-safe - [35/43] x86: kmsan: use __msan_ string functions where possible - [43/43] x86: kmsan: enable KMSAN builds for x86 3. Adding hooks from different subsystems to notify KMSAN about memory state changes: - [14/43] mm: kmsan: maintain KMSAN metadata for page - [15/43] mm: kmsan: call KMSAN hooks from SLUB code - [16/43] kmsan: handle task creation and exiting - [17/43] init: kmsan: call KMSAN initialization routines - [18/43] instrumented.h: add KMSAN support - [19/43] kmsan: add iomap support - [20/43] Input: libps2: mark data received in __ps2_command() as initialized - [21/43] dma: kmsan: unpoison DMA mappings - [34/43] x86: kmsan: handle open-coded assembly in lib/iomem.c - [36/43] x86: kmsan: sync metadata pages on page fault 4. Changes that prevent false reports by explicitly initializing memory, disabling optimized code that may trick KMSAN, selectively skipping instrumentation: - [08/43] kmsan: mark noinstr as __no_sanitize_memory - [12/43] kmsan: disable instrumentation of unsupported common kernel code - [22/43] virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() - [23/43] kmsan: handle memory sent to/from USB - [25/43] kmsan: disable strscpy() optimization under KMSAN - [26/43] crypto: kmsan: disable accelerated configs under KMSAN - [27/43] kmsan: disable physical page merging in biovec - [28/43] block: kmsan: skip bio block merging logic for KMSAN - [29/43] kcov: kmsan: unpoison area->list in kcov_remote_area_put() - [30/43] security: kmsan: fix interoperability with auto-initialization - [32/43] x86: kmsan: disable instrumentation of unsupported code - [33/43] x86: kmsan: skip shadow checks in __switch_to() - [37/43] x86: kasan: kmsan: support CONFIG_GENERIC_CSUM on x86, enable it for KASAN/KMSAN - [38/43] x86: fs: kmsan: disable CONFIG_DCACHE_WORD_ACCESS - [39/43] x86: kmsan: don't instrument stack walking functions - [40/43] entry: kmsan: introduce kmsan_unpoison_entry_regs() 5. Fixes for bugs detected with CONFIG_KMSAN_CHECK_PARAM_RETVAL: - [41/43] bpf: kmsan: initialize BPF registers with zeroes - [42/43] mm: fs: initialize fsdata passed to write_begin/write_end interface This patchset allows one to boot and run a defconfig+KMSAN kernel on a QEMU without known false positives. It however doesn't guarantee there are no false positives in drivers of certain devices or less tested subsystems, although KMSAN is actively tested on syzbot with a large config. By default, KMSAN enforces conservative checks of most kernel function parameters passed by value (via CONFIG_KMSAN_CHECK_PARAM_RETVAL, which maps to the -fsanitize-memory-param-retval compiler flag). As discussed in [4] and [5], passing uninitialized values as function parameters is considered undefined behavior, therefore KMSAN now reports such cases as errors. Several newly added patches fix known manifestations of these errors. This patch (of 43): Including sparsemem.h from other files (e.g. transitively via asm/pgtable_64_types.h) results in compilation errors due to unknown types: sparsemem.h:34:32: error: unknown type name 'phys_addr_t' extern int phys_to_target_node(phys_addr_t start); ^ sparsemem.h:36:39: error: unknown type name 'u64' extern int memory_add_physaddr_to_nid(u64 start); ^ Fix these errors by including linux/types.h from sparsemem.h This is required for the upcoming KMSAN patches. Link: https://lkml.kernel.org/r/20220915150417.722975-1-glider@google.com Link: https://lkml.kernel.org/r/20220915150417.722975-2-glider@google.com Signed-off-by: Dmitry Vyukov Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Cc: Andrey Konovalov Cc: Eric Biggers Signed-off-by: Andrew Morton --- arch/x86/include/asm/sparsemem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 6a9ccc1b2be5d..64df897c0ee30 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_SPARSEMEM_H #define _ASM_X86_SPARSEMEM_H +#include + #ifdef CONFIG_SPARSEMEM /* * generic non-linear memory support: -- GitLab From 83a4f1ef45a90d740bc6edf6a2533b14a3e5d183 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:36 +0200 Subject: [PATCH 1052/2223] stackdepot: reserve 5 extra bits in depot_stack_handle_t Some users (currently only KMSAN) may want to use spare bits in depot_stack_handle_t. Let them do so by adding @extra_bits to __stack_depot_save() to store arbitrary flags, and providing stack_depot_get_extra_bits() to retrieve those flags. Also adapt KASAN to the new prototype by passing extra_bits=0, as KASAN does not intend to store additional information in the stack handle. Link: https://lkml.kernel.org/r/20220915150417.722975-3-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 8 ++++++++ lib/stackdepot.c | 29 ++++++++++++++++++++++++----- mm/kasan/common.c | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index bc2797955de90..9ca7798d7a318 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,9 +14,15 @@ #include typedef u32 depot_stack_handle_t; +/* + * Number of bits in the handle that stack depot doesn't use. Users may store + * information in them. + */ +#define STACK_DEPOT_EXTRA_BITS 5 depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); /* @@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); + int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index e73fda23388d8..79e894cf84064 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -43,7 +43,8 @@ #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) + STACK_ALLOC_NULL_PROTECTION_BITS - \ + STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) #define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ @@ -56,6 +57,7 @@ union handle_parts { u32 slabindex : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -77,6 +79,14 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); + static bool init_stack_slab(void **prealloc) { if (!*prealloc) @@ -140,6 +150,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.slabindex = depot_index; stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; + stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); depot_offset += required_size; @@ -382,6 +393,7 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * * @entries: Pointer to storage array * @nr_entries: Size of the storage array + * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack slabs (increased chance of failure if false) * @@ -393,6 +405,10 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * + * Additional opaque flags can be passed in @extra_bits, stored in the unused + * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() + * without calling stack_depot_fetch(). + * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -402,10 +418,11 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; - depot_stack_handle_t retval = 0; + union handle_parts retval = { .handle = 0 }; struct page *page = NULL; void *prealloc = NULL; unsigned long flags; @@ -489,9 +506,11 @@ exit: free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); } if (found) - retval = found->handle.handle; + retval.handle = found->handle.handle; fast_exit: - return retval; + retval.extra = extra_bits; + + return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -511,6 +530,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 50f4338b477f2..833bf2cfd2a39 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) -- GitLab From 33b75c1d884e81ec97525e0a6fdcb187adf273f4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:37 +0200 Subject: [PATCH 1053/2223] instrumented.h: allow instrumenting both sides of copy_from_user() Introduce instrument_copy_from_user_before() and instrument_copy_from_user_after() hooks to be invoked before and after the call to copy_from_user(). KASAN and KCSAN will be only using instrument_copy_from_user_before(), but for KMSAN we'll need to insert code after copy_from_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-4-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/s390/lib/uaccess.c | 3 ++- include/linux/instrumented.h | 21 +++++++++++++++++++-- include/linux/uaccess.h | 19 ++++++++++++++----- lib/iov_iter.c | 9 ++++++--- lib/usercopy.c | 3 ++- 5 files changed, 43 insertions(+), 12 deletions(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index d7b3b193d1088..58033dfcb6d45 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -81,8 +81,9 @@ unsigned long _copy_from_user_key(void *to, const void __user *from, might_fault(); if (!should_fail_usercopy()) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user_key(to, from, n, key); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 42faebbaa202a..ee8f7d17d34f5 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -120,7 +120,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) } /** - * instrument_copy_from_user - instrument writes of copy_from_user + * instrument_copy_from_user_before - add instrumentation before copy_from_user * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. @@ -130,10 +130,27 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) * @n number of bytes to copy */ static __always_inline void -instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) +instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } +/** + * instrument_copy_from_user_after - add instrumentation after copy_from_user + * + * Instrument writes to kernel memory, that are due to copy_from_user (and + * variants). The instrumentation should be inserted after the accesses. + * + * @to destination address + * @from source address + * @n number of bytes to copy + * @left number of bytes not copied (as returned by copy_from_user) + */ +static __always_inline void +instrument_copy_from_user_after(const void *to, const void __user *from, + unsigned long n, unsigned long left) +{ +} + #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 47e5d374c7ebe..afb18f198843b 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -58,20 +58,28 @@ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - instrument_copy_from_user(to, from, n); + unsigned long res; + + instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { + unsigned long res; + might_fault(); + instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; - instrument_copy_from_user(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } /** @@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 4b7fce72e3e52..c3ca28ca68a65 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -174,13 +174,16 @@ static int copyout(void __user *to, const void *from, size_t n) static int copyin(void *to, const void __user *from, size_t n) { + size_t res = n; + if (should_fail_usercopy()) return n; if (access_ok(from, n)) { - instrument_copy_from_user(to, from, n); - n = raw_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } - return n; + return res; } static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, diff --git a/lib/usercopy.c b/lib/usercopy.c index 7413dd300516e..1505a52f23a01 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,8 +12,9 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); -- GitLab From 888f84a6da4d17e453058169fa7b235fff34f5bf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:38 +0200 Subject: [PATCH 1054/2223] x86: asm: instrument usercopy in get_user() and put_user() Use hooks from instrumented.h to notify bug detection tools about usercopy events in variations of get_user() and put_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-5-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/uaccess.h | 22 +++++++++++++++------- include/linux/instrumented.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 913e593a3b45f..c1b8982899eca 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,6 +5,7 @@ * User space memory access functions */ #include +#include #include #include #include @@ -103,6 +104,7 @@ extern int __get_user_bad(void); : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ + instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) @@ -192,9 +194,11 @@ extern void __put_user_nocheck_8(void); int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ - __chk_user_ptr(ptr); \ - __ptr_pu = (ptr); \ - __val_pu = (x); \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ + __ptr_pu = __ptr; \ + __val_pu = __x; \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ @@ -202,6 +206,7 @@ extern void __put_user_nocheck_8(void); "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ + instrument_put_user(__x, __ptr, sizeof(*(ptr))); \ __builtin_expect(__ret_pu, 0); \ }) @@ -248,23 +253,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ - __put_user_goto(x, ptr, "b", "iq", label); \ + __put_user_goto(__x, ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(x, ptr, "w", "ir", label); \ + __put_user_goto(__x, ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(x, ptr, "l", "ir", label); \ + __put_user_goto(__x, ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(x, ptr, label); \ + __put_user_goto_u64(__x, ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ + instrument_put_user(__x, ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT @@ -305,6 +312,7 @@ do { \ default: \ (x) = __get_user_bad(); \ } \ + instrument_get_user(x); \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index ee8f7d17d34f5..9f1dba8f717b0 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -153,4 +153,32 @@ instrument_copy_from_user_after(const void *to, const void __user *from, { } +/** + * instrument_get_user() - add instrumentation to get_user()-like macros + * + * get_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @to destination variable, may not be address-taken + */ +#define instrument_get_user(to) \ +({ \ +}) + +/** + * instrument_put_user() - add instrumentation to put_user()-like macros + * + * put_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @from source address + * @ptr userspace pointer to copy to + * @size number of bytes to copy + */ +#define instrument_put_user(from, ptr, size) \ +({ \ +}) + #endif /* _LINUX_INSTRUMENTED_H */ -- GitLab From 2b420aaf80408fd45d86ce983819813d43ac210f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:39 +0200 Subject: [PATCH 1055/2223] asm-generic: instrument usercopy in cacheflush.h Notify memory tools about usercopy events in copy_to_user_page() and copy_from_user_page(). Link: https://lkml.kernel.org/r/20220915150417.722975-6-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/asm-generic/cacheflush.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 4f07afacbc239..f46258d1a080f 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H +#include + struct mm_struct; struct vm_area_struct; struct page; @@ -105,14 +107,22 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) #ifndef copy_to_user_page #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ + instrument_copy_to_user((void __user *)dst, src, len); \ memcpy(dst, src, len); \ flush_icache_user_page(vma, page, vaddr, len); \ } while (0) #endif + #ifndef copy_from_user_page -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ + do { \ + instrument_copy_from_user_before(dst, (void __user *)src, \ + len); \ + memcpy(dst, src, len); \ + instrument_copy_from_user_after(dst, (void __user *)src, len, \ + 0); \ + } while (0) #endif #endif /* _ASM_GENERIC_CACHEFLUSH_H */ -- GitLab From 93858ae70cf4fb2ec75ae2f1e495b85b26614883 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:40 +0200 Subject: [PATCH 1056/2223] kmsan: add ReST documentation Add Documentation/dev-tools/kmsan.rst and reference it in the dev-tools index. Link: https://lkml.kernel.org/r/20220915150417.722975-7-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/dev-tools/index.rst | 1 + Documentation/dev-tools/kmsan.rst | 427 ++++++++++++++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 Documentation/dev-tools/kmsan.rst diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index 4621eac290f46..6b0663075dc04 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst kcov gcov kasan + kmsan ubsan kmemleak kcsan diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst new file mode 100644 index 0000000000000..2a53a801198cb --- /dev/null +++ b/Documentation/dev-tools/kmsan.rst @@ -0,0 +1,427 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2022, Google LLC. + +=================================== +The Kernel Memory Sanitizer (KMSAN) +=================================== + +KMSAN is a dynamic error detector aimed at finding uses of uninitialized +values. It is based on compiler instrumentation, and is quite similar to the +userspace `MemorySanitizer tool`_. + +An important note is that KMSAN is not intended for production use, because it +drastically increases kernel memory footprint and slows the whole system down. + +Usage +===== + +Building the kernel +------------------- + +In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+). +Please refer to `LLVM documentation`_ for the instructions on how to build Clang. + +Now configure and build the kernel with CONFIG_KMSAN enabled. + +Example report +-------------- + +Here is an example of a KMSAN report:: + + ===================================================== + BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test] + test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273 + kunit_run_case_internal lib/kunit/test.c:333 + kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374 + kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28 + kthread+0x721/0x850 kernel/kthread.c:327 + ret_from_fork+0x1f/0x30 ??:? + + Uninit was stored to memory at: + do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260 + test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271 + kunit_run_case_internal lib/kunit/test.c:333 + kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374 + kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28 + kthread+0x721/0x850 kernel/kthread.c:327 + ret_from_fork+0x1f/0x30 ??:? + + Local variable uninit created at: + do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256 + test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271 + + Bytes 4-7 of 8 are uninitialized + Memory access of size 8 starts at ffff888083fe3da0 + + CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 + ===================================================== + +The report says that the local variable ``uninit`` was created uninitialized in +``do_uninit_local_array()``. The third stack trace corresponds to the place +where this variable was created. + +The first stack trace shows where the uninit value was used (in +``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left +uninitialized in the local variable, as well as the stack where the value was +copied to another memory location before use. + +A use of uninitialized value ``v`` is reported by KMSAN in the following cases: + - in a condition, e.g. ``if (v) { ... }``; + - in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``; + - when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``; + - when it is passed as an argument to a function, and + ``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below). + +The mentioned cases (apart from copying data to userspace or hardware, which is +a security issue) are considered undefined behavior from the C11 Standard point +of view. + +Disabling the instrumentation +----------------------------- + +A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN +ignore uninitialized values in that function and mark its output as initialized. +As a result, the user will not get KMSAN reports related to that function. + +Another function attribute supported by KMSAN is ``__no_sanitize_memory``. +Applying this attribute to a function will result in KMSAN not instrumenting +it, which can be helpful if we do not want the compiler to interfere with some +low-level code (e.g. that marked with ``noinstr`` which implicitly adds +``__no_sanitize_memory``). + +This however comes at a cost: stack allocations from such functions will have +incorrect shadow/origin values, likely leading to false positives. Functions +called from non-instrumented code may also receive incorrect metadata for their +parameters. + +As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly. + +It is also possible to disable KMSAN for a single file (e.g. main.o):: + + KMSAN_SANITIZE_main.o := n + +or for the whole directory:: + + KMSAN_SANITIZE := n + +in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every +function in the file or directory. Most users won't need KMSAN_SANITIZE, unless +their code gets broken by KMSAN (e.g. runs at early boot time). + +Support +======= + +In order for KMSAN to work the kernel must be built with Clang, which so far is +the only compiler that has KMSAN support. The kernel instrumentation pass is +based on the userspace `MemorySanitizer tool`_. + +The runtime library only supports x86_64 at the moment. + +How KMSAN works +=============== + +KMSAN shadow memory +------------------- + +KMSAN associates a metadata byte (also called shadow byte) with every byte of +kernel memory. A bit in the shadow byte is set iff the corresponding bit of the +kernel memory byte is uninitialized. Marking the memory uninitialized (i.e. +setting its shadow bytes to ``0xff``) is called poisoning, marking it +initialized (setting the shadow bytes to ``0x00``) is called unpoisoning. + +When a new variable is allocated on the stack, it is poisoned by default by +instrumentation code inserted by the compiler (unless it is a stack variable +that is immediately initialized). Any new heap allocation done without +``__GFP_ZERO`` is also poisoned. + +Compiler instrumentation also tracks the shadow values as they are used along +the code. When needed, instrumentation code invokes the runtime library in +``mm/kmsan/`` to persist shadow values. + +The shadow value of a basic or compound type is an array of bytes of the same +length. When a constant value is written into memory, that memory is unpoisoned. +When a value is read from memory, its shadow memory is also obtained and +propagated into all the operations which use that value. For every instruction +that takes one or more values the compiler generates code that calculates the +shadow of the result depending on those values and their shadows. + +Example:: + + int a = 0xff; // i.e. 0x000000ff + int b; + int c = a | b; + +In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``, +shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of +``c`` are uninitialized, while the lower byte is initialized. + +Origin tracking +--------------- + +Every four bytes of kernel memory also have a so-called origin mapped to them. +This origin describes the point in program execution at which the uninitialized +value was created. Every origin is associated with either the full allocation +stack (for heap-allocated memory), or the function containing the uninitialized +variable (for locals). + +When an uninitialized variable is allocated on stack or heap, a new origin +value is created, and that variable's origin is filled with that value. When a +value is read from memory, its origin is also read and kept together with the +shadow. For every instruction that takes one or more values, the origin of the +result is one of the origins corresponding to any of the uninitialized inputs. +If a poisoned value is written into memory, its origin is written to the +corresponding storage as well. + +Example 1:: + + int a = 42; + int b; + int c = a + b; + +In this case the origin of ``b`` is generated upon function entry, and is +stored to the origin of ``c`` right before the addition result is written into +memory. + +Several variables may share the same origin address, if they are stored in the +same four-byte chunk. In this case every write to either variable updates the +origin for all of them. We have to sacrifice precision in this case, because +storing origins for individual bits (and even bytes) would be too costly. + +Example 2:: + + int combine(short a, short b) { + union ret_t { + int i; + short s[2]; + } ret; + ret.s[0] = a; + ret.s[1] = b; + return ret.i; + } + +If ``a`` is initialized and ``b`` is not, the shadow of the result would be +0xffff0000, and the origin of the result would be the origin of ``b``. +``ret.s[0]`` would have the same origin, but it will never be used, because +that variable is initialized. + +If both function arguments are uninitialized, only the origin of the second +argument is preserved. + +Origin chaining +~~~~~~~~~~~~~~~ + +To ease debugging, KMSAN creates a new origin for every store of an +uninitialized value to memory. The new origin references both its creation stack +and the previous origin the value had. This may cause increased memory +consumption, so we limit the length of origin chains in the runtime. + +Clang instrumentation API +------------------------- + +Clang instrumentation pass inserts calls to functions defined in +``mm/kmsan/nstrumentation.c`` into the kernel code. + +Shadow manipulation +~~~~~~~~~~~~~~~~~~~ + +For every memory access the compiler emits a call to a function that returns a +pair of pointers to the shadow and origin addresses of the given memory:: + + typedef struct { + void *shadow, *origin; + } shadow_origin_ptr_t + + shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr) + shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr) + shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size) + shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size) + +The function name depends on the memory access size. + +The compiler makes sure that for every loaded value its shadow and origin +values are read from memory. When a value is stored to memory, its shadow and +origin are also stored using the metadata pointers. + +Handling locals +~~~~~~~~~~~~~~~ + +A special function is used to create a new origin value for a local variable and +set the origin of that variable to that value:: + + void __msan_poison_alloca(void *addr, uintptr_t size, char *descr) + +Access to per-task data +~~~~~~~~~~~~~~~~~~~~~~~ + +At the beginning of every instrumented function KMSAN inserts a call to +``__msan_get_context_state()``:: + + kmsan_context_state *__msan_get_context_state(void) + +``kmsan_context_state`` is declared in ``include/linux/kmsan.h``:: + + struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + depot_stack_handle_t retval_origin_tls; + }; + +This structure is used by KMSAN to pass parameter shadows and origins between +instrumented functions (unless the parameters are checked immediately by +``CONFIG_KMSAN_CHECK_PARAM_RETVAL``). + +Passing uninitialized values to functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Clang's MemorySanitizer instrumentation has an option, +``-fsanitize-memory-param-retval``, which makes the compiler check function +parameters passed by value, as well as function return values. + +The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is +enabled by default to let KMSAN report uninitialized values earlier. +Please refer to the `LKML discussion`_ for more details. + +Because of the way the checks are implemented in LLVM (they are only applied to +parameters marked as ``noundef``), not all parameters are guaranteed to be +checked, so we cannot give up the metadata storage in ``kmsan_context_state``. + +String functions +~~~~~~~~~~~~~~~~ + +The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the +following functions. These functions are also called when data structures are +initialized or copied, making sure shadow and origin values are copied alongside +with the data:: + + void *__msan_memcpy(void *dst, void *src, uintptr_t n) + void *__msan_memmove(void *dst, void *src, uintptr_t n) + void *__msan_memset(void *dst, int c, uintptr_t n) + +Error reporting +~~~~~~~~~~~~~~~ + +For each use of a value the compiler emits a shadow check that calls +``__msan_warning()`` in the case that value is poisoned:: + + void __msan_warning(u32 origin) + +``__msan_warning()`` causes KMSAN runtime to print an error report. + +Inline assembly instrumentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +KMSAN instruments every inline assembly output with a call to:: + + void __msan_instrument_asm_store(void *addr, uintptr_t size) + +, which unpoisons the memory region. + +This approach may mask certain errors, but it also helps to avoid a lot of +false positives in bitwise operations, atomics etc. + +Sometimes the pointers passed into inline assembly do not point to valid memory. +In such cases they are ignored at runtime. + + +Runtime library +--------------- + +The code is located in ``mm/kmsan/``. + +Per-task KMSAN state +~~~~~~~~~~~~~~~~~~~~ + +Every task_struct has an associated KMSAN task state that holds the KMSAN +context (see above) and a per-task flag disallowing KMSAN reports:: + + struct kmsan_context { + ... + bool allow_reporting; + struct kmsan_context_state cstate; + ... + } + + struct task_struct { + ... + struct kmsan_context kmsan; + ... + } + +KMSAN contexts +~~~~~~~~~~~~~~ + +When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to +hold the metadata for function parameters and return values. + +But in the case the kernel is running in the interrupt, softirq or NMI context, +where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state:: + + DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +Metadata allocation +~~~~~~~~~~~~~~~~~~~ + +There are several places in the kernel for which the metadata is stored. + +1. Each ``struct page`` instance contains two pointers to its shadow and +origin pages:: + + struct page { + ... + struct page *shadow, *origin; + ... + }; + +At boot-time, the kernel allocates shadow and origin pages for every available +kernel page. This is done quite late, when the kernel address space is already +fragmented, so normal data pages may arbitrarily interleave with the metadata +pages. + +This means that in general for two contiguous memory pages their shadow/origin +pages may not be contiguous. Consequently, if a memory access crosses the +boundary of a memory block, accesses to shadow/origin memory may potentially +corrupt other pages or read incorrect values from them. + +In practice, contiguous memory pages returned by the same ``alloc_pages()`` +call will have contiguous metadata, whereas if these pages belong to two +different allocations their metadata pages can be fragmented. + +For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions +there also are no guarantees on metadata contiguity. + +In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two +pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions:: + + char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + +``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes. +All stores to ``dummy_store_page`` are ignored. + +2. For vmalloc memory and modules, there is a direct mapping between the memory +range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only +the first quarter available to ``vmalloc()``. The second quarter of the vmalloc +area contains shadow memory for the first quarter, the third one holds the +origins. A small part of the fourth quarter contains shadow and origins for the +kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for +more details. + +When an array of pages is mapped into a contiguous virtual memory space, their +shadow and origin pages are similarly mapped into contiguous regions. + +References +========== + +E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized +memory use in C++ +`_. +In Proceedings of CGO 2015. + +.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html +.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html +.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/ -- GitLab From 9b448bc25b776daab3215393c3ce6953dd3bb8ad Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:41 +0200 Subject: [PATCH 1057/2223] kmsan: introduce __no_sanitize_memory and __no_kmsan_checks __no_sanitize_memory is a function attribute that instructs KMSAN to skip a function during instrumentation. This is needed to e.g. implement the noinstr functions. __no_kmsan_checks is a function attribute that makes KMSAN ignore the uninitialized values coming from the function's inputs, and initialize the function's outputs. Functions marked with this attribute can't be inlined into functions not marked with it, and vice versa. This behavior is overridden by __always_inline. __SANITIZE_MEMORY__ is a macro that's defined iff the file is instrumented with KMSAN. This is not the same as CONFIG_KMSAN, which is defined for every file. Link: https://lkml.kernel.org/r/20220915150417.722975-8-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 23 +++++++++++++++++++++++ include/linux/compiler-gcc.h | 6 ++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index c84fec767445d..4fa0cc4cbd2c8 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -51,6 +51,29 @@ #define __no_sanitize_undefined #endif +#if __has_feature(memory_sanitizer) +#define __SANITIZE_MEMORY__ +/* + * Unlike other sanitizers, KMSAN still inserts code into functions marked with + * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation + * provides the behavior consistent with other __no_sanitize_ attributes, + * guaranteeing that __no_sanitize_memory functions remain uninstrumented. + */ +#define __no_sanitize_memory __disable_sanitizer_instrumentation + +/* + * The __no_kmsan_checks attribute ensures that a function does not produce + * false positive reports by: + * - initializing all local variables and memory stores in this function; + * - skipping all shadow checks; + * - passing initialized arguments to this function's callees. + */ +#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory"))) +#else +#define __no_sanitize_memory +#define __no_kmsan_checks +#endif + /* * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together * with no_sanitize("coverage"). Prior versions of Clang support coverage diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 9b157b71036f1..f55a37efdb974 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -114,6 +114,12 @@ #define __SANITIZE_ADDRESS__ #endif +/* + * GCC does not support KMSAN. + */ +#define __no_sanitize_memory +#define __no_kmsan_checks + /* * Turn individual warnings and errors on and off locally, depending * on version. -- GitLab From 5de0ce85f5a4d2883eae6f48eb015bc5dfbd91e9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:42 +0200 Subject: [PATCH 1058/2223] kmsan: mark noinstr as __no_sanitize_memory noinstr functions should never be instrumented, so make KMSAN skip them by applying the __no_sanitize_memory attribute. Link: https://lkml.kernel.org/r/20220915150417.722975-9-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a3..015207a6e2bf5 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -229,7 +229,8 @@ struct ftrace_likely_data { /* Section for code which can't be instrumented at all */ #define noinstr \ noinline notrace __attribute((__section__(".noinstr.text"))) \ - __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage + __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ + __no_sanitize_memory #endif /* __KERNEL__ */ -- GitLab From 1a167ddd3c561b21a76187a81530a167e3522261 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:43 +0200 Subject: [PATCH 1059/2223] x86: kmsan: pgtable: reduce vmalloc space KMSAN is going to use 3/4 of existing vmalloc space to hold the metadata, therefore we lower VMALLOC_END to make sure vmalloc() doesn't allocate past the first 1/4. Link: https://lkml.kernel.org/r/20220915150417.722975-10-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_64_types.h | 47 ++++++++++++++++++++++++- arch/x86/mm/init_64.c | 2 +- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 70e360a2e5fb7..04f36063ad546 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -139,7 +139,52 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +/* + * End of the region for which vmalloc page tables are pre-allocated. + * For non-KMSAN builds, this is the same as VMALLOC_END. + * For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than + * VMALLOC_START..VMALLOC_END (see below). + */ +#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) + +#ifndef CONFIG_KMSAN +#define VMALLOC_END VMEMORY_END +#else +/* + * In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4 + * are used to keep the metadata for virtual pages. The memory formerly + * belonging to vmalloc area is now laid out as follows: + * + * 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area + * 2nd quarter: KMSAN_VMALLOC_SHADOW_START to + * VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow + * 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to + * VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins + * 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START + * - shadow for modules, + * KMSAN_MODULES_ORIGIN_START to + * KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules. + */ +#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2) +#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1) + +/* + * vmalloc metadata addresses are calculated by adding shadow/origin offsets + * to vmalloc address. + */ +#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE +#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1) + +#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET) +#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET) + +/* + * The shadow/origin for modules are placed one by one in the last 1/4 of + * vmalloc space. + */ +#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1) +#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#endif /* CONFIG_KMSAN */ #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0fe690ebc269b..39b6bfcaa0ed4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1287,7 +1287,7 @@ static void __init preallocate_vmalloc_pages(void) unsigned long addr; const char *lvl; - for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; -- GitLab From 6e9f05dc66f951e8812c84a3ef148b601e3f8f45 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:44 +0200 Subject: [PATCH 1060/2223] libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE KMSAN adds extra metadata fields to struct page, so it does not fit into 64 bytes anymore. This change leads to increased memory consumption of the nvdimm driver, regardless of whether the kernel is built with KMSAN or not. Link: https://lkml.kernel.org/r/20220915150417.722975-11-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/pfn_devs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ec5219680092d..85ca5b4da3cf3 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -652,7 +652,7 @@ void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns); #if IS_ENABLED(CONFIG_ND_CLAIM) /* max struct page size independent of kernel config */ -#define MAX_STRUCT_PAGE_SIZE 64 +#define MAX_STRUCT_PAGE_SIZE 128 int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0e92ab4b32833..61af072ac98f9 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -787,7 +787,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. * - * Also make sure size of struct page is less than 64. We + * Also make sure size of struct page is less than 128. We * want to make sure we use large enough size here so that * we don't have a dynamic reserve space depending on * struct page size. But we also want to make sure we notice -- GitLab From f80be4571b19b9fd8dd1528cd2a2f123aff51f70 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:45 +0200 Subject: [PATCH 1061/2223] kmsan: add KMSAN runtime core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each memory location KernelMemorySanitizer maintains two types of metadata: 1. The so-called shadow of that location - а byte:byte mapping describing whether or not individual bits of memory are initialized (shadow is 0) or not (shadow is 1). 2. The origins of that location - а 4-byte:4-byte mapping containing 4-byte IDs of the stack traces where uninitialized values were created. Each struct page now contains pointers to two struct pages holding KMSAN metadata (shadow and origins) for the original struct page. Utility routines in mm/kmsan/core.c and mm/kmsan/shadow.c handle the metadata creation, addressing, copying and checking. mm/kmsan/report.c performs error reporting in the cases an uninitialized value is used in a way that leads to undefined behavior. KMSAN compiler instrumentation is responsible for tracking the metadata along with the kernel memory. mm/kmsan/instrumentation.c provides the implementation for instrumentation hooks that are called from files compiled with -fsanitize=kernel-memory. To aid parameter passing (also done at instrumentation level), each task_struct now contains a struct kmsan_task_state used to track the metadata of function parameters and return values for that task. Finally, this patch provides CONFIG_KMSAN that enables KMSAN, and declares CFLAGS_KMSAN, which are applied to files compiled with KMSAN. The KMSAN_SANITIZE:=n Makefile directive can be used to completely disable KMSAN instrumentation for certain files. Similarly, KMSAN_ENABLE_CHECKS:=n disables KMSAN checks and makes newly created stack memory initialized. Users can also use functions from include/linux/kmsan-checks.h to mark certain memory regions as uninitialized or initialized (this is called "poisoning" and "unpoisoning") or check that a particular region is initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-12-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Makefile | 1 + include/linux/kmsan-checks.h | 64 +++++ include/linux/kmsan_types.h | 35 +++ include/linux/mm_types.h | 12 + include/linux/sched.h | 5 + lib/Kconfig.debug | 1 + lib/Kconfig.kmsan | 50 ++++ mm/Makefile | 1 + mm/kmsan/Makefile | 23 ++ mm/kmsan/core.c | 440 +++++++++++++++++++++++++++++++++++ mm/kmsan/hooks.c | 66 ++++++ mm/kmsan/instrumentation.c | 307 ++++++++++++++++++++++++ mm/kmsan/kmsan.h | 204 ++++++++++++++++ mm/kmsan/report.c | 219 +++++++++++++++++ mm/kmsan/shadow.c | 147 ++++++++++++ scripts/Makefile.kmsan | 8 + scripts/Makefile.lib | 9 + 17 files changed, 1592 insertions(+) create mode 100644 include/linux/kmsan-checks.h create mode 100644 include/linux/kmsan_types.h create mode 100644 lib/Kconfig.kmsan create mode 100644 mm/kmsan/Makefile create mode 100644 mm/kmsan/core.c create mode 100644 mm/kmsan/hooks.c create mode 100644 mm/kmsan/instrumentation.c create mode 100644 mm/kmsan/kmsan.h create mode 100644 mm/kmsan/report.c create mode 100644 mm/kmsan/shadow.c create mode 100644 scripts/Makefile.kmsan diff --git a/Makefile b/Makefile index 952d354069a43..c9f37d25ea634 100644 --- a/Makefile +++ b/Makefile @@ -1015,6 +1015,7 @@ include-y := scripts/Makefile.extrawarn include-$(CONFIG_DEBUG_INFO) += scripts/Makefile.debug include-$(CONFIG_KASAN) += scripts/Makefile.kasan include-$(CONFIG_KCSAN) += scripts/Makefile.kcsan +include-$(CONFIG_KMSAN) += scripts/Makefile.kmsan include-$(CONFIG_UBSAN) += scripts/Makefile.ubsan include-$(CONFIG_KCOV) += scripts/Makefile.kcov include-$(CONFIG_RANDSTRUCT) += scripts/Makefile.randstruct diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h new file mode 100644 index 0000000000000..a6522a0c28df9 --- /dev/null +++ b/include/linux/kmsan-checks.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN checks to be used for one-off annotations in subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef _LINUX_KMSAN_CHECKS_H +#define _LINUX_KMSAN_CHECKS_H + +#include + +#ifdef CONFIG_KMSAN + +/** + * kmsan_poison_memory() - Mark the memory range as uninitialized. + * @address: address to start with. + * @size: size of buffer to poison. + * @flags: GFP flags for allocations done by this function. + * + * Until other data is written to this range, KMSAN will treat it as + * uninitialized. Error reports for this memory will reference the call site of + * kmsan_poison_memory() as origin. + */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags); + +/** + * kmsan_unpoison_memory() - Mark the memory range as initialized. + * @address: address to start with. + * @size: size of buffer to unpoison. + * + * Until other data is written to this range, KMSAN will treat it as + * initialized. + */ +void kmsan_unpoison_memory(const void *address, size_t size); + +/** + * kmsan_check_memory() - Check the memory range for being initialized. + * @address: address to start with. + * @size: size of buffer to check. + * + * If any piece of the given range is marked as uninitialized, KMSAN will report + * an error. + */ +void kmsan_check_memory(const void *address, size_t size); + +#else + +static inline void kmsan_poison_memory(const void *address, size_t size, + gfp_t flags) +{ +} +static inline void kmsan_unpoison_memory(const void *address, size_t size) +{ +} +static inline void kmsan_check_memory(const void *address, size_t size) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_CHECKS_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h new file mode 100644 index 0000000000000..8bfa6c98176d4 --- /dev/null +++ b/include/linux/kmsan_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal header declaring types added by KMSAN to existing kernel structs. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_TYPES_H +#define _LINUX_KMSAN_TYPES_H + +/* These constants are defined in the MSan LLVM instrumentation pass. */ +#define KMSAN_RETVAL_SIZE 800 +#define KMSAN_PARAM_SIZE 800 + +struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + u32 retval_origin_tls; +}; + +#undef KMSAN_PARAM_SIZE +#undef KMSAN_RETVAL_SIZE + +struct kmsan_ctx { + struct kmsan_context_state cstate; + int kmsan_in_runtime; + bool allow_reporting; +}; + +#endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5c87d0f292a23..500e536796ca4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -224,6 +224,18 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_KMSAN + /* + * KMSAN metadata for this page: + * - shadow page: every bit indicates whether the corresponding + * bit of the original page is initialized (0) or not (1); + * - origin page: every 4 bytes contain an id of the stack trace + * where the uninitialized value was created. + */ + struct page *kmsan_shadow; + struct page *kmsan_origin; +#endif + #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index fbac3c19fe354..88a043f7235eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1362,6 +1363,10 @@ struct task_struct { #endif #endif +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6d1544d9201e4..0129bee7de010 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -970,6 +970,7 @@ config DEBUG_STACKOVERFLOW source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" +source "lib/Kconfig.kmsan" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan new file mode 100644 index 0000000000000..5b19dbd34d76e --- /dev/null +++ b/lib/Kconfig.kmsan @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-only +config HAVE_ARCH_KMSAN + bool + +config HAVE_KMSAN_COMPILER + # Clang versions <14.0.0 also support -fsanitize=kernel-memory, but not + # all the features necessary to build the kernel with KMSAN. + depends on CC_IS_CLANG && CLANG_VERSION >= 140000 + def_bool $(cc-option,-fsanitize=kernel-memory -mllvm -msan-disable-checks=1) + +config KMSAN + bool "KMSAN: detector of uninitialized values use" + depends on HAVE_ARCH_KMSAN && HAVE_KMSAN_COMPILER + depends on SLUB && DEBUG_KERNEL && !KASAN && !KCSAN + select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT + help + KernelMemorySanitizer (KMSAN) is a dynamic detector of uses of + uninitialized values in the kernel. It is based on compiler + instrumentation provided by Clang and thus requires Clang to build. + + An important note is that KMSAN is not intended for production use, + because it drastically increases kernel memory footprint and slows + the whole system down. + + See for more details. + +if KMSAN + +config HAVE_KMSAN_PARAM_RETVAL + # -fsanitize-memory-param-retval is supported only by Clang >= 14. + depends on HAVE_KMSAN_COMPILER + def_bool $(cc-option,-fsanitize=kernel-memory -fsanitize-memory-param-retval) + +config KMSAN_CHECK_PARAM_RETVAL + bool "Check for uninitialized values passed to and returned from functions" + default y + depends on HAVE_KMSAN_PARAM_RETVAL + help + If the compiler supports -fsanitize-memory-param-retval, KMSAN will + eagerly check every function parameter passed by value and every + function return value. + + Disabling KMSAN_CHECK_PARAM_RETVAL will result in tracking shadow for + function parameters and return values across function borders. This + is a more relaxed mode, but it generates more instrumentation code and + may potentially report errors in corner cases when non-instrumented + functions call instrumented ones. + +endif diff --git a/mm/Makefile b/mm/Makefile index a731d1decbb12..cc23b00525848 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ +obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile new file mode 100644 index 0000000000000..550ad8625e4f9 --- /dev/null +++ b/mm/kmsan/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for KernelMemorySanitizer (KMSAN). +# +# +obj-y := core.o instrumentation.o hooks.o report.o shadow.o + +KMSAN_SANITIZE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n + +# Disable instrumentation of KMSAN runtime with other tools. +CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector +CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) +CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c new file mode 100644 index 0000000000000..5330138fda5bc --- /dev/null +++ b/mm/kmsan/core.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN runtime library. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kmsan.h" + +bool kmsan_enabled __read_mostly; + +/* + * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is + * unavaliable. + */ +DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags) +{ + u32 extra_bits = + kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE); + bool checked = poison_flags & KMSAN_POISON_CHECK; + depot_stack_handle_t handle; + + handle = kmsan_save_stack_with_flags(flags, extra_bits); + kmsan_internal_set_shadow_origin(address, size, -1, handle, checked); +} + +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked) +{ + kmsan_internal_set_shadow_origin(address, size, 0, 0, checked); +} + +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra) +{ + unsigned long entries[KMSAN_STACK_DEPTH]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); + + /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ + flags &= ~__GFP_DIRECT_RECLAIM; + + return __stack_depot_save(entries, nr_entries, extra, flags, true); +} + +/* Copy the metadata following the memmove() behavior. */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) +{ + depot_stack_handle_t old_origin = 0, new_origin = 0; + int src_slots, dst_slots, i, iter, step, skip_bits; + depot_stack_handle_t *origin_src, *origin_dst; + void *shadow_src, *shadow_dst; + u32 *align_shadow_src, shadow; + bool backwards; + + shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); + if (!shadow_dst) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + + shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); + if (!shadow_src) { + /* + * @src is untracked: zero out destination shadow, ignore the + * origins, we're done. + */ + __memset(shadow_dst, 0, n); + return; + } + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); + + __memmove(shadow_dst, shadow_src, n); + + origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); + origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin_dst || !origin_src); + src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); + KMSAN_WARN_ON((src_slots - dst_slots > 1) || + (dst_slots - src_slots < -1)); + + backwards = dst > src; + i = backwards ? min(src_slots, dst_slots) - 1 : 0; + iter = backwards ? -1 : 1; + + align_shadow_src = + (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); + for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { + KMSAN_WARN_ON(i < 0); + shadow = align_shadow_src[i]; + if (i == 0) { + /* + * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't + * look at the first @src % KMSAN_ORIGIN_SIZE bytes + * of the first shadow slot. + */ + skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + } + if (i == src_slots - 1) { + /* + * If @src + n isn't aligned on + * KMSAN_ORIGIN_SIZE, don't look at the last + * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the + * last shadow slot. + */ + skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + } + /* + * Overwrite the origin only if the corresponding + * shadow is nonempty. + */ + if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { + old_origin = origin_src[i]; + new_origin = kmsan_internal_chain_origin(old_origin); + /* + * kmsan_internal_chain_origin() may return + * NULL, but we don't want to lose the previous + * origin value. + */ + if (!new_origin) + new_origin = old_origin; + } + if (shadow) + origin_dst[i] = new_origin; + else + origin_dst[i] = 0; + } + /* + * If dst_slots is greater than src_slots (i.e. + * dst_slots == src_slots + 1), there is an extra origin slot at the + * beginning or end of the destination buffer, for which we take the + * origin from the previous slot. + * This is only done if the part of the source shadow corresponding to + * slot is non-zero. + * + * E.g. if we copy 8 aligned bytes that are marked as uninitialized + * and have origins o111 and o222, to an unaligned buffer with offset 1, + * these two origins are copied to three origin slots, so one of then + * needs to be duplicated, depending on the copy direction (@backwards) + * + * src shadow: |uuuu|uuuu|....| + * src origin: |o111|o222|....| + * + * backwards = 0: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |....|o111|o222| - fill the empty slot with o111 + * backwards = 1: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |o111|o222|....| - fill the empty slot with o222 + */ + if (src_slots < dst_slots) { + if (backwards) { + shadow = align_shadow_src[src_slots - 1]; + skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + if (shadow) + /* src_slots > 0, therefore dst_slots is at least 2 */ + origin_dst[dst_slots - 1] = + origin_dst[dst_slots - 2]; + } else { + shadow = align_shadow_src[0]; + skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + if (shadow) + origin_dst[0] = origin_dst[1]; + } + } +} + +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) +{ + unsigned long entries[3]; + u32 extra_bits; + int depth; + bool uaf; + + if (!id) + return id; + /* + * Make sure we have enough spare bits in @id to hold the UAF bit and + * the chain depth. + */ + BUILD_BUG_ON( + (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + + extra_bits = stack_depot_get_extra_bits(id); + depth = kmsan_depth_from_eb(extra_bits); + uaf = kmsan_uaf_from_eb(extra_bits); + + /* + * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH. + * This mostly happens in the case structures with uninitialized padding + * are copied around many times. Origin chains for such structures are + * usually periodic, and it does not make sense to fully store them. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + return id; + + depth++; + extra_bits = kmsan_extra_bits(depth, uaf); + + entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN; + entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0); + entries[2] = id; + /* + * @entries is a local var in non-instrumented code, so KMSAN does not + * know it is initialized. Explicitly unpoison it to avoid false + * positives when __stack_depot_save() passes it to instrumented code. + */ + kmsan_internal_unpoison_memory(entries, sizeof(entries), false); + return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, + GFP_ATOMIC, true); +} + +void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, + u32 origin, bool checked) +{ + u64 address = (u64)addr; + void *shadow_start; + u32 *origin_start; + size_t pad = 0; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW); + if (!shadow_start) { + /* + * kmsan_metadata_is_contiguous() is true, so either all shadow + * and origin pages are NULL, or all are non-NULL. + */ + if (checked) { + pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n", + __func__, size, addr); + KMSAN_WARN_ON(true); + } + return; + } + __memset(shadow_start, b, size); + + if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + pad = address % KMSAN_ORIGIN_SIZE; + address -= pad; + size += pad; + } + size = ALIGN(size, KMSAN_ORIGIN_SIZE); + origin_start = + (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN); + + for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) + origin_start[i] = origin; +} + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr) +{ + struct page *page; + + if (!kmsan_internal_is_vmalloc_addr(vaddr) && + !kmsan_internal_is_module_addr(vaddr)) + return NULL; + page = vmalloc_to_page(vaddr); + if (pfn_valid(page_to_pfn(page))) + return page; + else + return NULL; +} + +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason) +{ + depot_stack_handle_t cur_origin = 0, new_origin = 0; + unsigned long addr64 = (unsigned long)addr; + depot_stack_handle_t *origin = NULL; + unsigned char *shadow = NULL; + int cur_off_start = -1; + int chunk_size; + size_t pos = 0; + + if (!size) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + while (pos < size) { + chunk_size = min(size - pos, + PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE)); + shadow = kmsan_get_metadata((void *)(addr64 + pos), + KMSAN_META_SHADOW); + if (!shadow) { + /* + * This page is untracked. If there were uninitialized + * bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos - 1, user_addr, + reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + pos += chunk_size; + continue; + } + for (int i = 0; i < chunk_size; i++) { + if (!shadow[i]) { + /* + * This byte is unpoisoned. If there were + * poisoned bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + continue; + } + origin = kmsan_get_metadata((void *)(addr64 + pos + i), + KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin); + new_origin = *origin; + /* + * Encountered new origin - report the previous + * uninitialized range. + */ + if (cur_origin != new_origin) { + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = new_origin; + cur_off_start = pos + i; + } + } + pos += chunk_size; + } + KMSAN_WARN_ON(pos != size); + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, + user_addr, reason); + kmsan_leave_runtime(); + } +} + +bool kmsan_metadata_is_contiguous(void *addr, size_t size) +{ + char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL, + *next_origin = NULL; + u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE; + depot_stack_handle_t *origin_p; + bool all_untracked = false; + + if (!size) + return true; + + /* The whole range belongs to the same page. */ + if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) == + ALIGN_DOWN(cur_addr, PAGE_SIZE)) + return true; + + cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false); + if (!cur_shadow) + all_untracked = true; + cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true); + if (all_untracked && cur_origin) + goto report; + + for (; next_addr < (u64)addr + size; + cur_addr = next_addr, cur_shadow = next_shadow, + cur_origin = next_origin, next_addr += PAGE_SIZE) { + next_shadow = kmsan_get_metadata((void *)next_addr, false); + next_origin = kmsan_get_metadata((void *)next_addr, true); + if (all_untracked) { + if (next_shadow || next_origin) + goto report; + if (!next_shadow && !next_origin) + continue; + } + if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) && + ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE))) + continue; + goto report; + } + return true; + +report: + pr_err("%s: attempting to access two shadow page ranges.\n", __func__); + pr_err("Access of size %ld at %px.\n", size, addr); + pr_err("Addresses belonging to different ranges: %px and %px\n", + (void *)cur_addr, (void *)next_addr); + pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow, + next_shadow); + pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin, + next_origin); + origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN); + if (origin_p) { + pr_err("Origin: %08x\n", *origin_p); + kmsan_print_origin(*origin_p); + } else { + pr_err("Origin: unavailable\n"); + } + return false; +} diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c new file mode 100644 index 0000000000000..4ac62fa67a02a --- /dev/null +++ b/mm/kmsan/hooks.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN hooks for kernel subsystems. + * + * These functions handle creation of KMSAN metadata for memory allocations. + * + * Copyright (C) 2018-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "../slab.h" +#include "kmsan.h" + +/* + * Instrumented functions shouldn't be called under + * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to + * skipping effects of functions like memset() inside instrumented code. + */ + +/* Functions from kmsan-checks.h follow. */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_poison_memory((void *)address, size, flags, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_poison_memory); + +void kmsan_unpoison_memory(const void *address, size_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_unpoison_memory((void *)address, size, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_unpoison_memory); + +void kmsan_check_memory(const void *addr, size_t size) +{ + if (!kmsan_enabled) + return; + return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); +} +EXPORT_SYMBOL(kmsan_check_memory); diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c new file mode 100644 index 0000000000000..280d154132684 --- /dev/null +++ b/mm/kmsan/instrumentation.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN compiler API. + * + * This file implements __msan_XXX hooks that Clang inserts into the code + * compiled with -fsanitize=kernel-memory. + * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN + * instrumentation works. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" +#include +#include +#include + +static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store) +{ + if ((u64)addr < TASK_SIZE) + return true; + if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW)) + return true; + return false; +} + +static inline struct shadow_origin_ptr +get_shadow_origin_ptr(void *addr, u64 size, bool store) +{ + unsigned long ua_flags = user_access_save(); + struct shadow_origin_ptr ret; + + ret = kmsan_get_shadow_origin_ptr(addr, size, store); + user_access_restore(ua_flags); + return ret; +} + +/* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ false); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); + +/* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ true); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); + +/* + * Declare functions that obtain shadow/origin pointers for loads and stores + * with fixed size. + */ +#define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ false); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ true); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size) + +DECLARE_METADATA_PTR_GETTER(1); +DECLARE_METADATA_PTR_GETTER(2); +DECLARE_METADATA_PTR_GETTER(4); +DECLARE_METADATA_PTR_GETTER(8); + +/* + * Handle a memory store performed by inline assembly. KMSAN conservatively + * attempts to unpoison the outputs of asm() directives to prevent false + * positives caused by missed stores. + */ +void __msan_instrument_asm_store(void *addr, uintptr_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + /* + * Most of the accesses are below 32 bytes. The two exceptions so far + * are clwb() (64 bytes) and FPU state (512 bytes). + * It's unlikely that the assembly will touch more than 512 bytes. + */ + if (size > 512) { + WARN_ONCE(1, "assembly store size too big: %ld\n", size); + size = 8; + } + if (is_bad_asm_addr(addr, size, /*is_store*/ true)) { + user_access_restore(ua_flags); + return; + } + kmsan_enter_runtime(); + /* Unpoisoning the memory on best effort. */ + kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_instrument_asm_store); + +/* + * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset + * intrinsics with calls to respective __msan_ functions. We use + * get_param0_metadata() and set_retval_metadata() to store the shadow/origin + * values for the destination argument of these functions and use them for the + * functions' return values. + */ +static inline void get_param0_metadata(u64 *shadow, + depot_stack_handle_t *origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *shadow = *(u64 *)(ctx->cstate.param_tls); + *origin = ctx->cstate.param_origin_tls[0]; +} + +static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *(u64 *)(ctx->cstate.retval_tls) = shadow; + ctx->cstate.retval_origin_tls = origin; +} + +/* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memmove(dst, src, n); + if (!n) + /* Some people call memmove() with zero length. */ + return result; + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memmove); + +/* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memcpy(dst, src, n); + if (!n) + /* Some people call memcpy() with zero length. */ + return result; + + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* Using memmove instead of memcpy doesn't affect correctness. */ + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memcpy); + +/* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memset(dst, c, n); + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* + * Clang doesn't pass parameter metadata here, so it is impossible to + * use shadow of @c to set up the shadow for @dst. + */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memset); + +/* + * Create a new origin from an old one. This is done when storing an + * uninitialized value to memory. When reporting an error, KMSAN unrolls and + * prints the whole chain of stores that preceded the use of this value. + */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) +{ + depot_stack_handle_t ret = 0; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return ret; + + ua_flags = user_access_save(); + + /* Creating new origins may allocate memory. */ + kmsan_enter_runtime(); + ret = kmsan_internal_chain_origin(origin); + kmsan_leave_runtime(); + user_access_restore(ua_flags); + return ret; +} +EXPORT_SYMBOL(__msan_chain_origin); + +/* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr) +{ + depot_stack_handle_t handle; + unsigned long entries[4]; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN; + entries[1] = (u64)descr; + entries[2] = (u64)__builtin_return_address(0); + /* + * With frame pointers enabled, it is possible to quickly fetch the + * second frame of the caller stack without calling the unwinder. + * Without them, simply do not bother. + */ + if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) + entries[3] = (u64)__builtin_return_address(1); + else + entries[3] = 0; + + /* stack_depot_save() may allocate memory. */ + kmsan_enter_runtime(); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC); + kmsan_leave_runtime(); + + kmsan_internal_set_shadow_origin(address, size, -1, handle, + /*checked*/ true); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_poison_alloca); + +/* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_unpoison_memory(address, size, /*checked*/ true); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_unpoison_alloca); + +/* + * Report that an uninitialized value with the given origin was used in a way + * that constituted undefined behavior. + */ +void __msan_warning(u32 origin) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_report(origin, /*address*/ 0, /*size*/ 0, + /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0, + REASON_ANY); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_warning); + +/* + * At the beginning of an instrumented function, obtain the pointer to + * `struct kmsan_context_state` holding the metadata for function parameters. + */ +struct kmsan_context_state *__msan_get_context_state(void) +{ + return &kmsan_get_context()->cstate; +} +EXPORT_SYMBOL(__msan_get_context_state); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h new file mode 100644 index 0000000000000..97d48b45dba58 --- /dev/null +++ b/mm/kmsan/kmsan.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions used by the KMSAN runtime. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef __MM_KMSAN_KMSAN_H +#define __MM_KMSAN_KMSAN_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100 +#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200 + +#define KMSAN_POISON_NOCHECK 0x0 +#define KMSAN_POISON_CHECK 0x1 +#define KMSAN_POISON_FREE 0x2 + +#define KMSAN_ORIGIN_SIZE 4 +#define KMSAN_MAX_ORIGIN_DEPTH 7 + +#define KMSAN_STACK_DEPTH 64 + +#define KMSAN_META_SHADOW (false) +#define KMSAN_META_ORIGIN (true) + +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + +/* + * A pair of metadata pointers to be returned by the instrumentation functions. + */ +struct shadow_origin_ptr { + void *shadow, *origin; +}; + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, + bool store); +void *kmsan_get_metadata(void *addr, bool is_origin); + +enum kmsan_bug_reason { + REASON_ANY, + REASON_COPY_TO_USER, + REASON_SUBMIT_URB, +}; + +void kmsan_print_origin(depot_stack_handle_t origin); + +/** + * kmsan_report() - Report a use of uninitialized value. + * @origin: Stack ID of the uninitialized value. + * @address: Address at which the memory access happens. + * @size: Memory access size. + * @off_first: Offset (from @address) of the first byte to be reported. + * @off_last: Offset (from @address) of the last byte to be reported. + * @user_addr: When non-NULL, denotes the userspace address to which the kernel + * is leaking data. + * @reason: Error type from enum kmsan_bug_reason. + * + * kmsan_report() prints an error message for a consequent group of bytes + * sharing the same origin. If an uninitialized value is used in a comparison, + * this function is called once without specifying the addresses. When checking + * a memory range, KMSAN may call kmsan_report() multiple times with the same + * @address, @size, @user_addr and @reason, but different @off_first and + * @off_last corresponding to different @origin values. + */ +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason); + +DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +static __always_inline struct kmsan_ctx *kmsan_get_context(void) +{ + return in_task() ? ¤t->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx); +} + +/* + * When a compiler hook or KMSAN runtime function is invoked, it may make a + * call to instrumented code and eventually call itself recursively. To avoid + * that, we guard the runtime entry regions with + * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if + * kmsan_in_runtime() is true. + * + * Non-runtime code may occasionally get executed in nested IRQs from the + * runtime code (e.g. when called via smp_call_function_single()). Because some + * KMSAN routines may take locks (e.g. for memory allocation), we conservatively + * bail out instead of calling them. To minimize the effect of this (potentially + * missing initialization events) kmsan_in_runtime() is not checked in + * non-blocking runtime functions. + */ +static __always_inline bool kmsan_in_runtime(void) +{ + if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) + return true; + return kmsan_get_context()->kmsan_in_runtime; +} + +static __always_inline void kmsan_enter_runtime(void) +{ + struct kmsan_ctx *ctx; + + ctx = kmsan_get_context(); + KMSAN_WARN_ON(ctx->kmsan_in_runtime++); +} + +static __always_inline void kmsan_leave_runtime(void) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + KMSAN_WARN_ON(--ctx->kmsan_in_runtime); +} + +depot_stack_handle_t kmsan_save_stack(void); +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra_bits); + +/* + * Pack and unpack the origin chain depth and UAF flag to/from the extra bits + * provided by the stack depot. + * The UAF flag is stored in the lowest bit, followed by the depth in the upper + * bits. + * set_dsh_extra_bits() is responsible for clamping the value. + */ +static __always_inline unsigned int kmsan_extra_bits(unsigned int depth, + bool uaf) +{ + return (depth << 1) | uaf; +} + +static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits) +{ + return extra_bits & 1; +} + +static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits) +{ + return extra_bits >> 1; +} + +/* + * kmsan_internal_ functions are supposed to be very simple and not require the + * kmsan_in_runtime() checks. + */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n); +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags); +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked); +void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, + u32 origin, bool checked); +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); + +bool kmsan_metadata_is_contiguous(void *addr, size_t size); +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason); + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); + +/* + * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are + * non-instrumented versions of is_module_address() and is_vmalloc_addr() that + * are safe to call from KMSAN runtime without recursion. + */ +static inline bool kmsan_internal_is_module_addr(void *vaddr) +{ + return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END); +} + +static inline bool kmsan_internal_is_vmalloc_addr(void *addr) +{ + return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END); +} + +#endif /* __MM_KMSAN_KMSAN_H */ diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c new file mode 100644 index 0000000000000..02736ec757f2c --- /dev/null +++ b/mm/kmsan/report.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN error reporting routines. + * + * Copyright (C) 2019-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include + +#include "kmsan.h" + +static DEFINE_RAW_SPINLOCK(kmsan_report_lock); +#define DESCR_SIZE 128 +/* Protected by kmsan_report_lock */ +static char report_local_descr[DESCR_SIZE]; +int panic_on_kmsan __read_mostly; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kmsan." +module_param_named(panic, panic_on_kmsan, int, 0); + +/* + * Skip internal KMSAN frames. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], + int num_entries) +{ + int len, skip; + char buf[64]; + + for (skip = 0; skip < num_entries; ++skip) { + len = scnprintf(buf, sizeof(buf), "%ps", + (void *)stack_entries[skip]); + + /* Never show __msan_* or kmsan_* functions. */ + if ((strnstr(buf, "__msan_", len) == buf) || + (strnstr(buf, "kmsan_", len) == buf)) + continue; + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; + } + + return skip; +} + +/* + * Currently the descriptions of locals generated by Clang look as follows: + * ----local_name@function_name + * We want to print only the name of the local, as other information in that + * description can be confusing. + * The meaningful part of the description is copied to a global buffer to avoid + * allocating memory. + */ +static char *pretty_descr(char *descr) +{ + int pos = 0, len = strlen(descr); + + for (int i = 0; i < len; i++) { + if (descr[i] == '@') + break; + if (descr[i] == '-') + continue; + report_local_descr[pos] = descr[i]; + if (pos + 1 == DESCR_SIZE) + break; + pos++; + } + report_local_descr[pos] = 0; + return report_local_descr; +} + +void kmsan_print_origin(depot_stack_handle_t origin) +{ + unsigned long *entries = NULL, *chained_entries = NULL; + unsigned int nr_entries, chained_nr_entries, skipnr; + void *pc1 = NULL, *pc2 = NULL; + depot_stack_handle_t head; + unsigned long magic; + char *descr = NULL; + unsigned int depth; + + if (!origin) + return; + + while (true) { + nr_entries = stack_depot_fetch(origin, &entries); + depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin)); + magic = nr_entries ? entries[0] : 0; + if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) { + descr = (char *)entries[1]; + pc1 = (void *)entries[2]; + pc2 = (void *)entries[3]; + pr_err("Local variable %s created at:\n", + pretty_descr(descr)); + if (pc1) + pr_err(" %pSb\n", pc1); + if (pc2) + pr_err(" %pSb\n", pc2); + break; + } + if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) { + /* + * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are + * not stored, so the output may be incomplete. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + pr_err("\n\n"); + head = entries[1]; + origin = entries[2]; + pr_err("Uninit was stored to memory at:\n"); + chained_nr_entries = + stack_depot_fetch(head, &chained_entries); + kmsan_internal_unpoison_memory( + chained_entries, + chained_nr_entries * sizeof(*chained_entries), + /*checked*/ false); + skipnr = get_stack_skipnr(chained_entries, + chained_nr_entries); + stack_trace_print(chained_entries + skipnr, + chained_nr_entries - skipnr, 0); + pr_err("\n"); + continue; + } + pr_err("Uninit was created at:\n"); + if (nr_entries) { + skipnr = get_stack_skipnr(entries, nr_entries); + stack_trace_print(entries + skipnr, nr_entries - skipnr, + 0); + } else { + pr_err("(stack is not available)\n"); + } + break; + } +} + +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason) +{ + unsigned long stack_entries[KMSAN_STACK_DEPTH]; + int num_stack_entries, skipnr; + char *bug_type = NULL; + unsigned long ua_flags; + bool is_uaf; + + if (!kmsan_enabled) + return; + if (!current->kmsan_ctx.allow_reporting) + return; + if (!origin) + return; + + current->kmsan_ctx.allow_reporting = false; + ua_flags = user_access_save(); + raw_spin_lock(&kmsan_report_lock); + pr_err("=====================================================\n"); + is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin)); + switch (reason) { + case REASON_ANY: + bug_type = is_uaf ? "use-after-free" : "uninit-value"; + break; + case REASON_COPY_TO_USER: + bug_type = is_uaf ? "kernel-infoleak-after-free" : + "kernel-infoleak"; + break; + case REASON_SUBMIT_URB: + bug_type = is_uaf ? "kernel-usb-infoleak-after-free" : + "kernel-usb-infoleak"; + break; + } + + num_stack_entries = + stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + + pr_err("BUG: KMSAN: %s in %pSb\n", bug_type, + (void *)stack_entries[skipnr]); + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + pr_err("\n"); + + kmsan_print_origin(origin); + + if (size) { + pr_err("\n"); + if (off_first == off_last) + pr_err("Byte %d of %d is uninitialized\n", off_first, + size); + else + pr_err("Bytes %d-%d of %d are uninitialized\n", + off_first, off_last, size); + } + if (address) + pr_err("Memory access of size %d starts at %px\n", size, + address); + if (user_addr && reason == REASON_COPY_TO_USER) + pr_err("Data copied to user address %px\n", user_addr); + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("=====================================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + raw_spin_unlock(&kmsan_report_lock); + if (panic_on_kmsan) + panic("kmsan.panic set ...\n"); + user_access_restore(ua_flags); + current->kmsan_ctx.allow_reporting = true; +} diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c new file mode 100644 index 0000000000000..acc5279acc3be --- /dev/null +++ b/mm/kmsan/shadow.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN shadow implementation. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "kmsan.h" + +#define shadow_page_for(page) ((page)->kmsan_shadow) + +#define origin_page_for(page) ((page)->kmsan_origin) + +static void *shadow_ptr_for(struct page *page) +{ + return page_address(shadow_page_for(page)); +} + +static void *origin_ptr_for(struct page *page) +{ + return page_address(origin_page_for(page)); +} + +static bool page_has_metadata(struct page *page) +{ + return shadow_page_for(page) && origin_page_for(page); +} + +static void set_no_shadow_origin_page(struct page *page) +{ + shadow_page_for(page) = NULL; + origin_page_for(page) = NULL; +} + +/* + * Dummy load and store pages to be used when the real metadata is unavailable. + * There are separate pages for loads and stores, so that every load returns a + * zero, and every store doesn't affect other loads. + */ +static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static unsigned long vmalloc_meta(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr, off; + + KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE)); + if (kmsan_internal_is_vmalloc_addr(addr)) { + off = addr64 - VMALLOC_START; + return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START : + KMSAN_VMALLOC_SHADOW_START); + } + if (kmsan_internal_is_module_addr(addr)) { + off = addr64 - MODULES_VADDR; + return off + (is_origin ? KMSAN_MODULES_ORIGIN_START : + KMSAN_MODULES_SHADOW_START); + } + return 0; +} + +static struct page *virt_to_page_or_null(void *vaddr) +{ + if (kmsan_virt_addr_valid(vaddr)) + return virt_to_page(vaddr); + else + return NULL; +} + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size, + bool store) +{ + struct shadow_origin_ptr ret; + void *shadow; + + /* + * Even if we redirect this memory access to the dummy page, it will + * go out of bounds. + */ + KMSAN_WARN_ON(size > PAGE_SIZE); + + if (!kmsan_enabled) + goto return_dummy; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size)); + shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW); + if (!shadow) + goto return_dummy; + + ret.shadow = shadow; + ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN); + return ret; + +return_dummy: + if (store) { + /* Ignore this store. */ + ret.shadow = dummy_store_page; + ret.origin = dummy_store_page; + } else { + /* This load will return zero. */ + ret.shadow = dummy_load_page; + ret.origin = dummy_load_page; + } + return ret; +} + +/* + * Obtain the shadow or origin pointer for the given address, or NULL if there's + * none. The caller must check the return value for being non-NULL if needed. + * The return value of this function should not depend on whether we're in the + * runtime or not. + */ +void *kmsan_get_metadata(void *address, bool is_origin) +{ + u64 addr = (u64)address, pad, off; + struct page *page; + + if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { + pad = addr % KMSAN_ORIGIN_SIZE; + addr -= pad; + } + address = (void *)addr; + if (kmsan_internal_is_vmalloc_addr(address) || + kmsan_internal_is_module_addr(address)) + return (void *)vmalloc_meta(address, is_origin); + + page = virt_to_page_or_null(address); + if (!page) + return NULL; + if (!page_has_metadata(page)) + return NULL; + off = addr % PAGE_SIZE; + + return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; +} diff --git a/scripts/Makefile.kmsan b/scripts/Makefile.kmsan new file mode 100644 index 0000000000000..b5b0aa61322ec --- /dev/null +++ b/scripts/Makefile.kmsan @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +kmsan-cflags := -fsanitize=kernel-memory + +ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL +kmsan-cflags += -fsanitize-memory-param-retval +endif + +export CFLAGS_KMSAN := $(kmsan-cflags) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3fb6a99e78c47..ac32429e93b73 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -157,6 +157,15 @@ _c_flags += $(if $(patsubst n%,, \ endif endif +ifeq ($(CONFIG_KMSAN),y) +_c_flags += $(if $(patsubst n%,, \ + $(KMSAN_SANITIZE_$(basetarget).o)$(KMSAN_SANITIZE)y), \ + $(CFLAGS_KMSAN)) +_c_flags += $(if $(patsubst n%,, \ + $(KMSAN_ENABLE_CHECKS_$(basetarget).o)$(KMSAN_ENABLE_CHECKS)y), \ + , -mllvm -msan-disable-checks=1) +endif + ifeq ($(CONFIG_UBSAN),y) _c_flags += $(if $(patsubst n%,, \ $(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \ -- GitLab From 79dbd006a6d6f51777ba4948046561b6d9270504 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:46 +0200 Subject: [PATCH 1062/2223] kmsan: disable instrumentation of unsupported common kernel code EFI stub cannot be linked with KMSAN runtime, so we disable instrumentation for it. Instrumenting kcov, stackdepot or lockdep leads to infinite recursion caused by instrumentation hooks calling instrumented code again. Link: https://lkml.kernel.org/r/20220915150417.722975-13-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/firmware/efi/libstub/Makefile | 1 + kernel/Makefile | 1 + kernel/locking/Makefile | 3 ++- lib/Makefile | 3 +++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index d0537573501e9..81432d0c904b1 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -46,6 +46,7 @@ GCOV_PROFILE := n # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d32..d754e0be1176d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # Don't instrument error handlers diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f382..ea925731fa40f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n +KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/lib/Makefile b/lib/Makefile index d7d94102991b3..42e185cdecd03 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -270,6 +270,9 @@ obj-$(CONFIG_POLYNOMIAL) += polynomial.o CFLAGS_stackdepot.o += -fno-builtin obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +# In particular, instrumenting stackdepot.c with KMSAN will result in infinite +# recursion. +KMSAN_SANITIZE_stackdepot.o := n KCOV_INSTRUMENT_stackdepot.o := n obj-$(CONFIG_REF_TRACKER) += ref_tracker.o -- GitLab From d596b04f5967c75c196eb582fefba49488c57289 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:47 +0200 Subject: [PATCH 1063/2223] MAINTAINERS: add entry for KMSAN Add entry for KMSAN maintainers/reviewers. Link: https://lkml.kernel.org/r/20220915150417.722975-14-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6f1033f3c1eda..3c7dfe9bb7129 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11371,6 +11371,19 @@ F: kernel/kmod.c F: lib/test_kmod.c F: tools/testing/selftests/kmod/ +KMSAN +M: Alexander Potapenko +R: Marco Elver +R: Dmitry Vyukov +L: kasan-dev@googlegroups.com +S: Maintained +F: Documentation/dev-tools/kmsan.rst +F: arch/*/include/asm/kmsan.h +F: include/linux/kmsan*.h +F: lib/Kconfig.kmsan +F: mm/kmsan/ +F: scripts/Makefile.kmsan + KPROBES M: Naveen N. Rao M: Anil S Keshavamurthy -- GitLab From b073d7f8aee4ebf05d10e3380df377b73120cf16 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:48 +0200 Subject: [PATCH 1064/2223] mm: kmsan: maintain KMSAN metadata for page operations Insert KMSAN hooks that make the necessary bookkeeping changes: - poison page shadow and origins in alloc_pages()/free_page(); - clear page shadow and origins in clear_page(), copy_user_highpage(); - copy page metadata in copy_highpage(), wp_page_copy(); - handle vmap()/vunmap()/iounmap(); Link: https://lkml.kernel.org/r/20220915150417.722975-15-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/page_64.h | 7 ++ arch/x86/mm/ioremap.c | 3 + include/linux/highmem.h | 3 + include/linux/kmsan.h | 145 +++++++++++++++++++++++++++++++++ mm/internal.h | 6 ++ mm/kmsan/hooks.c | 86 +++++++++++++++++++ mm/kmsan/shadow.c | 113 +++++++++++++++++++++++++ mm/memory.c | 2 + mm/page_alloc.c | 11 +++ mm/vmalloc.c | 20 ++++- 10 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 include/linux/kmsan.h diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index baa70451b8df5..198e03e59ca19 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -8,6 +8,8 @@ #include #include +#include + /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; extern unsigned long phys_base; @@ -47,6 +49,11 @@ void clear_page_erms(void *page); static inline void clear_page(void *page) { + /* + * Clean up KMSAN metadata for the page being cleared. The assembly call + * below clobbers @page, so we perform unpoisoning before it. + */ + kmsan_unpoison_memory(page, PAGE_SIZE); alternative_call_2(clear_page_orig, clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1ad0228f8ceb9..78c5bc654cff5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -479,6 +480,8 @@ void iounmap(volatile void __iomem *addr) return; } + kmsan_iounmap_page_range((unsigned long)addr, + (unsigned long)addr + get_vm_area_size(p)); memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25679035ca283..e9912da5441b4 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from, vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } @@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from) vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); + kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h new file mode 100644 index 0000000000000..b36bf3db835ee --- /dev/null +++ b/include/linux/kmsan.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN API for subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include +#include +#include + +struct page; + +#ifdef CONFIG_KMSAN + +/** + * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. + * @page: struct page pointer returned by alloc_pages(). + * @order: order of allocated struct page. + * @flags: GFP flags used by alloc_pages() + * + * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless + * @flags contain __GFP_ZERO. + */ +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags); + +/** + * kmsan_free_page() - Notify KMSAN about a free_pages() call. + * @page: struct page pointer passed to free_pages(). + * @order: order of deallocated struct page. + * + * KMSAN marks freed memory as uninitialized. + */ +void kmsan_free_page(struct page *page, unsigned int order); + +/** + * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages. + * @dst: destination page. + * @src: source page. + * + * KMSAN copies the contents of metadata pages for @src into the metadata pages + * for @dst. If @dst has no associated metadata pages, nothing happens. + * If @src has no associated metadata pages, @dst metadata pages are unpoisoned. + */ +void kmsan_copy_page_meta(struct page *dst, struct page *src); + +/** + * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. + * @start: start of vmapped range. + * @end: end of vmapped range. + * @prot: page protection flags used for vmap. + * @pages: array of pages. + * @page_shift: page_shift passed to vmap_range_noflush(). + * + * KMSAN maps shadow and origin pages of @pages into contiguous ranges in + * vmalloc metadata address range. + */ +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + +/** + * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. + * @start: start of vunmapped range. + * @end: end of vunmapped range. + * + * KMSAN unmaps the contiguous metadata ranges created by + * kmsan_map_kernel_range_noflush(). + */ +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); + +/** + * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call. + * @addr: range start. + * @end: range end. + * @phys_addr: physical range start. + * @prot: page protection flags used for ioremap_page_range(). + * @page_shift: page_shift argument passed to vmap_range_noflush(). + * + * KMSAN creates new metadata pages for the physical pages mapped into the + * virtual memory. + */ +void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); + +/** + * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. + * @start: range start. + * @end: range end. + * + * KMSAN unmaps the metadata pages for the given range and, unlike for + * vunmap_page_range(), also deallocates them. + */ +void kmsan_iounmap_page_range(unsigned long start, unsigned long end); + +#else + +static inline int kmsan_alloc_page(struct page *page, unsigned int order, + gfp_t flags) +{ + return 0; +} + +static inline void kmsan_free_page(struct page *page, unsigned int order) +{ +} + +static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ +} + +static inline void kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) +{ +} + +static inline void kmsan_vunmap_range_noflush(unsigned long start, + unsigned long end) +{ +} + +static inline void kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, + pgprot_t prot, + unsigned int page_shift) +{ +} + +static inline void kmsan_iounmap_page_range(unsigned long start, + unsigned long end) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_H */ diff --git a/mm/internal.h b/mm/internal.h index e497ab14c9842..fea3cba154844 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -818,8 +818,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + void vunmap_range_noflush(unsigned long start, unsigned long end); +void __vunmap_range_noflush(unsigned long start, unsigned long end); + int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 4ac62fa67a02a..040111bb9f6a3 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,91 @@ * skipping effects of functions like memset() inside instrumented code. */ +static unsigned long vmalloc_shadow(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_SHADOW); +} + +static unsigned long vmalloc_origin(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_ORIGIN); +} + +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) +{ + __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end)); + __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end)); + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); +} + +/* + * This function creates new shadow/origin pages for the physical pages mapped + * into the virtual memory. If those physical pages already had shadow/origin, + * those are ignored. + */ +void kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) +{ + gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; + struct page *shadow, *origin; + unsigned long off = 0; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + shadow = alloc_pages(gfp_mask, 1); + origin = alloc_pages(gfp_mask, 1); + __vmap_pages_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, + PAGE_SHIFT); + __vmap_pages_range_noflush( + vmalloc_origin(start + off), + vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, + PAGE_SHIFT); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_iounmap_page_range(unsigned long start, unsigned long end) +{ + unsigned long v_shadow, v_origin; + struct page *shadow, *origin; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + v_shadow = (unsigned long)vmalloc_shadow(start); + v_origin = (unsigned long)vmalloc_origin(start); + for (int i = 0; i < nr; + i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) { + shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow); + origin = kmsan_vmalloc_to_page_or_null((void *)v_origin); + __vunmap_range_noflush(v_shadow, vmalloc_shadow(end)); + __vunmap_range_noflush(v_origin, vmalloc_origin(end)); + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index acc5279acc3be..8c81a059beea6 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -145,3 +145,116 @@ void *kmsan_get_metadata(void *address, bool is_origin) return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; } + +void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + if (!dst || !page_has_metadata(dst)) + return; + if (!src || !page_has_metadata(src)) { + kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE, + /*checked*/ false); + return; + } + + kmsan_enter_runtime(); + __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE); + __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); + kmsan_leave_runtime(); +} + +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) +{ + bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled; + struct page *shadow, *origin; + depot_stack_handle_t handle; + int pages = 1 << order; + + if (!page) + return; + + shadow = shadow_page_for(page); + origin = origin_page_for(page); + + if (initialized) { + __memset(page_address(shadow), 0, PAGE_SIZE * pages); + __memset(page_address(origin), 0, PAGE_SIZE * pages); + return; + } + + /* Zero pages allocated by the runtime should also be initialized. */ + if (kmsan_in_runtime()) + return; + + __memset(page_address(shadow), -1, PAGE_SIZE * pages); + kmsan_enter_runtime(); + handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0); + kmsan_leave_runtime(); + /* + * Addresses are page-aligned, pages are contiguous, so it's ok + * to just fill the origin pages with @handle. + */ + for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++) + ((depot_stack_handle_t *)page_address(origin))[i] = handle; +} + +void kmsan_free_page(struct page *page, unsigned int order) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(page_address(page), + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + unsigned long shadow_start, origin_start, shadow_end, origin_end; + struct page **s_pages, **o_pages; + int nr, mapped; + + if (!kmsan_enabled) + return; + + shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); + shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); + if (!shadow_start) + return; + + nr = (end - start) / PAGE_SIZE; + s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); + o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + if (!s_pages || !o_pages) + goto ret; + for (int i = 0; i < nr; i++) { + s_pages[i] = shadow_page_for(pages[i]); + o_pages[i] = origin_page_for(pages[i]); + } + prot = __pgprot(pgprot_val(prot) | _PAGE_NX); + prot = PAGE_KERNEL; + + origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN); + origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN); + kmsan_enter_runtime(); + mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, + s_pages, page_shift); + KMSAN_WARN_ON(mapped); + mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, + o_pages, page_shift); + KMSAN_WARN_ON(mapped); + kmsan_leave_runtime(); + flush_tlb_kernel_range(shadow_start, shadow_end); + flush_tlb_kernel_range(origin_start, origin_end); + flush_cache_vmap(shadow_start, shadow_end); + flush_cache_vmap(origin_start, origin_end); + +ret: + kfree(s_pages); + kfree(o_pages); +} diff --git a/mm/memory.c b/mm/memory.c index b3ed17219d772..118e5f023597c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -3136,6 +3137,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; } + kmsan_copy_page_meta(new_page, old_page); } if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8ea824e7653..1db1ac74ef142 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1400,6 +1401,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + kmsan_free_page(page, order); if (unlikely(PageHWPoison(page)) && !order) { /* @@ -3808,6 +3810,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, @@ -5560,6 +5570,7 @@ out: } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); return page; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a991b909866f2..ccaa461998f3c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -320,6 +320,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end, err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); + if (!err) + kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -416,7 +419,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -void vunmap_range_noflush(unsigned long start, unsigned long end) +void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -438,6 +441,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +void vunmap_range_noflush(unsigned long start, unsigned long end) +{ + kmsan_vunmap_range_noflush(start, end); + __vunmap_range_noflush(start, end); +} + /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap @@ -575,7 +584,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -601,6 +610,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map -- GitLab From 68ef169a1dd20df5cfa5a161b7304ad9fdd14c36 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:49 +0200 Subject: [PATCH 1065/2223] mm: kmsan: call KMSAN hooks from SLUB code In order to report uninitialized memory coming from heap allocations KMSAN has to poison them unless they're created with __GFP_ZERO. It's handy that we need KMSAN hooks in the places where init_on_alloc/init_on_free initialization is performed. In addition, we apply __no_kmsan_checks to get_freepointer_safe() to suppress reports when accessing freelist pointers that reside in freed objects. Link: https://lkml.kernel.org/r/20220915150417.722975-16-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 57 ++++++++++++++++++++++++++++++++ mm/kmsan/hooks.c | 76 +++++++++++++++++++++++++++++++++++++++++++ mm/slab.h | 1 + mm/slub.c | 17 ++++++++++ 4 files changed, 151 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index b36bf3db835ee..5c4e0079054e6 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -14,6 +14,7 @@ #include struct page; +struct kmem_cache; #ifdef CONFIG_KMSAN @@ -48,6 +49,44 @@ void kmsan_free_page(struct page *page, unsigned int order); */ void kmsan_copy_page_meta(struct page *dst, struct page *src); +/** + * kmsan_slab_alloc() - Notify KMSAN about a slab allocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * @flags: GFP flags passed to the allocator. + * + * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the + * newly created object, marking it as initialized or uninitialized. + */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); + +/** + * kmsan_slab_free() - Notify KMSAN about a slab deallocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * + * KMSAN marks the freed object as uninitialized. + */ +void kmsan_slab_free(struct kmem_cache *s, void *object); + +/** + * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation. + * @ptr: object pointer. + * @size: object size. + * @flags: GFP flags passed to the allocator. + * + * Similar to kmsan_slab_alloc(), but for large allocations. + */ +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); + +/** + * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation. + * @ptr: object pointer. + * + * Similar to kmsan_slab_free(), but for large allocations. + */ +void kmsan_kfree_large(const void *ptr); + /** * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. * @start: start of vmapped range. @@ -114,6 +153,24 @@ static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) { } +static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object, + gfp_t flags) +{ +} + +static inline void kmsan_slab_free(struct kmem_cache *s, void *object) +{ +} + +static inline void kmsan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) +{ +} + +static inline void kmsan_kfree_large(const void *ptr) +{ +} + static inline void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 040111bb9f6a3..000703c563a4d 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -27,6 +27,82 @@ * skipping effects of functions like memset() inside instrumented code. */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (unlikely(object == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * There's a ctor or this is an RCU cache - do nothing. The memory + * status hasn't changed since last use. + */ + if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU)) + return; + + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory(object, s->object_size, + KMSAN_POISON_CHECK); + else + kmsan_internal_poison_memory(object, s->object_size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_slab_free(struct kmem_cache *s, void *object) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))) + return; + /* + * If there's a constructor, freed memory must remain in the same state + * until the next allocation. We cannot save its state to detect + * use-after-free bugs, instead we just keep it unpoisoned. + */ + if (s->ctor) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +{ + if (unlikely(ptr == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory((void *)ptr, size, + /*checked*/ true); + else + kmsan_internal_poison_memory((void *)ptr, size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_kfree_large(const void *ptr) +{ + struct page *page; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + page = virt_to_head_page((void *)ptr); + KMSAN_WARN_ON(ptr != page_address(page)); + kmsan_internal_poison_memory((void *)ptr, + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + static unsigned long vmalloc_shadow(unsigned long addr) { return (unsigned long)kmsan_get_metadata((void *)addr, diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ecd..9d0afd2985df7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -729,6 +729,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, memset(p[i], 0, s->object_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); diff --git a/mm/slub.c b/mm/slub.c index 6953c3367bc20..ce8310e131b34 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,17 @@ static void prefetch_freepointer(const struct kmem_cache *s, void *object) prefetchw(object + s->offset); } +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { unsigned long freepointer_addr; @@ -1709,6 +1721,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) ptr = kasan_kmalloc_large(ptr, size, flags); /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); return ptr; } @@ -1716,12 +1729,14 @@ static __always_inline void kfree_hook(void *x) { kmemleak_free(x); kasan_kfree_large(x); + kmsan_kfree_large(x); } static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); debug_check_no_locks_freed(x, s->object_size); @@ -5941,6 +5956,7 @@ static char *create_unique_id(struct kmem_cache *s) p += sprintf(p, "%07u", s->size); BUG_ON(p > name + ID_STR_LENGTH - 1); + kmsan_unpoison_memory(name, p - name); return name; } @@ -6042,6 +6058,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) al->name = name; al->next = alias_list; alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); return 0; } -- GitLab From 50b5e49ca694a60f84a2a12d62b6cb6ec8e3649f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:50 +0200 Subject: [PATCH 1066/2223] kmsan: handle task creation and exiting Tell KMSAN that a new task is created, so the tool creates a backing metadata structure for that task. Link: https://lkml.kernel.org/r/20220915150417.722975-17-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 21 +++++++++++++++++++++ kernel/exit.c | 2 ++ kernel/fork.c | 2 ++ mm/kmsan/core.c | 10 ++++++++++ mm/kmsan/hooks.c | 17 +++++++++++++++++ mm/kmsan/kmsan.h | 2 ++ 6 files changed, 54 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 5c4e0079054e6..354aee6f7b1a2 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -15,9 +15,22 @@ struct page; struct kmem_cache; +struct task_struct; #ifdef CONFIG_KMSAN +/** + * kmsan_task_create() - Initialize KMSAN state for the task. + * @task: task to initialize. + */ +void kmsan_task_create(struct task_struct *task); + +/** + * kmsan_task_exit() - Notify KMSAN that a task has exited. + * @task: task about to finish. + */ +void kmsan_task_exit(struct task_struct *task); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -139,6 +152,14 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_task_create(struct task_struct *task) +{ +} + +static inline void kmsan_task_exit(struct task_struct *task) +{ +} + static inline int kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { diff --git a/kernel/exit.c b/kernel/exit.c index 98a33bd7c25c5..1899d73bdfb72 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -742,6 +743,7 @@ void __noreturn do_exit(long code) WARN_ON(tsk->plug); kcov_task_exit(tsk); + kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); diff --git a/kernel/fork.c b/kernel/fork.c index 3d788f759e5f1..3c2f8601b2b82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1023,6 +1024,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->worker_private = NULL; kcov_task_init(tsk); + kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 5330138fda5bc..112dce135c7f6 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -37,6 +37,16 @@ bool kmsan_enabled __read_mostly; */ DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); +void kmsan_internal_task_create(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + struct thread_info *info = current_thread_info(); + + __memset(ctx, 0, sizeof(*ctx)); + ctx->allow_reporting = true; + kmsan_internal_unpoison_memory(info, sizeof(*info), false); +} + void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, unsigned int poison_flags) { diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 000703c563a4d..6f3e64b0b61f8 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -27,6 +27,23 @@ * skipping effects of functions like memset() inside instrumented code. */ +void kmsan_task_create(struct task_struct *task) +{ + kmsan_enter_runtime(); + kmsan_internal_task_create(task); + kmsan_leave_runtime(); +} + +void kmsan_task_exit(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ctx->allow_reporting = false; +} + void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) { if (unlikely(object == NULL)) diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 97d48b45dba58..77ee068c04ae9 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -180,6 +180,8 @@ void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, u32 origin, bool checked); depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); +void kmsan_internal_task_create(struct task_struct *task); + bool kmsan_metadata_is_contiguous(void *addr, size_t size); void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, int reason); -- GitLab From 3c206509826094e85ead0b056f484db96829248d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:51 +0200 Subject: [PATCH 1067/2223] init: kmsan: call KMSAN initialization routines kmsan_init_shadow() scans the mappings created at boot time and creates metadata pages for those mappings. When the memblock allocator returns pages to pagealloc, we reserve 2/3 of those pages and use them as metadata for the remaining 1/3. Once KMSAN starts, every page allocated by pagealloc has its associated shadow and origin pages. kmsan_initialize() initializes the bookkeeping for init_task and enables KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-18-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 36 +++++++ init/main.c | 3 + mm/kmsan/Makefile | 3 +- mm/kmsan/init.c | 235 ++++++++++++++++++++++++++++++++++++++++++ mm/kmsan/kmsan.h | 3 + mm/kmsan/shadow.c | 34 ++++++ mm/page_alloc.c | 4 + 7 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 mm/kmsan/init.c diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 354aee6f7b1a2..e00de976ee438 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -31,6 +31,28 @@ void kmsan_task_create(struct task_struct *task); */ void kmsan_task_exit(struct task_struct *task); +/** + * kmsan_init_shadow() - Initialize KMSAN shadow at boot time. + * + * Allocate and initialize KMSAN metadata for early allocations. + */ +void __init kmsan_init_shadow(void); + +/** + * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN. + */ +void __init kmsan_init_runtime(void); + +/** + * kmsan_memblock_free_pages() - handle freeing of memblock pages. + * @page: struct page to free. + * @order: order of @page. + * + * Freed pages are either returned to buddy allocator or held back to be used + * as metadata pages. + */ +bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -152,6 +174,20 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_init_shadow(void) +{ +} + +static inline void kmsan_init_runtime(void) +{ +} + +static inline bool kmsan_memblock_free_pages(struct page *page, + unsigned int order) +{ + return true; +} + static inline void kmsan_task_create(struct task_struct *task) { } diff --git a/init/main.c b/init/main.c index eebe0cad4e378..93b000f2de8d7 100644 --- a/init/main.c +++ b/init/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -837,6 +838,7 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); + kmsan_init_shadow(); stack_depot_early_init(); mem_init(); mem_init_print_info(); @@ -857,6 +859,7 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); + kmsan_init_runtime(); } #ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 550ad8625e4f9..401acb1a491ce 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -3,7 +3,7 @@ # Makefile for KernelMemorySanitizer (KMSAN). # # -obj-y := core.o instrumentation.o hooks.o report.o shadow.o +obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o KMSAN_SANITIZE := n KCOV_INSTRUMENT := n @@ -18,6 +18,7 @@ CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c new file mode 100644 index 0000000000000..7fb794242fad0 --- /dev/null +++ b/mm/kmsan/init.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN initialization routines. + * + * Copyright (C) 2017-2021 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" + +#include +#include +#include + +#include "../internal.h" + +#define NUM_FUTURE_RANGES 128 +struct start_end_pair { + u64 start, end; +}; + +static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata; +static int future_index __initdata; + +/* + * Record a range of memory for which the metadata pages will be created once + * the page allocator becomes available. + */ +static void __init kmsan_record_future_shadow_range(void *start, void *end) +{ + u64 nstart = (u64)start, nend = (u64)end, cstart, cend; + bool merged = false; + + KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); + KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend); + nstart = ALIGN_DOWN(nstart, PAGE_SIZE); + nend = ALIGN(nend, PAGE_SIZE); + + /* + * Scan the existing ranges to see if any of them overlaps with + * [start, end). In that case, merge the two ranges instead of + * creating a new one. + * The number of ranges is less than 20, so there is no need to organize + * them into a more intelligent data structure. + */ + for (int i = 0; i < future_index; i++) { + cstart = start_end_pairs[i].start; + cend = start_end_pairs[i].end; + if ((cstart < nstart && cend < nstart) || + (cstart > nend && cend > nend)) + /* ranges are disjoint - do not merge */ + continue; + start_end_pairs[i].start = min(nstart, cstart); + start_end_pairs[i].end = max(nend, cend); + merged = true; + break; + } + if (merged) + return; + start_end_pairs[future_index].start = nstart; + start_end_pairs[future_index].end = nend; + future_index++; +} + +/* + * Initialize the shadow for existing mappings during kernel initialization. + * These include kernel text/data sections, NODE_DATA and future ranges + * registered while creating other data (e.g. percpu). + * + * Allocations via memblock can be only done before slab is initialized. + */ +void __init kmsan_init_shadow(void) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + phys_addr_t p_start, p_end; + u64 loop; + int nid; + + for_each_reserved_mem_range(loop, &p_start, &p_end) + kmsan_record_future_shadow_range(phys_to_virt(p_start), + phys_to_virt(p_end)); + /* Allocate shadow for .data */ + kmsan_record_future_shadow_range(_sdata, _edata); + + for_each_online_node(nid) + kmsan_record_future_shadow_range( + NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size); + + for (int i = 0; i < future_index; i++) + kmsan_init_alloc_meta_for_range( + (void *)start_end_pairs[i].start, + (void *)start_end_pairs[i].end); +} + +struct metadata_page_pair { + struct page *shadow, *origin; +}; +static struct metadata_page_pair held_back[MAX_ORDER] __initdata; + +/* + * Eager metadata allocation. When the memblock allocator is freeing pages to + * pagealloc, we use 2/3 of them as metadata for the remaining 1/3. + * We store the pointers to the returned blocks of pages in held_back[] grouped + * by their order: when kmsan_memblock_free_pages() is called for the first + * time with a certain order, it is reserved as a shadow block, for the second + * time - as an origin block. On the third time the incoming block receives its + * shadow and origin ranges from the previously saved shadow and origin blocks, + * after which held_back[order] can be used again. + * + * At the very end there may be leftover blocks in held_back[]. They are + * collected later by kmsan_memblock_discard(). + */ +bool kmsan_memblock_free_pages(struct page *page, unsigned int order) +{ + struct page *shadow, *origin; + + if (!held_back[order].shadow) { + held_back[order].shadow = page; + return false; + } + if (!held_back[order].origin) { + held_back[order].origin = page; + return false; + } + shadow = held_back[order].shadow; + origin = held_back[order].origin; + kmsan_setup_meta(page, shadow, origin, order); + + held_back[order].shadow = NULL; + held_back[order].origin = NULL; + return true; +} + +#define MAX_BLOCKS 8 +struct smallstack { + struct page *items[MAX_BLOCKS]; + int index; + int order; +}; + +static struct smallstack collect = { + .index = 0, + .order = MAX_ORDER, +}; + +static void smallstack_push(struct smallstack *stack, struct page *pages) +{ + KMSAN_WARN_ON(stack->index == MAX_BLOCKS); + stack->items[stack->index] = pages; + stack->index++; +} +#undef MAX_BLOCKS + +static struct page *smallstack_pop(struct smallstack *stack) +{ + struct page *ret; + + KMSAN_WARN_ON(stack->index == 0); + stack->index--; + ret = stack->items[stack->index]; + stack->items[stack->index] = NULL; + return ret; +} + +static void do_collection(void) +{ + struct page *page, *shadow, *origin; + + while (collect.index >= 3) { + page = smallstack_pop(&collect); + shadow = smallstack_pop(&collect); + origin = smallstack_pop(&collect); + kmsan_setup_meta(page, shadow, origin, collect.order); + __free_pages_core(page, collect.order); + } +} + +static void collect_split(void) +{ + struct smallstack tmp = { + .order = collect.order - 1, + .index = 0, + }; + struct page *page; + + if (!collect.order) + return; + while (collect.index) { + page = smallstack_pop(&collect); + smallstack_push(&tmp, &page[0]); + smallstack_push(&tmp, &page[1 << tmp.order]); + } + __memcpy(&collect, &tmp, sizeof(tmp)); +} + +/* + * Memblock is about to go away. Split the page blocks left over in held_back[] + * and return 1/3 of that memory to the system. + */ +static void kmsan_memblock_discard(void) +{ + /* + * For each order=N: + * - push held_back[N].shadow and .origin to @collect; + * - while there are >= 3 elements in @collect, do garbage collection: + * - pop 3 ranges from @collect; + * - use two of them as shadow and origin for the third one; + * - repeat; + * - split each remaining element from @collect into 2 ranges of + * order=N-1, + * - repeat. + */ + collect.order = MAX_ORDER - 1; + for (int i = MAX_ORDER - 1; i >= 0; i--) { + if (held_back[i].shadow) + smallstack_push(&collect, held_back[i].shadow); + if (held_back[i].origin) + smallstack_push(&collect, held_back[i].origin); + held_back[i].shadow = NULL; + held_back[i].origin = NULL; + do_collection(); + collect_split(); + } +} + +void __init kmsan_init_runtime(void) +{ + /* Assuming current is init_task */ + kmsan_internal_task_create(current); + kmsan_memblock_discard(); + pr_info("Starting KernelMemorySanitizer\n"); + pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n"); + kmsan_enabled = true; +} diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 77ee068c04ae9..7019c46d33a74 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -67,6 +67,7 @@ struct shadow_origin_ptr { struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, bool store); void *kmsan_get_metadata(void *addr, bool is_origin); +void __init kmsan_init_alloc_meta_for_range(void *start, void *end); enum kmsan_bug_reason { REASON_ANY, @@ -187,6 +188,8 @@ void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, int reason); struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order); /* * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 8c81a059beea6..6e90a806a7045 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -258,3 +258,37 @@ ret: kfree(s_pages); kfree(o_pages); } + +/* Allocate metadata for pages allocated at boot time. */ +void __init kmsan_init_alloc_meta_for_range(void *start, void *end) +{ + struct page *shadow_p, *origin_p; + void *shadow, *origin; + struct page *page; + u64 size; + + start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); + size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + shadow = memblock_alloc(size, PAGE_SIZE); + origin = memblock_alloc(size, PAGE_SIZE); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { + page = virt_to_page_or_null((char *)start + addr); + shadow_p = virt_to_page_or_null((char *)shadow + addr); + set_no_shadow_origin_page(shadow_p); + shadow_page_for(page) = shadow_p; + origin_p = virt_to_page_or_null((char *)origin + addr); + set_no_shadow_origin_page(origin_p); + origin_page_for(page) = origin_p; + } +} + +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order) +{ + for (int i = 0; i < (1 << order); i++) { + set_no_shadow_origin_page(&shadow[i]); + set_no_shadow_origin_page(&origin[i]); + shadow_page_for(&page[i]) = &shadow[i]; + origin_page_for(&page[i]) = &origin[i]; + } +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1db1ac74ef142..118462ae68004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,6 +1809,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; + if (!kmsan_memblock_free_pages(page, order)) { + /* KMSAN will take care of these pages. */ + return; + } __free_pages_core(page, order); } -- GitLab From 75cf0290271bf6dae9dee982aef15242dadf97e4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:52 +0200 Subject: [PATCH 1068/2223] instrumented.h: add KMSAN support To avoid false positives, KMSAN needs to unpoison the data copied from the userspace. To detect infoleaks - check the memory buffer passed to copy_to_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-19-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 18 ++++++++++++----- include/linux/kmsan-checks.h | 19 ++++++++++++++++++ mm/kmsan/hooks.c | 38 ++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 9f1dba8f717b0..501fa84867494 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -2,7 +2,7 @@ /* * This header provides generic wrappers for memory access instrumentation that - * the compiler cannot emit for: KASAN, KCSAN. + * the compiler cannot emit for: KASAN, KCSAN, KMSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H @@ -10,6 +10,7 @@ #include #include #include +#include #include /** @@ -117,6 +118,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); + kmsan_copy_to_user(to, from, n, 0); } /** @@ -151,6 +153,7 @@ static __always_inline void instrument_copy_from_user_after(const void *to, const void __user *from, unsigned long n, unsigned long left) { + kmsan_unpoison_memory(to, n - left); } /** @@ -162,10 +165,14 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * * @to destination variable, may not be address-taken */ -#define instrument_get_user(to) \ -({ \ +#define instrument_get_user(to) \ +({ \ + u64 __tmp = (u64)(to); \ + kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \ + to = __tmp; \ }) + /** * instrument_put_user() - add instrumentation to put_user()-like macros * @@ -177,8 +184,9 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * @ptr userspace pointer to copy to * @size number of bytes to copy */ -#define instrument_put_user(from, ptr, size) \ -({ \ +#define instrument_put_user(from, ptr, size) \ +({ \ + kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \ }) #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h index a6522a0c28df9..c4cae333deec5 100644 --- a/include/linux/kmsan-checks.h +++ b/include/linux/kmsan-checks.h @@ -46,6 +46,21 @@ void kmsan_unpoison_memory(const void *address, size_t size); */ void kmsan_check_memory(const void *address, size_t size); +/** + * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace. + * @to: destination address in the userspace. + * @from: source address in the kernel. + * @to_copy: number of bytes to copy. + * @left: number of bytes not copied. + * + * If this is a real userspace data transfer, KMSAN checks the bytes that were + * actually copied to ensure there was no information leak. If @to belongs to + * the kernel space (which is possible for compat syscalls), KMSAN just copies + * the metadata. + */ +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left); + #else static inline void kmsan_poison_memory(const void *address, size_t size, @@ -58,6 +73,10 @@ static inline void kmsan_unpoison_memory(const void *address, size_t size) static inline void kmsan_check_memory(const void *address, size_t size) { } +static inline void kmsan_copy_to_user(void __user *to, const void *from, + size_t to_copy, size_t left) +{ +} #endif diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 6f3e64b0b61f8..5c0eb25d984d7 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -205,6 +205,44 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end) kmsan_leave_runtime(); } +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * At this point we've copied the memory already. It's hard to check it + * before copying, as the size of actually copied buffer is unknown. + */ + + /* copy_to_user() may copy zero bytes. No need to check. */ + if (!to_copy) + return; + /* Or maybe copy_to_user() failed to copy anything. */ + if (to_copy <= left) + return; + + ua_flags = user_access_save(); + if ((u64)to < TASK_SIZE) { + /* This is a user memory access, check it. */ + kmsan_internal_check_memory((void *)from, to_copy - left, to, + REASON_COPY_TO_USER); + } else { + /* Otherwise this is a kernel memory access. This happens when a + * compat syscall passes an argument allocated on the kernel + * stack to a real syscall. + * Don't check anything, just copy the shadow of the copied + * bytes. + */ + kmsan_internal_memmove_metadata((void *)to, (void *)from, + to_copy - left); + } + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_copy_to_user); + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { -- GitLab From a28a4d4723c11fe5fd3e725f5eb1b3472e80fe12 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:53 +0200 Subject: [PATCH 1069/2223] kmsan: add iomap support Functions from lib/iomap.c interact with hardware, so KMSAN must ensure that: - every read function returns an initialized value - every write function checks values before sending them to hardware. Link: https://lkml.kernel.org/r/20220915150417.722975-20-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/iomap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lib/iomap.c b/lib/iomap.c index fbaa3e8f19d6c..4f8b31baa5752 100644 --- a/lib/iomap.c +++ b/lib/iomap.c @@ -6,6 +6,7 @@ */ #include #include +#include #include @@ -70,26 +71,35 @@ static void bad_io_access(unsigned long port, const char *access) #define mmio_read64be(addr) swab64(readq(addr)) #endif +/* + * Here and below, we apply __no_kmsan_checks to functions reading data from + * hardware, to ensure that KMSAN marks their return values as initialized. + */ +__no_kmsan_checks unsigned int ioread8(const void __iomem *addr) { IO_COND(addr, return inb(port), return readb(addr)); return 0xff; } +__no_kmsan_checks unsigned int ioread16(const void __iomem *addr) { IO_COND(addr, return inw(port), return readw(addr)); return 0xffff; } +__no_kmsan_checks unsigned int ioread16be(const void __iomem *addr) { IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); return 0xffff; } +__no_kmsan_checks unsigned int ioread32(const void __iomem *addr) { IO_COND(addr, return inl(port), return readl(addr)); return 0xffffffff; } +__no_kmsan_checks unsigned int ioread32be(const void __iomem *addr) { IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); @@ -142,18 +152,21 @@ static u64 pio_read64be_hi_lo(unsigned long port) return lo | (hi << 32); } +__no_kmsan_checks u64 ioread64_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr)); return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr)); return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64be_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64be_lo_hi(port), @@ -161,6 +174,7 @@ u64 ioread64be_lo_hi(const void __iomem *addr) return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64be_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64be_hi_lo(port), @@ -188,22 +202,32 @@ EXPORT_SYMBOL(ioread64be_hi_lo); void iowrite8(u8 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outb(val,port), writeb(val, addr)); } void iowrite16(u16 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outw(val,port), writew(val, addr)); } void iowrite16be(u16 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); } void iowrite32(u32 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outl(val,port), writel(val, addr)); } void iowrite32be(u32 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); } EXPORT_SYMBOL(iowrite8); @@ -239,24 +263,32 @@ static void pio_write64be_hi_lo(u64 val, unsigned long port) void iowrite64_lo_hi(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_lo_hi(val, port), writeq(val, addr)); } void iowrite64_hi_lo(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_hi_lo(val, port), writeq(val, addr)); } void iowrite64be_lo_hi(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_lo_hi(val, port), mmio_write64be(val, addr)); } void iowrite64be_hi_lo(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_hi_lo(val, port), mmio_write64be(val, addr)); } @@ -328,14 +360,20 @@ static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count); } void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count * 2); } void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count * 4); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); @@ -343,14 +381,20 @@ EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count); IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count * 2); IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count * 4); IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); } EXPORT_SYMBOL(iowrite8_rep); -- GitLab From 38317724f6a85572af373229a27e214d5282ddf8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:54 +0200 Subject: [PATCH 1070/2223] input: libps2: mark data received in __ps2_command() as initialized KMSAN does not know that the device initializes certain bytes in ps2dev->cmdbuf. Call kmsan_unpoison_memory() to explicitly mark them as initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-21-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/input/serio/libps2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 250e213cc80c6..3e19344eda93c 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,9 +295,11 @@ int __ps2_command(struct ps2dev *ps2dev, u8 *param, unsigned int command) serio_pause_rx(ps2dev->serio); - if (param) + if (param) { for (i = 0; i < receive; i++) param[i] = ps2dev->cmdbuf[(receive - 1) - i]; + kmsan_unpoison_memory(param, receive); + } if (ps2dev->cmdcnt && (command != PS2_CMD_RESET_BAT || ps2dev->cmdcnt != 1)) { -- GitLab From 7ade4f10779cb46f5c29ced9b7a41f68501cf0ed Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:55 +0200 Subject: [PATCH 1071/2223] dma: kmsan: unpoison DMA mappings KMSAN doesn't know about DMA memory writes performed by devices. We unpoison such memory when it's mapped to avoid false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-22-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 41 ++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 10 +++++--- mm/kmsan/hooks.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e00de976ee438..dac296da45c55 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -9,6 +9,7 @@ #ifndef _LINUX_KMSAN_H #define _LINUX_KMSAN_H +#include #include #include #include @@ -16,6 +17,7 @@ struct page; struct kmem_cache; struct task_struct; +struct scatterlist; #ifdef CONFIG_KMSAN @@ -172,6 +174,35 @@ void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, */ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); +/** + * kmsan_handle_dma() - Handle a DMA data transfer. + * @page: first page of the buffer. + * @offset: offset of the buffer within the first page. + * @size: buffer size. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffer, if it is copied to device; + * * initializes the buffer, if it is copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir); + +/** + * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist. + * @sg: scatterlist holding DMA buffers. + * @nents: number of scatterlist entries. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffers in the scatterlist, if they are copied to device; + * * initializes the buffers, if they are copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir); + #else static inline void kmsan_init_shadow(void) @@ -254,6 +285,16 @@ static inline void kmsan_iounmap_page_range(unsigned long start, { } +static inline void kmsan_handle_dma(struct page *page, size_t offset, + size_t size, enum dma_data_direction dir) +{ +} + +static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de71..a8400aa9bcd4e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); + kmsan_handle_dma(page, offset, size, dir); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; @@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, else ents = ops->map_sg(dev, sg, nents, dir, attrs); - if (ents > 0) + if (ents > 0) { + kmsan_handle_dma_sg(sg, nents, dir); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); - else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != -EREMOTEIO)) + } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != -EREMOTEIO)) { return -EIO; + } return ents; } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 5c0eb25d984d7..563c09443a37a 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -10,10 +10,12 @@ */ #include +#include #include #include #include #include +#include #include #include @@ -243,6 +245,63 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +static void kmsan_handle_dma_page(const void *addr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_TO_DEVICE: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + break; + case DMA_FROM_DEVICE: + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_NONE: + break; + } +} + +/* Helper function to handle DMA data transfers. */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ + u64 page_offset, to_go, addr; + + if (PageHighMem(page)) + return; + addr = (u64)page_address(page) + offset; + /* + * The kernel may occasionally give us adjacent DMA pages not belonging + * to the same allocation. Process them separately to avoid triggering + * internal KMSAN checks. + */ + while (size > 0) { + page_offset = addr % PAGE_SIZE; + to_go = min(PAGE_SIZE - page_offset, (u64)size); + kmsan_handle_dma_page((void *)addr, to_go, dir); + addr += to_go; + size -= to_go; + } +} + +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + struct scatterlist *item; + int i; + + for_each_sg(sg, item, nents, i) + kmsan_handle_dma(sg_page(item), item->offset, item->length, + dir); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { -- GitLab From 88938359e2dfe1f5f5840268b98935948db8fbd9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:56 +0200 Subject: [PATCH 1072/2223] virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() If vring doesn't use the DMA API, KMSAN is unable to tell whether the memory is initialized by hardware. Explicitly call kmsan_handle_dma() from vring_map_one_sg() in this case to prevent false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-23-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Michael S. Tsirkin Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/virtio/virtio_ring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 4620e9d79dde8..8974c34b40fda 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -352,8 +353,15 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction) { - if (!vq->use_dma_api) + if (!vq->use_dma_api) { + /* + * If DMA is not used, KMSAN doesn't know that the scatterlist + * is initialized by the hardware. Explicitly check/unpoison it + * depending on the direction. + */ + kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); return (dma_addr_t)sg_phys(sg); + } /* * We can't use dma_map_sg, because we don't use scatterlists in -- GitLab From 553a80188a5d7164d2b0688b06bf3fe297023bfe Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:57 +0200 Subject: [PATCH 1073/2223] kmsan: handle memory sent to/from USB Depending on the value of is_out kmsan_handle_urb() KMSAN either marks the data copied to the kernel from a USB device as initialized, or checks the data sent to the device for being initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-24-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/usb/core/urb.c | 2 ++ include/linux/kmsan.h | 15 +++++++++++++++ mm/kmsan/hooks.c | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index 33d62d7e3929f..9f3c54032556e 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -426,6 +427,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags) URB_SETUP_MAP_SINGLE | URB_SETUP_MAP_LOCAL | URB_DMA_SG_COMBINED); urb->transfer_flags |= (is_out ? URB_DIR_OUT : URB_DIR_IN); + kmsan_handle_urb(urb, is_out); if (xfertype != USB_ENDPOINT_XFER_CONTROL && dev->state < USB_STATE_CONFIGURED) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index dac296da45c55..c473e0e21683c 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -18,6 +18,7 @@ struct page; struct kmem_cache; struct task_struct; struct scatterlist; +struct urb; #ifdef CONFIG_KMSAN @@ -203,6 +204,16 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir); +/** + * kmsan_handle_urb() - Handle a USB data transfer. + * @urb: struct urb pointer. + * @is_out: data transfer direction (true means output to hardware). + * + * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise, + * KMSAN initializes the transfer buffer. + */ +void kmsan_handle_urb(const struct urb *urb, bool is_out); + #else static inline void kmsan_init_shadow(void) @@ -295,6 +306,10 @@ static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, { } +static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 563c09443a37a..79d7e73e2cfd8 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../internal.h" #include "../slab.h" @@ -245,6 +246,21 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +/* Helper function to check an URB. */ +void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ + if (!urb) + return; + if (is_out) + kmsan_internal_check_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*user_addr*/ 0, REASON_SUBMIT_URB); + else + kmsan_internal_unpoison_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*checked*/ false); +} + static void kmsan_handle_dma_page(const void *addr, size_t size, enum dma_data_direction dir) { -- GitLab From 8ed691b02ade8f755f34aa1fa8beff8ce4f81f6d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:58 +0200 Subject: [PATCH 1074/2223] kmsan: add tests for KMSAN The testing module triggers KMSAN warnings in different cases and checks that the errors are properly reported, using console probes to capture the tool's output. Link: https://lkml.kernel.org/r/20220915150417.722975-25-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/Kconfig.kmsan | 12 + mm/kmsan/Makefile | 4 + mm/kmsan/kmsan_test.c | 581 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 597 insertions(+) create mode 100644 mm/kmsan/kmsan_test.c diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan index 5b19dbd34d76e..b2489dd6503fa 100644 --- a/lib/Kconfig.kmsan +++ b/lib/Kconfig.kmsan @@ -47,4 +47,16 @@ config KMSAN_CHECK_PARAM_RETVAL may potentially report errors in corner cases when non-instrumented functions call instrumented ones. +config KMSAN_KUNIT_TEST + tristate "KMSAN integration test suite" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS + depends on TRACEPOINTS && KUNIT + help + Test suite for KMSAN, testing various error detection scenarios, + and checking that reports are correctly output to console. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. + endif diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 401acb1a491ce..98eab2856626f 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -22,3 +22,7 @@ CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) + +obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o +KMSAN_SANITIZE_kmsan_test.o := y +CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c new file mode 100644 index 0000000000000..9a29ea2dbfb9b --- /dev/null +++ b/mm/kmsan/kmsan_test.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KMSAN. + * For each test case checks the presence (or absence) of generated reports. + * Relies on 'console' tracepoint to capture reports as they appear in the + * kernel log. + * + * Copyright (C) 2021-2022, Google LLC. + * Author: Alexander Potapenko + * + */ + +#include +#include "kmsan.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(int, per_cpu_var); + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + bool available; + bool ignore; /* Stop console output collection. */ + char header[256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + + if (observed.ignore) + return; + spin_lock_irqsave(&observed.lock, flags); + + if (strnstr(buf, "BUG: KMSAN: ", len)) { + /* + * KMSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.header, buf, + min(len + 1, sizeof(observed.header))); + WRITE_ONCE(observed.available, true); + observed.ignore = true; + } + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.available); +} + +/* Information we expect in a report. */ +struct expect_report { + const char *error_type; /* Error type. */ + /* + * Kernel symbol from the error header, or NULL if no report is + * expected. + */ + const char *symbol; +}; + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + typeof(observed.header) expected_header; + unsigned long flags; + bool ret = false; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available() || !r->symbol) + return (!report_available() && !r->symbol); + + /* Generate expected report contents. */ + + /* Title */ + cur = expected_header; + end = &expected_header[sizeof(expected_header) - 1]; + + cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type); + + scnprintf(cur, end - cur, " in %s", r->symbol); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expected_header, '+'); + if (cur) + *cur = '\0'; + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.header, expected_header); +out: + spin_unlock_irqrestore(&observed.lock, flags); + + return ret; +} + +/* ===== Test cases ===== */ + +/* Prevent replacing branch with select in LLVM. */ +static noinline void check_true(char *arg) +{ + pr_info("%s is true\n", arg); +} + +static noinline void check_false(char *arg) +{ + pr_info("%s is false\n", arg); +} + +#define USE(x) \ + do { \ + if (x) \ + check_true(#x); \ + else \ + check_false(#x); \ + } while (0) + +#define EXPECTATION_ETYPE_FN(e, reason, fn) \ + struct expect_report e = { \ + .error_type = reason, \ + .symbol = fn, \ + } + +#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL) +#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \ + EXPECTATION_ETYPE_FN(e, "uninit-value", fn) +#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__) +#define EXPECTATION_USE_AFTER_FREE(e) \ + EXPECTATION_ETYPE_FN(e, "use-after-free", __func__) + +/* Test case: ensure that kmalloc() returns uninitialized memory. */ +static void test_uninit_kmalloc(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + int *ptr; + + kunit_info(test, "uninitialized kmalloc test (UMR report)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that kmalloc'ed memory becomes initialized after memset(). + */ +static void test_init_kmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kmalloc test (no reports)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + memset(ptr, 0, sizeof(*ptr)); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that kzalloc() returns initialized memory. */ +static void test_init_kzalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kzalloc test (no reports)\n"); + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables are uninitialized by default. */ +static void test_uninit_stack_var(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int cond; + + kunit_info(test, "uninitialized stack variable (UMR report)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables with initializers are initialized. */ +static void test_init_stack_var(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + volatile int cond = 1; + + kunit_info(test, "initialized stack variable (no reports)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void two_param_fn_2(int arg1, int arg2) +{ + USE(arg1); + USE(arg2); +} + +static noinline void one_param_fn(int arg) +{ + two_param_fn_2(arg, arg); + USE(arg); +} + +static noinline void two_param_fn(int arg1, int arg2) +{ + int init = 0; + + one_param_fn(init); + USE(arg1); + USE(arg2); +} + +static void test_params(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to two_param_fn(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_params"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn"); +#endif + volatile int uninit, init = 1; + + kunit_info(test, + "uninit passed through a function parameter (UMR report)\n"); + two_param_fn(uninit, init); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static int signed_sum3(int a, int b, int c) +{ + return a + b + c; +} + +/* + * Test case: ensure that uninitialized values are tracked through function + * arguments. + */ +static void test_uninit_multiple_params(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile char b = 3, c; + volatile int a; + + kunit_info(test, "uninitialized local passed to fn (UMR report)\n"); + USE(signed_sum3(a, b, c)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Helper function to make an array uninitialized. */ +static noinline void do_uninit_local_array(char *array, int start, int stop) +{ + volatile char uninit; + + for (int i = start; i < stop; i++) + array[i] = uninit; +} + +/* + * Test case: ensure kmsan_check_memory() reports an error when checking + * uninitialized memory. + */ +static void test_uninit_kmsan_check_memory(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory"); + volatile char local_array[8]; + + kunit_info( + test, + "kmsan_check_memory() called on uninit local (UMR report)\n"); + do_uninit_local_array((char *)local_array, 5, 7); + + kmsan_check_memory((char *)local_array, 8); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: check that a virtual memory range created with vmap() from + * initialized pages is still considered as initialized. + */ +static void test_init_kmsan_vmap_vunmap(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + const int npages = 2; + struct page **pages; + void *vbuf; + + kunit_info(test, "pages initialized via vmap (no reports)\n"); + + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + for (int i = 0; i < npages; i++) + pages[i] = alloc_page(GFP_KERNEL); + vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + memset(vbuf, 0xfe, npages * PAGE_SIZE); + for (int i = 0; i < npages; i++) + kmsan_check_memory(page_address(pages[i]), PAGE_SIZE); + + if (vbuf) + vunmap(vbuf); + for (int i = 0; i < npages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memset() can initialize a buffer allocated via + * vmalloc(). + */ +static void test_init_vmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int npages = 8; + char *buf; + + kunit_info(test, "vmalloc buffer can be initialized (no reports)\n"); + buf = vmalloc(PAGE_SIZE * npages); + buf[0] = 1; + memset(buf, 0xfe, PAGE_SIZE * npages); + USE(buf[0]); + for (int i = 0; i < npages; i++) + kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE); + vfree(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that use-after-free reporting works. */ +static void test_uaf(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile int value; + volatile int *var; + + kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n"); + var = kmalloc(80, GFP_KERNEL); + var[3] = 0xfeedface; + kfree((int *)var); + /* Copy the invalid value before checking it. */ + value = var[3]; + USE(value); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that uninitialized values are propagated through per-CPU + * memory. + */ +static void test_percpu_propagate(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int uninit, check; + + kunit_info(test, + "uninit local stored to per_cpu memory (UMR report)\n"); + + this_cpu_write(per_cpu_var, uninit); + check = this_cpu_read(per_cpu_var); + USE(check); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that passing uninitialized values to printk() leads to an + * error report. + */ +static void test_printk(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to pr_info(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "number"); +#endif + volatile int uninit; + + kunit_info(test, "uninit local passed to pr_info() (UMR report)\n"); + pr_info("%px contains %d\n", &uninit, uninit); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and `dst`. + */ +static void test_memcpy_aligned_to_aligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned"); + volatile int uninit_src; + volatile int dst = 0; + + kunit_info( + test, + "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst, sizeof(dst)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the first of the two values. + */ +static void test_memcpy_aligned_to_unaligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)dst, 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the second of the two values. + */ +static void test_memcpy_aligned_to_unaligned2(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_memcpy_aligned_to_unaligned2"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void fibonacci(int *array, int size, int start) { + if (start < 2 || (start == size)) + return; + array[start] = array[start - 1] + array[start - 2]; + fibonacci(array, size, start + 1); +} + +static void test_long_origin_chain(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_long_origin_chain"); + /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */ + volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2]; + int last = ARRAY_SIZE(accum) - 1; + + kunit_info( + test, + "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n"); + /* + * We do not set accum[1] to 0, so the uninitializedness will be carried + * over to accum[2..last]. + */ + accum[0] = 1; + fibonacci((int *)accum, ARRAY_SIZE(accum), 2); + kmsan_check_memory((void *)&accum[last], sizeof(int)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static struct kunit_case kmsan_test_cases[] = { + KUNIT_CASE(test_uninit_kmalloc), + KUNIT_CASE(test_init_kmalloc), + KUNIT_CASE(test_init_kzalloc), + KUNIT_CASE(test_uninit_stack_var), + KUNIT_CASE(test_init_stack_var), + KUNIT_CASE(test_params), + KUNIT_CASE(test_uninit_multiple_params), + KUNIT_CASE(test_uninit_kmsan_check_memory), + KUNIT_CASE(test_init_kmsan_vmap_vunmap), + KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uaf), + KUNIT_CASE(test_percpu_propagate), + KUNIT_CASE(test_printk), + KUNIT_CASE(test_memcpy_aligned_to_aligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned2), + KUNIT_CASE(test_long_origin_chain), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + observed.header[0] = '\0'; + observed.ignore = false; + observed.available = false; + spin_unlock_irqrestore(&observed.lock, flags); + + return 0; +} + +static void test_exit(struct kunit *test) +{ +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kmsan_suite_init(struct kunit_suite *suite) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kmsan_suite_exit(struct kunit_suite *suite) +{ + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static struct kunit_suite kmsan_test_suite = { + .name = "kmsan", + .test_cases = kmsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kmsan_suite_init, + .suite_exit = kmsan_suite_exit, +}; +kunit_test_suites(&kmsan_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alexander Potapenko "); -- GitLab From 2de6f3bf75058e35eff04e6fab7ca41533bdb027 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:59 +0200 Subject: [PATCH 1075/2223] kmsan: disable strscpy() optimization under KMSAN Disable the efficient 8-byte reading under KMSAN to avoid false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-26-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/string.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/string.c b/lib/string.c index 6f334420f6871..3371d26a0e390 100644 --- a/lib/string.c +++ b/lib/string.c @@ -197,6 +197,14 @@ ssize_t strscpy(char *dest, const char *src, size_t count) max = 0; #endif + /* + * read_word_at_a_time() below may read uninitialized bytes after the + * trailing zero and use them in comparisons. Disable this optimization + * under KMSAN to prevent false positive reports. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + max = 0; + while (max >= sizeof(unsigned long)) { unsigned long c, data; -- GitLab From 440fed95ebc30420d1f7802c6578f95e18523140 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:00 +0200 Subject: [PATCH 1076/2223] crypto: kmsan: disable accelerated configs under KMSAN KMSAN is unable to understand when initialized values come from assembly. Disable accelerated configs in KMSAN builds to prevent false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-27-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- crypto/Kconfig | 30 ++++++++++++++++++++++++++++++ drivers/net/Kconfig | 1 + 2 files changed, 31 insertions(+) diff --git a/crypto/Kconfig b/crypto/Kconfig index bb427a835e44a..182fb817ebb52 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -319,6 +319,7 @@ config CRYPTO_CURVE25519 config CRYPTO_CURVE25519_X86 tristate "x86_64 accelerated Curve25519 scalar multiplication library" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 @@ -367,11 +368,13 @@ config CRYPTO_AEGIS128 config CRYPTO_AEGIS128_SIMD bool "Support SIMD acceleration for AEGIS-128" depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON) + depends on !KMSAN # avoid false positives from assembly default y config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_SIMD help @@ -517,6 +520,7 @@ config CRYPTO_NHPOLY1305 config CRYPTO_NHPOLY1305_SSE2 tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help SSE2 optimized implementation of the hash function used by the @@ -525,6 +529,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help AVX2 optimized implementation of the hash function used by the @@ -649,6 +654,7 @@ config CRYPTO_CRC32C config CRYPTO_CRC32C_INTEL tristate "CRC32c INTEL hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help In Intel processor with SSE4.2 supported, the processor will @@ -689,6 +695,7 @@ config CRYPTO_CRC32 config CRYPTO_CRC32_PCLMUL tristate "CRC32 PCLMULQDQ hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH select CRC32 help @@ -748,6 +755,7 @@ config CRYPTO_BLAKE2B config CRYPTO_BLAKE2S_X86 bool "BLAKE2s digest algorithm (x86 accelerated version)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_BLAKE2S_GENERIC select CRYPTO_ARCH_HAVE_LIB_BLAKE2S @@ -762,6 +770,7 @@ config CRYPTO_CRCT10DIF config CRYPTO_CRCT10DIF_PCLMUL tristate "CRCT10DIF PCLMULQDQ hardware acceleration" depends on X86 && 64BIT && CRC_T10DIF + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help For x86_64 processors with SSE4.2 and PCLMULQDQ supported, @@ -831,6 +840,7 @@ config CRYPTO_POLY1305 config CRYPTO_POLY1305_X86_64 tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_ARCH_HAVE_LIB_POLY1305 help @@ -920,6 +930,7 @@ config CRYPTO_SHA1 config CRYPTO_SHA1_SSSE3 tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA1 select CRYPTO_HASH help @@ -931,6 +942,7 @@ config CRYPTO_SHA1_SSSE3 config CRYPTO_SHA256_SSSE3 tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA256 select CRYPTO_HASH help @@ -943,6 +955,7 @@ config CRYPTO_SHA256_SSSE3 config CRYPTO_SHA512_SSSE3 tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA512 select CRYPTO_HASH help @@ -1168,6 +1181,7 @@ config CRYPTO_WP512 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "GHASH hash function (CLMUL-NI accelerated)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CRYPTD help This is the x86_64 CLMUL-NI accelerated implementation of @@ -1228,6 +1242,7 @@ config CRYPTO_AES_TI config CRYPTO_AES_NI_INTEL tristate "AES cipher algorithms (AES-NI)" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_LIB_AES select CRYPTO_ALGAPI @@ -1369,6 +1384,7 @@ config CRYPTO_BLOWFISH_COMMON config CRYPTO_BLOWFISH_X86_64 tristate "Blowfish cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -1399,6 +1415,7 @@ config CRYPTO_CAMELLIA config CRYPTO_CAMELLIA_X86_64 tristate "Camellia cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -1415,6 +1432,7 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 select CRYPTO_SIMD @@ -1433,6 +1451,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Camellia cipher algorithm module (x86_64/AES-NI/AVX2). @@ -1478,6 +1497,7 @@ config CRYPTO_CAST5 config CRYPTO_CAST5_AVX_X86_64 tristate "CAST5 (CAST-128) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON @@ -1501,6 +1521,7 @@ config CRYPTO_CAST6 config CRYPTO_CAST6_AVX_X86_64 tristate "CAST6 (CAST-256) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON @@ -1534,6 +1555,7 @@ config CRYPTO_DES_SPARC64 config CRYPTO_DES3_EDE_X86_64 tristate "Triple DES EDE cipher algorithm (x86-64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -1604,6 +1626,7 @@ config CRYPTO_CHACHA20 config CRYPTO_CHACHA20_X86_64 tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA @@ -1674,6 +1697,7 @@ config CRYPTO_SERPENT config CRYPTO_SERPENT_SSE2_X86_64 tristate "Serpent cipher algorithm (x86_64/SSE2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1693,6 +1717,7 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Serpent cipher algorithm (i586/SSE2)" depends on X86 && !64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1712,6 +1737,7 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1732,6 +1758,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SERPENT_AVX_X86_64 help Serpent cipher algorithm, by Anderson, Biham & Knudsen. @@ -1876,6 +1903,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Twofish cipher algorithm (x86_64)" depends on (X86 || UML_X86) && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -1893,6 +1921,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Twofish cipher algorithm (x86_64, 3-way parallel)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -1913,6 +1942,7 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Twofish cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 94c889802566a..2aaf02bfe6f7e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -76,6 +76,7 @@ config WIREGUARD tristate "WireGuard secure network tunnel" depends on NET && INET depends on IPV6 || !IPV6 + depends on !KMSAN # KMSAN doesn't support the crypto configs below select NET_UDP_TUNNEL select DST_CACHE select CRYPTO -- GitLab From f630a5d0ca59a6e73b61e3f82c371dc230da99ff Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:01 +0200 Subject: [PATCH 1077/2223] kmsan: disable physical page merging in biovec KMSAN metadata for adjacent physical pages may not be adjacent, therefore accessing such pages together may lead to metadata corruption. We disable merging pages in biovec to prevent such corruptions. Link: https://lkml.kernel.org/r/20220915150417.722975-28-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- block/blk.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/blk.h b/block/blk.h index d7142c4d2fefb..af02b93c1dba5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -88,6 +88,13 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + /* + * Merging adjacent physical pages may not work correctly under KMSAN + * if their metadata pages aren't adjacent. Just disable merging. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) -- GitLab From 11b331f857b5fc3ff76bfec36c44a137a6b37de1 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:02 +0200 Subject: [PATCH 1078/2223] block: kmsan: skip bio block merging logic for KMSAN KMSAN doesn't allow treating adjacent memory pages as such, if they were allocated by different alloc_pages() calls. The block layer however does so: adjacent pages end up being used together. To prevent this, make page_is_mergeable() return false under KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-29-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Eric Biggers Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- block/bio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bio.c b/block/bio.c index 3d3a2678fea25..106ef14f28c2a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -869,6 +869,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); if (*same_page) return true; + else if (IS_ENABLED(CONFIG_KMSAN)) + return false; return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); } -- GitLab From 74d899098854b4e56cf9dc9d0245d4d40f5efcd4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:03 +0200 Subject: [PATCH 1079/2223] kcov: kmsan: unpoison area->list in kcov_remote_area_put() KMSAN does not instrument kernel/kcov.c for performance reasons (with CONFIG_KCOV=y virtually every place in the kernel invokes kcov instrumentation). Therefore the tool may miss writes from kcov.c that initialize memory. When CONFIG_DEBUG_LIST is enabled, list pointers from kernel/kcov.c are passed to instrumented helpers in lib/list_debug.c, resulting in false positives. To work around these reports, we unpoison the contents of area->list after initializing it. Link: https://lkml.kernel.org/r/20220915150417.722975-30-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/kcov.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/kcov.c b/kernel/kcov.c index e19c84b02452e..e5cd09fd8a050 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); + /* + * KMSAN doesn't instrument this file, so it may not know area->list + * is initialized. Unpoison it explicitly to avoid reports in + * kcov_remote_area_get(). + */ + kmsan_unpoison_memory(&area->list, sizeof(area->list)); } static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) -- GitLab From 42eaa27d9e7aafb4049fc3a5b02005a917013e65 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:04 +0200 Subject: [PATCH 1080/2223] security: kmsan: fix interoperability with auto-initialization Heap and stack initialization is great, but not when we are trying uses of uninitialized memory. When the kernel is built with KMSAN, having kernel memory initialization enabled may introduce false negatives. We disable CONFIG_INIT_STACK_ALL_PATTERN and CONFIG_INIT_STACK_ALL_ZERO under CONFIG_KMSAN, making it impossible to auto-initialize stack variables in KMSAN builds. We also disable CONFIG_INIT_ON_ALLOC_DEFAULT_ON and CONFIG_INIT_ON_FREE_DEFAULT_ON to prevent accidental use of heap auto-initialization. We however still let the users enable heap auto-initialization at boot-time (by setting init_on_alloc=1 or init_on_free=1), in which case a warning is printed. Link: https://lkml.kernel.org/r/20220915150417.722975-31-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++++ security/Kconfig.hardening | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 118462ae68004..c7e9451c69fc7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -938,6 +938,10 @@ void init_mem_debugging_and_hardening(void) else static_branch_disable(&init_on_free); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index bd2aabb2c60f9..2739a6776454e 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -106,6 +106,7 @@ choice config INIT_STACK_ALL_PATTERN bool "pattern-init everything (strongest)" depends on CC_HAS_AUTO_VAR_INIT_PATTERN + depends on !KMSAN help Initializes everything on the stack (including padding) with a specific debug value. This is intended to eliminate @@ -124,6 +125,7 @@ choice config INIT_STACK_ALL_ZERO bool "zero-init everything (strongest and safest)" depends on CC_HAS_AUTO_VAR_INIT_ZERO + depends on !KMSAN help Initializes everything on the stack (including padding) with a zero value. This is intended to eliminate all @@ -218,6 +220,7 @@ config STACKLEAK_RUNTIME_DISABLE config INIT_ON_ALLOC_DEFAULT_ON bool "Enable heap memory zeroing on allocation by default" + depends on !KMSAN help This has the effect of setting "init_on_alloc=1" on the kernel command line. This can be disabled with "init_on_alloc=0". @@ -230,6 +233,7 @@ config INIT_ON_ALLOC_DEFAULT_ON config INIT_ON_FREE_DEFAULT_ON bool "Enable heap memory zeroing on free by default" + depends on !KMSAN help This has the effect of setting "init_on_free=1" on the kernel command line. This can be disabled with "init_on_free=0". -- GitLab From 40b22c9df2c51c6ce459953f57c720b129332fbf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:05 +0200 Subject: [PATCH 1081/2223] objtool: kmsan: list KMSAN API functions as uaccess-safe KMSAN inserts API function calls in a lot of places (function entries and exits, local variables, memory accesses), so they may get called from the uaccess regions as well. KMSAN API functions are used to update the metadata (shadow/origin pages) for kernel memory accesses. The metadata pages for kernel pointers are also located in the kernel memory, so touching them is not a problem. For userspace pointers, no metadata is allocated. If an API function is supposed to read or modify the metadata, it does so for kernel pointers and ignores userspace pointers. If an API function is supposed to return a pair of metadata pointers for the instrumentation to use (like all __msan_metadata_ptr_for_TYPE_SIZE() functions do), it returns the allocated metadata for kernel pointers and special dummy buffers residing in the kernel memory for userspace pointers. As a result, none of KMSAN API functions perform userspace accesses, but since they might be called from UACCESS regions they use user_access_save/restore(). Link: https://lkml.kernel.org/r/20220915150417.722975-32-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/objtool/check.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e55fdf952a3a1..7c048c11ce7da 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1062,6 +1062,26 @@ static const char *uaccess_safe_builtin[] = { "__sanitizer_cov_trace_cmp4", "__sanitizer_cov_trace_cmp8", "__sanitizer_cov_trace_switch", + /* KMSAN */ + "kmsan_copy_to_user", + "kmsan_report", + "kmsan_unpoison_entry_regs", + "kmsan_unpoison_memory", + "__msan_chain_origin", + "__msan_get_context_state", + "__msan_instrument_asm_store", + "__msan_metadata_ptr_for_load_1", + "__msan_metadata_ptr_for_load_2", + "__msan_metadata_ptr_for_load_4", + "__msan_metadata_ptr_for_load_8", + "__msan_metadata_ptr_for_load_n", + "__msan_metadata_ptr_for_store_1", + "__msan_metadata_ptr_for_store_2", + "__msan_metadata_ptr_for_store_4", + "__msan_metadata_ptr_for_store_8", + "__msan_metadata_ptr_for_store_n", + "__msan_poison_alloca", + "__msan_warning", /* UBSAN */ "ubsan_type_mismatch_common", "__ubsan_handle_type_mismatch", -- GitLab From 93324e6842148cfdb44d1437fb586b957ace1f8c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:06 +0200 Subject: [PATCH 1082/2223] x86: kmsan: disable instrumentation of unsupported code Instrumenting some files with KMSAN will result in kernel being unable to link, boot or crashing at runtime for various reasons (e.g. infinite recursion caused by instrumentation hooks calling instrumented code again). Completely omit KMSAN instrumentation in the following places: - arch/x86/boot and arch/x86/realmode/rm, as KMSAN doesn't work for i386; - arch/x86/entry/vdso, which isn't linked with KMSAN runtime; - three files in arch/x86/kernel - boot problems; - arch/x86/mm/cpu_entry_area.c - recursion. Link: https://lkml.kernel.org/r/20220915150417.722975-33-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/boot/Makefile | 1 + arch/x86/boot/compressed/Makefile | 1 + arch/x86/entry/vdso/Makefile | 3 +++ arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/mm/Makefile | 2 ++ arch/x86/realmode/rm/Makefile | 1 + 7 files changed, 11 insertions(+) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ffec8bb01ba8c..9860ca5979f8a 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Kernel does not boot with kcov instrumentation here. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 35ce1a64068b7..3a261abb6d158 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -20,6 +20,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 12f6c4d714cd6..ce4eb7e44e5b8 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -11,6 +11,9 @@ include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n +KMSAN_SANITIZE_vclock_gettime.o := n +KMSAN_SANITIZE_vgetcpu.o := n + UBSAN_SANITIZE := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a20a5ebfacd73..ac564c5d7b1f0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,6 +33,8 @@ KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9661e3e802be5..f10a921ee7565 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 829c1409ffbde..afb6f7187dad0 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,6 +14,8 @@ KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions # reference __initdata sections. KCSAN_SANITIZE := n +# Avoid recursion by not calling KMSAN hooks for CEA code. +KMSAN_SANITIZE_cpu_entry_area.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 83f1b6a56449f..f614009d3e4e2 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -10,6 +10,7 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. -- GitLab From b11671b37f8f4761ff5a3d344553d65238309954 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:07 +0200 Subject: [PATCH 1083/2223] x86: kmsan: skip shadow checks in __switch_to() When instrumenting functions, KMSAN obtains the per-task state (mostly pointers to metadata for function arguments and return values) once per function at its beginning, using the `current` pointer. Every time the instrumented function calls another function, this state (`struct kmsan_context_state`) is updated with shadow/origin data of the passed and returned values. When `current` changes in the low-level arch code, instrumented code can not notice that, and will still refer to the old state, possibly corrupting it or using stale data. This may result in false positive reports. To deal with that, we need to apply __no_kmsan_checks to the functions performing context switching - this will result in skipping all KMSAN shadow checks and marking newly created values as initialized, preventing all false positive reports in those functions. False negatives are still possible, but we expect them to be rare and impersistent. Link: https://lkml.kernel.org/r/20220915150417.722975-34-glider@google.com Suggested-by: Marco Elver Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1962008fe7437..6b3418bff3261 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -553,6 +553,7 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ +__no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { -- GitLab From 9245ec01ce848eb5147e2e5030cf33ffbf1befff Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:08 +0200 Subject: [PATCH 1084/2223] x86: kmsan: handle open-coded assembly in lib/iomem.c KMSAN cannot intercept memory accesses within asm() statements. That's why we add kmsan_unpoison_memory() and kmsan_check_memory() to hint it how to handle memory copied from/to I/O memory. Link: https://lkml.kernel.org/r/20220915150417.722975-35-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/lib/iomem.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index 3e2f33fc33de2..e0411a3774d49 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -1,6 +1,7 @@ #include #include #include +#include #define movs(type,to,from) \ asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory") @@ -37,6 +38,8 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si n-=2; } rep_movs(to, (const void *)from, n); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(to, n); } static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n) @@ -44,6 +47,8 @@ static void string_memcpy_toio(volatile void __iomem *to, const void *from, size if (unlikely(!n)) return; + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(from, n); /* Align any unaligned destination IO */ if (unlikely(1 & (unsigned long)to)) { movs("b", to, from); -- GitLab From ff901d80fff6d65ada6f2a60a1f7d180ee2e0416 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:09 +0200 Subject: [PATCH 1085/2223] x86: kmsan: use __msan_ string functions where possible. Unless stated otherwise (by explicitly calling __memcpy(), __memset() or __memmove()) we want all string functions to call their __msan_ versions (e.g. __msan_memcpy() instead of memcpy()), so that shadow and origin values are updated accordingly. Bootloader must still use the default string functions to avoid crashes. Link: https://lkml.kernel.org/r/20220915150417.722975-36-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/string_64.h | 23 +++++++++++++++++++++-- include/linux/fortify-string.h | 2 ++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 6e450827f677a..3b87d889b6e16 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -11,11 +11,23 @@ function. */ #define __HAVE_ARCH_MEMCPY 1 +#if defined(__SANITIZE_MEMORY__) +#undef memcpy +void *__msan_memcpy(void *dst, const void *src, size_t size); +#define memcpy __msan_memcpy +#else extern void *memcpy(void *to, const void *from, size_t len); +#endif extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET +#if defined(__SANITIZE_MEMORY__) +extern void *__msan_memset(void *s, int c, size_t n); +#undef memset +#define memset __msan_memset +#else void *memset(void *s, int c, size_t n); +#endif void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMSET16 @@ -55,7 +67,13 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE +#if defined(__SANITIZE_MEMORY__) +#undef memmove +void *__msan_memmove(void *dest, const void *src, size_t len); +#define memmove __msan_memmove +#else void *memmove(void *dest, const void *src, size_t count); +#endif void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); @@ -64,8 +82,7 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); -#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) - +#if (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) /* * For files that not instrumented (e.g. mm/slub.c) we * should use not instrumented version of mem* functions. @@ -73,7 +90,9 @@ int strcmp(const char *cs, const char *ct); #undef memcpy #define memcpy(dst, src, len) __memcpy(dst, src, len) +#undef memmove #define memmove(dst, src, len) __memmove(dst, src, len) +#undef memset #define memset(s, c, n) __memset(s, c, n) #ifndef __NO_FORTIFY diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 3b401fa0f3746..6c8a1a29d0b63 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -285,8 +285,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * __builtin_object_size() must be captured here to avoid evaluating argument * side-effects further into the macro layers. */ +#ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __builtin_object_size(p, 0), __builtin_object_size(p, 1)) +#endif /* * To make sure the compiler can enforce protection against buffer overflows, -- GitLab From 3f1e2c7a9099c1ed32c67f12cdf432ba782cf51f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:10 +0200 Subject: [PATCH 1086/2223] x86: kmsan: sync metadata pages on page fault KMSAN assumes shadow and origin pages for every allocated page are accessible. For pages between [VMALLOC_START, VMALLOC_END] those metadata pages start at KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START, therefore we must sync a bigger memory region. Link: https://lkml.kernel.org/r/20220915150417.722975-37-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa71a5d12e872..d728791be8ace 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,6 +284,27 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + __arch_sync_kernel_mappings(start, end); +#ifdef CONFIG_KMSAN + /* + * KMSAN maintains two additional metadata page mappings for the + * [VMALLOC_START, VMALLOC_END) range. These mappings start at + * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and + * have to be synced together with the vmalloc memory mapping. + */ + if (start >= VMALLOC_START && end < VMALLOC_END) { + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, + end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, + end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); + } +#endif +} + static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; -- GitLab From d911c67e10b47eb1ace08dcf95ce98fe4d408c88 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:11 +0200 Subject: [PATCH 1087/2223] x86: kasan: kmsan: support CONFIG_GENERIC_CSUM on x86, enable it for KASAN/KMSAN This is needed to allow memory tools like KASAN and KMSAN see the memory accesses from the checksum code. Without CONFIG_GENERIC_CSUM the tools can't see memory accesses originating from handwritten assembly code. For KASAN it's a question of detecting more bugs, for KMSAN using the C implementation also helps avoid false positives originating from seemingly uninitialized checksum values. Link: https://lkml.kernel.org/r/20220915150417.722975-38-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/checksum.h | 16 ++++++++++------ arch/x86/lib/Makefile | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 674d694a665ef..eb158eb8a985f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -325,6 +325,10 @@ config GENERIC_ISA_DMA def_bool y depends on ISA_DMA_API +config GENERIC_CSUM + bool + default y if KMSAN || KASAN + config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index bca625a60186c..6df6ece8a28ec 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 -#define HAVE_CSUM_COPY_USER -#define _HAVE_ARCH_CSUM_AND_COPY -#ifdef CONFIG_X86_32 -# include +#ifdef CONFIG_GENERIC_CSUM +# include #else -# include +# define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +# define HAVE_CSUM_COPY_USER +# define _HAVE_ARCH_CSUM_AND_COPY +# ifdef CONFIG_X86_32 +# include +# else +# include +# endif #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f76747862bd2e..7ba5f61d72735 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -65,7 +65,9 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) endif else obj-y += iomap_copy_64.o +ifneq ($(CONFIG_GENERIC_CSUM),y) lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o +endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o -- GitLab From 7cf8f44a5a1c0cb10a594996797e5a988cf0589d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:12 +0200 Subject: [PATCH 1088/2223] x86: fs: kmsan: disable CONFIG_DCACHE_WORD_ACCESS dentry_string_cmp() calls read_word_at_a_time(), which might read uninitialized bytes to optimize string comparisons. Disabling CONFIG_DCACHE_WORD_ACCESS should prohibit this optimization, as well as (probably) similar ones. Link: https://lkml.kernel.org/r/20220915150417.722975-39-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Andrey Konovalov Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb158eb8a985f..edf7f12935d72 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -129,7 +129,9 @@ config X86 select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG - select DCACHE_WORD_ACCESS + # Word-size accesses may read uninitialized data past the trailing \0 + # in strings and cause false KMSAN reports. + select DCACHE_WORD_ACCESS if !KMSAN select DYNAMIC_SIGFRAME select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT -- GitLab From 37ad4ee8364255c73026a3c343403b5977fa7e79 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:13 +0200 Subject: [PATCH 1089/2223] x86: kmsan: don't instrument stack walking functions Upon function exit, KMSAN marks local variables as uninitialized. Further function calls may result in the compiler creating the stack frame where these local variables resided. This results in frame pointers being marked as uninitialized data, which is normally correct, because they are not stack-allocated. However stack unwinding functions are supposed to read and dereference the frame pointers, in which case KMSAN might be reporting uses of uninitialized values. To work around that, we mark update_stack_state(), unwind_next_frame() and show_trace_log_lvl() with __no_kmsan_checks, preventing all KMSAN reports inside those functions and making them return initialized values. Link: https://lkml.kernel.org/r/20220915150417.722975-40-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/dumpstack.c | 6 ++++++ arch/x86/kernel/unwind_frame.c | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index afae4dd774951..476eb504084e4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -177,6 +177,12 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 8e1c50c86e5db..d8ba93778ae32 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -183,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -250,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state, return true; } +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; -- GitLab From 6cae637fa26df867449c6bc20ea8bc693abe49b0 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:14 +0200 Subject: [PATCH 1090/2223] entry: kmsan: introduce kmsan_unpoison_entry_regs() struct pt_regs passed into IRQ entry code is set up by uninstrumented asm functions, therefore KMSAN may not notice the registers are initialized. kmsan_unpoison_entry_regs() unpoisons the contents of struct pt_regs, preventing potential false positives. Unlike kmsan_unpoison_memory(), it can be called under kmsan_in_runtime(), which is often the case in IRQ entry code. Link: https://lkml.kernel.org/r/20220915150417.722975-41-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 15 +++++++++++++++ kernel/entry/common.c | 5 +++++ mm/kmsan/hooks.c | 26 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index c473e0e21683c..e38ae3c346184 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -214,6 +214,17 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, */ void kmsan_handle_urb(const struct urb *urb, bool is_out); +/** + * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code. + * @regs: struct pt_regs pointer received from assembly code. + * + * KMSAN unpoisons the contents of the passed pt_regs, preventing potential + * false positive reports. Unlike kmsan_unpoison_memory(), + * kmsan_unpoison_entry_regs() can be called from the regions where + * kmsan_in_runtime() returns true, which is the case in early entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs); + #else static inline void kmsan_init_shadow(void) @@ -310,6 +321,10 @@ static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) { } +static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 063068a9ea9b3..846add8394c41 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) user_exit_irqoff(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); } @@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); @@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) */ lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) ct_nmi_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 79d7e73e2cfd8..35f6b6e6a908c 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -348,6 +348,32 @@ void kmsan_unpoison_memory(const void *address, size_t size) } EXPORT_SYMBOL(kmsan_unpoison_memory); +/* + * Version of kmsan_unpoison_memory() that can be called from within the KMSAN + * runtime. + * + * Non-instrumented IRQ entry functions receive struct pt_regs from assembly + * code. Those regs need to be unpoisoned, otherwise using them will result in + * false positives. + * Using kmsan_unpoison_memory() is not an option in entry code, because the + * return value of in_task() is inconsistent - as a result, certain calls to + * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that + * the registers are unpoisoned even if kmsan_in_runtime() is true in the early + * entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ + unsigned long ua_flags; + + if (!kmsan_enabled) + return; + + ua_flags = user_access_save(); + kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), + KMSAN_POISON_NOCHECK); + user_access_restore(ua_flags); +} + void kmsan_check_memory(const void *addr, size_t size) { if (!kmsan_enabled) -- GitLab From a6a7aaba7f39ee439f3d42e4b5bfc6e7f762d126 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:15 +0200 Subject: [PATCH 1091/2223] bpf: kmsan: initialize BPF registers with zeroes When executing BPF programs, certain registers may get passed uninitialized to helper functions. E.g. when performing a JMP_CALL, registers BPF_R1-BPF_R5 are always passed to the helper, no matter how many of them are actually used. Passing uninitialized values as function parameters is technically undefined behavior, so we work around it by always initializing the registers. Link: https://lkml.kernel.org/r/20220915150417.722975-42-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb7..547d139ab98af 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2002,7 +2002,7 @@ out: static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_EXT_REG]; \ + u64 regs[MAX_BPF_EXT_REG] = {}; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ -- GitLab From 1468c6f4558b1bcd92aa0400f2920f9dc7588402 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:16 +0200 Subject: [PATCH 1092/2223] mm: fs: initialize fsdata passed to write_begin/write_end interface Functions implementing the a_ops->write_end() interface accept the `void *fsdata` parameter that is supposed to be initialized by the corresponding a_ops->write_begin() (which accepts `void **fsdata`). However not all a_ops->write_begin() implementations initialize `fsdata` unconditionally, so it may get passed uninitialized to a_ops->write_end(), resulting in undefined behavior. Fix this by initializing fsdata with NULL before the call to write_begin(), rather than doing so in all possible a_ops implementations. This patch covers only the following cases found by running x86 KMSAN under syzkaller: - generic_perform_write() - cont_expand_zero() and generic_cont_expand_simple() - page_symlink() Other cases of passing uninitialized fsdata may persist in the codebase. Link: https://lkml.kernel.org/r/20220915150417.722975-43-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/buffer.c | 4 ++-- fs/namei.c | 2 +- mm/filemap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index b4c9fff3ab6c9..b55252078e7bf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2341,7 +2341,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2367,7 +2367,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; diff --git a/fs/namei.c b/fs/namei.c index 53b4bc094db23..076ae96ca0b14 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5088,7 +5088,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; - void *fsdata; + void *fsdata = NULL; int err; unsigned int flags; diff --git a/mm/filemap.c b/mm/filemap.c index f27c93a581ab4..ec17bd1a3bb77 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3720,7 +3720,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ - void *fsdata; + void *fsdata = NULL; offset = (pos & (PAGE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_SIZE - offset, -- GitLab From 4ca8cc8d1bbe582bfc7a4d80bd72cfd8d3d0e2e8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:17 +0200 Subject: [PATCH 1093/2223] x86: kmsan: enable KMSAN builds for x86 Make KMSAN usable by adding the necessary Kconfig bits. Also declare x86-specific functions checking address validity in arch/x86/include/asm/kmsan.h. Link: https://lkml.kernel.org/r/20220915150417.722975-44-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kmsan.h | 55 ++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 arch/x86/include/asm/kmsan.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index edf7f12935d72..dce94e84199d3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -169,6 +169,7 @@ config X86 select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 select HAVE_ARCH_KFENCE + select HAVE_ARCH_KMSAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h new file mode 100644 index 0000000000000..a790b865d0a68 --- /dev/null +++ b/arch/x86/include/asm/kmsan.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KMSAN support. + * + * Copyright (C) 2022, Google LLC + * Author: Alexander Potapenko + */ + +#ifndef _ASM_X86_KMSAN_H +#define _ASM_X86_KMSAN_H + +#ifndef MODULE + +#include +#include + +/* + * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. + */ +static inline bool kmsan_phys_addr_valid(unsigned long addr) +{ + if (IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) + return !(addr >> boot_cpu_data.x86_phys_bits); + else + return true; +} + +/* + * Taken from arch/x86/mm/physaddr.c to avoid using an instrumented version. + */ +static inline bool kmsan_virt_addr_valid(void *addr) +{ + unsigned long x = (unsigned long)addr; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) + return false; + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !kmsan_phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} + +#endif /* !MODULE */ + +#endif /* _ASM_X86_KMSAN_H */ -- GitLab From ce732a7520b093091c345cba1b84542d1abd83ed Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 28 Sep 2022 14:32:19 +0200 Subject: [PATCH 1094/2223] x86: kmsan: handle CPU entry area Among other data, CPU entry area holds exception stacks, so addresses from this area can be passed to kmsan_get_metadata(). This previously led to kmsan_get_metadata() returning NULL, which in turn resulted in a warning that triggered further attempts to call kmsan_get_metadata() in the exception context, which quickly exhausted the exception stack. This patch allocates shadow and origin for the CPU entry area on x86 and introduces arch_kmsan_get_meta_or_null(), which performs arch-specific metadata mapping. Link: https://lkml.kernel.org/r/20220928123219.1101883-1-glider@google.com Signed-off-by: Alexander Potapenko Fixes: 21d723a7c1409 ("kmsan: add KMSAN runtime core") Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + arch/x86/include/asm/kmsan.h | 32 ++++++++++++++++++++++++++++++++ arch/x86/mm/Makefile | 3 +++ arch/x86/mm/kmsan_shadow.c | 20 ++++++++++++++++++++ mm/kmsan/shadow.c | 6 +++++- 5 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 arch/x86/mm/kmsan_shadow.c diff --git a/MAINTAINERS b/MAINTAINERS index 3c7dfe9bb7129..456b07f028034 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11379,6 +11379,7 @@ L: kasan-dev@googlegroups.com S: Maintained F: Documentation/dev-tools/kmsan.rst F: arch/*/include/asm/kmsan.h +F: arch/*/mm/kmsan_* F: include/linux/kmsan*.h F: lib/Kconfig.kmsan F: mm/kmsan/ diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index a790b865d0a68..8fa6ac0e2d766 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -11,9 +11,41 @@ #ifndef MODULE +#include #include #include +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); + +/* + * Functions below are declared in the header to make sure they are inlined. + * They all are called from kmsan_get_metadata() for every memory access in + * the kernel, so speed is important here. + */ + +/* + * Compute metadata addresses for the CPU entry area on x86. + */ +static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr; + char *metadata_array; + unsigned long off; + int cpu; + + if ((addr64 < CPU_ENTRY_AREA_BASE) || + (addr64 >= (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE))) + return NULL; + cpu = (addr64 - CPU_ENTRY_AREA_BASE) / CPU_ENTRY_AREA_SIZE; + off = addr64 - (unsigned long)get_cpu_entry_area(cpu); + if ((off < 0) || (off >= CPU_ENTRY_AREA_SIZE)) + return NULL; + metadata_array = is_origin ? cpu_entry_area_origin : + cpu_entry_area_shadow; + return &per_cpu(metadata_array[off], cpu); +} + /* * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index afb6f7187dad0..c80febc44cd2f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -46,6 +46,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o +KMSAN_SANITIZE_kmsan_shadow.o := n +obj-$(CONFIG_KMSAN) += kmsan_shadow.o + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/kmsan_shadow.c b/arch/x86/mm/kmsan_shadow.c new file mode 100644 index 0000000000000..bee2ec4a3bfa8 --- /dev/null +++ b/arch/x86/mm/kmsan_shadow.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86-specific bits of KMSAN shadow implementation. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + */ + +#include +#include + +/* + * Addresses within the CPU entry area (including e.g. exception stacks) do not + * have struct page entries corresponding to them, so they need separate + * handling. + * arch_kmsan_get_meta_or_null() (declared in the header) maps the addresses in + * CPU entry area to addresses in cpu_entry_area_shadow/cpu_entry_area_origin. + */ +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 6e90a806a7045..21e3e196ec3cf 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -126,6 +125,7 @@ void *kmsan_get_metadata(void *address, bool is_origin) { u64 addr = (u64)address, pad, off; struct page *page; + void *ret; if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { pad = addr % KMSAN_ORIGIN_SIZE; @@ -136,6 +136,10 @@ void *kmsan_get_metadata(void *address, bool is_origin) kmsan_internal_is_module_addr(address)) return (void *)vmalloc_meta(address, is_origin); + ret = arch_kmsan_get_meta_or_null(address, is_origin); + if (ret) + return ret; + page = virt_to_page_or_null(address); if (!page) return NULL; -- GitLab From 871f697b494b04f8d78cc090e49b416062d23a10 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 15 Sep 2022 22:22:36 +0800 Subject: [PATCH 1095/2223] mm/damon/sysfs: avoid call damon_target_has_pid() repeatedly In damon_sysfs_destroy_targets(), we call damon_target_has_pid() to check whether the 'ctx' include a valid pid, but there no need to call damon_target_has_pid() to check repeatedly, just need call it once. [xhao@linux.alibaba.com: more simplified code calls damon_target_has_pid()] Link: https://lkml.kernel.org/r/20220916133535.7428-1-xhao@linux.alibaba.com Link: https://lkml.kernel.org/r/20220915142237.92529-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1fa0023f136eb..0cca1909bf67b 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2143,9 +2143,10 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next; + bool has_pid = damon_target_has_pid(ctx); damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) + if (has_pid) put_pid(t->pid); damon_destroy_target(t); } -- GitLab From a07b8eafa43fdbe1df33256b06d625c80829e557 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 15 Sep 2022 13:30:41 +0000 Subject: [PATCH 1096/2223] mm/damon: simplify scheme create in lru_sort.c In damon_lru_sort_new_hot_scheme() and damon_lru_sort_new_cold_scheme(), they have so much in common, so we can combine them into a single function, and we just need to distinguish their differences. [yangyingliang@huawei.com: change damon_lru_sort_stub_pattern to static] Link: https://lkml.kernel.org/r/20220917121228.1889699-1-yangyingliang@huawei.com Link: https://lkml.kernel.org/r/20220915133041.71819-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Signed-off-by: Yang Yingliang Suggested-by: SeongJae Park Reviewed-by: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 07a0908963fd0..a91c1e364fc7b 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -132,6 +132,18 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, lru_sort_tried_cold_regions, lru_sorted_cold_regions, cold_quota_exceeds); +static struct damos_access_pattern damon_lru_sort_stub_pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* no matter its access frequency */ + .min_nr_accesses = 0, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, +}; + static struct damon_ctx *ctx; static struct damon_target *target; @@ -157,36 +169,19 @@ static struct damos *damon_lru_sort_new_scheme( /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { - struct damos_access_pattern pattern = { - /* Find regions having PAGE_SIZE or larger size */ - .min_sz_region = PAGE_SIZE, - .max_sz_region = ULONG_MAX, - /* and accessed for more than the threshold */ - .min_nr_accesses = hot_thres, - .max_nr_accesses = UINT_MAX, - /* no matter its age */ - .min_age_region = 0, - .max_age_region = UINT_MAX, - }; + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + pattern.min_nr_accesses = hot_thres; return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { - struct damos_access_pattern pattern = { - /* Find regions having PAGE_SIZE or larger size */ - .min_sz_region = PAGE_SIZE, - .max_sz_region = ULONG_MAX, - /* and not accessed at all */ - .min_nr_accesses = 0, - .max_nr_accesses = 0, - /* for min_age or more micro-seconds */ - .min_age_region = cold_thres, - .max_age_region = UINT_MAX, - }; + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + pattern.max_nr_accesses = 0; + pattern.min_age_region = cold_thres; return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } -- GitLab From 16bc1b0f0269b6110f6d25880b52947d354e2980 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 15 Sep 2022 19:33:41 +0800 Subject: [PATCH 1097/2223] mm/damon: use 'struct damon_target *' instead of 'void *' in target_valid() We could use 'struct damon_target *' directly instead of 'void *' in target_valid() operation to make code simple. Link: https://lkml.kernel.org/r/1663241621-13293-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/vaddr.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index c5dc0c77c7722..1dda8d0068e54 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -346,7 +346,7 @@ struct damon_operations { unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - bool (*target_valid)(void *target); + bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3f84584f99826..f53c2ff2bcc8a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -593,9 +593,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) * Functions for the target validity check and cleanup */ -static bool damon_va_target_valid(void *target) +static bool damon_va_target_valid(struct damon_target *t) { - struct damon_target *t = target; struct task_struct *task; task = damon_get_task_struct(t); -- GitLab From 81f8f57f853ed6fb8ae9bbc96e3aead10d6e248a Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 15 Sep 2022 10:10:23 +0800 Subject: [PATCH 1098/2223] mm/damon/reclaim: change damon_reclaim_wmarks to static damon_reclaim_wmarks is only used in reclaim.c now, change it to static. Link: https://lkml.kernel.org/r/20220915021024.4177940-1-yangyingliang@huawei.com Fixes: 89dd02d8abd1 ("mm/damon/reclaim: use watermarks parameters generator macro") Signed-off-by: Yang Yingliang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 1acf808e16242..039fa55e0ae9c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -64,7 +64,7 @@ static struct damos_quota damon_reclaim_quota = { }; DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); -struct damos_watermarks damon_reclaim_wmarks = { +static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ .high = 500, /* 50 percent */ -- GitLab From e47b082579f307d0367b1fb7ca3698fd9c73a88b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 15 Sep 2022 10:10:24 +0800 Subject: [PATCH 1099/2223] mm/damon/lru_sort: change damon_lru_sort_wmarks to static damon_lru_sort_wmarks is only used in lru_sort.c now, change it to static. Link: https://lkml.kernel.org/r/20220915021024.4177940-2-yangyingliang@huawei.com Fixes: 189aa3d58206 ("mm/damon/lru_sort: use watermarks parameters generator macro") Signed-off-by: Yang Yingliang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index a91c1e364fc7b..4a40054ba03bf 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -77,7 +77,7 @@ static struct damos_quota damon_lru_sort_quota = { }; DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); -struct damos_watermarks damon_lru_sort_wmarks = { +static struct damos_watermarks damon_lru_sort_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ .high = 200, /* 20 percent */ -- GitLab From 1ea41595f606e21ba422c59dcdc637f9a9513f2e Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 15 Sep 2022 09:16:02 +0800 Subject: [PATCH 1100/2223] mm/secretmem: add __init annotation to secretmem_init() It's a fs_initcall entry, add __init annotation to it. Link: https://lkml.kernel.org/r/20220915011602.176967-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/secretmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 3f71540997958..6a44efb673b2c 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -276,7 +276,7 @@ static struct file_system_type secretmem_fs = { .kill_sb = kill_anon_super, }; -static int secretmem_init(void) +static int __init secretmem_init(void) { int ret = 0; -- GitLab From cc713520bdc1b84fc5394f6ac8649b93ad2c5dde Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 16 Sep 2022 23:20:35 +0800 Subject: [PATCH 1101/2223] mm/damon: return void from damon_set_schemes() There is no point in returning an int from damon_set_schemes(). It always returns 0 which is meaningless for the caller, so change it to return void directly. Link: https://lkml.kernel.org/r/1663341635-12675-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/core.c | 5 +---- mm/damon/dbgfs.c | 8 +++----- mm/damon/lru_sort.c | 4 +--- mm/damon/reclaim.c | 4 +--- 5 files changed, 7 insertions(+), 16 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1dda8d0068e54..e7808a84675fb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -541,7 +541,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); -int damon_set_schemes(struct damon_ctx *ctx, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); diff --git a/mm/damon/core.c b/mm/damon/core.c index a843673c11cfc..9c80c6eb00c24 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -454,10 +454,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) * * This function should not be called while the kdamond of the context is * running. - * - * Return: 0 if success, or negative error code otherwise. */ -int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes) { struct damos *s, *next; @@ -467,7 +465,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, damon_destroy_scheme(s); for (i = 0; i < nr_schemes; i++) damon_add_scheme(ctx, schemes[i]); - return 0; } /** diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index c00eba4448d85..6f0ae7d3ae39b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -307,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, goto unlock_out; } - ret = damon_set_schemes(ctx, schemes, nr_schemes); - if (!ret) { - ret = count; - nr_schemes = 0; - } + damon_set_schemes(ctx, schemes, nr_schemes); + ret = count; + nr_schemes = 0; unlock_out: mutex_unlock(&ctx->kdamond_lock); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 4a40054ba03bf..d7eb72b41cb67 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -203,9 +203,7 @@ static int damon_lru_sort_apply_parameters(void) scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 039fa55e0ae9c..3d59ab11b7b39 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -155,9 +155,7 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); if (monitor_region_start > monitor_region_end) return -EINVAL; -- GitLab From 3ae6d3e30a52a7af222f284d0bf5d424b4f2f365 Mon Sep 17 00:00:00 2001 From: Chih-En Lin Date: Fri, 16 Sep 2022 17:04:34 +0800 Subject: [PATCH 1102/2223] mm/page_table_check: fix typos Link: https://lkml.kernel.org/r/20220916090434.701194-1-shiyn.lin@gmail.com Signed-off-by: Chih-En Lin Acked-by: Pasha Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 903db62794d3c..433dbce13fe1d 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -53,7 +53,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) } /* - * An enty is removed from the page table, decrement the counters for that page + * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, @@ -87,7 +87,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } /* - * A new enty is added to the page table, increment the counters for that page + * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ -- GitLab From ce96fa6223ee851cb83118678f6e75f260852a80 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:42 +0800 Subject: [PATCH 1103/2223] mm/page_alloc: ensure kswapd doesn't accidentally go to sleep Patch series "A few cleanup patches for mm", v2. This series contains a few cleanup patches to remove the obsolete comments and functions, use helper macro to improve readability and so on. More details can be found in the respective changelogs. This patch (of 16): If ALLOC_KSWAPD is set, wake_all_kswapds() will be called to ensure kswapd doesn't accidentally go to sleep. But when reserve_flags is set, alloc_flags will be overwritten and ALLOC_KSWAPD is thus lost. Preserve the ALLOC_KSWAPD flag in alloc_flags to ensure kswapd won't go to sleep accidentally. Link: https://lkml.kernel.org/r/20220916072257.9639-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220916072257.9639-2-linmiaohe@huawei.com Fixes: 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7e9451c69fc7..24caa11db8aee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5156,7 +5156,8 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | + (alloc_flags & ALLOC_KSWAPD); /* * Reset the nodemask and zonelist iterators if memory policies can be -- GitLab From b89f1735169b8ab54b6a03bf4823657ee4e30073 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:43 +0800 Subject: [PATCH 1104/2223] mm/page_alloc: make zone_pcp_update() static Since commit b92ca18e8ca5 ("mm/page_alloc: disassociate the pcp->high from pcp->batch"), zone_pcp_update() is only used in mm/page_alloc.c. Move zone_pcp_update() up to avoid forward declaration and then make it static. No functional change intended. Link: https://lkml.kernel.org/r/20220916072257.9639-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/page_alloc.c | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index fea3cba154844..f46cd8a6694a8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -367,7 +367,6 @@ extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); -extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 24caa11db8aee..49efaf9963d1c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7242,6 +7242,17 @@ void __meminit setup_zone_pageset(struct zone *zone) zone_set_pageset_high_and_batch(zone, 0); } +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalculated. + */ +static void zone_pcp_update(struct zone *zone, int cpu_online) +{ + mutex_lock(&pcp_batch_high_lock); + zone_set_pageset_high_and_batch(zone, cpu_online); + mutex_unlock(&pcp_batch_high_lock); +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. @@ -9473,17 +9484,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) } EXPORT_SYMBOL(free_contig_range); -/* - * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalculated. - */ -void zone_pcp_update(struct zone *zone, int cpu_online) -{ - mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone, cpu_online); - mutex_unlock(&pcp_batch_high_lock); -} - /* * Effectively disable pcplists for the zone by setting the high limit to 0 * and draining all cpus. A concurrent page freeing on another CPU that's about -- GitLab From 638a9ae97ab596f1f7b7522dad709e69cb5b4e9d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:44 +0800 Subject: [PATCH 1105/2223] mm: remove obsolete macro NR_PCP_ORDER_MASK and NR_PCP_ORDER_WIDTH Since commit 8b10b465d0e1 ("mm/page_alloc: free pages in a single pass during bulk free"), they're not used anymore. Remove them. Link: https://lkml.kernel.org/r/20220916072257.9639-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 ------- mm/page_alloc.c | 1 - 2 files changed, 8 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c69c081568227..3ff1e757d5aab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -564,13 +564,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -/* - * Shift to encode migratetype and order in the same integer, with order - * in the least significant bits. - */ -#define NR_PCP_ORDER_WIDTH 8 -#define NR_PCP_ORDER_MASK ((1<_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 49efaf9963d1c..fa3dd2e1d5664 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1584,7 +1584,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; - BUILD_BUG_ON(MAX_ORDER >= (1< Date: Fri, 16 Sep 2022 15:22:45 +0800 Subject: [PATCH 1106/2223] mm/page_alloc: remove obsolete comment in zone_statistics() Since commit 43c95bcc51e4 ("mm/page_alloc: reduce duration that IRQs are disabled for VM counters"), zone_statistics() is not called with interrupts disabled. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220916072257.9639-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fa3dd2e1d5664..b94d3a42cb8bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3671,8 +3671,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) /* * Update NUMA hit/miss statistics - * - * Must be called with interrupts disabled. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, long nr_account) -- GitLab From 5749fcc5f04cef4091dea0c2ba6b5c5f5e05a549 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:46 +0800 Subject: [PATCH 1107/2223] mm/page_alloc: add __init annotations to init_mem_debugging_and_hardening() It's only called by mm_init(). Add __init annotations to it. Link: https://lkml.kernel.org/r/20220916072257.9639-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a37c8a29c49ba..8bbcccbc55654 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3092,7 +3092,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -extern void init_mem_debugging_and_hardening(void); +extern void __init init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b94d3a42cb8bf..21261f55dab1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -903,7 +903,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * order of appearance. So we need to first gather the full picture of what was * enabled, and then make decisions. */ -void init_mem_debugging_and_hardening(void) +void __init init_mem_debugging_and_hardening(void) { bool page_poisoning_requested = false; -- GitLab From 022e7fa0f73d7c90cf3d6bea3d4e4cc5df1e1087 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:47 +0800 Subject: [PATCH 1108/2223] mm/page_alloc: fix freeing static percpu memory The size of struct per_cpu_zonestat can be 0 on !SMP && !NUMA. In that case, zone->per_cpu_zonestats will always equal to boot_zonestats. But in zone_pcp_reset(), zone->per_cpu_zonestats is freed via free_percpu() directly without checking against boot_zonestats first. boot_zonestats will be released by free_percpu() unexpectedly. Link: https://lkml.kernel.org/r/20220916072257.9639-7-linmiaohe@huawei.com Fixes: 28f836b6777b ("mm/page_alloc: split per cpu page lists and zone stats") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21261f55dab1e..43114e172592d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9513,9 +9513,11 @@ void zone_pcp_reset(struct zone *zone) drain_zonestat(zone, pzstats); } free_percpu(zone->per_cpu_pageset); - free_percpu(zone->per_cpu_zonestats); zone->per_cpu_pageset = &boot_pageset; - zone->per_cpu_zonestats = &boot_zonestats; + if (zone->per_cpu_zonestats != &boot_zonestats) { + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_zonestats = &boot_zonestats; + } } } -- GitLab From 30e3b5d7c82f78c63c53197b5d8b99636bb60d56 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:48 +0800 Subject: [PATCH 1109/2223] mm: remove obsolete pgdat_is_empty() There's no caller. Remove it. Link: https://lkml.kernel.org/r/20220916072257.9639-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ff1e757d5aab..4c8510f26b02b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1241,11 +1241,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) return pgdat->node_start_pfn + pgdat->node_spanned_pages; } -static inline bool pgdat_is_empty(pg_data_t *pgdat) -{ - return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; -} - #include void build_all_zonelists(pg_data_t *pgdat); -- GitLab From b36184553d41c59e6712f9d4699aca24577fbd4a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:49 +0800 Subject: [PATCH 1110/2223] mm/page_alloc: add missing is_migrate_isolate() check in set_page_guard() In MIGRATE_ISOLATE case, zone freepage state shouldn't be modified as caller will take care of it. Add missing is_migrate_isolate() here to avoid possible unbalanced freepage state. This would happen if someone isolates the block, and then we face an MCE failure/soft-offline on a page within that block. __mod_zone_freepage_state() will be triggered via below call trace which already had been triggered back when block was isolated: take_page_off_buddy break_down_buddy_pages set_page_guard Link: https://lkml.kernel.org/r/20220916072257.9639-9-linmiaohe@huawei.com Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages") Signed-off-by: Miaohe Lin Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 43114e172592d..c055b4cd37f01 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -873,7 +873,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << order), migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } -- GitLab From c035290424a9b7b64477752058b460d0ecc21987 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:50 +0800 Subject: [PATCH 1111/2223] mm/page_alloc: use local variable zone_idx directly Use local variable zone_idx directly since it holds the exact value of zone_idx(). No functional change intended. Link: https://lkml.kernel.org/r/20220916072257.9639-10-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c055b4cd37f01..ec865cfd0c3a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6881,7 +6881,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) + if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) return; /* -- GitLab From f774a6a6fd39e1b5677bdf71f6813b382faddeeb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:51 +0800 Subject: [PATCH 1112/2223] mm, memory_hotplug: remove obsolete generic_free_nodedata() Commit 390511e1476e ("mm, memory_hotplug: drop arch_free_nodedata") drops the last caller of generic_free_nodedata(). Remove it too. Link: https://lkml.kernel.org/r/20220916072257.9639-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51052969dbfe4..9fcbf57065957 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -43,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); ({ \ memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ }) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) extern pg_data_t *node_data[]; static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) @@ -63,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid) BUG(); return NULL; } -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) { } -- GitLab From 6dc2c87a5a8878b657d08e34ca0e757d31273e12 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:52 +0800 Subject: [PATCH 1113/2223] mm/page_alloc: make boot_nodestats static It's only used in mm/page_alloc.c now. Make it static. Link: https://lkml.kernel.org/r/20220916072257.9639-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/internal.h | 2 -- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f46cd8a6694a8..6b7ef495b56d3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -836,8 +836,6 @@ int migrate_device_coherent_page(struct page *page); */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); - extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec865cfd0c3a7..0f856b4ce3b0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6579,7 +6579,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { -- GitLab From c940e0207a1c307fdab92b32d0522271036fc3ef Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:53 +0800 Subject: [PATCH 1114/2223] mm/page_alloc: use helper macro SZ_1{K,M} Use helper macro SZ_1K and SZ_1M to do the size conversion. Minor readability improvement. Link: https://lkml.kernel.org/r/20220916072257.9639-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f856b4ce3b0d..3216477d9ba65 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7056,7 +7056,7 @@ static int zone_batchsize(struct zone *zone) * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -8531,8 +8531,8 @@ void __init mem_init_print_info(void) #endif ")\n", K(nr_free_pages()), K(physpages), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, + codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, + (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, K(physpages - totalram_pages() - totalcma_pages), K(totalcma_pages) #ifdef CONFIG_HIGHMEM @@ -9057,8 +9057,8 @@ void *__init alloc_large_system_hash(const char *tablename, numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ - if (PAGE_SHIFT < 20) - numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + if (PAGE_SIZE < SZ_1M) + numentries = round_up(numentries, SZ_1M / PAGE_SIZE); #if __BITS_PER_LONG > 32 if (!high_limit) { -- GitLab From dae37a5dccd104fc465241c42d9e17756ddebbc1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:54 +0800 Subject: [PATCH 1115/2223] mm/page_alloc: init local variable buddy_pfn The local variable buddy_pfn could be passed to buddy_merge_likely() without initialization if the passed in order is MAX_ORDER - 1. This looks buggy but buddy_pfn won't be used in this case as there's a order >= MAX_ORDER - 2 check. Init buddy_pfn to 0 anyway to avoid possible future misuse. Link: https://lkml.kernel.org/r/20220916072257.9639-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3216477d9ba65..4dc2fe575fc8d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1113,7 +1113,7 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned long buddy_pfn; + unsigned long buddy_pfn = 0; unsigned long combined_pfn; struct page *buddy; bool to_tail; -- GitLab From 896c4d52538df231c3847491acc4f2c23891fe6a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:55 +0800 Subject: [PATCH 1116/2223] mm/page_alloc: use costly_order in WARN_ON_ONCE_GFP() There's no need to check whether order > PAGE_ALLOC_COSTLY_ORDER again. Minor readability improvement. Link: https://lkml.kernel.org/r/20220916072257.9639-15-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4dc2fe575fc8d..23f839e1d89e3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5280,7 +5280,7 @@ nopage: * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask); + WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* * Help non-failing allocations by giving them access to memory -- GitLab From def76fd549c513bb90278a8d6d0fe3ef3faa20a7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:56 +0800 Subject: [PATCH 1117/2223] mm/page_alloc: remove obsolete gfpflags_normal_context() Since commit dacb5d8875cc ("tcp: fix page frag corruption on page fault"), there's no caller of gfpflags_normal_context(). Remove it as this helper is strictly tied to the sk page frag usage and there won't be other user in the future. [linmiaohe@huawei.com: fix htmldocs] Link: https://lkml.kernel.org/r/1bc55727-9b66-0e9e-c306-f10c4716ea89@huawei.com Link: https://lkml.kernel.org/r/20220916072257.9639-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 3 --- include/linux/gfp.h | 23 ----------------------- 2 files changed, 26 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 1ebcc6c3fafe7..f5dde5bceaeaf 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -19,9 +19,6 @@ User Space Memory Access Memory Allocation Controls ========================== -.. kernel-doc:: include/linux/gfp.h - :internal: - .. kernel-doc:: include/linux/gfp_types.h :doc: Page mobility and placement hints diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ea6cb9399152e..ef4aea3b356e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } -/** - * gfpflags_normal_context - is gfp_flags a normal sleepable context? - * @gfp_flags: gfp_flags to test - * - * Test whether @gfp_flags indicates that the allocation is from the - * %current context and allowed to sleep. - * - * An allocation being allowed to block doesn't mean it owns the %current - * context. When direct reclaim path tries to allocate memory, the - * allocation context is nested inside whatever %current was doing at the - * time of the original allocation. The nested allocation may be allowed - * to block but modifying anything %current owns can corrupt the outer - * context's expectations. - * - * %true result from this function indicates that the allocation context - * can sleep and use anything that's associated with %current. - */ -static inline bool gfpflags_normal_context(const gfp_t gfp_flags) -{ - return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) == - __GFP_DIRECT_RECLAIM; -} - #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else -- GitLab From c9b3637f8a5a4c869f78c26773c559669796212f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:57 +0800 Subject: [PATCH 1118/2223] mm/page_alloc: fix obsolete comment in deferred_pfn_valid() There are no architectures that can have holes in the memory map within a pageblock since commit 859a85ddf90e ("mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE"). Update the corresponding comment. Link: https://lkml.kernel.org/r/20220916072257.9639-17-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23f839e1d89e3..66f7778732fbd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1929,11 +1929,7 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * First we check if pfn is valid on architectures where it is possible to have - * holes within pageblock_nr_pages. On systems where it is not possible, this - * function is optimized out. - * - * Then, we check if a current large page is valid by only checking the validity + * We check if a current large page is valid by only checking the validity * of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) -- GitLab From 2b21624fc23277553ef254b3ad02c37afa1c484d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Sep 2022 14:46:38 -0700 Subject: [PATCH 1119/2223] hugetlb: freeze allocated pages before creating hugetlb pages When creating hugetlb pages, the hugetlb code must first allocate contiguous pages from a low level allocator such as buddy, cma or memblock. The pages returned from these low level allocators are ref counted. This creates potential issues with other code taking speculative references on these pages before they can be transformed to a hugetlb page. This issue has been addressed with methods and code such as that provided in [1]. Recent discussions about vmemmap freeing [2] have indicated that it would be beneficial to freeze all sub pages, including the head page of pages returned from low level allocators before converting to a hugetlb page. This helps avoid races if we want to replace the page containing vmemmap for the head page. There have been proposals to change at least the buddy allocator to return frozen pages as described at [3]. If such a change is made, it can be employed by the hugetlb code. However, as mentioned above hugetlb uses several low level allocators so each would need to be modified to return frozen pages. For now, we can manually freeze the returned pages. This is done in two places: 1) alloc_buddy_huge_page, only the returned head page is ref counted. We freeze the head page, retrying once in the VERY rare case where there may be an inflated ref count. 2) prep_compound_gigantic_page, for gigantic pages the current code freezes all pages except the head page. New code will simply freeze the head page as well. In a few other places, code checks for inflated ref counts on newly allocated hugetlb pages. With the modifications to freeze after allocating, this code can be removed. After hugetlb pages are freshly allocated, they are often added to the hugetlb free lists. Since these pages were previously ref counted, this was done via put_page() which would end up calling the hugetlb destructor: free_huge_page. With changes to freeze pages, we simply call free_huge_page directly to add the pages to the free list. In a few other places, freshly allocated hugetlb pages were immediately put into use, and the expectation was they were already ref counted. In these cases, we must manually ref count the page. [1] https://lore.kernel.org/linux-mm/20210622021423.154662-3-mike.kravetz@oracle.com/ [2] https://lore.kernel.org/linux-mm/20220802180309.19340-1-joao.m.martins@oracle.com/ [3] https://lore.kernel.org/linux-mm/20220809171854.3725722-1-willy@infradead.org/ [mike.kravetz@oracle.com: fix NULL pointer dereference] Link: https://lkml.kernel.org/r/20220921202702.106069-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220916214638.155744-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Cc: Joao Martins Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 102 +++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b0e39045a7a86..2182134216f09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,9 +1787,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __ClearPageReserved(page); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { p = nth_page(page, i); /* @@ -1830,17 +1829,19 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, } else { VM_BUG_ON_PAGE(page_count(p), p); } - set_compound_head(p, page); + if (i != 0) + set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); atomic_set(compound_pincount_ptr(page), 0); return true; out_error: - /* undo tail page modifications made above */ - for (j = 1; j < i; j++) { + /* undo page modifications made above */ + for (j = 0; j < i; j++) { p = nth_page(page, j); - clear_compound_head(p); + if (j != 0) + clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ @@ -1936,6 +1937,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int order = huge_page_order(h); struct page *page; bool alloc_try_hard = true; + bool retry = true; /* * By default we always try hard to allocate the page with @@ -1951,7 +1953,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); +retry: page = __alloc_pages(gfp_mask, order, nid, nmask); + + /* Freeze head page */ + if (page && !page_ref_freeze(page, 1)) { + __free_pages(page, order); + if (retry) { /* retry once */ + retry = false; + goto retry; + } + /* WOW! twice in a row. */ + pr_warn("HugeTLB head page unexpected inflated ref count\n"); + page = NULL; + } + if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else @@ -1979,6 +1995,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ static struct page *alloc_fresh_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, @@ -2036,7 +2055,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, if (!page) return 0; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(page); /* free it into the hugepage allocator */ return 1; } @@ -2193,10 +2212,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask, bool zero_ref) + int nid, nodemask_t *nmask) { struct page *page = NULL; - bool retry = false; if (hstate_is_gigantic(h)) return NULL; @@ -2206,7 +2224,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); -retry: page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2222,34 +2239,10 @@ retry: if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); spin_unlock_irq(&hugetlb_lock); - put_page(page); + free_huge_page(page); return NULL; } - if (zero_ref) { - /* - * Caller requires a page with zero ref count. - * We will drop ref count here. If someone else is holding - * a ref, the page will be freed when they drop it. Abuse - * temporary page flag to accomplish this. - */ - SetHPageTemporary(page); - if (!put_page_testzero(page)) { - /* - * Unexpected inflated ref count on freshly allocated - * huge. Retry once. - */ - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - spin_unlock_irq(&hugetlb_lock); - if (retry) - return NULL; - - retry = true; - goto retry; - } - ClearHPageTemporary(page); - } - h->surplus_huge_pages++; h->surplus_huge_pages_node[page_to_nid(page)]++; @@ -2271,6 +2264,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (!page) return NULL; + /* fresh huge pages are frozen */ + set_page_refcounted(page); + /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference @@ -2298,14 +2294,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } @@ -2375,7 +2371,7 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL, true); + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; @@ -2737,7 +2733,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); - bool alloc_retry = false; struct page *new_page; int ret = 0; @@ -2748,30 +2743,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * the pool. This simplifies and let us do most of the processing * under the lock. */ -alloc_retry: new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; - /* - * If all goes well, this page will be directly added to the free - * list in the pool. For this the ref count needs to be zero. - * Attempt to drop now, and retry once if needed. It is VERY - * unlikely there is another ref on the page. - * - * If someone else has a reference to the page, it will be freed - * when they drop their ref. Abuse temporary page flag to accomplish - * this. Retry once if there is an inflated ref count. - */ - SetHPageTemporary(new_page); - if (!put_page_testzero(new_page)) { - if (alloc_retry) - return -EBUSY; - - alloc_retry = true; - goto alloc_retry; - } - ClearHPageTemporary(new_page); - __prep_new_huge_page(h, new_page); retry: @@ -2951,6 +2925,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, } spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); @@ -3055,7 +3030,7 @@ static void __init gather_bootmem_prealloc(void) if (prep_compound_gigantic_page(page, huge_page_order(h))) { WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* add to the hugepage allocator */ + free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ free_gigantic_page(page, huge_page_order(h)); @@ -3087,7 +3062,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) &node_states[N_MEMORY], NULL); if (!page) break; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(page); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3478,9 +3453,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) else prep_compound_page(subpage, target_hstate->order); set_page_private(subpage, 0); - set_page_refcounted(subpage); prep_new_huge_page(target_hstate, subpage, nid); - put_page(subpage); + free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); -- GitLab From e3e486e634bfd652036292c3d66f9d388614ffe6 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 17 Sep 2022 21:56:54 +0800 Subject: [PATCH 1120/2223] mm/damon: rename damon_pageout_score() to damon_cold_score() In the beginning there is only one damos_action 'DAMOS_PAGEOUT' that need to get the coldness score of a region for a scheme, which using damon_pageout_score() to do that. But now there are also other damos_action actions need the coldness score, so rename it to damon_cold_score() to make more sense. Link: https://lkml.kernel.org/r/1663423014-28907-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/damon/ops-common.h | 2 +- mm/damon/paddr.c | 4 ++-- mm/damon/vaddr.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 9310df72e1c54..75409601f9349 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -130,7 +130,7 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, return hotness; } -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { int hotness = damon_hot_score(c, r, s); diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 52329ff361cd0..8d82d37222042 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dfeebffe82f44..e1a4315c4be6a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -287,11 +287,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context, { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); case DAMOS_LRU_PRIO: return damon_hot_score(context, r, scheme); case DAMOS_LRU_DEPRIO: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index f53c2ff2bcc8a..ea94e0b2c3113 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -673,7 +673,7 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } -- GitLab From a57ae9ef9e1a20b68ae841a8cab7aff3f000ed9d Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Sun, 18 Sep 2022 02:56:40 +0000 Subject: [PATCH 1121/2223] mm/page_alloc: update comments for rmqueue() Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists"), the per-cpu page allocators (PCP) is not only for order-0 pages. Update the comments. Link: https://lkml.kernel.org/r/20220918025640.208586-1-ran.xiaokai@zte.com.cn Signed-off-by: Ran Xiaokai Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 66f7778732fbd..12b6184cbbed6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3810,7 +3810,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, } /* - * Allocate a page from the given zone. Use pcplists for order-0 allocations. + * Allocate a page from the given zone. + * Use pcplists for THP or "cheap" high-order allocations. */ /* -- GitLab From 30b6242c49cd2a98def3bb2feee68d82a0e9686b Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 20 Sep 2022 16:35:30 +0000 Subject: [PATCH 1122/2223] mm/damon/sysfs: return 'err' value when call kstrtoul() failed We had better return the 'err' value when calling kstrtoul() failed, so the user will know why it really fails, there do little change, let it return the 'err' value when failed. Link: https://lkml.kernel.org/r/6329ebe0.050a0220.ec4bd.297cSMTPIN_ADDED_BROKEN@mx.google.com Suggested-by: SeongJae Park Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Reviewed-by: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 0cca1909bf67b..455215a5c0598 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -58,7 +58,7 @@ static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 0, &min); if (err) - return -EINVAL; + return err; range->min = min; return count; @@ -83,7 +83,7 @@ static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 0, &max); if (err) - return -EINVAL; + return err; range->max = max; return count; @@ -291,9 +291,7 @@ static ssize_t interval_us_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->interval_us); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t high_show(struct kobject *kobj, @@ -312,9 +310,7 @@ static ssize_t high_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->high); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t mid_show(struct kobject *kobj, @@ -333,9 +329,7 @@ static ssize_t mid_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->mid); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t low_show(struct kobject *kobj, @@ -354,9 +348,7 @@ static ssize_t low_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->low); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_watermarks_release(struct kobject *kobj) @@ -437,9 +429,7 @@ static ssize_t sz_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->sz); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t nr_accesses_permil_show(struct kobject *kobj, @@ -458,9 +448,7 @@ static ssize_t nr_accesses_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->nr_accesses); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t age_permil_show(struct kobject *kobj, @@ -479,9 +467,7 @@ static ssize_t age_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->age); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_weights_release(struct kobject *kobj) @@ -1111,9 +1097,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region, kobj); int err = kstrtoul(buf, 0, ®ion->start); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1132,9 +1116,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region, kobj); int err = kstrtoul(buf, 0, ®ion->end); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_region_release(struct kobject *kobj) @@ -1528,7 +1510,7 @@ static ssize_t sample_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->sample_us = us; return count; @@ -1552,7 +1534,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->aggr_us = us; return count; @@ -1576,7 +1558,7 @@ static ssize_t update_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->update_us = us; return count; -- GitLab From 233f0b31bd9503ce2be7be0bde69c67287c8a741 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 20 Sep 2022 16:53:22 +0000 Subject: [PATCH 1123/2223] mm/damon: deduplicate damon_{reclaim,lru_sort}_apply_parameters() The bodies of damon_{reclaim,lru_sort}_apply_parameters() contain duplicates. This commit adds a common function damon_set_region_biggest_system_ram_default() to remove the duplicates. Link: https://lkml.kernel.org/r/6329f00d.a70a0220.9bb29.3678SMTPIN_ADDED_BROKEN@mx.google.com Signed-off-by: Kaixu Xia Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 35 ++++++++++++++++++++++++++++++++++- mm/damon/lru_sort.c | 13 +++---------- mm/damon/reclaim.c | 13 +++---------- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index e7808a84675fb..ed5470f50babd 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -557,7 +557,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 9c80c6eb00c24..4de8c7c529794 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1245,7 +1245,8 @@ static int walk_system_ram(struct resource *res, void *arg) * Find biggest 'System RAM' resource and store its start and end address in * @start and @end, respectively. If no System RAM is found, returns false. */ -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) +static bool damon_find_biggest_system_ram(unsigned long *start, + unsigned long *end) { struct damon_system_ram_region arg = {}; @@ -1259,6 +1260,38 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) return true; } +/** + * damon_set_region_biggest_system_ram_default() - Set the region of the given + * monitoring target as requested, or biggest 'System RAM'. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds the biggest + * 'System RAM' resource and sets the region to cover the resource. In the + * latter case, this function saves the start and end addresses of the resource + * in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_biggest_system_ram(start, end)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1); +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index d7eb72b41cb67..efbc2bda8b9cd 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -188,7 +188,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; @@ -211,15 +210,9 @@ static int damon_lru_sort_apply_parameters(void) return -ENOMEM; damon_add_scheme(ctx, scheme); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !damon_find_biggest_system_ram(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_lru_sort_turn(bool on) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3d59ab11b7b39..162c9b1ca00fd 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -144,7 +144,6 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; int err = 0; err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); @@ -157,15 +156,9 @@ static int damon_reclaim_apply_parameters(void) return -ENOMEM; damon_set_schemes(ctx, &scheme, 1); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !damon_find_biggest_system_ram(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_reclaim_turn(bool on) -- GitLab From 2eb989195d9a361d13d66ffb8738847649e080ad Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:33 +0800 Subject: [PATCH 1124/2223] mm: memcontrol: use memcg_kmem_enabled in count_objcg_event Patch series "mm: memcontrol: cleanup and optimize for two accounting params", v2. This patch (of 2): There are currently two helpers for checking if cgroup kmem accounting is enabled: - mem_cgroup_kmem_disabled - memcg_kmem_enabled mem_cgroup_kmem_disabled is a simple helper that returns true if cgroup.memory=nokmem is specified, otherwise returns false. memcg_kmem_enabled is a bit different, it returns true if cgroup.memory=nokmem is not specified and there was at least one non-root memory control enabled cgroup ever created. This help improve performance when kmem accounting was not actually activated. And it's optimized with static branch. The usage of mem_cgroup_kmem_disabled is for sub-systems that need to preallocate data for kmem accounting since they could be initialized before kmem accounting is activated. But count_objcg_event doesn't need that, so using memcg_kmem_enabled is better here. Link: https://lkml.kernel.org/r/20220919180634.45958-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20220919180634.45958-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dc7d40e575d5f..ef479e5542536 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1778,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (mem_cgroup_kmem_disabled()) + if (!memcg_kmem_enabled()) return; rcu_read_lock(); -- GitLab From c1b8fdae62e59904ecdfe4f50410575ea02fec18 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:34 +0800 Subject: [PATCH 1125/2223] mm: memcontrol: make cgroup_memory_noswap a static key cgroup_memory_noswap is used in many hot path, so make it a static key to lower the kernel overhead. Using 8G of ZRAM as SWAP, benchmark using `perf stat -d -d -d --repeat 100` with the following code snip in a non-root cgroup: #include #include #include #include #define MB 1024UL * 1024UL int main(int argc, char **argv){ void *p = mmap(NULL, 8000 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0xff, 8000 * MB); madvise(p, 8000 * MB, MADV_PAGEOUT); memset(p, 0xff, 8000 * MB); return 0; } Before: 7,021.43 msec task-clock # 0.967 CPUs utilized ( +- 0.03% ) 4,010 context-switches # 573.853 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,057 page-faults # 293.661 K/sec ( +- 0.00% ) 12,616,546,027 cycles # 1.805 GHz ( +- 0.06% ) (39.92%) 156,823,666 stalled-cycles-frontend # 1.25% frontend cycles idle ( +- 0.10% ) (40.25%) 310,130,812 stalled-cycles-backend # 2.47% backend cycles idle ( +- 4.39% ) (40.73%) 18,692,516,591 instructions # 1.49 insn per cycle # 0.01 stalled cycles per insn ( +- 0.04% ) (40.75%) 4,907,447,976 branches # 702.283 M/sec ( +- 0.05% ) (40.30%) 13,002,578 branch-misses # 0.26% of all branches ( +- 0.08% ) (40.48%) 7,069,786,296 L1-dcache-loads # 1.012 G/sec ( +- 0.03% ) (40.32%) 649,385,847 L1-dcache-load-misses # 9.13% of all L1-dcache accesses ( +- 0.07% ) (40.10%) 1,485,448,688 L1-icache-loads # 212.576 M/sec ( +- 0.15% ) (39.49%) 31,628,457 L1-icache-load-misses # 2.13% of all L1-icache accesses ( +- 0.40% ) (39.57%) 6,667,311 dTLB-loads # 954.129 K/sec ( +- 0.21% ) (39.50%) 5,668,555 dTLB-load-misses # 86.40% of all dTLB cache accesses ( +- 0.12% ) (39.03%) 765 iTLB-loads # 109.476 /sec ( +- 21.81% ) (39.44%) 4,370,351 iTLB-load-misses # 214320.09% of all iTLB cache accesses ( +- 1.44% ) (39.86%) 149,207,254 L1-dcache-prefetches # 21.352 M/sec ( +- 0.13% ) (40.27%) 7.25869 +- 0.00203 seconds time elapsed ( +- 0.03% ) After: 6,576.16 msec task-clock # 0.953 CPUs utilized ( +- 0.10% ) 4,020 context-switches # 605.595 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,056 page-faults # 309.133 K/sec ( +- 0.00% ) 11,967,619,180 cycles # 1.803 GHz ( +- 0.36% ) (38.76%) 161,259,240 stalled-cycles-frontend # 1.38% frontend cycles idle ( +- 0.27% ) (36.58%) 253,605,302 stalled-cycles-backend # 2.16% backend cycles idle ( +- 4.45% ) (34.78%) 19,328,171,892 instructions # 1.65 insn per cycle # 0.01 stalled cycles per insn ( +- 0.10% ) (31.46%) 5,213,967,902 branches # 785.461 M/sec ( +- 0.18% ) (30.68%) 12,385,170 branch-misses # 0.24% of all branches ( +- 0.26% ) (34.13%) 7,271,687,822 L1-dcache-loads # 1.095 G/sec ( +- 0.12% ) (35.29%) 649,873,045 L1-dcache-load-misses # 8.93% of all L1-dcache accesses ( +- 0.11% ) (41.41%) 1,950,037,608 L1-icache-loads # 293.764 M/sec ( +- 0.33% ) (43.11%) 31,365,566 L1-icache-load-misses # 1.62% of all L1-icache accesses ( +- 0.39% ) (45.89%) 6,767,809 dTLB-loads # 1.020 M/sec ( +- 0.47% ) (48.42%) 6,339,590 dTLB-load-misses # 95.43% of all dTLB cache accesses ( +- 0.50% ) (46.60%) 736 iTLB-loads # 110.875 /sec ( +- 1.79% ) (48.60%) 4,314,836 iTLB-load-misses # 518653.73% of all iTLB cache accesses ( +- 0.63% ) (42.91%) 144,950,156 L1-dcache-prefetches # 21.836 M/sec ( +- 0.37% ) (41.39%) 6.89935 +- 0.00703 seconds time elapsed ( +- 0.10% ) The performance is clearly better. There is no significant hotspot improvement according to perf report, as there are quite a few callers of memcg_swap_enabled and do_memsw_account (which calls memcg_swap_enabled). Many pieces of minor optimizations resulted in lower overhead for the branch predictor, and bettter performance. Link: https://lkml.kernel.org/r/20220919180634.45958-3-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac6440daf2086..6b74bbdc26596 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -90,9 +90,18 @@ static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __ro_after_init; +static bool cgroup_memory_noswap __initdata; + +static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); +static inline bool memcg_swap_enabled(void) +{ + return static_branch_likely(&memcg_swap_enabled_key); +} #else -#define cgroup_memory_noswap 1 +static inline bool memcg_swap_enabled(void) +{ + return false; +} #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -102,7 +111,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7370,7 +7379,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (!cgroup_memory_noswap && memcg != swap_memcg) { + if (memcg_swap_enabled() && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7422,7 +7431,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7454,7 +7463,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7470,7 +7479,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7487,7 +7496,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = folio_memcg(folio); @@ -7795,6 +7804,8 @@ static int __init mem_cgroup_swap_init(void) if (cgroup_memory_noswap) return 0; + static_branch_enable(&memcg_swap_enabled_key); + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) -- GitLab From 958f32ce832ba781ac20e11bb2d12a9352ea28fc Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 12:21:13 +0800 Subject: [PATCH 1126/2223] mm: hugetlb: fix UAF in hugetlb_handle_userfault The vma_lock and hugetlb_fault_mutex are dropped before handling userfault and reacquire them again after handle_userfault(), but reacquire the vma_lock could lead to UAF[1,2] due to the following race, hugetlb_fault hugetlb_no_page /*unlock vma_lock */ hugetlb_handle_userfault handle_userfault /* unlock mm->mmap_lock*/ vm_mmap_pgoff do_mmap mmap_region munmap_vma_range /* clean old vma */ /* lock vma_lock again <--- UAF */ /* unlock vma_lock */ Since the vma_lock will unlock immediately after hugetlb_handle_userfault(), let's drop the unneeded lock and unlock in hugetlb_handle_userfault() to fix the issue. [1] https://lore.kernel.org/linux-mm/000000000000d5e00a05e834962e@google.com/ [2] https://lore.kernel.org/linux-mm/20220921014457.1668-1-liuzixian4@huawei.com/ Link: https://lkml.kernel.org/r/20220923042113.137273-1-liushixin2@huawei.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Signed-off-by: Liu Shixin Signed-off-by: Kefeng Wang Reported-by: syzbot+193f9cee8638750b23cf@syzkaller.appspotmail.com Reported-by: Liu Zixian Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Muchun Song Cc: Sidhartha Kumar Cc: [4.14+] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2182134216f09..3c1316ad54b5f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5489,7 +5489,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, unsigned long addr, unsigned long reason) { - vm_fault_t ret; u32 hash; struct vm_fault vmf = { .vma = vma, @@ -5507,18 +5506,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * vma_lock and hugetlb_fault_mutex must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * vma_lock and hugetlb_fault_mutex must be dropped before handling + * userfault. Also mmap_lock could be dropped due to handling + * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - ret = handle_userfault(&vmf, reason); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - hugetlb_vma_lock_read(vma); - - return ret; + return handle_userfault(&vmf, reason); } static vm_fault_t hugetlb_no_page(struct mm_struct *mm, @@ -5536,6 +5531,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* * Currently, we are forced to kill the process in the event the @@ -5546,7 +5542,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); - return ret; + goto out; } /* @@ -5560,12 +5556,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) { - ret = hugetlb_handle_userfault(vma, mapping, idx, + if (userfaultfd_missing(vma)) + return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MISSING); - goto out; - } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5631,10 +5625,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - ret = hugetlb_handle_userfault(vma, mapping, idx, + return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MINOR); - goto out; } } @@ -5692,6 +5685,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, unlock_page(page); out: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: @@ -5789,11 +5784,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + if (huge_pte_none_mostly(entry)) + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); - goto out_mutex; - } ret = 0; -- GitLab From 780a4b6fb865534fcb3aa9150942f3a719d11ce9 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:27:31 -0700 Subject: [PATCH 1127/2223] mm/khugepaged: check compound_order() in collapse_pte_mapped_thp() By the time we lock a page in collapse_pte_mapped_thp(), the page mapped by the address pushed onto the slot's .pte_mapped_thp[] array might have changed arbitrarily since we last looked at it. We revalidate that the page is still the head of a compound page, but we don't revalidate if the compound page is of order HPAGE_PMD_ORDER before applying rmap and page table updates. Since the kernel now supports large folios of arbitrary order, and since replacing page's pte mappings by a pmd mapping only makes sense for compound pages of order HPAGE_PMD_ORDER, revalidate that the compound order is indeed of order HPAGE_PMD_ORDER before proceeding. Link: https://lore.kernel.org/linux-mm/CAHbLzkon+2ky8v9ywGcsTUgXM_B35jt5NThYqQKXW2YV_GUacw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922222731.1124481-1-zokeefe@google.com Signed-off-by: Zach O'Keefe Suggested-by: Yang Shi Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 57af2c841b410..40fd9f7b3ed3b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,6 +1399,9 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; + if (compound_order(hpage) != HPAGE_PMD_ORDER) + goto drop_hpage; + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; -- GitLab From 0f3e2a2c4243695c5ac3fbccce18dc74c0250df6 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 11:46:50 -0700 Subject: [PATCH 1128/2223] mm/madvise: MADV_COLLAPSE return EAGAIN when page cannot be isolated MADV_COLLAPSE is a best-effort request that attempts to set an actionable errno value if the request cannot be fulfilled at the time. EAGAIN should be used to communicate that a resource was temporarily unavailable, but that the user may try again immediately. SCAN_DEL_PAGE_LRU is an internal result code used when a page cannot be isolated from it's LRU list. Since this, like SCAN_PAGE_LRU, is likely a transitory state, make MADV_COLLAPSE return EAGAIN so that users know they may reattempt the operation. Another important scenario to consider is race with khugepaged. khugepaged might isolate a page while MADV_COLLAPSE is interested in it. Even though racing with khugepaged might mean that the memory has already been collapsed, signalling an errno that is non-intrinsic to that memory or arguments provided to madvise(2) lets the user know that future attempts might (and in this case likely would) succeed, and avoids false-negative assumptions by the user. Link: https://lkml.kernel.org/r/20220922184651.1016461-1-zokeefe@google.com Fixes: 7d8faaf15545 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40fd9f7b3ed3b..b3ebe90a66d99 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2372,6 +2372,7 @@ static int madvise_collapse_errno(enum scan_result r) /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to @@ -2454,6 +2455,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: -- GitLab From 3505c8e62acfb62908ffd7d2d6c5971657596d1d Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 11:46:51 -0700 Subject: [PATCH 1129/2223] selftests/vm: retry on EAGAIN for MADV_COLLAPSE selftest MADV_COLLAPSE is a best-effort request that will set errno to an actionable value if the request cannot be performed. For example, if pages are not found on the LRU, or if they are currently locked by something else, MADV_COLLAPSE will fail and set errno to EAGAIN to inform callers that they may try again. Since the khugepaged selftest is the first public use of MADV_COLLAPSE, set a best practice of checking errno and retrying on EAGAIN. Link: https://lkml.kernel.org/r/20220922184651.1016461-2-zokeefe@google.com Fixes: 9330694de59f ("selftests/vm: add MADV_COLLAPSE collapse context to selftests") Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index b77b1e28cdb38..b55dc331af139 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,4 +1,5 @@ #define _GNU_SOURCE +#include #include #include #include @@ -477,6 +478,26 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + /* * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with * validate_memory()'able contents. @@ -531,7 +552,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, /* Clear VM_NOHUGEPAGE */ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); if (((bool)ret) == expect) fail("Fail: Bad return value"); else if (check_huge(p, nr_hpages) != expect) -- GitLab From 7c6c6cc4d3a213e7303ef06ff40f6193df01839c Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:37 -0700 Subject: [PATCH 1130/2223] mm/shmem: add flag to enforce shmem THP in hugepage_vma_check() Patch series "mm: add file/shmem support to MADV_COLLAPSE", v4. This series builds on top of the previous "mm: userspace hugepage collapse" series which introduced the MADV_COLLAPSE madvise mode and added support for private, anonymous mappings[2], by adding support for file and shmem backed memory to CONFIG_READ_ONLY_THP_FOR_FS=y kernels. File and shmem support have been added with effort to align with existing MADV_COLLAPSE semantics and policy decisions[3]. Collapse of shmem-backed memory ignores kernel-guiding directives and heuristics including all sysfs settings (transparent_hugepage/shmem_enabled), and tmpfs huge= mount options (shmem always supports large folios). Like anonymous mappings, on successful return of MADV_COLLAPSE on file/shmem memory, the contents of memory mapped by the addresses provided will be synchronously pmd-mapped THPs. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. khugepaged has received a small improvement by association and can now detect and collapse pte-mapped THPs. However, there is still work to be done along the file collapse path. Compound pages of arbitrary order still needs to be supported and THP collapse needs to be converted to using folios in general. Eventually, we'd like to move away from the read-only and executable-mapped constraints currently imposed on eligible files and support any inode claiming huge folio support. That said, I think the series as-is covers enough to claim that MADV_COLLAPSE supports file/shmem memory. Patches 1-3 Implement the guts of the series. Patch 4 Is a tracepoint for debugging. Patches 5-9 Refactor existing khugepaged selftests to work with new memory types + new collapse tests. Patch 10 Adds a userfaultfd selftest mode to mimic a functional test of UFFDIO_REGISTER_MODE_MINOR+MADV_COLLAPSE live migration. (v4 note: "userfaultfd shmem" selftest is failing as of Sep 22 mm-unstable) [1] https://lore.kernel.org/linux-mm/YyiK8YvVcrtZo0z3@google.com/ [2] https://lore.kernel.org/linux-mm/20220706235936.2197195-1-zokeefe@google.com/ [3] https://lore.kernel.org/linux-mm/YtBmhaiPHUTkJml8@google.com/ [4] https://lore.kernel.org/linux-mm/20220922222731.1124481-1-zokeefe@google.com/ [5] https://lore.kernel.org/linux-mm/20220922184651.1016461-1-zokeefe@google.com/ This patch (of 10): Extend 'mm/thp: add flag to enforce sysfs THP in hugepage_vma_check()' to shmem, allowing callers to ignore /sys/kernel/transparent_hugepage/shmem_enabled and tmpfs huge= mount. This is intended to be used by MADV_COLLAPSE, and the rationale is analogous to the anon/file case: MADV_COLLAPSE is not coupled to directives that advise the kernel's decisions on when THPs should be considered eligible. shmem/tmpfs always claims large folio support, regardless of sysfs or mount options. [shy828301@gmail.com: test shmem_huge_force explicitly] Link: https://lore.kernel.org/linux-mm/CAHbLzko3A5-TpS0BgBeKkx5cuOkWgLvWXQH=TdgW-baO4rPtdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922224046.1143204-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-2-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-2-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 10 ++++++---- mm/huge_memory.c | 2 +- mm/shmem.c | 18 ++++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f24071e3c826e..d500ea967dc73 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,11 +92,13 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma, + bool shmem_huge_force) { - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, + shmem_huge_force); } extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4938defe4e732..1cc4a5f4791e9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -119,7 +119,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); + return shmem_huge_enabled(vma, !enforce_sysfs); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && diff --git a/mm/shmem.c b/mm/shmem.c index 275899bacbeaf..cabe48d55a64d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -462,20 +462,22 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; + if (shmem_huge_force) + return true; if (shmem_huge == SHMEM_HUGE_FORCE) return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: @@ -670,8 +672,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { return false; } @@ -1058,7 +1060,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); - if (shmem_is_huge(NULL, inode, 0)) + if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1900,7 +1902,7 @@ repeat: return 0; } - if (!shmem_is_huge(vma, inode, index)) + if (!shmem_is_huge(vma, inode, index, false)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); -- GitLab From 58ac9a8993a13ebcbb0682ede0e3a158b4a41b28 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:38 -0700 Subject: [PATCH 1131/2223] mm/khugepaged: attempt to map file/shmem-backed pte-mapped THPs by pmds The main benefit of THPs are that they can be mapped at the pmd level, increasing the likelihood of TLB hit and spending less cycles in page table walks. pte-mapped hugepages - that is - hugepage-aligned compound pages of order HPAGE_PMD_ORDER mapped by ptes - although being contiguous in physical memory, don't have this advantage. In fact, one could argue they are detrimental to system performance overall since they occupy a precious hugepage-aligned/sized region of physical memory that could otherwise be used more effectively. Additionally, pte-mapped hugepages can be the cheapest memory to collapse for khugepaged since no new hugepage allocation or copying of memory contents is necessary - we only need to update the mapping page tables. In the anonymous collapse path, we are able to collapse pte-mapped hugepages (albeit, perhaps suboptimally), but the file/shmem path makes no effort when compound pages (of any order) are encountered. Identify pte-mapped hugepages in the file/shmem collapse path. The final step of which makes a racy check of the value of the pmd to ensure it maps a pte table. This should be fine, since races that result in false-positive (i.e. attempt collapse even though we shouldn't) will fail later in collapse_pte_mapped_thp() once we actually lock mmap_lock and reinspect the pmd value. Races that result in false-negatives (i.e. where we decide to not attempt collapse, but should have) shouldn't be an issue, since in the worst case, we do nothing - which is what we've done up to this point. We make a similar check in retract_page_tables(). If we do think we've found a pte-mapped hugepgae in khugepaged context, attempt to update page tables mapping this hugepage. Note that these collapses still count towards the /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed counter, and if the pte-mapped hugepage was also mapped into multiple process' address spaces, could be incremented for each page table update. Since we increment the counter when a pte-mapped hugepage is successfully added to the list of to-collapse pte-mapped THPs, it's possible that we never actually update the page table either. This is different from how file/shmem pages_collapsed accounting works today where only a successful page cache update is counted (it's also possible here that no page tables are actually changed). Though it incurs some slop, this is preferred to either not accounting for the event at all, or plumbing through data in struct mm_slot on whether to account for the collapse or not. Also note that work still needs to be done to support arbitrary compound pages, and that this should all be converted to using folios. [shy828301@gmail.com: Spelling mistake, update comment, and add Documentation] Link: https://lore.kernel.org/linux-mm/CAHbLzkpHwZxFzjfX9nxVoRhzup8WMjMfyL6Xiq8mZ9M-N3ombw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-3-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-3-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 9 ++- include/trace/events/huge_memory.h | 1 + mm/khugepaged.c | 69 +++++++++++++++++++--- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 8e3418ec4503e..8ee78ec232ebc 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -191,7 +191,14 @@ allocation failure to throttle the next allocation attempt:: /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs -The khugepaged progress can be seen in the number of pages collapsed:: +The khugepaged progress can be seen in the number of pages collapsed (note +that this counter may not be an exact count of the number of pages +collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping +being replaced by a PMD mapping, or (2) All 4K physical pages replaced by +one 2M hugepage. Each may happen independently, or together, depending on +the type of memory and the failures that occur. As such, this value should +be interpreted roughly as a sign of progress, and counters in /proc/vmstat +consulted for more accurate accounting):: /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 55392bf30a034..fbbb25494d603 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -17,6 +17,7 @@ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ + EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \ EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b3ebe90a66d99..b1e3f83c4eb2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -35,6 +35,7 @@ enum scan_result { SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -1320,20 +1321,24 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ -static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; + bool ret = false; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + ret = true; + } spin_unlock(&khugepaged_mm_lock); + return ret; } static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1370,9 +1375,16 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) pte_t *start_pte, *pte; pmd_t *pmd; spinlock_t *ptl; - int count = 0; + int count = 0, result = SCAN_FAIL; int i; + mmap_assert_write_locked(mm); + + /* Fast check before locking page if not PMD mapping PTE table */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result != SCAN_SUCCEED) + return; + if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; @@ -1726,9 +1738,16 @@ static int collapse_file(struct mm_struct *mm, struct file *file, /* * If file was truncated then extended, or hole-punched, before * we locked the first page, then a THP might be there already. + * This will be discovered on the first iteration. */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; goto out_unlock; } @@ -1962,11 +1981,23 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, } /* - * XXX: khugepaged should compact smaller compound pages + * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ break; } @@ -2026,6 +2057,12 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } + +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + return false; +} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, @@ -2118,8 +2155,26 @@ skip: &mmap_locked, cc); } - if (*result == SCAN_SUCCEED) + switch (*result) { + case SCAN_PTE_MAPPED_HUGEPAGE: { + pmd_t *pmd; + + *result = find_pmd_or_thp_or_none(mm, + khugepaged_scan.address, + &pmd); + if (*result != SCAN_SUCCEED) + break; + if (!khugepaged_add_pte_mapped_thp(mm, + khugepaged_scan.address)) + break; + } fallthrough; + case SCAN_SUCCEED: ++khugepaged_pages_collapsed; + break; + default: + break; + } + /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; -- GitLab From 34488399fa08faaf664743fa54b271eb6f9e1321 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:39 -0700 Subject: [PATCH 1132/2223] mm/madvise: add file and shmem support to MADV_COLLAPSE Add support for MADV_COLLAPSE to collapse shmem-backed and file-backed memory into THPs (requires CONFIG_READ_ONLY_THP_FOR_FS=y). On success, the backing memory will be a hugepage. For the memory range and process provided, the page tables will synchronously have a huge pmd installed, mapping the THP. Other mappings of the file extent mapped by the memory range may be added to a set of entries that khugepaged will later process and attempt update their page tables to map the THP by a pmd. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. Since khugepaged is single threaded, this change now introduces possibility of collapse contexts racing in file collapse path. There a important few places to consider: (1) hpage_collapse_scan_file(), when we xas_pause() and drop RCU. We could have the memory collapsed out from under us, but the next xas_for_each() iteration will correctly pick up the hugepage. The hugepage might not be up to date (insofar as copying of small page contents might not have completed - the page still may be locked), but regardless what small page index we were iterating over, we'll find the hugepage and identify it as a suitably aligned compound page of order HPAGE_PMD_ORDER. In khugepaged path, we locklessly check the value of the pmd, and only add it to deferred collapse array if we find pmd mapping pte table. This is fine, since other values that could have raced in right afterwards denote failure, or that the memory was successfully collapsed, so we don't need further processing. In madvise path, we'll take mmap_lock() in write to serialize against page table updates and will know what to do based on the true value of the pmd: recheck all ptes if we point to a pte table, directly install the pmd, if the pmd has been cleared, but memory not yet faulted, or nothing at all if we find a huge pmd. It's worth putting emphasis here on how we treat the none pmd here. If khugepaged has processed this mm's page tables already, it will have left the pmd cleared (ready for refault by the process). Depending on the VMA flags and sysfs settings, amount of RAM on the machine, and the current load, could be a relatively common occurrence - and as such is one we'd like to handle successfully in MADV_COLLAPSE. When we see the none pmd in collapse_pte_mapped_thp(), we've locked mmap_lock in write and checked (a) huepaged_vma_check() to see if the backing memory is appropriate still, along with VMA sizing and appropriate hugepage alignment within the file, and (b) we've found a hugepage head of order HPAGE_PMD_ORDER at the offset in the file mapped by our hugepage-aligned virtual address. Even though the common-case is likely race with khugepaged, given these checks (regardless how we got here - we could be operating on a completely different file than originally checked in hpage_collapse_scan_file() for all we know) it should be safe to directly make the pmd a huge pmd pointing to this hugepage. (2) collapse_file() is mostly serialized on the same file extent by lock sequence: | lock hupepage | lock mapping->i_pages | lock 1st page | unlock mapping->i_pages | | lock mapping->i_pages | page_ref_freeze(3) | xas_store(hugepage) | unlock mapping->i_pages | page_ref_unfreeze(1) | unlock 1st page V unlock hugepage Once a context (who already has their fresh hugepage locked) locks mapping->i_pages exclusively, it will hold said lock until it locks the first page, and it will hold that lock until the after the hugepage has been added to the page cache (and will unlock the hugepage after page table update, though that isn't important here). A racing context that loses the race for mapping->i_pages will then lose the race to locking the first page. Here - depending on how far the other racing context has gotten - we might find the new hugepage (in which case we'll exit cleanly when we check PageTransCompound()), or we'll find the "old" 1st small page (in which we'll exit cleanly when we discover unexpected refcount of 2 after isolate_lru_page()). This is assuming we are able to successfully lock the page we find - in shmem path, we could just fail the trylock and exit cleanly anyways. Failure path in collapse_file() is similar: once we hold lock on 1st small page, we are serialized against other collapse contexts. Before the 1st small page is unlocked, we add it back to the pagecache and unfreeze the refcount appropriately. Contexts who lost the race to the 1st small page will then find the same 1st small page with the correct refcount and will be able to proceed. [zokeefe@google.com: don't check pmd value twice in collapse_pte_mapped_thp()] Link: https://lkml.kernel.org/r/20220927033854.477018-1-zokeefe@google.com [shy828301@gmail.com: Delete hugepage_vma_revalidate_anon(), remove check for multi-add in khugepaged_add_pte_mapped_thp()] Link: https://lore.kernel.org/linux-mm/CAHbLzkrtpM=ic7cYAHcqkubah5VTR8N5=k5RT8MTvv5rN1Y91w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-4-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-4-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 13 +- include/trace/events/huge_memory.h | 1 + kernel/events/uprobes.c | 2 +- mm/khugepaged.c | 245 ++++++++++++++++++++++------- 4 files changed, 198 insertions(+), 63 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 384f034ae947f..70162d707caf0 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM -extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); #else -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } #endif @@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { } -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } static inline void khugepaged_min_free_kbytes_update(void) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index fbbb25494d603..df33453b70fcf 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -11,6 +11,7 @@ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ + EM( SCAN_PMD_NONE, "pmd_none") \ EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e0a9b945e7bc0..d9e357b7e17c9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -555,7 +555,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b1e3f83c4eb2d..3bd6e2a741631 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -29,6 +29,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_NONE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -821,6 +822,7 @@ static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { @@ -845,8 +847,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * hugepage_vma_check may return true for qualified file * vmas. */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return SCAN_VMA_CHECK; + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; return SCAN_SUCCEED; } @@ -866,8 +868,8 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ barrier(); #endif - if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + if (pmd_none(pmde)) + return SCAN_PMD_NONE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) @@ -995,7 +997,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; @@ -1026,7 +1028,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -1320,6 +1322,26 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. + * + * Note that following race exists: + * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, + * emptying the A's ->pte_mapped_thp[] array. + * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and + * retract_page_tables() finds a VMA in mm_struct A mapping the same extent + * (at virtual address X) and adds an entry (for X) into mm_struct A's + * ->pte-mapped_thp[] array. + * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, + * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry + * (for X) into mm_struct A's ->pte-mapped_thp[] array. + * Thus, it's possible the same address is added multiple times for the same + * mm_struct. Should this happen, we'll simply attempt + * collapse_pte_mapped_thp() multiple times for the same address, under the same + * exclusive mmap_lock, and assuming the first call is successful, subsequent + * attempts will return quickly (without grabbing any additional locks) when + * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap + * check, and since this is a rare occurrence, the cost of preventing this + * "multiple-add" is thought to be more expensive than just handling it, should + * it occur. */ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) @@ -1341,6 +1363,27 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return ret; } +/* hpage must be locked, and mmap_lock must be held in write */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + .pmd = pmdp, + }; + + VM_BUG_ON(!PageTransHuge(hpage)); + mmap_assert_write_locked(vma->vm_mm); + + if (do_set_pmd(&vmf, hpage)) + return SCAN_FAIL; + + get_page(hpage); + return SCAN_SUCCEED; +} + static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -1362,12 +1405,14 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v * * @mm: process address space where collapse happens * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. + * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ -void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); @@ -1380,14 +1425,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) mmap_assert_write_locked(mm); - /* Fast check before locking page if not PMD mapping PTE table */ + /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); - if (result != SCAN_SUCCEED) - return; + if (result == SCAN_PMD_MAPPED) + return result; if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return; + return SCAN_VMA_CHECK; /* * If we are here, we've succeeded in replacing all the native pages @@ -1397,27 +1442,43 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * analogously elide sysfs THP settings here. */ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) - return; + return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) - return; + return SCAN_PTE_UFFD_WP; hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) - return; + return SCAN_PAGE_NULL; - if (!PageHead(hpage)) + if (!PageHead(hpage)) { + result = SCAN_FAIL; goto drop_hpage; + } - if (compound_order(hpage) != HPAGE_PMD_ORDER) + if (compound_order(hpage) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; goto drop_hpage; + } - if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_PMD_NONE: + /* + * In MADV_COLLAPSE path, possible race with khugepaged where + * all pte entries have been removed and pmd cleared. If so, + * skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: goto drop_hpage; + } start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + result = SCAN_FAIL; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1429,8 +1490,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) + if (!pte_present(*pte)) { + result = SCAN_PTE_NON_PRESENT; goto abort; + } page = vm_normal_page(vma, addr, *pte); if (WARN_ON_ONCE(page && is_zone_device_page(page))) @@ -1465,12 +1528,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); } - /* step 4: collapse pmd */ + /* step 4: remove pte entries */ collapse_and_free_pmd(mm, vma, haddr, pmd); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, hpage) + : SCAN_SUCCEED; + drop_hpage: unlock_page(hpage); put_page(hpage); - return; + return result; abort: pte_unmap_unlock(start_pte, ptl); @@ -1493,22 +1563,29 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); } -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + struct mm_struct *target_mm, + unsigned long target_addr, struct page *hpage, + struct collapse_control *cc) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long addr; - pmd_t *pmd; + int target_result = SCAN_FAIL; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + int result = SCAN_FAIL; + struct mm_struct *mm = NULL; + unsigned long addr = 0; + pmd_t *pmd; + bool is_target = false; + /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth investing @@ -1525,24 +1602,34 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * ptl. It has higher chance to recover THP for the VMA, but * has higher cost too. */ - if (vma->anon_vma) - continue; + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto next; + } addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr & ~HPAGE_PMD_MASK) - continue; - if (vma->vm_end < addr + HPAGE_PMD_SIZE) - continue; + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) { + result = SCAN_VMA_CHECK; + goto next; + } mm = vma->vm_mm; - if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) - continue; + is_target = mm == target_mm && addr == target_addr; + result = find_pmd_or_thp_or_none(mm, addr, &pmd); + if (result != SCAN_SUCCEED) + goto next; /* * We need exclusive mmap_lock to retract page table. * * We use trylock due to lock inversion: we need to acquire * mmap_lock while holding page lock. Fault path does it in * reverse order. Trylock is a way to avoid deadlock. + * + * Also, it's not MADV_COLLAPSE's job to collapse other + * mappings - let khugepaged take care of them later. */ - if (mmap_write_trylock(mm)) { + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte @@ -1551,22 +1638,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!hpage_collapse_test_exit(mm) && - !userfaultfd_wp(vma)) - collapse_and_free_pmd(mm, vma, addr, pmd); + if (hpage_collapse_test_exit(mm)) { + result = SCAN_ANY_PROCESS; + goto unlock_next; + } + if (userfaultfd_wp(vma)) { + result = SCAN_PTE_UFFD_WP; + goto unlock_next; + } + collapse_and_free_pmd(mm, vma, addr, pmd); + if (!cc->is_khugepaged && is_target) + result = set_huge_pmd(vma, addr, pmd, hpage); + else + result = SCAN_SUCCEED; + +unlock_next: mmap_write_unlock(mm); - } else { - /* Try again later */ + goto next; + } + /* + * Calling context will handle target mm/addr. Otherwise, let + * khugepaged try again later. + */ + if (!is_target) { khugepaged_add_pte_mapped_thp(mm, addr); + continue; } +next: + if (is_target) + target_result = result; } i_mmap_unlock_write(mapping); + return target_result; } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens + * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad @@ -1586,8 +1696,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static int collapse_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *hpage; @@ -1895,7 +2006,8 @@ xa_unlocked: /* * Remove pte page tables, so we can re-fault the page as huge. */ - retract_page_tables(mapping, start); + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); unlock_page(hpage); hpage = NULL; } else { @@ -1951,8 +2063,9 @@ out: return result; } -static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -2040,7 +2153,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - result = collapse_file(mm, file, start, cc); + result = collapse_file(mm, addr, file, start, cc); } } @@ -2048,8 +2161,9 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, return result; } #else -static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { BUILD_BUG(); } @@ -2145,8 +2259,9 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - *result = khugepaged_scan_file(mm, file, pgoff, - cc); + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, + file, pgoff, cc); mmap_locked = false; fput(file); } else { @@ -2453,10 +2568,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - /* TODO: Support file/shmem */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return -EINVAL; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return -EINVAL; @@ -2479,7 +2590,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, cond_resched(); mmap_read_lock(mm); mmap_locked = true; - result = hugepage_vma_revalidate(mm, addr, &vma, cc); + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; @@ -2489,16 +2601,35 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); - result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, - cc); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ +handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + BUG_ON(*prev); + mmap_write_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_write_unlock(mm); + goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: -- GitLab From d41fd2016ed07a630da2817b76c98eeab7931e1e Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:40 -0700 Subject: [PATCH 1133/2223] mm/khugepaged: add tracepoint to hpage_collapse_scan_file() Add huge_memory:trace_mm_khugepaged_scan_file tracepoint to hpage_collapse_scan_file() analogously to hpage_collapse_scan_pmd(). While this change is targeted at debugging MADV_COLLAPSE pathway, the "mm_khugepaged" prefix is retained for symmetry with huge_memory:trace_mm_khugepaged_scan_pmd, which retains it's legacy name to prevent changing kernel ABI as much as possible. Link: https://lkml.kernel.org/r/20220907144521.3115321-5-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-5-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 34 ++++++++++++++++++++++++++++++ mm/khugepaged.c | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index df33453b70fcf..935af49479173 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -169,5 +169,39 @@ TRACE_EVENT(mm_collapse_huge_page_swapin, __entry->ret) ); +TRACE_EVENT(mm_khugepaged_scan_file, + + TP_PROTO(struct mm_struct *mm, struct page *page, const char *filename, + int present, int swap, int result), + + TP_ARGS(mm, page, filename, present, swap, result), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, pfn) + __string(filename, filename) + __field(int, present) + __field(int, swap) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->pfn = page ? page_to_pfn(page) : -1; + __assign_str(filename, filename); + __entry->present = present; + __entry->swap = swap; + __entry->result = result; + ), + + TP_printk("mm=%p, scan_pfn=0x%lx, filename=%s, present=%d, swap=%d, result=%s", + __entry->mm, + __entry->pfn, + __get_str(filename), + __entry->present, + __entry->swap, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3bd6e2a741631..c7699fabf302f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2157,7 +2157,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } } - /* TODO: tracepoints */ + trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname, + present, swap, result); return result; } #else -- GitLab From c07c343cda8ef02985ac6583a2e5af892726f734 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:41 -0700 Subject: [PATCH 1134/2223] selftests/vm: dedup THP helpers These files: tools/testing/selftests/vm/vm_util.c tools/testing/selftests/vm/khugepaged.c Both contain logic to: 1) Determine hugepage size on current system 2) Read /proc/self/smaps to determine number of THPs at an address Refactor selftests/vm/khugepaged.c to use the vm_util common helpers and add it as a build dependency. Since selftests/vm/khugepaged.c is the largest user of check_huge(), change the signature of check_huge() to match selftests/vm/khugepaged.c's useage: take an expected number of hugepages, and return a bool indicating if the correct number of hugepages were found. Add a wrapper, check_huge_anon(), in anticipation of checking smaps for file and shmem hugepages. Update existing callsites to use the new pattern / function. Likewise, check_for_pattern() was duplicated, and it's a general enough helper to include in vm_util helpers as well. Link: https://lkml.kernel.org/r/20220907144521.3115321-6-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-6-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Zi Yan Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/khugepaged.c | 64 ++----------------- tools/testing/selftests/vm/soft-dirty.c | 2 +- .../selftests/vm/split_huge_page_test.c | 12 ++-- tools/testing/selftests/vm/vm_util.c | 26 +++++--- tools/testing/selftests/vm/vm_util.h | 3 +- 6 files changed, 32 insertions(+), 76 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 4ae879f70f4c3..c9c0996c122b4 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -95,6 +95,7 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk +$(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index b55dc331af139..235a64b4458c3 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -12,6 +12,8 @@ #include #include +#include "vm_util.h" + #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif @@ -352,64 +354,12 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -#define MAX_LINE_LENGTH 500 - -static bool check_for_pattern(FILE *fp, char *pattern, char *buf) -{ - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - static bool check_huge(void *addr, int nr_hpages) { - bool thp = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer)) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - nr_hpages * (hpage_pmd_size >> 10)); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - /* - * Fetch the AnonHugePages: in the same block and check whether it got - * the expected number of hugeepages next. - */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - thp = true; -err_out: - fclose(fp); - return thp; + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } - +#define MAX_LINE_LENGTH 500 static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -431,7 +381,7 @@ static bool check_swap(void *addr, unsigned long size) printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); exit(EXIT_FAILURE); } - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", @@ -444,7 +394,7 @@ static bool check_swap(void *addr, unsigned long size) * Fetch the Swap: in the same block and check whether it got * the expected number of hugeepages next. */ - if (!check_for_pattern(fp, "Swap:", buffer)) + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) goto err_out; if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) @@ -1066,7 +1016,7 @@ int main(int argc, const char **argv) setbuf(stdout, NULL); page_size = getpagesize(); - hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_size = read_pmd_pagesize(); hpage_pmd_nr = hpage_pmd_size / page_size; default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c index e3a43f5d4fa2b..21d8830c5f243 100644 --- a/tools/testing/selftests/vm/soft-dirty.c +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -91,7 +91,7 @@ static void test_hugepage(int pagemap_fd, int pagesize) for (i = 0; i < hpage_len; i++) map[i] = (char)i; - if (check_huge(map)) { + if (check_huge_anon(map, 1, hpage_len)) { ksft_test_result_pass("Test %s huge page allocation\n", __func__); clear_softdirty(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 6aa2b8253aeda..76e1c36dd9e57 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -92,7 +92,6 @@ void split_pmd_thp(void) { char *one_page; size_t len = 4 * pmd_pagesize; - uint64_t thp_size; size_t i; one_page = memalign(pmd_pagesize, len); @@ -107,8 +106,7 @@ void split_pmd_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } @@ -124,9 +122,8 @@ void split_pmd_thp(void) } - thp_size = check_huge(one_page); - if (thp_size) { - printf("Still %ld kB AnonHugePages not split\n", thp_size); + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); } @@ -172,8 +169,7 @@ void split_pte_mapped_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index b58ab11a7a302..9dae51b8219f1 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -42,9 +42,9 @@ void clear_softdirty(void) ksft_exit_fail_msg("writing clear_refs failed\n"); } -static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) { - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + while (fgets(buf, len, fp)) { if (!strncmp(buf, pattern, strlen(pattern))) return true; } @@ -72,9 +72,10 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } -uint64_t check_huge(void *addr) +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) { - uint64_t thp = 0; + uint64_t thp = -1; int ret; FILE *fp; char buffer[MAX_LINE_LENGTH]; @@ -89,20 +90,27 @@ uint64_t check_huge(void *addr) if (!fp) ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; /* - * Fetch the AnonHugePages: in the same block and check the number of + * Fetch the pattern in the same block and check the number of * hugepages. */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) goto err_out; - if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) ksft_exit_fail_msg("Reading smap error\n"); err_out: fclose(fp); - return thp; + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); } diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 2e512bd57ae14..8434ea0c95cd7 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -5,5 +5,6 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); -uint64_t check_huge(void *addr); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); -- GitLab From 8e638707a3f1a82dccbdc9285980329644946d4f Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:42 -0700 Subject: [PATCH 1135/2223] selftests/vm: modularize thp collapse memory operations Modularize operations to setup, cleanup, fault, and check for huge pages, for a given memory type. This allows reusing existing tests with additional memory types by defining new memory operations. Following patches will add file and shmem memory types. Link: https://lkml.kernel.org/r/20220907144521.3115321-7-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-7-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 373 +++++++++++++----------- 1 file changed, 207 insertions(+), 166 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 235a64b4458c3..06ea6f18980e2 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -29,8 +29,16 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); +}; + struct collapse_context { - void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect); + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); bool enforce_pte_scan_limits; }; @@ -354,11 +362,6 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static bool check_huge(void *addr, int nr_hpages) -{ - return check_huge_anon(addr, nr_hpages, hpage_pmd_size); -} - #define MAX_LINE_LENGTH 500 static bool check_swap(void *addr, unsigned long size) { @@ -452,18 +455,33 @@ retry: * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with * validate_memory()'able contents. */ -static void *alloc_hpage(void) +static void *alloc_hpage(struct mem_ops *ops) { - void *p; + void *p = ops->setup_area(1); - p = alloc_mapping(1); + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p, 1)) - success("OK"); - else - fail("Fail"); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); return p; } @@ -480,18 +498,40 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } -static void madvise_collapse(const char *msg, char *p, int nr_hpages, - bool expect) +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) { int ret; struct settings settings = *current_settings(); printf("%s...", msg); - /* Sanity check */ - if (!check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } /* * Prevent khugepaged interference and tests that MADV_COLLAPSE @@ -505,7 +545,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); if (((bool)ret) == expect) fail("Fail: Bad return value"); - else if (check_huge(p, nr_hpages) != expect) + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) fail("Fail: check_huge()"); else success("OK"); @@ -513,14 +553,26 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, pop_settings(); } +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p, int nr_hpages) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (!check_huge(p, 0)) { + if (!ops->check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } @@ -532,7 +584,7 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages) printf("%s...", msg); while (timeout--) { - if (check_huge(p, nr_hpages)) + if (ops->check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -546,19 +598,20 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages) } static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, - bool expect) + struct mem_ops *ops, bool expect) { - if (wait_for_scan(msg, p, nr_hpages)) { + if (wait_for_scan(msg, p, nr_hpages, ops)) { if (expect) fail("Timeout"); else success("OK"); return; - } else if (check_huge(p, nr_hpages) == expect) { + } + + if (ops->check_huge(p, expect ? nr_hpages : 0)) success("OK"); - } else { + else fail("Fail"); - } } static void alloc_at_fault(void) @@ -572,7 +625,7 @@ static void alloc_at_fault(void) p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p, 1)) + if (check_huge_anon(p, 1, hpage_pmd_size)) success("OK"); else fail("Fail"); @@ -581,49 +634,48 @@ static void alloc_at_fault(void) madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (check_huge(p, 0)) + if (check_huge_anon(p, 0, hpage_pmd_size)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); } -static void collapse_full(struct collapse_context *c) +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) { void *p; int nr_hpages = 4; unsigned long size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, - true); + ops, true); validate_memory(p, 0, size); - munmap(p, size); + ops->cleanup_area(p, size); } -static void collapse_empty(struct collapse_context *c) +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - c->collapse("Do not collapse empty PTE table", p, 1, false); - munmap(p, hpage_pmd_size); + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry(struct collapse_context *c) +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - fill_memory(p, 0, page_size); + p = ops->setup_area(1); + ops->fault(p, 0, page_size); c->collapse("Collapse PTE table with single PTE entry present", p, - 1, true); - validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_none(struct collapse_context *c) +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_none = hpage_pmd_nr / 2; struct settings settings = *current_settings(); @@ -632,30 +684,30 @@ static void collapse_max_ptes_none(struct collapse_context *c) settings.khugepaged.max_ptes_none = max_ptes_none; push_settings(&settings); - p = alloc_mapping(1); + p = ops->setup_area(1); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, - !c->enforce_pte_scan_limits); + ops, !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); if (c->enforce_pte_scan_limits) { - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, true); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } - - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); pop_settings(); } -static void collapse_swapin_single_pte(struct collapse_context *c) +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - fill_memory(p, 0, hpage_pmd_size); + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout one page..."); if (madvise(p, page_size, MADV_PAGEOUT)) { @@ -669,20 +721,21 @@ static void collapse_swapin_single_pte(struct collapse_context *c) goto out; } - c->collapse("Collapse with swapping in single PTE entry", p, 1, true); + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(struct collapse_context *c) +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(1); + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { perror("madvise(MADV_PAGEOUT)"); @@ -695,12 +748,12 @@ static void collapse_max_ptes_swap(struct collapse_context *c) goto out; } - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); if (c->enforce_pte_scan_limits) { - fill_memory(p, 0, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { @@ -715,63 +768,65 @@ static void collapse_max_ptes_swap(struct collapse_context *c) } c->collapse("Collapse with max_ptes_swap pages swapped out", p, - 1, true); + 1, ops, true); validate_memory(p, 0, hpage_pmd_size); } out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(struct collapse_context *c) +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table with single PTE mapping compound page", - p, 1, true); + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_full_of_compound(struct collapse_context *c) +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - c->collapse("Collapse PTE table full of compound pages", p, 1, true); + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_compound_extreme(struct collapse_context *c) +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) { void *p; int i; - p = alloc_mapping(1); + p = ops->setup_area(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR, 1)) { + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -798,30 +853,30 @@ static void collapse_compound_extreme(struct collapse_context *c) } } - munmap(BASE_ADDR, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p, 0)) + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table full of different compound pages", p, 1, - true); + ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork(struct collapse_context *c) +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_mapping(1); + p = ops->setup_area(1); printf("Allocate small page..."); - fill_memory(p, 0, page_size); - if (check_huge(p, 0)) + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -832,17 +887,17 @@ static void collapse_fork(struct collapse_context *c) skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, page_size, 2 * page_size); + ops->fault(p, page_size, 2 * page_size); c->collapse("Collapse PTE table with single page shared with parent process", - p, 1, true); + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -850,27 +905,27 @@ static void collapse_fork(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork_compound(struct collapse_context *c) +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -878,20 +933,20 @@ static void collapse_fork_compound(struct collapse_context *c) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, 0, page_size); + ops->fault(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); c->collapse("Collapse PTE table full of compound pages in child", - p, 1, true); + p, 1, ops, true); write_num("khugepaged/max_ptes_shared", current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -899,59 +954,59 @@ static void collapse_fork_compound(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_shared(struct collapse_context *c) +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (check_huge(p, 0)) + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - 1, !c->enforce_pte_scan_limits); + 1, ops, !c->enforce_pte_scan_limits); if (c->enforce_pte_scan_limits) { printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse with max_ptes_shared PTEs shared", - p, 1, true); + p, 1, ops, true); } validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -959,42 +1014,28 @@ static void collapse_max_ptes_shared(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void madvise_collapse_existing_thps(void) +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) { void *p; - int err; - p = alloc_mapping(1); - fill_memory(p, 0, hpage_pmd_size); + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); - printf("Collapse fully populated PTE table..."); - /* - * Note that we don't set MADV_HUGEPAGE here, which - * also tests that VM_HUGEPAGE isn't required for - * MADV_COLLAPSE in "madvise" mode. - */ - err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p, 1)) { - success("OK"); - printf("Re-collapse PMD-mapped hugepage"); - err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - } else { - fail("Fail"); - } + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } int main(int argc, const char **argv) @@ -1034,37 +1075,37 @@ int main(int argc, const char **argv) c.collapse = &khugepaged_collapse; c.enforce_pte_scan_limits = true; - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); + collapse_full(&c, &anon_ops); + collapse_empty(&c, &anon_ops); + collapse_single_pte_entry(&c, &anon_ops); + collapse_max_ptes_none(&c, &anon_ops); + collapse_swapin_single_pte(&c, &anon_ops); + collapse_max_ptes_swap(&c, &anon_ops); + collapse_single_pte_entry_compound(&c, &anon_ops); + collapse_full_of_compound(&c, &anon_ops); + collapse_compound_extreme(&c, &anon_ops); + collapse_fork(&c, &anon_ops); + collapse_fork_compound(&c, &anon_ops); + collapse_max_ptes_shared(&c, &anon_ops); } if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { printf("\n*** Testing context: madvise ***\n"); c.collapse = &madvise_collapse; c.enforce_pte_scan_limits = false; - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); - madvise_collapse_existing_thps(); + collapse_full(&c, &anon_ops); + collapse_empty(&c, &anon_ops); + collapse_single_pte_entry(&c, &anon_ops); + collapse_max_ptes_none(&c, &anon_ops); + collapse_swapin_single_pte(&c, &anon_ops); + collapse_max_ptes_swap(&c, &anon_ops); + collapse_single_pte_entry_compound(&c, &anon_ops); + collapse_full_of_compound(&c, &anon_ops); + collapse_compound_extreme(&c, &anon_ops); + collapse_fork(&c, &anon_ops); + collapse_fork_compound(&c, &anon_ops); + collapse_max_ptes_shared(&c, &anon_ops); + madvise_collapse_existing_thps(&c, &anon_ops); } restore_settings(0); -- GitLab From 1b03d0d558a281f12f68e5917dfa781c3b94e074 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:43 -0700 Subject: [PATCH 1136/2223] selftests/vm: add thp collapse file and tmpfs testing Add memory operations for file-backed and tmpfs memory. Call existing tests with these new memory operations to test collapse functionality of khugepaged and MADV_COLLAPSE on file-backed and tmpfs memory. Not all tests are reusable; for example, collapse_swapin_single_pte() which checks swap usage. Refactor test arguments. Usage is now: Usage: ./khugepaged [dir] : : : [all|khugepaged|madvise] : [all|anon|file] "file,all" mem_type requires [dir] argument "file,all" mem_type requires kernel built with CONFIG_READ_ONLY_THP_FOR_FS=y if [dir] is a (sub)directory of a tmpfs mount, tmpfs must be mounted with huge=madvise option for khugepaged tests to work Refactor calling tests to make it clear what collapse context / memory operations they support, but only invoke tests requested by user. Also log what test is being ran, and with what context / memory, to make test logs more human readable. A new test file is created and deleted for every test to ensure no pages remain in the page cache between tests (tests also may attempt to collapse different amount of memory). For file-backed memory where the file is stored on a block device, disable /sys/block//queue/read_ahead_kb so that pages don't find their way into the page cache without the tests faulting them in. Add file and shmem wrappers to vm_utils check for file and shmem hugepages in smaps. [zokeefe@google.com: fix "add thp collapse file and tmpfs testing" for tmpfs] Link: https://lkml.kernel.org/r/20220913212517.3163701-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-8-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-8-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 475 +++++++++++++++++++++--- tools/testing/selftests/vm/vm_util.c | 10 + tools/testing/selftests/vm/vm_util.h | 2 + 3 files changed, 431 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 06ea6f18980e2..08de6141c2afe 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,7 +1,9 @@ #define _GNU_SOURCE +#include #include #include #include +#include #include #include #include @@ -11,12 +13,21 @@ #include #include +#include +#include +#include +#include + +#include "linux/magic.h" #include "vm_util.h" #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif @@ -28,20 +39,47 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; struct mem_ops { void *(*setup_area)(int nr_hpages); void (*cleanup_area)(void *p, unsigned long size); void (*fault)(void *p, unsigned long start, unsigned long end); bool (*check_huge)(void *addr, int nr_hpages); + const char *name; }; +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; + struct collapse_context { void (*collapse)(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect); bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; }; +static struct file_info finfo; + enum thp_enabled { THP_ALWAYS, THP_MADVISE, @@ -107,6 +145,7 @@ struct settings { enum shmem_enabled shmem_enabled; bool use_zero_page; struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; }; static struct settings saved_settings; @@ -125,6 +164,11 @@ static void fail(const char *msg) exit_status++; } +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + static int read_file(const char *path, char *buf, size_t buflen) { int fd; @@ -152,13 +196,19 @@ static int write_file(const char *path, const char *buf, size_t buflen) ssize_t numwritten; fd = open(path, O_WRONLY); - if (fd == -1) + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); return 0; + } numwritten = write(fd, buf, buflen - 1); close(fd); - if (numwritten < 1) + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); return 0; + } return (unsigned int) numwritten; } @@ -225,20 +275,11 @@ static void write_string(const char *name, const char *val) } } -static const unsigned long read_num(const char *name) +static const unsigned long _read_num(const char *path) { - char path[PATH_MAX]; char buf[21]; - int ret; - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - ret = read_file(path, buf, sizeof(buf)); - if (ret < 0) { + if (read_file(path, buf, sizeof(buf)) < 0) { perror("read_file(read_num)"); exit(EXIT_FAILURE); } @@ -246,10 +287,9 @@ static const unsigned long read_num(const char *name) return strtoul(buf, NULL, 10); } -static void write_num(const char *name, unsigned long num) +static const unsigned long read_num(const char *name) { char path[PATH_MAX]; - char buf[21]; int ret; ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); @@ -257,6 +297,12 @@ static void write_num(const char *name, unsigned long num) printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; sprintf(buf, "%ld", num); if (!write_file(path, buf, strlen(buf) + 1)) { @@ -265,6 +311,19 @@ static void write_num(const char *name, unsigned long num) } } +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + static void write_settings(struct settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; @@ -284,6 +343,10 @@ static void write_settings(struct settings *settings) write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); } #define MAX_SETTINGS_DEPTH 4 @@ -354,6 +417,10 @@ static void save_settings(void) .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), .pages_to_scan = read_num("khugepaged/pages_to_scan"), }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + success("OK"); signal(SIGTERM, restore_settings); @@ -362,7 +429,90 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -#define MAX_LINE_LENGTH 500 +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -518,11 +668,91 @@ static bool anon_check_huge(void *addr, int nr_hpages) return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } -static struct mem_ops anon_ops = { +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static struct mem_ops __anon_ops = { .setup_area = &anon_setup_area, .cleanup_area = &anon_cleanup_area, .fault = &anon_fault, .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", }; static void __madvise_collapse(const char *msg, char *p, int nr_hpages, @@ -538,6 +768,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, * ignores /sys/kernel/mm/transparent_hugepage/enabled */ settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; push_settings(&settings); /* Clear VM_NOHUGEPAGE */ @@ -608,12 +839,37 @@ static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, return; } + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + if (ops->check_huge(p, expect ? nr_hpages : 0)) success("OK"); else fail("Fail"); } +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + static void alloc_at_fault(void) { struct settings settings = *current_settings(); @@ -686,6 +942,13 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o p = ops->setup_area(1); + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, ops, !c->enforce_pte_scan_limits); @@ -698,6 +961,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } +skip: ops->cleanup_area(p, hpage_pmd_size); pop_settings(); } @@ -781,6 +1045,13 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc p = alloc_hpage(ops); + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); @@ -792,6 +1063,7 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc c->collapse("Collapse PTE table with single PTE mapping compound page", p, 1, ops, true); validate_memory(p, 0, page_size); +skip: ops->cleanup_area(p, hpage_pmd_size); } @@ -1038,9 +1310,70 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, ops->cleanup_area(p, hpage_pmd_size); } +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\t\t: :\n"); + fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t\t: [all|anon|file]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + int main(int argc, const char **argv) { - struct collapse_context c; struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, @@ -1051,8 +1384,20 @@ int main(int argc, const char **argv) .alloc_sleep_millisecs = 10, .scan_sleep_millisecs = 10, }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, }; - const char *tests = argc == 1 ? "all" : argv[1]; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); setbuf(stdout, NULL); @@ -1070,43 +1415,61 @@ int main(int argc, const char **argv) alloc_at_fault(); - if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) { - printf("\n*** Testing context: khugepaged ***\n"); - c.collapse = &khugepaged_collapse; - c.enforce_pte_scan_limits = true; - - collapse_full(&c, &anon_ops); - collapse_empty(&c, &anon_ops); - collapse_single_pte_entry(&c, &anon_ops); - collapse_max_ptes_none(&c, &anon_ops); - collapse_swapin_single_pte(&c, &anon_ops); - collapse_max_ptes_swap(&c, &anon_ops); - collapse_single_pte_entry_compound(&c, &anon_ops); - collapse_full_of_compound(&c, &anon_ops); - collapse_compound_extreme(&c, &anon_ops); - collapse_fork(&c, &anon_ops); - collapse_fork_compound(&c, &anon_ops); - collapse_max_ptes_shared(&c, &anon_ops); - } - if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { - printf("\n*** Testing context: madvise ***\n"); - c.collapse = &madvise_collapse; - c.enforce_pte_scan_limits = false; - - collapse_full(&c, &anon_ops); - collapse_empty(&c, &anon_ops); - collapse_single_pte_entry(&c, &anon_ops); - collapse_max_ptes_none(&c, &anon_ops); - collapse_swapin_single_pte(&c, &anon_ops); - collapse_max_ptes_swap(&c, &anon_ops); - collapse_single_pte_entry_compound(&c, &anon_ops); - collapse_full_of_compound(&c, &anon_ops); - collapse_compound_extreme(&c, &anon_ops); - collapse_fork(&c, &anon_ops); - collapse_fork_compound(&c, &anon_ops); - collapse_max_ptes_shared(&c, &anon_ops); - madvise_collapse_existing_thps(&c, &anon_ops); - } +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); restore_settings(0); } diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 9dae51b8219f1..f11f8adda5218 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -114,3 +114,13 @@ bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) { return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); } + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 8434ea0c95cd7..5c35de454e08f 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -8,3 +8,5 @@ void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); -- GitLab From d0d35b6010a8bcc12b986f51d29cf3a8635cdbb4 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:44 -0700 Subject: [PATCH 1137/2223] selftests/vm: add thp collapse shmem testing Add memory operations for shmem (memfd) memory, and reuse existing tests with the new memory operations. Shmem tests can be called with "shmem" mem_type, and shmem tests are ran with "all" mem_type as well. Link: https://lkml.kernel.org/r/20220907144521.3115321-9-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-9-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 57 ++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 08de6141c2afe..eabbd2710096c 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -59,6 +59,7 @@ struct mem_ops { static struct mem_ops *file_ops; static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; struct collapse_context { void (*collapse)(const char *msg, char *p, int nr_hpages, @@ -739,6 +740,40 @@ static bool file_check_huge(void *addr, int nr_hpages) } } +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + static struct mem_ops __anon_ops = { .setup_area = &anon_setup_area, .cleanup_area = &anon_cleanup_area, @@ -755,6 +790,14 @@ static struct mem_ops __file_ops = { .name = "file", }; +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + static void __madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { @@ -1315,7 +1358,7 @@ static void usage(void) fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); fprintf(stderr, "\t\t: :\n"); fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t\t: [all|anon|file]\n"); + fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); @@ -1357,10 +1400,13 @@ static void parse_test_type(int argc, const char **argv) if (!strcmp(buf, "all")) { file_ops = &__file_ops; anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; } else if (!strcmp(buf, "anon")) { anon_ops = &__anon_ops; } else if (!strcmp(buf, "file")) { file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; } else { usage(); } @@ -1377,7 +1423,7 @@ int main(int argc, const char **argv) struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, + .shmem_enabled = SHMEM_ADVISE, .use_zero_page = 0, .khugepaged = { .defrag = 1, @@ -1424,16 +1470,20 @@ int main(int argc, const char **argv) TEST(collapse_full, khugepaged_context, anon_ops); TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); TEST(collapse_full, madvise_context, anon_ops); TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); TEST(collapse_empty, khugepaged_context, anon_ops); TEST(collapse_empty, madvise_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); TEST(collapse_single_pte_entry, madvise_context, anon_ops); TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); TEST(collapse_max_ptes_none, khugepaged_context, file_ops); @@ -1447,8 +1497,10 @@ int main(int argc, const char **argv) TEST(collapse_full_of_compound, khugepaged_context, anon_ops); TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); TEST(collapse_full_of_compound, madvise_context, anon_ops); TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); TEST(collapse_compound_extreme, khugepaged_context, anon_ops); TEST(collapse_compound_extreme, madvise_context, anon_ops); @@ -1470,6 +1522,7 @@ int main(int argc, const char **argv) TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); restore_settings(0); } -- GitLab From 69d9428ce97f28eb1ba8acee552cf46014663d2b Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:45 -0700 Subject: [PATCH 1138/2223] selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd This test tests that MADV_COLLAPSE acting on file/shmem memory for which (1) the file extent mapping by the memory is already a huge page in the page cache, and (2) the pmd mapping this memory in the target process is none. In practice, (1)+(2) is the state left over after khugepaged has successfully collapsed file/shmem memory for a target VMA, but the memory has not yet been refaulted. So, this test in-effect tests MADV_COLLAPSE racing with khugepaged to collapse the memory first. Link: https://lkml.kernel.org/r/20220907144521.3115321-10-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-10-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index eabbd2710096c..64126c8cd5612 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1353,6 +1353,33 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, ops->cleanup_area(p, hpage_pmd_size); } +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + static void usage(void) { fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); @@ -1524,5 +1551,8 @@ int main(int argc, const char **argv) TEST(madvise_collapse_existing_thps, madvise_context, file_ops); TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + restore_settings(0); } -- GitLab From 0f633baac0f1716200bbccc6430b6006d103d7b9 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:46 -0700 Subject: [PATCH 1139/2223] selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory Add :collapse mod to userfaultfd selftest. Currently this mod is only valid for "shmem" test type, but could be used for other test types. When provided, memory allocated by ->allocate_area() will be hugepage-aligned enforced to be hugepage-sized. userfaultf_minor_test, after the UFFD-registered mapping has been populated by UUFD minor fault handler, attempt to MADV_COLLAPSE the UFFD-registered mapping to collapse the memory into a pmd-mapped THP. This test is meant to be a functional test of what occurs during UFFD-driven live migration of VMs backed by huge tmpfs where, after a hugepage-sized region has been successfully migrated (in native page-sized chunks, to avoid latency of fetched a hugepage over the network), we want to reclaim previous VM performance by remapping it at the PMD level. Link: https://lkml.kernel.org/r/20220907144521.3115321-11-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-11-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/userfaultfd.c | 171 ++++++++++++++++++----- 2 files changed, 134 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c9c0996c122b4..c687533374e6b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -99,6 +99,7 @@ $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7be709d9eed07..74babdbc02e56 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -61,10 +61,11 @@ #include #include "../kselftest.h" +#include "vm_util.h" #ifdef __NR_userfaultfd -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) @@ -79,6 +80,8 @@ static int test_type; #define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + /* test using /dev/userfaultfd, instead of userfaultfd(2) */ static bool test_dev_userfaultfd; @@ -97,9 +100,10 @@ static int huge_fd; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; static char *zeropage; pthread_attr_t attr; +static bool test_collapse; /* Userfaultfd test statistics */ struct uffd_stats { @@ -127,6 +131,8 @@ struct uffd_stats { #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" @@ -152,6 +158,8 @@ static void usage(void) "Supported mods:\n"); fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); fprintf(stderr, "\nExample test mod usage:\n"); fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); @@ -229,12 +237,10 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static void anon_allocate_area(void **alloc_area) +static void anon_allocate_area(void **alloc_area, bool is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (*alloc_area == MAP_FAILED) - err("mmap of anonymous memory failed"); } static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -252,7 +258,7 @@ static void hugetlb_release_pages(char *rel_area) } } -static void hugetlb_allocate_area(void **alloc_area) +static void hugetlb_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; char **alloc_area_alias; @@ -262,7 +268,7 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), -1, 0); else @@ -270,9 +276,9 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_SHARED | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); @@ -282,12 +288,12 @@ static void hugetlb_allocate_area(void **alloc_area) PROT_READ | PROT_WRITE, MAP_SHARED, huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } - if (*alloc_area == area_src) { + if (is_src) { alloc_area_alias = &area_src_alias; } else { alloc_area_alias = &area_dst_alias; @@ -310,21 +316,36 @@ static void shmem_release_pages(char *rel_area) err("madvise(MADV_REMOVE) failed"); } -static void shmem_allocate_area(void **alloc_area) +static void shmem_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; - bool is_src = alloc_area == (void **)&area_src; - unsigned long offset = is_src ? 0 : nr_pages * page_size; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (*alloc_area == MAP_FAILED) err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); - area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (area_alias == MAP_FAILED) err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); if (is_src) area_src_alias = area_alias; @@ -337,28 +358,39 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) *start = (unsigned long)area_dst_alias + offset; } +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + struct uffd_test_ops { - void (*allocate_area)(void **alloc_area); + void (*allocate_area)(void **alloc_area, bool is_src); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); }; static struct uffd_test_ops anon_uffd_test_ops = { .allocate_area = anon_allocate_area, .release_pages = anon_release_pages, .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops shmem_uffd_test_ops = { .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops *uffd_test_ops; @@ -478,6 +510,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_src_alias); munmap_area((void **)&area_dst); munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); } static void uffd_test_ctx_init(uint64_t features) @@ -486,8 +519,8 @@ static void uffd_test_ctx_init(uint64_t features) uffd_test_ctx_clear(); - uffd_test_ops->allocate_area((void **)&area_src); - uffd_test_ops->allocate_area((void **)&area_dst); + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); userfaultfd_open(&features); @@ -804,6 +837,7 @@ static void *uffd_poll_thread(void *arg) err("remove failure"); break; case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ area_dst = (char *)(unsigned long)msg.arg.remap.to; break; } @@ -1256,13 +1290,30 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + static int userfaultfd_minor_test(void) { - struct uffdio_register uffdio_register; unsigned long p; + struct uffdio_register uffdio_register; pthread_t uffd_mon; - uint8_t expected_byte; - void *expected_page; char c; struct uffd_stats stats = { 0 }; @@ -1301,17 +1352,7 @@ static int userfaultfd_minor_test(void) * fault. uffd_poll_thread will resolve the fault by bit-flipping the * page's contents, and then issuing a CONTINUE ioctl. */ - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (p = 0; p < nr_pages; ++p) { - expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, area_dst_alias + (p * page_size), - page_size)) - err("unexpected page contents after minor fault"); - } + check_memory_contents(area_dst_alias); if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); @@ -1320,6 +1361,23 @@ static int userfaultfd_minor_test(void) uffd_stats_report(&stats, 1); + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; } @@ -1656,6 +1714,8 @@ static void parse_test_type_arg(const char *raw_type) test_dev_userfaultfd = true; else if (!strcmp(token, "syscall")) test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; else err("unrecognized test mod '%s'", token); } @@ -1663,8 +1723,11 @@ static void parse_test_type_arg(const char *raw_type) if (!test_type) err("failed to parse test type argument: '%s'", raw_type); + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + if (test_type == TEST_HUGETLB) - page_size = default_huge_page_size(); + page_size = hpage_size; else page_size = sysconf(_SC_PAGE_SIZE); @@ -1702,6 +1765,8 @@ static void sigalrm(int sig) int main(int argc, char **argv) { + size_t bytes; + if (argc < 4) usage(); @@ -1709,11 +1774,41 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); + hpage_size = default_huge_page_size(); parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / - nr_cpus; + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; if (!nr_pages_per_cpu) { _err("invalid MiB"); usage(); -- GitLab From 6b91e5dfb3c7ef485587e7ab494dcb47bcdadce3 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 19:09:35 +0800 Subject: [PATCH 1140/2223] mm: remove unused inline functions from include/linux/mm_inline.h Remove the following unused inline functions from mm_inline.h: 1. All uses of add_page_to_lru_list_tail() have been removed since commit 7a3dbfe8a52b ("mm/swap: convert lru_deactivate_file to a folio_batch"), and it can be replaced by lruvec_add_folio_tail(). 2. All uses of __clear_page_lru_flags() have been removed since commit 188e8caee968 ("mm/swap: convert __page_cache_release() to use a folio"), and it can be replaced by __folio_clear_lru_flags(). They are useless, so remove them. Link: https://lkml.kernel.org/r/20220922110935.1495099-1-cuigaosheng1@huawei.com Signed-off-by: Gaosheng Cui Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4949eda9a9a2a..e8ed225d8f7ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -76,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) __folio_clear_unevictable(folio); } -static __always_inline void __clear_page_lru_flags(struct page *page) -{ - __folio_clear_lru_flags(page_folio(page)); -} - /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. @@ -348,12 +343,6 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) list_add_tail(&folio->lru, &lruvec->lists[lru]); } -static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec) -{ - lruvec_add_folio_tail(lruvec, page_folio(page)); -} - static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { -- GitLab From 8346d69d8bcb6c526a0d8bd126241dff41a60723 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 22 Sep 2022 10:19:29 +0800 Subject: [PATCH 1141/2223] mm/hugetlb: add available_huge_pages() func In hugetlb.c there are several places which compare the values of 'h->free_huge_pages' and 'h->resv_huge_pages', it looks a bit messy, so add a new available_huge_pages() function to do these. Link: https://lkml.kernel.org/r/20220922021929.98961-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c1316ad54b5f..8de5a6b5a172a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1191,6 +1191,11 @@ retry_cpuset: return NULL; } +static unsigned long available_huge_pages(struct hstate *h) +{ + return h->free_huge_pages - h->resv_huge_pages; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -1207,12 +1212,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma, chg) && - h->free_huge_pages - h->resv_huge_pages == 0) + if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + if (avoid_reserve && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -2124,7 +2128,7 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) + if (!available_huge_pages(h)) goto out; /* @@ -2311,7 +2315,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); - if (h->free_huge_pages - h->resv_huge_pages > 0) { + if (available_huge_pages(h)) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); -- GitLab From f7c5b1aab5ef18b0eb4136a33fc2c78b54e3e777 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 20 Sep 2022 09:22:05 +0800 Subject: [PATCH 1142/2223] mm/secretmem: remove reduntant return value The return value @ret is always 0, so remove it and return 0 directly. Link: https://lkml.kernel.org/r/20220920012205.246217-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/secretmem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 6a44efb673b2c..04c3ac9448a18 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -278,10 +278,8 @@ static struct file_system_type secretmem_fs = { static int __init secretmem_init(void) { - int ret = 0; - if (!secretmem_enable) - return ret; + return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) @@ -290,6 +288,6 @@ static int __init secretmem_init(void) /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; - return ret; + return 0; } fs_initcall(secretmem_init); -- GitLab From c91bdc9358992856721ff77887202a7e80b7ab22 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:01 -0400 Subject: [PATCH 1143/2223] mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled Patch series "memcg swap fix & cleanups". This patch (of 4): Since commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), the cgroup swap arrays are used to track memory ownership at the time of swap readahead and swapoff, even if swap space *accounting* has been turned off by the user via swapaccount=0 (which sets cgroup_memory_noswap). However, the patch was overzealous: by simply dropping the cgroup_memory_noswap conditionals in the swapon, swapoff and uncharge path, it caused the cgroup arrays being allocated even when the memory controller as a whole is disabled. This is a waste of that memory. Restore mem_cgroup_disabled() checks, implied previously by cgroup_memory_noswap, in the swapon, swapoff, and swap_entry_free callbacks. Link: https://lkml.kernel.org/r/20220926135704.400818-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20220926135704.400818-2-hannes@cmpxchg.org Fixes: 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") Signed-off-by: Johannes Weiner Reported-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +++ mm/swap_cgroup.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b74bbdc26596..9e3c010ca676c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7459,6 +7459,9 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 5a9442979a185..db6c4a26cf593 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -170,6 +170,9 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return 0; + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array = vcalloc(length, sizeof(void *)); @@ -204,6 +207,9 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return; + mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; -- GitLab From b25806dcd3d5248833f7d2544ee29a701735159f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:02 -0400 Subject: [PATCH 1144/2223] mm: memcontrol: deprecate swapaccounting=0 mode The swapaccounting= commandline option already does very little today. To close a trivial containment failure case, the swap ownership tracking part of the swap controller has recently become mandatory (see commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") for details), which makes up the majority of the work during swapout, swapin, and the swap slot map. The only thing left under this flag is the page_counter operations and the visibility of the swap control files in the first place, which are rather meager savings. There also aren't many scenarios, if any, where controlling the memory of a cgroup while allowing it unlimited access to a global swap space is a workable resource isolation strategy. On the other hand, there have been several bugs and confusion around the many possible swap controller states (cgroup1 vs cgroup2 behavior, memory accounting without swap accounting, memcg runtime disabled). This puts the maintenance overhead of retaining the toggle above its practical benefits. Deprecate it. Link: https://lkml.kernel.org/r/20220926135704.400818-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 6 --- mm/memcontrol.c | 50 ++++--------------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3b95f65bafe27..99a13f2be2ef6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6036,12 +6036,6 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swapaccount= [KNL] - Format: [0|1] - Enable accounting of swap in memory resource - controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst) - swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { [,] | force | noforce } -- Number of I/O TLB slabs diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e3c010ca676c..4be1b48b96596 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -88,22 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; -/* Whether the swap controller is active */ -#ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __initdata; - -static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); -static inline bool memcg_swap_enabled(void) -{ - return static_branch_likely(&memcg_swap_enabled_key); -} -#else -static inline bool memcg_swap_enabled(void) -{ - return false; -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -111,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7379,7 +7363,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg_swap_enabled() && memcg != swap_memcg) { + if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7431,7 +7415,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && + if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7466,7 +7450,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { + if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7482,7 +7466,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7499,7 +7483,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return false; memcg = folio_memcg(folio); @@ -7519,10 +7503,9 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - bool res; - - if (!kstrtobool(s, &res)) - cgroup_memory_noswap = !res; + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); @@ -7791,24 +7774,11 @@ static struct cftype zswap_files[] = { }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -/* - * If mem_cgroup_swap_init() is implemented as a subsys_initcall() - * instead of a core_initcall(), this could mean cgroup_memory_noswap still - * remains set to false even when memcg is disabled via "cgroup_disable=memory" - * boot parameter. This may result in premature OOPS inside - * mem_cgroup_get_nr_swap_pages() function in corner cases. - */ static int __init mem_cgroup_swap_init(void) { - /* No memory control -> no swap control */ if (mem_cgroup_disabled()) - cgroup_memory_noswap = true; - - if (cgroup_memory_noswap) return 0; - static_branch_enable(&memcg_swap_enabled_key); - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) @@ -7816,6 +7786,6 @@ static int __init mem_cgroup_swap_init(void) #endif return 0; } -core_initcall(mem_cgroup_swap_init); +subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_MEMCG_SWAP */ -- GitLab From b94c4e949c36e0e363515822ade0d8305e9a6ef2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:03 -0400 Subject: [PATCH 1145/2223] mm: memcontrol: use do_memsw_account() in a few more places It's slightly more descriptive and consistent with other places that distinguish cgroup1's combined memory+swap accounting scheme from cgroup2's dedicated swap accounting. Link: https://lkml.kernel.org/r/20220926135704.400818-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be1b48b96596..76bb0a18a2f3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1667,17 +1667,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max = READ_ONCE(memcg->memory.max); - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - if (mem_cgroup_swappiness(memcg)) - max += min(READ_ONCE(memcg->swap.max), - (unsigned long)total_swap_pages); - } else { /* v1 */ + if (do_memsw_account()) { if (mem_cgroup_swappiness(memcg)) { /* Calculate swap excess capacity from memsw limit */ unsigned long swap = READ_ONCE(memcg->memsw.max) - max; max += min(swap, (unsigned long)total_swap_pages); } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); } return max; } @@ -7334,7 +7334,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (mem_cgroup_disabled()) return; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!do_memsw_account()) return; memcg = folio_memcg(folio); @@ -7399,7 +7399,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return 0; memcg = folio_memcg(folio); @@ -7451,10 +7451,10 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, nr_pages); - else + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); -- GitLab From e55b9f96860f6c6026cff97966a740576285e07b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:04 -0400 Subject: [PATCH 1146/2223] mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol Since 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), CONFIG_MEMCG_SWAP hasn't been a user-visible config option anymore, it just means CONFIG_MEMCG && CONFIG_SWAP. Update the sites accordingly and drop the symbol. [ While touching the docs, remove two references to CONFIG_MEMCG_KMEM, which hasn't been a user-visible symbol for over half a decade. ] Link: https://lkml.kernel.org/r/20220926135704.400818-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 +--- arch/mips/configs/db1xxx_defconfig | 1 - arch/mips/configs/generic_defconfig | 1 - arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/sh/configs/sdk7786_defconfig | 1 - arch/sh/configs/urquell_defconfig | 1 - include/linux/swap.h | 2 +- include/linux/swap_cgroup.h | 4 ++-- init/Kconfig | 5 ----- mm/Makefile | 4 +++- mm/memcontrol.c | 6 +++--- tools/testing/selftests/cgroup/config | 1 - 13 files changed, 10 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 2cc502a75ef64..5b86245450bdc 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -299,7 +299,7 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by lruvec->lru_lock; PG_lru bit of page->flags is cleared before isolating a page from its LRU under lruvec->lru_lock. -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +2.7 Kernel Memory Extension ----------------------------------------------- With the Kernel memory extension, the Memory Controller is able to limit @@ -386,8 +386,6 @@ U != 0, K >= U: a. Enable CONFIG_CGROUPS b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) ------------------------------------------------------------------- diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig index b8bd663009969..83cbdecb27e6a 100644 --- a/arch/mips/configs/db1xxx_defconfig +++ b/arch/mips/configs/db1xxx_defconfig @@ -9,7 +9,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig index 714169e411cf0..48e4e251779b6 100644 --- a/arch/mips/configs/generic_defconfig +++ b/arch/mips/configs/generic_defconfig @@ -3,7 +3,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 49f49c2639350..4acca52634048 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -17,7 +17,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index b571d084c148b..fead14ebb1fce 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -16,7 +16,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index a8662b6927ec7..97b7356639ed8 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -16,7 +16,6 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index cb2f56468fe02..be478f3148f2d 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -14,7 +14,6 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y diff --git a/include/linux/swap.h b/include/linux/swap.h index fc8d98660326f..a18cf4b7c724c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -666,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) cgroup_throttle_swaprate(&folio->page, gfp); } -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index a12dd1c3966c9..ae73a87775b3a 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -4,7 +4,7 @@ #include -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); @@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif /* CONFIG_MEMCG_SWAP */ +#endif #endif /* __LINUX_SWAP_CGROUP_H */ diff --git a/init/Kconfig b/init/Kconfig index 532362fcfe31f..7d86cf6b3012d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -958,11 +958,6 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup. -config MEMCG_SWAP - bool - depends on MEMCG && SWAP - default y - config MEMCG_KMEM bool depends on MEMCG && !SLOB diff --git a/mm/Makefile b/mm/Makefile index cc23b00525848..8e105e5b3e293 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -98,7 +98,9 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o -obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o +ifdef CONFIG_SWAP +obj-$(CONFIG_MEMCG) += swap_cgroup.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76bb0a18a2f3d..61e05fc281fb9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3423,7 +3423,7 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -7296,7 +7296,7 @@ static int __init mem_cgroup_init(void) } subsys_initcall(mem_cgroup_init); -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { @@ -7788,4 +7788,4 @@ static int __init mem_cgroup_swap_init(void) } subsys_initcall(mem_cgroup_swap_init); -#endif /* CONFIG_MEMCG_SWAP */ +#endif /* CONFIG_SWAP */ diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 84fe884fad867..97d549ee894fa 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -4,5 +4,4 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y CONFIG_MEMCG_KMEM=y -CONFIG_MEMCG_SWAP=y CONFIG_PAGE_COUNTER=y -- GitLab From 6e4a53ee7989c8a2b9fc3b14cd90f6e2d613ca76 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 3 Sep 2022 00:59:36 +0100 Subject: [PATCH 1147/2223] ocfs2: replace zero-length arrays with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated and we are moving towards adopting C99 flexible-array members, instead. So, replace zero-length array declarations in a couple of structures and unions with the new DECLARE_FLEX_ARRAY() helper macro. This helper allows for a flexible-array member in a union and as only member in a structure. Also, this addresses multiple warnings reported when building with Clang-15 and -Wzero-length-array. Lastly, this will also help memcpy (in a coming hardening update) execute proper bounds-checking on variable length object i_symlink at fs/ocfs2/namei.c:1973: fs/ocfs2/namei.c: 1973 memcpy((char *) fe->id2.i_symlink, symname, l); Link: https://github.com/KSPP/linux/issues/21 Link: https://github.com/KSPP/linux/issues/193 Link: https://github.com/KSPP/linux/issues/197 Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Link: https://lkml.kernel.org/r/YxKY6O2hmdwNh8r8@work Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ocfs2_fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 638d875eccc7d..7aebdbf5cc0a5 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -527,7 +527,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -548,7 +548,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -727,7 +727,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -892,7 +892,7 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when -- GitLab From 1c320cfa17701cbf98085b34ad6159e9f41e5268 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 14:16:56 +0800 Subject: [PATCH 1148/2223] fs/ocfs2/suballoc.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905061656.1829179-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Acked-by: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/suballoc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 5805a03d100ba..9c74eace3adc1 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, -- GitLab From 8f824b4abd31c5ea32ae1d6725c47bdb247d18da Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 10:10:34 +0800 Subject: [PATCH 1149/2223] init.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905021034.947701-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/init.h b/include/linux/init.h index baf0b29a7010a..3254766ebbf23 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -134,7 +134,7 @@ static inline initcall_t initcall_from_entry(initcall_entry_t *entry) extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; -/* Used for contructor calls. */ +/* Used for constructor calls. */ typedef void (*ctor_fn_t)(void); struct file_system_type; -- GitLab From 5758478a3d3c42a78ee9ddc4b08db3e968a68058 Mon Sep 17 00:00:00 2001 From: Jingyu Wang Date: Fri, 9 Sep 2022 02:54:52 +0800 Subject: [PATCH 1150/2223] ipc: mqueue: remove unnecessary conditionals iput() already handles null and non-null parameters, so there is no need to use if(). Link: https://lkml.kernel.org/r/20220908185452.76590-1-jingyuwang_vip@163.com Signed-off-by: Jingyu Wang Acked-by: Roman Gushchin Signed-off-by: Andrew Morton --- ipc/mqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f98de32aeea17..9834104a5a31d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -986,8 +986,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) out_unlock: inode_unlock(d_inode(mnt->mnt_root)); - if (inode) - iput(inode); + iput(inode); mnt_drop_write(mnt); out_name: putname(name); -- GitLab From 1179083ff07698b870da30a5aad34d44ed5dae10 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 9 Sep 2022 17:07:55 -0300 Subject: [PATCH 1151/2223] firmware: google: test spinlock on panic path to avoid lockups Currently the gsmi driver registers a panic notifier as well as reboot and die notifiers. The callbacks registered are called in atomic and very limited context - for instance, panic disables preemption and local IRQs, also all secondary CPUs (not executing the panic path) are shutdown. With that said, taking a spinlock in this scenario is a dangerous invitation for lockup scenarios. So, fix that by checking if the spinlock is free to acquire in the panic notifier callback - if not, bail-out and avoid a potential hang. Link: https://lkml.kernel.org/r/20220909200755.189679-1-gpiccoli@igalia.com Fixes: 74c5b31c6618 ("driver: Google EFI SMI") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Evan Green Cc: Ard Biesheuvel Cc: David Gow Cc: Greg Kroah-Hartman Cc: Julius Werner Cc: Petr Mladek Signed-off-by: Andrew Morton --- drivers/firmware/google/gsmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index adaa492c3d2df..4e2575dfeb908 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -681,6 +681,15 @@ static struct notifier_block gsmi_die_notifier = { static int gsmi_panic_callback(struct notifier_block *nb, unsigned long reason, void *arg) { + + /* + * Panic callbacks are executed with all other CPUs stopped, + * so we must not attempt to spin waiting for gsmi_dev.lock + * to be released. + */ + if (spin_is_locked(&gsmi_dev.lock)) + return NOTIFY_DONE; + gsmi_shutdown_reason(GSMI_SHUTDOWN_PANIC); return NOTIFY_DONE; } -- GitLab From 5ca14835dc429c09fefb290f60343fe266382760 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Sep 2022 13:57:41 -0700 Subject: [PATCH 1152/2223] fs: uninline inode_maybe_inc_iversion() It has many callsites and is large. text data bss dec hex filename 91796 15984 512 108292 1a704 mm/shmem.o-before 91180 15984 512 107676 1a49c mm/shmem.o-after Acked-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- fs/libfs.c | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/iversion.h | 46 +--------------------------------------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31d..682d56345a1cf 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include /* sync_mapping_buffers */ #include @@ -1520,3 +1521,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion); diff --git a/include/linux/iversion.h b/include/linux/iversion.h index eb5a158101693..e27bd4f55d840 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -172,51 +172,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) I_VERSION_QUERIED); } -/** - * inode_maybe_inc_iversion - increments i_version - * @inode: inode with the i_version that should be updated - * @force: increment the counter even if it's not necessary? - * - * Every time the inode is modified, the i_version field must be seen to have - * changed by any observer. - * - * If "force" is set or the QUERIED flag is set, then ensure that we increment - * the value, and clear the queried flag. - * - * In the common case where neither is set, then we can return "false" without - * updating i_version. - * - * If this function returns false, and no other metadata has changed, then we - * can avoid logging the metadata. - */ -static inline bool -inode_maybe_inc_iversion(struct inode *inode, bool force) -{ - u64 cur, new; - - /* - * The i_version field is not strictly ordered with any other inode - * information, but the legacy inode_inc_iversion code used a spinlock - * to serialize increments. - * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. - * - * This barrier pairs with the barrier in inode_query_iversion() - */ - smp_mb(); - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is clear then we needn't do anything */ - if (!force && !(cur & I_VERSION_QUERIED)) - return false; - - /* Since lowest bit is flag, add 2 to avoid it */ - new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return true; -} - +bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version -- GitLab From 7ec354baa2ad6dcf1b481a5a582293cec0eb2a67 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 9 Sep 2022 14:25:29 +0200 Subject: [PATCH 1153/2223] proc: make config PROC_CHILDREN depend on PROC_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2e13ba54a268 ("fs, proc: introduce CONFIG_PROC_CHILDREN") introduces the config PROC_CHILDREN to configure kernels to provide the /proc//task//children file. When one deselects PROC_FS for kernel builds without /proc/, the config PROC_CHILDREN has no effect anymore, but is still visible in menuconfig. Add the dependency on PROC_FS to make the PROC_CHILDREN option disappear for kernel builds without /proc/. Link: https://lkml.kernel.org/r/20220909122529.1941-1-lukas.bulwahn@gmail.com Fixes: 2e13ba54a268 ("fs, proc: introduce CONFIG_PROC_CHILDREN") Signed-off-by: Lukas Bulwahn Cc: Iago López Galeiras Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index c930001056f95..32b1116ae137c 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc//task//children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See -- GitLab From 83d87a4ddb3b4a42bb73b314b3d1acc3965a689f Mon Sep 17 00:00:00 2001 From: wuchi Date: Fri, 9 Sep 2022 18:10:25 +0800 Subject: [PATCH 1154/2223] relay: use kvcalloc to alloc page array in relay_alloc_page_array kvcalloc() is safer because it will check the integer overflows, and using it will simple the logic of allocation size. Link: https://lkml.kernel.org/r/20220909101025.82955-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Andrew Morton --- kernel/relay.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 6a611e779e958..d7edc934c56d5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - const size_t pa_size = n_pages * sizeof(struct page *); - if (pa_size > PAGE_SIZE) - return vzalloc(pa_size); - return kzalloc(pa_size, GFP_KERNEL); + return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); } /* -- GitLab From e77999c1d4d26df7a3fe83627d05898052437269 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Thu, 8 Sep 2022 21:00:36 +0800 Subject: [PATCH 1155/2223] fs/ocfs2: fix repeated words in comments Delete the redundant word 'to'. Link: https://lkml.kernel.org/r/20220908130036.31149-1-wangjianli@cdjrlc.com Signed-off-by: wangjianli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a36..623db358b1efa 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. -- GitLab From 5d0ce3595ab75330a15cec914096efbbb8b41e4a Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:37 +0800 Subject: [PATCH 1156/2223] percpu: add percpu_counter_add_local and percpu_counter_sub_local Patch series "/msg: mitigate the lock contention in ipc/msg", v6. Here are two patches to mitigate the lock contention in ipc/msg. The 1st patch is to add the new interface percpu_counter_add_local and percpu_counter_sub_local. The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg. The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. This patch (of 2): The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. Link: https://lkml.kernel.org/r/20220913192538.3023708-1-jiebin.sun@intel.com Link: https://lkml.kernel.org/r/20220913192538.3023708-2-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Davidlohr Bueso Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 01861eebed79d..8ed5fba6d156f 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -15,6 +15,9 @@ #include #include +/* percpu_counter batch for local add or sub */ +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX + #ifdef CONFIG_SMP struct percpu_counter { @@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +/* + * With percpu_counter_add_local() and percpu_counter_sub_local(), counts + * are accumulated in local per cpu counter and not in fbc->count until + * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter + * write efficient. + * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be + * used to add up the counts from each CPU to account for all the local + * counts. So percpu_counter_add_local() and percpu_counter_sub_local() + * should be used when a counter is updated frequently and read rarely. + */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); +} + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); @@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) preempt_enable(); } +/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, amount); +} + static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { @@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) percpu_counter_add(fbc, -amount); } +static inline void +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_local(fbc, -amount); +} + #endif /* _LINUX_PERCPU_COUNTER_H */ -- GitLab From 72d1e611082eda18689106a0c192f2827072713c Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:38 +0800 Subject: [PATCH 1157/2223] ipc/msg: mitigate the lock contention with percpu counter The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. Apply the patch and test the pts/stress-ng-1.4.0 -- system v message passing (160 threads). Score gain: 3.99x CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/stress-ng-1.4.0 -- system v message passing (160 threads) [akpm@linux-foundation.org: coding-style cleanups] [jiebin.sun@intel.com: avoid negative value by overflow in msginfo] Link: https://lkml.kernel.org/r/20220920150809.4014944-1-jiebin.sun@intel.com [akpm@linux-foundation.org: fix min() warnings] Link: https://lkml.kernel.org/r/20220913192538.3023708-3-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Signed-off-by: Andrew Morton --- include/linux/ipc_namespace.h | 5 ++-- ipc/msg.c | 48 +++++++++++++++++++++++++---------- ipc/namespace.c | 5 +++- ipc/util.h | 4 +-- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e3e8c8662b490..e8240cf2611ad 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -11,6 +11,7 @@ #include #include #include +#include struct user_namespace; @@ -36,8 +37,8 @@ struct ipc_namespace { unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; - atomic_t msg_bytes; - atomic_t msg_hdrs; + struct percpu_counter percpu_msg_bytes; + struct percpu_counter percpu_msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; diff --git a/ipc/msg.c b/ipc/msg.c index a0d05775af2c5..e4e0990e08f75 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -495,17 +496,22 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd == MSG_INFO) { + if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; - msginfo->msgmap = atomic_read(&ns->msg_hdrs); - msginfo->msgtql = atomic_read(&ns->msg_bytes); + max_idx = ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd == MSG_INFO) { + msginfo->msgmap = min_t(int, + percpu_counter_sum(&ns->percpu_msg_hdrs), + INT_MAX); + msginfo->msgtql = min_t(int, + percpu_counter_sum(&ns->percpu_msg_bytes), + INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_idx = ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } @@ -935,8 +941,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; @@ -1159,8 +1165,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; @@ -1297,20 +1303,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + +fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); +fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); diff --git a/ipc/namespace.c b/ipc/namespace.c index e1fcaedba4fae..8316ea5857333 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (!setup_ipc_sysctls(ns)) goto fail_mq; + err = msg_init_ns(ns); + if (err) + goto fail_put; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); return ns; diff --git a/ipc/util.h b/ipc/util.h index 2dd7ce0416d8e..b2906e3665394 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { } #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } -- GitLab From 462cd7724e2341472c9f9670ac88e250788d4c82 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 19 Sep 2022 09:44:06 +0800 Subject: [PATCH 1158/2223] usr/gen_init_cpio.c: remove unnecessary -1 values from int file The file variable is assigned first, it does not need to be initialized. Link: https://lkml.kernel.org/r/20220919014406.3242-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Li zeming Cc: Masahiro Yamada Cc: Nicolas Schier Signed-off-by: Andrew Morton --- usr/gen_init_cpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index dc838e26a5b9a..ee01e40e8bc65 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -326,7 +326,7 @@ static int cpio_mkfile(const char *name, const char *location, char s[256]; struct stat buf; unsigned long size; - int file = -1; + int file; int retval; int rc = -1; int namesize; -- GitLab From bd17e036b495bebbf07a5fc814c868e30e1dc131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 14 Sep 2022 12:02:55 +0200 Subject: [PATCH 1159/2223] checkpatch: warn for non-standard fixes tag style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a warning for fixes tags that does not follow community conventions. Link: https://lkml.kernel.org/r/20220914100255.1048460-1-niklas.soderlund@corigine.com Signed-off-by: Niklas Söderlund Reviewed-by: Simon Horman Reviewed-by: Louis Peens Reviewed-by: Philippe Schenker Acked-by: Dwaipayan Ray Reviewed-by: Lukas Bulwahn Acked-by: Lukas Bulwahn Acked-by: Joe Perches Signed-off-by: Andrew Morton --- Documentation/dev-tools/checkpatch.rst | 7 ++++ scripts/checkpatch.pl | 44 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index b52452bc29636..c3389c6f38381 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -612,6 +612,13 @@ Commit message See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + **BAD_FIXES_TAG** + The Fixes: tag is malformed or does not follow the community conventions. + This can occur if the tag have been split into multiple lines (e.g., when + pasted in an email program with word wrapping enabled). + + See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + Comparison style ---------------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 18effbe1fe908..e8e0542f29f02 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3146,6 +3146,50 @@ sub process { } } +# Check Fixes: styles is correct + if (!$in_header_lines && + $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) { + my $orig_commit = ""; + my $id = "0123456789ab"; + my $title = "commit title"; + my $tag_case = 1; + my $tag_space = 1; + my $id_length = 1; + my $id_case = 1; + my $title_has_quotes = 0; + + if ($line =~ /(\s*fixes:?)\s+([0-9a-f]{5,})\s+($balanced_parens)/i) { + my $tag = $1; + $orig_commit = $2; + $title = $3; + + $tag_case = 0 if $tag eq "Fixes:"; + $tag_space = 0 if ($line =~ /^fixes:? [0-9a-f]{5,} ($balanced_parens)/i); + + $id_length = 0 if ($orig_commit =~ /^[0-9a-f]{12}$/i); + $id_case = 0 if ($orig_commit !~ /[A-F]/); + + # Always strip leading/trailing parens then double quotes if existing + $title = substr($title, 1, -1); + if ($title =~ /^".*"$/) { + $title = substr($title, 1, -1); + $title_has_quotes = 1; + } + } + + my ($cid, $ctitle) = git_commit_info($orig_commit, $id, + $title); + + if ($ctitle ne $title || $tag_case || $tag_space || + $id_length || $id_case || !$title_has_quotes) { + if (WARN("BAD_FIXES_TAG", + "Please use correct Fixes: style 'Fixes: <12 chars of sha1> (\"\")' - ie: 'Fixes: $cid (\"$ctitle\")'\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] = "Fixes: $cid (\"$ctitle\")"; + } + } + } + # Check email subject for common tools that don't need to be mentioned if ($in_header_lines && $line =~ /^Subject:.*\b(?:checkpatch|sparse|smatch)\b[^:]/i) { -- GitLab From 0badb2e46a7699d09ef09a2c7f8f4fe66c15e606 Mon Sep 17 00:00:00 2001 From: Minghao Chi <chi.minghao@zte.com.cn> Date: Wed, 21 Sep 2022 12:48:02 +0900 Subject: [PATCH 1160/2223] nilfs2: delete unnecessary checks before brelse() Patch series "nilfs2 minor amendments". This patch (of 2): The brelse() inline function tests whether its argument is NULL and then returns immediately. Thus remove the tests which are not needed around the shown calls. Link: https://lkml.kernel.org/r/20220921034803.2476-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20220819081700.96279-1-chi.minghao@zte.com.cn Link: https://lkml.kernel.org/r/20220921034803.2476-2-konishi.ryusuke@gmail.com Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: ye xingchen <ye.xingchen@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/btree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 9f4d9432d38a1..b9d15c3df3cc1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; - if (bh != NULL) - brelse(bh); + brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } @@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree, ptrs[i] = le64_to_cpu(dptrs[i]); } - if (bh != NULL) - brelse(bh); + brelse(bh); return nitems; } -- GitLab From da6f79164e98de4ab3f2fdeea4875207fe282014 Mon Sep 17 00:00:00 2001 From: ye xingchen <ye.xingchen@zte.com.cn> Date: Wed, 21 Sep 2022 12:48:03 +0900 Subject: [PATCH 1161/2223] nilfs2: remove the unneeded result variable Return the value nilfs_segctor_sync() directly instead of storing it in another redundant variable. Link: https://lkml.kernel.org/r/20220831033403.302184-1-ye.xingchen@zte.com.cn Link: https://lkml.kernel.org/r/20220921034803.2476-3-konishi.ryusuke@gmail.com Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Minghao Chi <chi.minghao@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/segment.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c7547..9abae2c9120ed 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2235,7 +2235,6 @@ int nilfs_construct_segment(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - int err; if (!sci) return -EROFS; @@ -2243,8 +2242,7 @@ int nilfs_construct_segment(struct super_block *sb) /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); - err = nilfs_segctor_sync(sci); - return err; + return nilfs_segctor_sync(sci); } /** -- GitLab From ef1d61781bc6708ccc4a21262cc80a7dad952e04 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Tue, 20 Sep 2022 20:35:23 +0300 Subject: [PATCH 1162/2223] proc: mark more files as permanent Mark /proc/devices /proc/kpagecount /proc/kpageflags /proc/kpagecgroup /proc/loadavg /proc/meminfo /proc/softirqs /proc/uptime /proc/version as permanent /proc entries, saving alloc/free and some list/spinlock ops per use. These files are never removed by the kernel so it is OK to mark them. Link: https://lkml.kernel.org/r/Yyn527DzDMa+r0Yj@localhost.localdomain Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/proc/devices.c | 6 +++++- fs/proc/internal.h | 5 +++++ fs/proc/loadavg.c | 6 +++++- fs/proc/meminfo.c | 5 ++++- fs/proc/page.c | 3 +++ fs/proc/softirqs.c | 6 +++++- fs/proc/uptime.c | 6 +++++- fs/proc/version.c | 6 +++++- 8 files changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 837971e741097..fe7bfcb7d0494 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { @@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = { static int __init proc_devices_init(void) { - proc_create_seq("devices", 0, NULL, &devinfo_ops); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d8..af277184b8073 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index f32878d9a39f3..817981e57223e 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v) static int __init proc_loadavg_init(void) { - proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6e89f0e2fd20f..70e5294052d52 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -162,7 +162,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) static int __init proc_meminfo_init(void) { - proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index a2873a617ae86..f2273b164535b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; @@ -268,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, } static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpageflags_read, }; @@ -322,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecgroup_read, }; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 12901dcf57e2b..f4616083faef3 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -3,6 +3,7 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v) static int __init proc_softirqs_init(void) { - proc_create_single("softirqs", 0, NULL, show_softirqs); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index deb99bc9b7e6b..b5343d209381a 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { @@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) static int __init proc_uptime_init(void) { - proc_create_single("uptime", 0, NULL, uptime_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index b449f186577f8..02e3c3cd4a9af 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -5,6 +5,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v) static int __init proc_version_init(void) { - proc_create_single("version", 0, NULL, version_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); -- GitLab From 374d6cda7946431611c41cbb6e75dc4a25727ea8 Mon Sep 17 00:00:00 2001 From: Zhou jie <zhoujie@nfschina.com> Date: Wed, 28 Sep 2022 09:45:39 +0800 Subject: [PATCH 1163/2223] init/main.c: remove unnecessary (void*) conversions The void pointer object can be directly assigned to different structure objects, it does not need to be cast. Link: https://lkml.kernel.org/r/20220928014539.11046-1-zhoujie@nfschina.com Signed-off-by: Zhou jie <zhoujie@nfschina.com> Reviewed-by: Andrew Halaney <ahalaney@redhat.com> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- init/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index a45f9eca40af0..22255723599b2 100644 --- a/init/main.c +++ b/init/main.c @@ -1237,7 +1237,7 @@ __setup("initcall_blacklist=", initcall_blacklist); static __init_or_module void trace_initcall_start_cb(void *data, initcall_t fn) { - ktime_t *calltime = (ktime_t *)data; + ktime_t *calltime = data; printk(KERN_DEBUG "calling %pS @ %i\n", fn, task_pid_nr(current)); *calltime = ktime_get(); @@ -1246,7 +1246,7 @@ trace_initcall_start_cb(void *data, initcall_t fn) static __init_or_module void trace_initcall_finish_cb(void *data, initcall_t fn, int ret) { - ktime_t rettime, *calltime = (ktime_t *)data; + ktime_t rettime, *calltime = data; rettime = ktime_get(); printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", -- GitLab From 3baca1a4d490484fcd555413f1fec85b2e071912 Mon Sep 17 00:00:00 2001 From: Anup Patel <apatel@ventanamicro.com> Date: Wed, 27 Jul 2022 10:08:29 +0530 Subject: [PATCH 1164/2223] RISC-V: Add mvendorid, marchid, and mimpid to /proc/cpuinfo output Identifying the underlying RISC-V implementation can be important for some of the user space applications. For example, the perf tool uses arch specific CPU implementation id (i.e. CPUID) to select a JSON file describing custom perf events on a CPU. Currently, there is no way to identify RISC-V implementation so we add mvendorid, marchid, and mimpid to /proc/cpuinfo output. Signed-off-by: Anup Patel <apatel@ventanamicro.com> Reviewed-by: Heinrich Schuchardt <heinrich.schuchardt@canonical.com> Tested-by: Nikita Shubin <n.shubin@yadro.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220727043829.151794-1-apatel@ventanamicro.com/ Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/kernel/cpu.c | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 0be8a2403212d..250220e2ccebf 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -3,10 +3,13 @@ * Copyright (C) 2012 Regents of the University of California */ +#include <linux/cpu.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/of.h> +#include <asm/csr.h> #include <asm/hwcap.h> +#include <asm/sbi.h> #include <asm/smp.h> #include <asm/pgtable.h> @@ -68,6 +71,50 @@ int riscv_of_parent_hartid(struct device_node *node, unsigned long *hartid) } #ifdef CONFIG_PROC_FS + +struct riscv_cpuinfo { + unsigned long mvendorid; + unsigned long marchid; + unsigned long mimpid; +}; +static DEFINE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); + +static int riscv_cpuinfo_starting(unsigned int cpu) +{ + struct riscv_cpuinfo *ci = this_cpu_ptr(&riscv_cpuinfo); + +#if IS_ENABLED(CONFIG_RISCV_SBI) + ci->mvendorid = sbi_spec_is_0_1() ? 0 : sbi_get_mvendorid(); + ci->marchid = sbi_spec_is_0_1() ? 0 : sbi_get_marchid(); + ci->mimpid = sbi_spec_is_0_1() ? 0 : sbi_get_mimpid(); +#elif IS_ENABLED(CONFIG_RISCV_M_MODE) + ci->mvendorid = csr_read(CSR_MVENDORID); + ci->marchid = csr_read(CSR_MARCHID); + ci->mimpid = csr_read(CSR_MIMPID); +#else + ci->mvendorid = 0; + ci->marchid = 0; + ci->mimpid = 0; +#endif + + return 0; +} + +static int __init riscv_cpuinfo_init(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "riscv/cpuinfo:starting", + riscv_cpuinfo_starting, NULL); + if (ret < 0) { + pr_err("cpuinfo: failed to register hotplug callbacks.\n"); + return ret; + } + + return 0; +} +device_initcall(riscv_cpuinfo_init); + #define __RISCV_ISA_EXT_DATA(UPROP, EXTID) \ { \ .uprop = #UPROP, \ @@ -185,6 +232,7 @@ static int c_show(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; struct device_node *node = of_get_cpu_node(cpu_id, NULL); + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); const char *compat, *isa; seq_printf(m, "processor\t: %lu\n", cpu_id); @@ -195,6 +243,9 @@ static int c_show(struct seq_file *m, void *v) if (!of_property_read_string(node, "compatible", &compat) && strcmp(compat, "riscv")) seq_printf(m, "uarch\t\t: %s\n", compat); + seq_printf(m, "mvendorid\t: 0x%lx\n", ci->mvendorid); + seq_printf(m, "marchid\t\t: 0x%lx\n", ci->marchid); + seq_printf(m, "mimpid\t\t: 0x%lx\n", ci->mimpid); seq_puts(m, "\n"); of_node_put(node); -- GitLab From 44159659df8ca381b84261e11058b2176fa03ba0 Mon Sep 17 00:00:00 2001 From: Shida Zhang <zhangshida@kylinos.cn> Date: Tue, 4 Oct 2022 16:39:42 +1100 Subject: [PATCH 1165/2223] xfs: trim the mapp array accordingly in xfs_da_grow_inode_int Take a look at the for-loop in xfs_da_grow_inode_int: ====== for(){ nmap = min(XFS_BMAP_MAX_NMAP, count); ... error = xfs_bmapi_write(...,&mapp[mapi], &nmap);//(..., $1, $2) ... mapi += nmap; } ===== where $1 stands for the start address of the array, while $2 is used to indicate the size of the array. The array $1 will advance by $nmap in each iteration after the allocation of extents. But the size $2 still remains unchanged, which is determined by min(XFS_BMAP_MAX_NMAP, count). It seems that it has forgotten to trim the mapp array after each iteration, so change it. Signed-off-by: Shida Zhang <zhangshida@kylinos.cn> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com> --- fs/xfs/libxfs/xfs_da_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e7201dc68f430..e576560b46e97 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2192,8 +2192,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap); -- GitLab From c098576f5f63bc0ee2424bba50892514a71d54e8 Mon Sep 17 00:00:00 2001 From: Shida Zhang <zhangshida@kylinos.cn> Date: Tue, 4 Oct 2022 16:39:58 +1100 Subject: [PATCH 1166/2223] xfs: rearrange the logic and remove the broken comment for xfs_dir2_isxx xfs_dir2_isleaf is used to see if the directory is a single-leaf form directory instead, as commented right above the function. Besides getting rid of the broken comment, we rearrange the logic by converting everything over to standard formatting and conventions, at the same time, to make it easier to understand and self documenting. Signed-off-by: Shida Zhang <zhangshida@kylinos.cn> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com> --- fs/xfs/libxfs/xfs_dir2.c | 50 +++++++++++++++++++++++---------------- fs/xfs/libxfs/xfs_dir2.h | 4 ++-- fs/xfs/scrub/dir.c | 2 +- fs/xfs/xfs_dir2_readdir.c | 2 +- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 76eedc2756b31..92bac3373f1f5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -261,7 +261,7 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -357,7 +357,7 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -435,7 +435,7 @@ xfs_dir_removename( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -493,7 +493,7 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -610,19 +610,23 @@ xfs_dir2_grow_inode( int xfs_dir2_isblock( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isblock) { - xfs_fileoff_t last; /* last file offset */ - int rval; + struct xfs_mount *mp = args->dp->i_mount; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (XFS_IS_CORRUPT(args->dp->i_mount, - rval != 0 && - args->dp->i_disk_size != args->geo->blksize)) + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isblock = false; + if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) + return 0; + + *isblock = true; + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; - *vp = rval; return 0; } @@ -632,14 +636,20 @@ xfs_dir2_isblock( int xfs_dir2_isleaf( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isleaf) { - xfs_fileoff_t last; /* last file offset */ - int rval; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - *vp = last == args->geo->leafblk + args->geo->fsbcount; + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isleaf = false; + if (eof != args->geo->leafblk + args->geo->fsbcount) + return 0; + + *isleaf = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index b6df3c34b26af..dd39f17dd9a9c 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5abb5fdb71d93..b9c5764e74374 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -676,7 +676,7 @@ xchk_directory_blocks( xfs_dablk_t dabno; xfs_dir2_db_t last_data_db = 0; bool found; - int is_block = 0; + bool is_block = false; int error; /* Ignore local format directories. */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index e295fc8062d81..9f3ceb4615156 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -512,7 +512,7 @@ xfs_readdir( { struct xfs_da_args args = { NULL }; unsigned int lock_mode; - int isblock; + bool isblock; int error; trace_xfs_readdir(dp); -- GitLab From e033f40be262c4d227f8fbde52856e1d8646872b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Tue, 4 Oct 2022 16:40:01 +1100 Subject: [PATCH 1167/2223] xfs: on memory failure, only shut down fs after scanning all mappings xfs_dax_failure_fn is used to scan the filesystem during a memory failure event to look for memory mappings to revoke. Unfortunately, if it encounters an rmap record for filesystem metadata, it will shut down the filesystem and the scan immediately. This means that we don't complete the mapping revocation scan and instead leave live mappings to failed memory. Fix the function to defer the shutdown until after we've finished culling mappings. While we're at it, add the usual "xfs_" prefix to struct failure_info, and actually initialize mf_flags. Fixes: 6f643c57d57c ("xfs: implement ->notify_failure() for XFS") Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com> --- fs/xfs/xfs_notify_failure.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 69d9c83ea4b21..65d5eb20878e6 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -23,17 +23,18 @@ #include <linux/mm.h> #include <linux/dax.h> -struct failure_info { +struct xfs_failure_info { xfs_agblock_t startblock; xfs_extlen_t blockcount; int mf_flags; + bool want_shutdown; }; static pgoff_t xfs_failure_pgoff( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); @@ -47,7 +48,7 @@ static unsigned long xfs_failure_pgcnt( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { xfs_agblock_t end_rec; xfs_agblock_t end_notify; @@ -71,13 +72,13 @@ xfs_dax_failure_fn( { struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; - struct failure_info *notify = data; + struct xfs_failure_info *notify = data; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); - return -EFSCORRUPTED; + notify->want_shutdown = true; + return 0; } /* Get files that incore, filter out others that are not in use. */ @@ -86,8 +87,10 @@ xfs_dax_failure_fn( /* Continue the rmap query if the inode isn't incore */ if (error == -ENODATA) return 0; - if (error) - return error; + if (error) { + notify->want_shutdown = true; + return 0; + } error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, xfs_failure_pgoff(mp, rec, notify), @@ -104,6 +107,7 @@ xfs_dax_notify_ddev_failure( xfs_daddr_t bblen, int mf_flags) { + struct xfs_failure_info notify = { .mf_flags = mf_flags }; struct xfs_trans *tp = NULL; struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; @@ -120,7 +124,6 @@ xfs_dax_notify_ddev_failure( for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; - struct failure_info notify; struct xfs_agf *agf; xfs_agblock_t agend; struct xfs_perag *pag; @@ -161,6 +164,11 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); + if (error || notify.want_shutdown) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + if (!error) + error = -EFSCORRUPTED; + } return error; } -- GitLab From 4635c0e2a7f7f3568cbfccae70121f9835efa62c Mon Sep 17 00:00:00 2001 From: Quentin Schulz <quentin.schulz@theobroma-systems.com> Date: Fri, 30 Sep 2022 15:20:32 +0200 Subject: [PATCH 1168/2223] pinctrl: rockchip: add pinmux_ops.gpio_set_direction callback Before the split of gpio and pinctrl sections in their own driver, rockchip_set_mux was called in pinmux_ops.gpio_set_direction for configuring a pin in its GPIO function. This is essential for cases where pinctrl is "bypassed" by gpio consumers otherwise the GPIO function is not configured for the pin and it does not work. Such was the case for the sysfs/libgpiod userspace GPIO handling. Let's re-implement the pinmux_ops.gpio_set_direction callback so that the gpio subsystem can request from the pinctrl driver to put the pin in its GPIO function. Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes") Cc: stable@vger.kernel.org Reviewed-by: Heiko Stuebner <heiko@sntech.de> Signed-off-by: Quentin Schulz <quentin.schulz@theobroma-systems.com> Link: https://lore.kernel.org/r/20220930132033.4003377-2-foss+kernel@0leil.net Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/pinctrl-rockchip.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index a91061f9c2aca..53bdfc40f0558 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -2665,11 +2665,24 @@ static int rockchip_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, return 0; } +static int rockchip_pmx_gpio_set_direction(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned offset, + bool input) +{ + struct rockchip_pinctrl *info = pinctrl_dev_get_drvdata(pctldev); + struct rockchip_pin_bank *bank; + + bank = pin_to_bank(info, offset); + return rockchip_set_mux(bank, offset - bank->pin_base, RK_FUNC_GPIO); +} + static const struct pinmux_ops rockchip_pmx_ops = { .get_functions_count = rockchip_pmx_get_funcs_count, .get_function_name = rockchip_pmx_get_func_name, .get_function_groups = rockchip_pmx_get_groups, .set_mux = rockchip_pmx_set, + .gpio_set_direction = rockchip_pmx_gpio_set_direction, }; /* -- GitLab From 8ea8af6c8469156ac2042d83d73f6b74eb4b4b45 Mon Sep 17 00:00:00 2001 From: Quentin Schulz <quentin.schulz@theobroma-systems.com> Date: Fri, 30 Sep 2022 15:20:33 +0200 Subject: [PATCH 1169/2223] gpio: rockchip: request GPIO mux to pinctrl when setting direction Before the split of gpio and pinctrl sections in their own driver, rockchip_set_mux was called in pinmux_ops.gpio_set_direction for configuring a pin in its GPIO function. This is essential for cases where pinctrl is "bypassed" by gpio consumers otherwise the GPIO function is not configured for the pin and it does not work. Such was the case for the sysfs/libgpiod userspace GPIO handling. Let's call pinctrl_gpio_direction_input/output when setting the direction of a GPIO so that the pinctrl core requests from the rockchip pinctrl driver to put the pin in its GPIO function. Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes") Fixes: 936ee2675eee ("gpio/rockchip: add driver for rockchip gpio") Cc: stable@vger.kernel.org Reviewed-by: Heiko Stuebner <heiko@sntech.de> Signed-off-by: Quentin Schulz <quentin.schulz@theobroma-systems.com> Link: https://lore.kernel.org/r/20220930132033.4003377-3-foss+kernel@0leil.net Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/gpio/gpio-rockchip.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index f91e876fd9690..2bde8365b1258 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -19,6 +19,7 @@ #include <linux/of_address.h> #include <linux/of_device.h> #include <linux/of_irq.h> +#include <linux/pinctrl/consumer.h> #include <linux/pinctrl/pinconf-generic.h> #include <linux/regmap.h> @@ -156,6 +157,12 @@ static int rockchip_gpio_set_direction(struct gpio_chip *chip, unsigned long flags; u32 data = input ? 0 : 1; + + if (input) + pinctrl_gpio_direction_input(bank->pin_base + offset); + else + pinctrl_gpio_direction_output(bank->pin_base + offset); + raw_spin_lock_irqsave(&bank->slock, flags); rockchip_gpio_writel_bit(bank, offset, data, bank->gpio_regs->port_ddr); raw_spin_unlock_irqrestore(&bank->slock, flags); -- GitLab From 19fdcb1d98a6adcab27db4cc0d111fcba0f7bd8f Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Fri, 23 Sep 2022 18:10:38 +0800 Subject: [PATCH 1170/2223] pinctrl: bcm: ns: Remove redundant dev_err call devm_ioremap_resource() prints error message in itself. Remove the dev_err call to avoid redundant error message. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Florian Fainelli <f.fainelli@gmail.com> Acked-by: Ray Jui <ray.jui@broadcom.com> Link: https://lore.kernel.org/r/20220923101038.18036-1-shangxiaojing@huawei.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/bcm/pinctrl-ns.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-ns.c b/drivers/pinctrl/bcm/pinctrl-ns.c index 65a86543c58cc..465cc96814a11 100644 --- a/drivers/pinctrl/bcm/pinctrl-ns.c +++ b/drivers/pinctrl/bcm/pinctrl-ns.c @@ -233,10 +233,8 @@ static int ns_pinctrl_probe(struct platform_device *pdev) res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cru_gpio_control"); ns_pinctrl->base = devm_ioremap_resource(dev, res); - if (IS_ERR(ns_pinctrl->base)) { - dev_err(dev, "Failed to map pinctrl regs\n"); + if (IS_ERR(ns_pinctrl->base)) return PTR_ERR(ns_pinctrl->base); - } memcpy(pctldesc, &ns_pinctrl_desc, sizeof(*pctldesc)); -- GitLab From 203672e1208c2f36ff31a305f6a70d73d9dbce63 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Date: Sun, 25 Sep 2022 13:21:03 +0200 Subject: [PATCH 1171/2223] pinctrl: qcom: restrict drivers per ARM/ARM64 There is no point to allow selecting pin-controller drivers for Qualcomm ARMv7 SoCs when building ARM64 kernel, and vice versa. This makes kernel configuration more difficult as many do not remember the Qualcomm SoCs. There won't be a single image for ARMv7 and ARMv8/9 SoCs, so no features/options are lost. Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Link: https://lore.kernel.org/r/20220925112103.148836-1-krzysztof.kozlowski@linaro.org Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/qcom/Kconfig | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index 2961b5eb8e10a..9dc2d803a5867 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -15,6 +15,7 @@ config PINCTRL_MSM config PINCTRL_APQ8064 tristate "Qualcomm APQ8064 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -23,6 +24,7 @@ config PINCTRL_APQ8064 config PINCTRL_APQ8084 tristate "Qualcomm APQ8084 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -31,6 +33,7 @@ config PINCTRL_APQ8084 config PINCTRL_IPQ4019 tristate "Qualcomm IPQ4019 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -39,6 +42,7 @@ config PINCTRL_IPQ4019 config PINCTRL_IPQ8064 tristate "Qualcomm IPQ8064 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -47,6 +51,7 @@ config PINCTRL_IPQ8064 config PINCTRL_IPQ8074 tristate "Qualcomm Technologies, Inc. IPQ8074 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for @@ -57,6 +62,7 @@ config PINCTRL_IPQ8074 config PINCTRL_IPQ6018 tristate "Qualcomm Technologies, Inc. IPQ6018 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for @@ -67,6 +73,7 @@ config PINCTRL_IPQ6018 config PINCTRL_MSM8226 tristate "Qualcomm 8226 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -76,6 +83,7 @@ config PINCTRL_MSM8226 config PINCTRL_MSM8660 tristate "Qualcomm 8660 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -84,6 +92,7 @@ config PINCTRL_MSM8660 config PINCTRL_MSM8960 tristate "Qualcomm 8960 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -100,6 +109,7 @@ config PINCTRL_MDM9607 config PINCTRL_MDM9615 tristate "Qualcomm 9615 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -108,6 +118,7 @@ config PINCTRL_MDM9615 config PINCTRL_MSM8X74 tristate "Qualcomm 8x74 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -116,6 +127,7 @@ config PINCTRL_MSM8X74 config PINCTRL_MSM8909 tristate "Qualcomm 8909 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -132,6 +144,7 @@ config PINCTRL_MSM8916 config PINCTRL_MSM8953 tristate "Qualcomm 8953 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -142,6 +155,7 @@ config PINCTRL_MSM8953 config PINCTRL_MSM8976 tristate "Qualcomm 8976 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -152,6 +166,7 @@ config PINCTRL_MSM8976 config PINCTRL_MSM8994 tristate "Qualcomm 8994 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -161,6 +176,7 @@ config PINCTRL_MSM8994 config PINCTRL_MSM8996 tristate "Qualcomm MSM8996 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -169,6 +185,7 @@ config PINCTRL_MSM8996 config PINCTRL_MSM8998 tristate "Qualcomm MSM8998 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -177,6 +194,7 @@ config PINCTRL_MSM8998 config PINCTRL_QCM2290 tristate "Qualcomm QCM2290 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -185,6 +203,7 @@ config PINCTRL_QCM2290 config PINCTRL_QCS404 tristate "Qualcomm QCS404 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -232,6 +251,7 @@ config PINCTRL_QCOM_SSBI_PMIC config PINCTRL_SC7180 tristate "Qualcomm Technologies Inc SC7180 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -241,6 +261,7 @@ config PINCTRL_SC7180 config PINCTRL_SC7280 tristate "Qualcomm Technologies Inc SC7280 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -250,6 +271,7 @@ config PINCTRL_SC7280 config PINCTRL_SC7280_LPASS_LPI tristate "Qualcomm Technologies Inc SC7280 LPASS LPI pin controller driver" depends on GPIOLIB + depends on ARM64 || COMPILE_TEST depends on PINCTRL_LPASS_LPI help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -259,6 +281,7 @@ config PINCTRL_SC7280_LPASS_LPI config PINCTRL_SC8180X tristate "Qualcomm Technologies Inc SC8180x pin controller driver" depends on (OF || ACPI) + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -268,6 +291,7 @@ config PINCTRL_SC8180X config PINCTRL_SC8280XP tristate "Qualcomm Technologies Inc SC8280xp pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -277,6 +301,7 @@ config PINCTRL_SC8280XP config PINCTRL_SDM660 tristate "Qualcomm Technologies Inc SDM660 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -286,6 +311,7 @@ config PINCTRL_SDM660 config PINCTRL_SDM845 tristate "Qualcomm Technologies Inc SDM845 pin controller driver" depends on (OF || ACPI) + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -295,6 +321,7 @@ config PINCTRL_SDM845 config PINCTRL_SDX55 tristate "Qualcomm Technologies Inc SDX55 pin controller driver" depends on OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -304,6 +331,7 @@ config PINCTRL_SDX55 config PINCTRL_SM6115 tristate "Qualcomm Technologies Inc SM6115,SM4250 pin controller driver" depends on GPIOLIB && OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -313,6 +341,7 @@ config PINCTRL_SM6115 config PINCTRL_SM6125 tristate "Qualcomm Technologies Inc SM6125 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -322,6 +351,7 @@ config PINCTRL_SM6125 config PINCTRL_SM6350 tristate "Qualcomm Technologies Inc SM6350 pin controller driver" depends on GPIOLIB && OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -331,6 +361,7 @@ config PINCTRL_SM6350 config PINCTRL_SM6375 tristate "Qualcomm Technologies Inc SM6375 pin controller driver" depends on GPIOLIB && OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -340,6 +371,7 @@ config PINCTRL_SM6375 config PINCTRL_SDX65 tristate "Qualcomm Technologies Inc SDX65 pin controller driver" depends on GPIOLIB && OF + depends on ARM || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -349,6 +381,7 @@ config PINCTRL_SDX65 config PINCTRL_SM8150 tristate "Qualcomm Technologies Inc SM8150 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -358,6 +391,7 @@ config PINCTRL_SM8150 config PINCTRL_SM8250 tristate "Qualcomm Technologies Inc SM8250 pin controller driver" depends on OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -367,6 +401,7 @@ config PINCTRL_SM8250 config PINCTRL_SM8250_LPASS_LPI tristate "Qualcomm Technologies Inc SM8250 LPASS LPI pin controller driver" depends on GPIOLIB + depends on ARM64 || COMPILE_TEST depends on PINCTRL_LPASS_LPI help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -375,6 +410,7 @@ config PINCTRL_SM8250_LPASS_LPI config PINCTRL_SM8350 tristate "Qualcomm Technologies Inc SM8350 pin controller driver" + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -384,6 +420,7 @@ config PINCTRL_SM8350 config PINCTRL_SM8450 tristate "Qualcomm Technologies Inc SM8450 pin controller driver" depends on GPIOLIB && OF + depends on ARM64 || COMPILE_TEST depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -393,6 +430,7 @@ config PINCTRL_SM8450 config PINCTRL_SM8450_LPASS_LPI tristate "Qualcomm Technologies Inc SM8450 LPASS LPI pin controller driver" depends on GPIOLIB + depends on ARM64 || COMPILE_TEST depends on PINCTRL_LPASS_LPI help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -402,6 +440,7 @@ config PINCTRL_SM8450_LPASS_LPI config PINCTRL_SC8280XP_LPASS_LPI tristate "Qualcomm Technologies Inc SC8280XP LPASS LPI pin controller driver" depends on GPIOLIB + depends on ARM64 || COMPILE_TEST depends on PINCTRL_LPASS_LPI help This is the pinctrl, pinmux, pinconf and gpiolib driver for the -- GitLab From 66db794ad54ce49d4fd564a16f682f257f608655 Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 13:39:26 +0000 Subject: [PATCH 1172/2223] pinctrl: bcm: Remove unused struct bcm6328_pingroup After commit 0e3db16300fb("pinctrl: bcm: Convert drivers to use struct pingroup and PINCTRL_PINGROUP()"), no one use struct bcm6328_pingroup, so remove it. Signed-off-by: Yuan Can <yuancan@huawei.com> Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Link: https://lore.kernel.org/r/20220927133926.103943-1-yuancan@huawei.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/bcm/pinctrl-bcm6328.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6328.c b/drivers/pinctrl/bcm/pinctrl-bcm6328.c index 1eef5ab9a5e52..1e8cc2c80c81f 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6328.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6328.c @@ -26,12 +26,6 @@ #define BCM6328_MUX_OTHER_REG 0x24 #define BCM6328_MUX_MASK GENMASK(1, 0) -struct bcm6328_pingroup { - const char *name; - const unsigned * const pins; - const unsigned num_pins; -}; - struct bcm6328_function { const char *name; const char * const *groups; -- GitLab From f4a31facfa80df2f440a2fdc2b7f58d6c23925b0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Date: Tue, 27 Sep 2022 20:55:09 +0300 Subject: [PATCH 1173/2223] pinctrl: wpcm450: Correct the fwnode_irq_get() return value check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fwnode_irq_get() can return zero to indicate IRQ mapping errors. Handle this case by skipping the interrupt resource. Fixes: a1d1e0e3d80a ("pinctrl: nuvoton: Add driver for WPCM450") Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Reviewed-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net> Link: https://lore.kernel.org/r/20220927175509.15695-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/nuvoton/pinctrl-wpcm450.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 0dbeb91f0bf27..8193b92da4031 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -1081,10 +1081,13 @@ static int wpcm450_gpio_register(struct platform_device *pdev, girq->num_parents = 0; for (i = 0; i < WPCM450_NUM_GPIO_IRQS; i++) { - int irq = fwnode_irq_get(child, i); + int irq; + irq = fwnode_irq_get(child, i); if (irq < 0) break; + if (!irq) + continue; girq->parents[i] = irq; girq->num_parents++; -- GitLab From e75729b2f63fbdbf776930de8b0eee0d43a68be6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov <dmitry.torokhov@gmail.com> Date: Wed, 28 Sep 2022 13:20:18 -0700 Subject: [PATCH 1174/2223] pinctrl: st: stop abusing of_get_named_gpio() Pin descriptions for this chip only look like standard GPIO device tree descriptions, while in fact they contain additional data (in excess of number of cells specified in description of gpio controllers). They also refer to only pins/gpios belonging to the driver and not to arbitrary gpio in the system. Because we want to stop exporting OF-specific handlers from gpiolib-of, let's parse the pin reference ourself instead of trying to call of_get_named_gpio(). Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> Tested-by: Patrice Chotard <patrice.chotard@foss.st.com> Reviewed-by: Patrice Chotard <patrice.chotard@foss.st.com> Link: https://lore.kernel.org/r/YzSsgoVoJn4+mSpv@google.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/pinctrl-st.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/pinctrl-st.c b/drivers/pinctrl/pinctrl-st.c index 0fea71fd9a005..cf7f9cbe60440 100644 --- a/drivers/pinctrl/pinctrl-st.c +++ b/drivers/pinctrl/pinctrl-st.c @@ -12,7 +12,6 @@ #include <linux/io.h> #include <linux/of.h> #include <linux/of_irq.h> -#include <linux/of_gpio.h> /* of_get_named_gpio() */ #include <linux/of_address.h> #include <linux/gpio/driver.h> #include <linux/regmap.h> @@ -1162,6 +1161,31 @@ static void st_parse_syscfgs(struct st_pinctrl *info, int bank, return; } +static int st_pctl_dt_calculate_pin(struct st_pinctrl *info, + phandle bank, unsigned int offset) +{ + struct device_node *np; + struct gpio_chip *chip; + int retval = -EINVAL; + int i; + + np = of_find_node_by_phandle(bank); + if (!np) + return -EINVAL; + + for (i = 0; i < info->nbanks; i++) { + chip = &info->banks[i].gpio_chip; + if (chip->of_node == np) { + if (offset < chip->ngpio) + retval = chip->base + offset; + break; + } + } + + of_node_put(np); + return retval; +} + /* * Each pin is represented in of the below forms. * <bank offset mux direction rt_type rt_delay rt_clk> @@ -1175,6 +1199,8 @@ static int st_pctl_dt_parse_groups(struct device_node *np, struct device *dev = info->dev; struct st_pinconf *conf; struct device_node *pins; + phandle bank; + unsigned int offset; int i = 0, npins = 0, nr_props, ret = 0; pins = of_get_child_by_name(np, "st,pins"); @@ -1214,9 +1240,9 @@ static int st_pctl_dt_parse_groups(struct device_node *np, conf = &grp->pin_conf[i]; /* bank & offset */ - be32_to_cpup(list++); - be32_to_cpup(list++); - conf->pin = of_get_named_gpio(pins, pp->name, 0); + bank = be32_to_cpup(list++); + offset = be32_to_cpup(list++); + conf->pin = st_pctl_dt_calculate_pin(info, bank, offset); conf->name = pp->name; grp->pins[i] = conf->pin; /* mux */ -- GitLab From 448921706bdd1758ac63c07185c5a4713278d6f8 Mon Sep 17 00:00:00 2001 From: Marek Vasut <marex@denx.de> Date: Mon, 26 Sep 2022 22:47:24 +0200 Subject: [PATCH 1175/2223] dt-bindings: pinctrl: st,stm32: Document gpio-line-names Document gpio-line-names property as valid property. This fixes dtbs_check warnings when building current Linux DTs: " arch/arm/boot/dts/stm32mp153c-dhcom-drc02.dtb: pinctrl@50002000: gpio@50009000: 'gpio-line-names' does not match any of the regexes: 'pinctrl-[0-9]+' " Signed-off-by: Marek Vasut <marex@denx.de> Acked-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/r/20220926204724.381760-1-marex@denx.de Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml index 53c952d93ea28..06229d93c24cf 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -71,6 +71,7 @@ patternProperties: maxItems: 1 resets: maxItems: 1 + gpio-line-names: true gpio-ranges: minItems: 1 maxItems: 16 -- GitLab From 140bb02315e78923dc0ecd7d3c7f021c0167a817 Mon Sep 17 00:00:00 2001 From: Marek Vasut <marex@denx.de> Date: Mon, 26 Sep 2022 22:47:35 +0200 Subject: [PATCH 1176/2223] dt-bindings: pinctrl: st,stm32: Document gpio-hog pattern property Document gpio-hog pattern property and its subnodes. This fixes dtbs_check warnings when building current Linux DTs: " arch/arm/boot/dts/stm32mp153c-dhcom-drc02.dtb: pinctrl@50002000: gpio@50003000: 'rs485-rx-en-hog' does not match any of the regexes: 'pinctrl-[0-9]+' " Signed-off-by: Marek Vasut <marex@denx.de> Acked-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/r/20220926204735.381779-1-marex@denx.de Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- .../devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml index 06229d93c24cf..12598e036287b 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -107,6 +107,12 @@ patternProperties: minimum: 0 maximum: 11 + patternProperties: + "^(.+-hog(-[0-9]+)?)$": + type: object + required: + - gpio-hog + required: - gpio-controller - '#gpio-cells' -- GitLab From 5197b707d68ad75a165db743ac1151ea3407c1eb Mon Sep 17 00:00:00 2001 From: Marek Vasut <marex@denx.de> Date: Mon, 26 Sep 2022 22:47:52 +0200 Subject: [PATCH 1177/2223] dt-bindings: pinctrl: st,stm32: Document interrupt-controller property Document interrupt-controller property and its interrupt-cells. This fixes dtbs_check warnings when building current Linux DTs: " arch/arm/boot/dts/stm32mp153c-dhcom-drc02.dtb: pinctrl@50002000: gpio@5000a000: '#interrupt-cells', 'interrupt-controller' do not match any of the regexes: 'pinctrl-[0-9]+' " Signed-off-by: Marek Vasut <marex@denx.de> Acked-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/r/20220926204752.381798-1-marex@denx.de Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- .../devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml index 12598e036287b..9d59208d83c18 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -64,6 +64,9 @@ patternProperties: gpio-controller: true '#gpio-cells': const: 2 + interrupt-controller: true + '#interrupt-cells': + const: 2 reg: maxItems: 1 -- GitLab From ba7fdf88e98acadc00c56e1272d022564f7ac721 Mon Sep 17 00:00:00 2001 From: Jianlong Huang <jianlong.huang@starfivetech.com> Date: Fri, 30 Sep 2022 14:08:19 +0800 Subject: [PATCH 1178/2223] pinctrl: Create subdirectory for StarFive drivers Move the StarFive JH7100 pinctrl driver to a new subdirectory in preparation for adding more StarFive pinctrl drivers. No functional change. Signed-off-by: Jianlong Huang <jianlong.huang@starfivetech.com> Signed-off-by: Hal Feng <hal.feng@linux.starfivetech.com> Link: https://lore.kernel.org/r/20220930060819.5320-1-hal.feng@linux.starfivetech.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- drivers/pinctrl/Kconfig | 18 +----------------- drivers/pinctrl/Makefile | 2 +- drivers/pinctrl/starfive/Kconfig | 18 ++++++++++++++++++ drivers/pinctrl/starfive/Makefile | 3 +++ .../pinctrl/{ => starfive}/pinctrl-starfive.c | 8 ++++---- 5 files changed, 27 insertions(+), 22 deletions(-) create mode 100644 drivers/pinctrl/starfive/Kconfig create mode 100644 drivers/pinctrl/starfive/Makefile rename drivers/pinctrl/{ => starfive}/pinctrl-starfive.c (99%) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index da87f2dc358bc..287420cfc850d 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -431,23 +431,6 @@ config PINCTRL_ST select PINCONF select GPIOLIB_IRQCHIP -config PINCTRL_STARFIVE - tristate "Pinctrl and GPIO driver for the StarFive JH7100 SoC" - depends on SOC_STARFIVE || COMPILE_TEST - depends on OF - default SOC_STARFIVE - select GENERIC_PINCTRL_GROUPS - select GENERIC_PINMUX_FUNCTIONS - select GENERIC_PINCONF - select GPIOLIB - select GPIOLIB_IRQCHIP - select OF_GPIO - help - Say yes here to support pin control on the StarFive JH7100 SoC. - This also provides an interface to the GPIO pins not used by other - peripherals supporting inputs, outputs, configuring pull-up/pull-down - and interrupts on input changes. - config PINCTRL_STMFX tristate "STMicroelectronics STMFX GPIO expander pinctrl driver" depends on I2C @@ -545,6 +528,7 @@ source "drivers/pinctrl/renesas/Kconfig" source "drivers/pinctrl/samsung/Kconfig" source "drivers/pinctrl/spear/Kconfig" source "drivers/pinctrl/sprd/Kconfig" +source "drivers/pinctrl/starfive/Kconfig" source "drivers/pinctrl/stm32/Kconfig" source "drivers/pinctrl/sunplus/Kconfig" source "drivers/pinctrl/sunxi/Kconfig" diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 7188dab7eec88..89bfa01b5231a 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -44,7 +44,6 @@ obj-$(CONFIG_PINCTRL_RK805) += pinctrl-rk805.o obj-$(CONFIG_PINCTRL_ROCKCHIP) += pinctrl-rockchip.o obj-$(CONFIG_PINCTRL_SINGLE) += pinctrl-single.o obj-$(CONFIG_PINCTRL_ST) += pinctrl-st.o -obj-$(CONFIG_PINCTRL_STARFIVE) += pinctrl-starfive.o obj-$(CONFIG_PINCTRL_STMFX) += pinctrl-stmfx.o obj-$(CONFIG_PINCTRL_SX150X) += pinctrl-sx150x.o obj-$(CONFIG_PINCTRL_TB10X) += pinctrl-tb10x.o @@ -71,6 +70,7 @@ obj-$(CONFIG_PINCTRL_RENESAS) += renesas/ obj-$(CONFIG_PINCTRL_SAMSUNG) += samsung/ obj-$(CONFIG_PINCTRL_SPEAR) += spear/ obj-y += sprd/ +obj-$(CONFIG_SOC_STARFIVE) += starfive/ obj-$(CONFIG_PINCTRL_STM32) += stm32/ obj-y += sunplus/ obj-$(CONFIG_PINCTRL_SUNXI) += sunxi/ diff --git a/drivers/pinctrl/starfive/Kconfig b/drivers/pinctrl/starfive/Kconfig new file mode 100644 index 0000000000000..13c3275a57247 --- /dev/null +++ b/drivers/pinctrl/starfive/Kconfig @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config PINCTRL_STARFIVE + tristate "Pinctrl and GPIO driver for the StarFive JH7100 SoC" + depends on SOC_STARFIVE || COMPILE_TEST + depends on OF + select GENERIC_PINCTRL_GROUPS + select GENERIC_PINMUX_FUNCTIONS + select GENERIC_PINCONF + select GPIOLIB + select GPIOLIB_IRQCHIP + select OF_GPIO + default SOC_STARFIVE + help + Say yes here to support pin control on the StarFive JH7100 SoC. + This also provides an interface to the GPIO pins not used by other + peripherals supporting inputs, outputs, configuring pull-up/pull-down + and interrupts on input changes. diff --git a/drivers/pinctrl/starfive/Makefile b/drivers/pinctrl/starfive/Makefile new file mode 100644 index 0000000000000..4c96e2f862922 --- /dev/null +++ b/drivers/pinctrl/starfive/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_PINCTRL_STARFIVE) += pinctrl-starfive.o diff --git a/drivers/pinctrl/pinctrl-starfive.c b/drivers/pinctrl/starfive/pinctrl-starfive.c similarity index 99% rename from drivers/pinctrl/pinctrl-starfive.c rename to drivers/pinctrl/starfive/pinctrl-starfive.c index 3eb40e230d981..74a084740e8c0 100644 --- a/drivers/pinctrl/pinctrl-starfive.c +++ b/drivers/pinctrl/starfive/pinctrl-starfive.c @@ -22,10 +22,10 @@ #include <dt-bindings/pinctrl/pinctrl-starfive.h> -#include "core.h" -#include "pinctrl-utils.h" -#include "pinmux.h" -#include "pinconf.h" +#include "../core.h" +#include "../pinctrl-utils.h" +#include "../pinmux.h" +#include "../pinconf.h" #define DRIVER_NAME "pinctrl-starfive" -- GitLab From ba99b756da178aa8c608c4499a91074466050c10 Mon Sep 17 00:00:00 2001 From: Jianlong Huang <jianlong.huang@starfivetech.com> Date: Fri, 30 Sep 2022 14:14:04 +0800 Subject: [PATCH 1179/2223] pinctrl: starfive: Rename "pinctrl-starfive" to "pinctrl-starfive-jh7100" Add the SoC name to make it more clear. Also the next generation StarFive SoCs will use "pinctrl-starfive" as the core of StarFive pinctrl driver. No functional change. Signed-off-by: Jianlong Huang <jianlong.huang@starfivetech.com> Signed-off-by: Hal Feng <hal.feng@linux.starfivetech.com> Reviewed-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/r/20220930061404.5418-1-hal.feng@linux.starfivetech.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- .../bindings/pinctrl/starfive,jh7100-pinctrl.yaml | 2 +- arch/riscv/boot/dts/starfive/jh7100-beaglev-starlight.dts | 2 +- drivers/pinctrl/starfive/Kconfig | 2 +- drivers/pinctrl/starfive/Makefile | 2 +- .../{pinctrl-starfive.c => pinctrl-starfive-jh7100.c} | 2 +- .../{pinctrl-starfive.h => pinctrl-starfive-jh7100.h} | 6 +++--- 6 files changed, 8 insertions(+), 8 deletions(-) rename drivers/pinctrl/starfive/{pinctrl-starfive.c => pinctrl-starfive-jh7100.c} (99%) rename include/dt-bindings/pinctrl/{pinctrl-starfive.h => pinctrl-starfive-jh7100.h} (98%) diff --git a/Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml index 92963604422f4..a6140dddd39ac 100644 --- a/Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml @@ -165,7 +165,7 @@ examples: - | #include <dt-bindings/clock/starfive-jh7100.h> #include <dt-bindings/reset/starfive-jh7100.h> - #include <dt-bindings/pinctrl/pinctrl-starfive.h> + #include <dt-bindings/pinctrl/pinctrl-starfive-jh7100.h> soc { #address-cells = <2>; diff --git a/arch/riscv/boot/dts/starfive/jh7100-beaglev-starlight.dts b/arch/riscv/boot/dts/starfive/jh7100-beaglev-starlight.dts index c9af67f7a0d20..f7a2301105128 100644 --- a/arch/riscv/boot/dts/starfive/jh7100-beaglev-starlight.dts +++ b/arch/riscv/boot/dts/starfive/jh7100-beaglev-starlight.dts @@ -8,7 +8,7 @@ #include "jh7100.dtsi" #include <dt-bindings/gpio/gpio.h> #include <dt-bindings/leds/common.h> -#include <dt-bindings/pinctrl/pinctrl-starfive.h> +#include <dt-bindings/pinctrl/pinctrl-starfive-jh7100.h> / { model = "BeagleV Starlight Beta"; diff --git a/drivers/pinctrl/starfive/Kconfig b/drivers/pinctrl/starfive/Kconfig index 13c3275a57247..55c514e622f91 100644 --- a/drivers/pinctrl/starfive/Kconfig +++ b/drivers/pinctrl/starfive/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -config PINCTRL_STARFIVE +config PINCTRL_STARFIVE_JH7100 tristate "Pinctrl and GPIO driver for the StarFive JH7100 SoC" depends on SOC_STARFIVE || COMPILE_TEST depends on OF diff --git a/drivers/pinctrl/starfive/Makefile b/drivers/pinctrl/starfive/Makefile index 4c96e2f862922..0293f26a0a993 100644 --- a/drivers/pinctrl/starfive/Makefile +++ b/drivers/pinctrl/starfive/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PINCTRL_STARFIVE) += pinctrl-starfive.o +obj-$(CONFIG_PINCTRL_STARFIVE_JH7100) += pinctrl-starfive-jh7100.o diff --git a/drivers/pinctrl/starfive/pinctrl-starfive.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c similarity index 99% rename from drivers/pinctrl/starfive/pinctrl-starfive.c rename to drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c index 74a084740e8c0..5b544fb7f3d88 100644 --- a/drivers/pinctrl/starfive/pinctrl-starfive.c +++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c @@ -20,7 +20,7 @@ #include <linux/pinctrl/pinctrl.h> #include <linux/pinctrl/pinmux.h> -#include <dt-bindings/pinctrl/pinctrl-starfive.h> +#include <dt-bindings/pinctrl/pinctrl-starfive-jh7100.h> #include "../core.h" #include "../pinctrl-utils.h" diff --git a/include/dt-bindings/pinctrl/pinctrl-starfive.h b/include/dt-bindings/pinctrl/pinctrl-starfive-jh7100.h similarity index 98% rename from include/dt-bindings/pinctrl/pinctrl-starfive.h rename to include/dt-bindings/pinctrl/pinctrl-starfive-jh7100.h index de4f75c2c9e85..a200f546d078a 100644 --- a/include/dt-bindings/pinctrl/pinctrl-starfive.h +++ b/include/dt-bindings/pinctrl/pinctrl-starfive-jh7100.h @@ -3,8 +3,8 @@ * Copyright (C) 2021 Emil Renner Berthing <kernel@esmil.dk> */ -#ifndef __DT_BINDINGS_PINCTRL_STARFIVE_H__ -#define __DT_BINDINGS_PINCTRL_STARFIVE_H__ +#ifndef __DT_BINDINGS_PINCTRL_STARFIVE_JH7100_H__ +#define __DT_BINDINGS_PINCTRL_STARFIVE_JH7100_H__ #define PAD_GPIO_OFFSET 0 #define PAD_FUNC_SHARE_OFFSET 64 @@ -272,4 +272,4 @@ #define GPI_NONE 0xff -#endif /* __DT_BINDINGS_PINCTRL_STARFIVE_H__ */ +#endif /* __DT_BINDINGS_PINCTRL_STARFIVE_JH7100_H__ */ -- GitLab From 8012243e62b5e13bded3ce8a3b69d28f8ea694fe Mon Sep 17 00:00:00 2001 From: Raul Silvera <rsilvera@google.com> Date: Mon, 15 Aug 2022 22:59:22 +0000 Subject: [PATCH 1180/2223] perf inject: Add a command line option to specify build ids. This commit adds the option --known-build-ids to perf inject. It allows the user to explicitly specify the build id for a given path, instead of retrieving it from the current system. This is useful in cases where a perf.data file is processed on a different system from where it was collected, or if some of the binaries are no longer available. The build ids and paths are specified in pairs in the command line. Using the file:// specifier, build ids can be loaded from a file directly generated by perf buildid-list. This is convenient to copy build ids from one perf.data file to another. ** Example: In this example we use perf record to create two perf.data files, one with build ids and another without, and use perf buildid-list and perf inject to copy the build ids from the first file to the second. $ perf record ls /tmp $ perf record --no-buildid -o perf.data.no-buildid ls /tmp $ perf buildid-list > build-ids.txt $ perf inject -b --known-build-ids='file://build-ids.txt' \ -i perf.data.no-buildid -o perf.data.buildid Signed-off-by: Raul Silvera <rsilvera@google.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220815225922.2118745-1-rsilvera@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-inject.txt | 7 +- tools/perf/builtin-inject.c | 85 ++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index ffc293fdf61df..70e2ac3cc91ab 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -27,9 +27,14 @@ OPTIONS --build-ids:: Inject build-ids into the output stream ---buildid-all: +--buildid-all:: Inject build-ids of all DSOs into the output stream +--known-build-ids=:: + Override build-ids to inject using these comma-separated pairs of + build-id and path. Understands file://filename to read these pairs + from a file, which can be generated with perf buildid-list. + -v:: --verbose:: Be more verbose. diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 2a0f992ca0be7..8ec9554024883 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -21,6 +21,7 @@ #include "util/data.h" #include "util/auxtrace.h" #include "util/jit.h" +#include "util/string2.h" #include "util/symbol.h" #include "util/synthetic-events.h" #include "util/thread.h" @@ -38,6 +39,7 @@ #include <linux/string.h> #include <linux/zalloc.h> #include <linux/hash.h> +#include <ctype.h> #include <errno.h> #include <signal.h> #include <inttypes.h> @@ -123,6 +125,7 @@ struct perf_inject { char event_copy[PERF_SAMPLE_MAX_SIZE]; struct perf_file_section secs[HEADER_FEAT_BITS]; struct guest_session guest_session; + struct strlist *known_build_ids; }; struct event_entry { @@ -634,9 +637,73 @@ static int dso__read_build_id(struct dso *dso) return dso->has_build_id ? 0 : -1; } +static struct strlist *perf_inject__parse_known_build_ids( + const char *known_build_ids_string) +{ + struct str_node *pos, *tmp; + struct strlist *known_build_ids; + int bid_len; + + known_build_ids = strlist__new(known_build_ids_string, NULL); + if (known_build_ids == NULL) + return NULL; + strlist__for_each_entry_safe(pos, tmp, known_build_ids) { + const char *build_id, *dso_name; + + build_id = skip_spaces(pos->s); + dso_name = strchr(build_id, ' '); + if (dso_name == NULL) { + strlist__remove(known_build_ids, pos); + continue; + } + bid_len = dso_name - pos->s; + dso_name = skip_spaces(dso_name); + if (bid_len % 2 != 0 || bid_len >= SBUILD_ID_SIZE) { + strlist__remove(known_build_ids, pos); + continue; + } + for (int ix = 0; 2 * ix + 1 < bid_len; ++ix) { + if (!isxdigit(build_id[2 * ix]) || + !isxdigit(build_id[2 * ix + 1])) { + strlist__remove(known_build_ids, pos); + break; + } + } + } + return known_build_ids; +} + +static bool perf_inject__lookup_known_build_id(struct perf_inject *inject, + struct dso *dso) +{ + struct str_node *pos; + int bid_len; + + strlist__for_each_entry(pos, inject->known_build_ids) { + const char *build_id, *dso_name; + + build_id = skip_spaces(pos->s); + dso_name = strchr(build_id, ' '); + bid_len = dso_name - pos->s; + dso_name = skip_spaces(dso_name); + if (strcmp(dso->long_name, dso_name)) + continue; + for (int ix = 0; 2 * ix + 1 < bid_len; ++ix) { + dso->bid.data[ix] = (hex(build_id[2 * ix]) << 4 | + hex(build_id[2 * ix + 1])); + } + dso->bid.size = bid_len / 2; + dso->has_build_id = 1; + return true; + } + return false; +} + static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, struct machine *machine, u8 cpumode, u32 flags) { + struct perf_inject *inject = container_of(tool, struct perf_inject, + tool); int err; if (is_anon_memory(dso->long_name) || flags & MAP_HUGETLB) @@ -644,6 +711,10 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, if (is_no_dso_memory(dso->long_name)) return 0; + if (inject->known_build_ids != NULL && + perf_inject__lookup_known_build_id(inject, dso)) + return 1; + if (dso__read_build_id(dso) < 0) { pr_debug("no build_id found for %s\n", dso->long_name); return -1; @@ -2112,12 +2183,16 @@ int cmd_inject(int argc, const char **argv) }; int ret; bool repipe = true; + const char *known_build_ids = NULL; struct option options[] = { OPT_BOOLEAN('b', "build-ids", &inject.build_ids, "Inject build-ids into the output stream"), OPT_BOOLEAN(0, "buildid-all", &inject.build_id_all, "Inject build-ids of all DSOs into the output stream"), + OPT_STRING(0, "known-build-ids", &known_build_ids, + "buildid path [,buildid path...]", + "build-ids to use for given paths"), OPT_STRING('i', "input", &inject.input_name, "file", "input file name"), OPT_STRING('o', "output", &inject.output.path, "file", @@ -2257,6 +2332,15 @@ int cmd_inject(int argc, const char **argv) */ inject.tool.ordered_events = true; inject.tool.ordering_requires_timestamps = true; + if (known_build_ids != NULL) { + inject.known_build_ids = + perf_inject__parse_known_build_ids(known_build_ids); + + if (inject.known_build_ids == NULL) { + pr_err("Couldn't parse known build ids.\n"); + goto out_delete; + } + } } if (inject.sched_stat) { @@ -2285,6 +2369,7 @@ int cmd_inject(int argc, const char **argv) guest_session__exit(&inject.guest_session); out_delete: + strlist__delete(inject.known_build_ids); zstd_fini(&(inject.session->zstd_data)); perf_session__delete(inject.session); out_close_output: -- GitLab From 709533e51b166d5a520589a03f0044ed304b33bd Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Thu, 18 Aug 2022 14:09:55 +0200 Subject: [PATCH 1181/2223] tools build: Fix feature detection output due to eval expansion As the first eval expansion is used only to generate Makefile statements, messages should not be displayed at this stage, as for example conditional expressions are not evaluated. It can be seen for example in the output of feature detection for bpftool, where the number of detected features does not change, despite turning on the verbose mode (VF = 1) and there are additional features to display. Fix this issue by escaping the $ before $(info) statements, to ensure that messages are printed only when the function containing them is actually executed, and not when it is expanded. In addition, move the $(info) statement out of feature_print_status, due to the fact that is called both inside and outside an eval context, and place it to the caller so that the $ can be escaped when necessary. For symmetry, move the $(info) statement also out of feature_print_text, and place it to the caller. Force the TMP variable evaluation in verbose mode, to display the features in FEATURE_TESTS that are not in FEATURE_DISPLAY. Reorder perf feature detection messages (first non-verbose, then verbose ones) by moving the call to feature_display_entries earlier, before the VF environment variable check. Also, remove the newline from that function, as perf might display additional messages. Move the newline to perf Makefile, and display another one if displaying the detection result is not deferred as in the case of bpftool. Committer testing: Collecting the output from: $ make VF=1 -C tools/bpf/bpftool/ |& grep "Auto-detecting system features" -A20 $ diff -u before after --- before 2022-08-18 09:59:55.460529231 -0300 +++ after 2022-08-18 10:01:11.182517282 -0300 @@ -4,3 +4,5 @@ ... libbfd-liberty-z: [ on ] ... libcap: [ on ] ... clang-bpf-co-re: [ on ] +... disassembler-four-args: [ on ] +... disassembler-init-styled: [ OFF ] $ Fixes: 0afc5cad387db560 ("perf build: Separate feature make support into config/Makefile.feature") Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: bpf@vger.kernel.org Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Link: https://lore.kernel.org/r/20220818120957.319995-1-roberto.sassu@huaweicloud.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/build/Makefile.feature | 19 ++++++++----------- tools/perf/Makefile.config | 15 ++++++++------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index fc6ce0b2535ad..9d3afbc37e15d 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -177,7 +177,7 @@ endif # # Print the result of the feature test: # -feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG)) +feature_print_status = $(eval $(feature_print_status_code)) define feature_print_status_code ifeq ($(feature-$(1)), 1) @@ -187,7 +187,7 @@ define feature_print_status_code endif endef -feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG)) +feature_print_text = $(eval $(feature_print_text_code)) define feature_print_text_code MSG = $(shell printf '...%30s: %s' $(1) $(2)) endef @@ -247,21 +247,18 @@ endif feature_display_entries = $(eval $(feature_display_entries_code)) define feature_display_entries_code ifeq ($(feature_display),1) - $(info ) - $(info Auto-detecting system features:) - $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),)) - ifneq ($(feature_verbose),1) - $(info ) - endif + $$(info ) + $$(info Auto-detecting system features:) + $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),) $$(info $(MSG))) endif ifeq ($(feature_verbose),1) - TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS)) - $(foreach feat,$(TMP),$(call feature_print_status,$(feat),)) - $(info ) + $(eval TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))) + $(foreach feat,$(TMP),$(call feature_print_status,$(feat),) $$(info $(MSG))) endif endef ifeq ($(FEATURE_DISPLAY_DEFERRED),) $(call feature_display_entries) + $(info ) endif diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 2171f02daf59d..2b4f703a54e0a 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1304,11 +1304,15 @@ define print_var_code MSG = $(shell printf '...%30s: %s' $(1) $($(1))) endef +ifeq ($(feature_display),1) + $(call feature_display_entries) +endif + ifeq ($(VF),1) # Display EXTRA features which are detected manualy # from here with feature_check call and thus cannot # be partof global state output. - $(foreach feat,$(FEATURE_TESTS_EXTRA),$(call feature_print_status,$(feat),)) + $(foreach feat,$(FEATURE_TESTS_EXTRA),$(call feature_print_status,$(feat),) $(info $(MSG))) $(call print_var,prefix) $(call print_var,bindir) $(call print_var,libdir) @@ -1318,11 +1322,12 @@ ifeq ($(VF),1) $(call print_var,JDIR) ifeq ($(dwarf-post-unwind),1) - $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text)) + $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text)) $(info $(MSG)) endif - $(info ) endif +$(info ) + $(call detected_var,bindir_SQ) $(call detected_var,PYTHON_WORD) ifneq ($(OUTPUT),) @@ -1352,7 +1357,3 @@ endif # tests. $(shell rm -f $(FEATURE_DUMP_FILENAME)) $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME))) - -ifeq ($(feature_display),1) - $(call feature_display_entries) -endif -- GitLab From 74da7697a2ab988e3889ba4db78992a0944ea83d Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Thu, 18 Aug 2022 14:09:56 +0200 Subject: [PATCH 1182/2223] tools build: Increment room for feature name in feature detection output Since now there are features with a long name, increase the room for them, so that fields are correctly aligned. Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220818120957.319995-2-roberto.sassu@huaweicloud.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/build/Makefile.feature | 6 +++--- tools/perf/Makefile.config | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 9d3afbc37e15d..6c809941ff016 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -181,15 +181,15 @@ feature_print_status = $(eval $(feature_print_status_code)) define feature_print_status_code ifeq ($(feature-$(1)), 1) - MSG = $(shell printf '...%30s: [ \033[32mon\033[m ]' $(1)) + MSG = $(shell printf '...%40s: [ \033[32mon\033[m ]' $(1)) else - MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1)) + MSG = $(shell printf '...%40s: [ \033[31mOFF\033[m ]' $(1)) endif endef feature_print_text = $(eval $(feature_print_text_code)) define feature_print_text_code - MSG = $(shell printf '...%30s: %s' $(1) $(2)) + MSG = $(shell printf '...%40s: %s' $(1) $(2)) endef # diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 2b4f703a54e0a..b3b733f4366bc 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1301,7 +1301,7 @@ endif print_var = $(eval $(print_var_code)) $(info $(MSG)) define print_var_code - MSG = $(shell printf '...%30s: %s' $(1) $($(1))) + MSG = $(shell printf '...%40s: %s' $(1) $($(1))) endef ifeq ($(feature_display),1) -- GitLab From 74ef1cc9587870016f2a528c03634607b9d53093 Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Thu, 18 Aug 2022 14:09:57 +0200 Subject: [PATCH 1183/2223] tools build: Display logical OR of a feature flavors Sometimes, features are simply different flavors of another feature, to properly detect the exact dependencies needed by different Linux distributions. For example, libbfd has three flavors: libbfd if the distro does not require any additional dependency; libbfd-liberty if it requires libiberty; libbfd-liberty-z if it requires libiberty and libz. It might not be clear to the user whether a feature has been successfully detected or not, given that some of its flavors will be set to OFF, others to ON. Instead, display only the feature main flavor if not in verbose mode (VF != 1), and set it to ON if at least one of its flavors has been successfully detected (logical OR), OFF otherwise. Omit the other flavors. Accomplish that by declaring a FEATURE_GROUP_MEMBERS-<feature main flavor> variable, with the list of the other flavors as variable value. For now, do it just for libbfd. In verbose mode, of if no group is defined for a feature, show the feature detection result as before. Committer testing: Collecting the output from: $ make -C tools/bpf/bpftool/ clean $ make -C tools/bpf/bpftool/ |& grep "Auto-detecting system features" -A10 $ diff -u before after --- before 2022-08-18 10:06:40.422086966 -0300 +++ after 2022-08-18 10:07:59.202138282 -0300 @@ -1,6 +1,4 @@ Auto-detecting system features: ... libbfd: [ on ] -... libbfd-liberty: [ on ] -... libbfd-liberty-z: [ on ] ... libcap: [ on ] ... clang-bpf-co-re: [ on ] $ Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220818120957.319995-3-roberto.sassu@huaweicloud.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/build/Makefile.feature | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 6c809941ff016..57619f240b560 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -137,6 +137,12 @@ FEATURE_DISPLAY ?= \ libaio \ libzstd +# +# Declare group members of a feature to display the logical OR of the detection +# result instead of each member result. +# +FEATURE_GROUP_MEMBERS-libbfd = libbfd-liberty libbfd-liberty-z + # Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features. # If in the future we need per-feature checks/flags for features not # mentioned in this list we need to refactor this ;-). @@ -179,8 +185,17 @@ endif # feature_print_status = $(eval $(feature_print_status_code)) +feature_group = $(eval $(feature_gen_group)) $(GROUP) + +define feature_gen_group + GROUP := $(1) + ifneq ($(feature_verbose),1) + GROUP += $(FEATURE_GROUP_MEMBERS-$(1)) + endif +endef + define feature_print_status_code - ifeq ($(feature-$(1)), 1) + ifneq (,$(filter 1,$(foreach feat,$(call feature_group,$(feat)),$(feature-$(feat))))) MSG = $(shell printf '...%40s: [ \033[32mon\033[m ]' $(1)) else MSG = $(shell printf '...%40s: [ \033[31mOFF\033[m ]' $(1)) @@ -244,12 +259,20 @@ ifeq ($(VF),1) feature_verbose := 1 endif +ifneq ($(feature_verbose),1) + # + # Determine the features to omit from the displayed message, as only the + # logical OR of the detection result will be shown. + # + FEATURE_OMIT := $(foreach feat,$(FEATURE_DISPLAY),$(FEATURE_GROUP_MEMBERS-$(feat))) +endif + feature_display_entries = $(eval $(feature_display_entries_code)) define feature_display_entries_code ifeq ($(feature_display),1) $$(info ) $$(info Auto-detecting system features:) - $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),) $$(info $(MSG))) + $(foreach feat,$(filter-out $(FEATURE_OMIT),$(FEATURE_DISPLAY)),$(call feature_print_status,$(feat),) $$(info $(MSG))) endif ifeq ($(feature_verbose),1) -- GitLab From f1417cea017dff2bbf2836bf67abd8e25e624411 Mon Sep 17 00:00:00 2001 From: Xin Gao <gaoxin@cdjrlc.com> Date: Wed, 17 Aug 2022 01:41:09 +0800 Subject: [PATCH 1184/2223] perf parse-events: Use 'unsigned int' instead of plain 'unsigned'. 'unsigned int' should be clearer than 'unsigned'. Signed-off-by: Xin Gao <gaoxin@cdjrlc.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220816174109.7718-1-gaoxin@cdjrlc.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 89655d53117ae..74a2cafb4e8de 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1182,7 +1182,7 @@ static char *pmu_formats_string(struct list_head *formats) struct perf_pmu_format *format; char *str = NULL; struct strbuf buf = STRBUF_INIT; - unsigned i = 0; + unsigned int i = 0; if (!formats) return NULL; -- GitLab From 84f879c5331873a7b4036ff2f319d0f2fcf4179b Mon Sep 17 00:00:00 2001 From: Xin Gao <gaoxin@cdjrlc.com> Date: Wed, 17 Aug 2022 01:38:04 +0800 Subject: [PATCH 1185/2223] perf metrics: Use 'unsigned int' instead of just 'unsigned'. 'unsigned int' should be clearer than 'unsigned'. Signed-off-by: Xin Gao <gaoxin@cdjrlc.com> Cc: Ian Rogers <irogers@google.com> Cc: John Garry <john.garry@huawei.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lore.kernel.org/lkml/20220816173804.7539-1-gaoxin@cdjrlc.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/metricgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index c93bcaf6d55d0..18aae040d61db 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1703,7 +1703,7 @@ int metricgroup__copy_metric_events(struct evlist *evlist, struct cgroup *cgrp, struct rblist *new_metric_events, struct rblist *old_metric_events) { - unsigned i; + unsigned int i; for (i = 0; i < rblist__nr_entries(old_metric_events); i++) { struct rb_node *nd; -- GitLab From bdf4572555652074272d7dd1c694674efe60bea6 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Tue, 23 Aug 2022 22:06:04 -0700 Subject: [PATCH 1186/2223] perf hashmap: Tidy hashmap dependency When libbpf is present the build uses definitions in libbpf hashmap.c, however, libbpf's hashmap.h wasn't being used. Switch to using the correct hashmap.h dependent on the define HAVE_LIBBPF_SUPPORT. This was the original intent in: https://lore.kernel.org/lkml/20200515221732.44078-8-irogers@google.com/ Signed-off-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: http://lore.kernel.org/lkml/20220824050604.352156-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/bpf-loader.c | 6 +++++- tools/perf/util/evsel.c | 6 +++++- tools/perf/util/expr.h | 11 ++++------- tools/perf/util/stat.c | 6 +++++- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index e2052f4fed33b..d657594894cf6 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -27,7 +27,11 @@ #include "util.h" #include "llvm-utils.h" #include "c++/clang-c.h" -#include "hashmap.h" +#ifdef HAVE_LIBBPF_SUPPORT +#include <bpf/hashmap.h> +#else +#include "util/hashmap.h" +#endif #include "asm/bug.h" #include <internal/xyarray.h> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 18c3eb864d558..e1bc76ece1178 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -46,7 +46,11 @@ #include "string2.h" #include "memswap.h" #include "util.h" -#include "hashmap.h" +#ifdef HAVE_LIBBPF_SUPPORT +#include <bpf/hashmap.h> +#else +#include "util/hashmap.h" +#endif #include "pmu-hybrid.h" #include "off_cpu.h" #include "../perf-sys.h" diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index bd2116983bbb5..0403a92d9dcc3 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -2,14 +2,11 @@ #ifndef PARSE_CTX_H #define PARSE_CTX_H 1 -// There are fixes that need to land upstream before we can use libbpf's headers, -// for now use our copy unconditionally, since the data structures at this point -// are exactly the same, no problem. -//#ifdef HAVE_LIBBPF_SUPPORT -//#include <bpf/hashmap.h> -//#else +#ifdef HAVE_LIBBPF_SUPPORT +#include <bpf/hashmap.h> +#else #include "util/hashmap.h" -//#endif +#endif struct metric_ref; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 0882b4754fcf1..ce5e9e372fc4c 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -14,7 +14,11 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" -#include "hashmap.h" +#ifdef HAVE_LIBBPF_SUPPORT +#include <bpf/hashmap.h> +#else +#include "util/hashmap.h" +#endif #include <linux/zalloc.h> void update_stats(struct stats *stats, u64 val) -- GitLab From 6562c9acb43ac69ba5a956b0c3911b883d90541f Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 24 Aug 2022 10:28:10 +0300 Subject: [PATCH 1187/2223] perf record: Fix way of handling non-perf-event pollfds perf record __cmd_record() does not poll evlist pollfds. Instead it polls thread_data[0].pollfd. That happens whether or not threads are being used. perf record duplicates evlist mmap pollfds as needed for separate threads. The non-perf-event represented by evlist->ctl_fd has to handled separately, which is done explicitly, duplicating it into the thread_data[0] pollfds. That approach neglects any other non-perf-event file descriptors. Currently there is also done_fd which needs the same handling. Add a new generalized approach. Add fdarray_flag__non_perf_event to identify the file descriptors that need the special handling. For those cases, also keep a mapping of the evlist pollfd index and thread pollfd index, so that the evlist revents can be updated. Although this patch adds the new handling, it does not take it into use. There is no functional change, but it is the precursor to a fix, so is marked as a fix. Fixes: 415ccb58f68a6beb ("perf record: Introduce thread specific data array") Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20220824072814.16422-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/api/fd/array.h | 5 ++- tools/perf/builtin-record.c | 80 +++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/tools/lib/api/fd/array.h b/tools/lib/api/fd/array.h index 60ad197c8ee94..5c01f7b05dfb1 100644 --- a/tools/lib/api/fd/array.h +++ b/tools/lib/api/fd/array.h @@ -31,8 +31,9 @@ struct fdarray { }; enum fdarray_flags { - fdarray_flag__default = 0x00000000, - fdarray_flag__nonfilterable = 0x00000001 + fdarray_flag__default = 0x00000000, + fdarray_flag__nonfilterable = 0x00000001, + fdarray_flag__non_perf_event = 0x00000002, }; void fdarray__init(struct fdarray *fda, int nr_autogrow); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 0f711f88894cf..bf6879a6ffa4a 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -143,6 +143,11 @@ static const char *thread_spec_tags[THREAD_SPEC__MAX] = { "undefined", "cpu", "core", "package", "numa", "user" }; +struct pollfd_index_map { + int evlist_pollfd_index; + int thread_pollfd_index; +}; + struct record { struct perf_tool tool; struct record_opts opts; @@ -171,6 +176,9 @@ struct record { int nr_threads; struct thread_mask *thread_masks; struct record_thread *thread_data; + struct pollfd_index_map *index_map; + size_t index_map_sz; + size_t index_map_cnt; }; static volatile int done; @@ -1074,6 +1082,70 @@ static void record__free_thread_data(struct record *rec) zfree(&rec->thread_data); } +static int record__map_thread_evlist_pollfd_indexes(struct record *rec, + int evlist_pollfd_index, + int thread_pollfd_index) +{ + size_t x = rec->index_map_cnt; + + if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL)) + return -ENOMEM; + rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index; + rec->index_map[x].thread_pollfd_index = thread_pollfd_index; + rec->index_map_cnt += 1; + return 0; +} + +static int record__update_evlist_pollfd_from_thread(struct record *rec, + struct evlist *evlist, + struct record_thread *thread_data) +{ + struct pollfd *e_entries = evlist->core.pollfd.entries; + struct pollfd *t_entries = thread_data->pollfd.entries; + int err = 0; + size_t i; + + for (i = 0; i < rec->index_map_cnt; i++) { + int e_pos = rec->index_map[i].evlist_pollfd_index; + int t_pos = rec->index_map[i].thread_pollfd_index; + + if (e_entries[e_pos].fd != t_entries[t_pos].fd || + e_entries[e_pos].events != t_entries[t_pos].events) { + pr_err("Thread and evlist pollfd index mismatch\n"); + err = -EINVAL; + continue; + } + e_entries[e_pos].revents = t_entries[t_pos].revents; + } + return err; +} + +static int record__dup_non_perf_events(struct record *rec, + struct evlist *evlist, + struct record_thread *thread_data) +{ + struct fdarray *fda = &evlist->core.pollfd; + int i, ret; + + for (i = 0; i < fda->nr; i++) { + if (!(fda->priv[i].flags & fdarray_flag__non_perf_event)) + continue; + ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda); + if (ret < 0) { + pr_err("Failed to duplicate descriptor in main thread pollfd\n"); + return ret; + } + pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n", + thread_data, ret, fda->entries[i].fd); + ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret); + if (ret < 0) { + pr_err("Failed to map thread and evlist pollfd indexes\n"); + return ret; + } + } + return 0; +} + static int record__alloc_thread_data(struct record *rec, struct evlist *evlist) { int t, ret; @@ -1121,6 +1193,11 @@ static int record__alloc_thread_data(struct record *rec, struct evlist *evlist) thread_data[t].pipes.msg[0]); } else { thread_data[t].tid = gettid(); + + ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]); + if (ret < 0) + goto out_free; + if (evlist->ctl_fd.pos == -1) continue; ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos, @@ -2534,6 +2611,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) record__thread_munmap_filtered, NULL) == 0) draining = true; + err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread); + if (err) + goto out_child; evlist__ctlfd_update(rec->evlist, &thread->pollfd.entries[thread->ctlfd_pos]); } -- GitLab From a032ad87aa3bcc6f90cb5771c4ed593844eecc1a Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 24 Aug 2022 10:28:11 +0300 Subject: [PATCH 1188/2223] perf record: Fix done_fd wakeup event evlist__add_wakeup_eventfd() calls perf_evlist__add_pollfd() to add a non-perf-event to the evlist pollfds. Since commit 415ccb58f68a ("perf record: Introduce thread specific data array") that doesn't work because evlist pollfs is not polled and done_fd is not duplicated into thread-data. Patch "perf record: Fix way of handling non-perf-event pollfds" added a new approach that ensures file descriptors like done_fd are handled correctly by flagging them as fdarray_flag__non_perf_event. Fix by flagging done_fd as fdarray_flag__non_perf_event. Example: Before: $ sleep 3 & perf record -vv -p $! ... thread_data[0x55f44bd34140]: pollfd[0] <- event_fd=5 thread_data[0x55f44bd34140]: pollfd[1] <- event_fd=6 thread_data[0x55f44bd34140]: pollfd[2] <- event_fd=7 thread_data[0x55f44bd34140]: pollfd[3] <- event_fd=8 thread_data[0x55f44bd34140]: pollfd[4] <- event_fd=9 thread_data[0x55f44bd34140]: pollfd[5] <- event_fd=10 thread_data[0x55f44bd34140]: pollfd[6] <- event_fd=11 thread_data[0x55f44bd34140]: pollfd[7] <- event_fd=12 ... After: $ sleep 3 & perf record -vv -p $! ... thread_data[0x55a8ded89140]: pollfd[0] <- event_fd=5 thread_data[0x55a8ded89140]: pollfd[1] <- event_fd=6 thread_data[0x55a8ded89140]: pollfd[2] <- event_fd=7 thread_data[0x55a8ded89140]: pollfd[3] <- event_fd=8 thread_data[0x55a8ded89140]: pollfd[4] <- event_fd=9 thread_data[0x55a8ded89140]: pollfd[5] <- event_fd=10 thread_data[0x55a8ded89140]: pollfd[6] <- event_fd=11 thread_data[0x55a8ded89140]: pollfd[7] <- event_fd=12 thread_data[0x55a8ded89140]: pollfd[8] <- non_perf_event fd=4 ... This patch depends on "perf record: Fix way of handling non-perf-event pollfds". Fixes: 415ccb58f68a6beb ("perf record: Introduce thread specific data array") Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20220824072814.16422-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/evlist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 48167f3941a65..0b2222d055771 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -608,7 +608,8 @@ int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask) int evlist__add_wakeup_eventfd(struct evlist *evlist, int fd) { return perf_evlist__add_pollfd(&evlist->core, fd, NULL, POLLIN, - fdarray_flag__nonfilterable); + fdarray_flag__nonfilterable | + fdarray_flag__non_perf_event); } #endif -- GitLab From feff0b61ffd831dbe4a7f28cfc8064b59c9f90c1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 24 Aug 2022 10:28:12 +0300 Subject: [PATCH 1189/2223] perf record: Change evlist->ctl_fd to use fdarray_flag__non_perf_event Patch "perf record: Fix way of handling non-perf-event pollfds" added a generic way to handle non-perf-event file descriptors like evlist->ctl_fd. Use it instead of handling evlist->ctl_fd separately. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20220824072814.16422-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-record.c | 15 +-------------- tools/perf/util/evlist.c | 19 ++----------------- tools/perf/util/evlist.h | 1 - 3 files changed, 3 insertions(+), 32 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index bf6879a6ffa4a..f6204b8f8a068 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1198,18 +1198,7 @@ static int record__alloc_thread_data(struct record *rec, struct evlist *evlist) if (ret < 0) goto out_free; - if (evlist->ctl_fd.pos == -1) - continue; - ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos, - &evlist->core.pollfd); - if (ret < 0) { - pr_err("Failed to duplicate descriptor in main thread pollfd\n"); - goto out_free; - } - thread_data[t].ctlfd_pos = ret; - pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n", - thread_data, thread_data[t].ctlfd_pos, - evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd); + thread_data[t].ctlfd_pos = -1; /* Not used */ } } @@ -2614,8 +2603,6 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread); if (err) goto out_child; - evlist__ctlfd_update(rec->evlist, - &thread->pollfd.entries[thread->ctlfd_pos]); } if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 0b2222d055771..4c5e6e9f8d111 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1898,7 +1898,8 @@ int evlist__initialize_ctlfd(struct evlist *evlist, int fd, int ack) } evlist->ctl_fd.pos = perf_evlist__add_pollfd(&evlist->core, fd, NULL, POLLIN, - fdarray_flag__nonfilterable); + fdarray_flag__nonfilterable | + fdarray_flag__non_perf_event); if (evlist->ctl_fd.pos < 0) { evlist->ctl_fd.pos = -1; pr_err("Failed to add ctl fd entry: %m\n"); @@ -2148,22 +2149,6 @@ int evlist__ctlfd_process(struct evlist *evlist, enum evlist_ctl_cmd *cmd) return err; } -int evlist__ctlfd_update(struct evlist *evlist, struct pollfd *update) -{ - int ctlfd_pos = evlist->ctl_fd.pos; - struct pollfd *entries = evlist->core.pollfd.entries; - - if (!evlist__ctlfd_initialized(evlist)) - return 0; - - if (entries[ctlfd_pos].fd != update->fd || - entries[ctlfd_pos].events != update->events) - return -1; - - entries[ctlfd_pos].revents = update->revents; - return 0; -} - struct evsel *evlist__find_evsel(struct evlist *evlist, int idx) { struct evsel *evsel; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 351ba2887a796..3a464585d3970 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -418,7 +418,6 @@ void evlist__close_control(int ctl_fd, int ctl_fd_ack, bool *ctl_fd_close); int evlist__initialize_ctlfd(struct evlist *evlist, int ctl_fd, int ctl_fd_ack); int evlist__finalize_ctlfd(struct evlist *evlist); bool evlist__ctlfd_initialized(struct evlist *evlist); -int evlist__ctlfd_update(struct evlist *evlist, struct pollfd *update); int evlist__ctlfd_process(struct evlist *evlist, enum evlist_ctl_cmd *cmd); int evlist__ctlfd_ack(struct evlist *evlist); -- GitLab From 329725d5f6e139fbdb62a9f45d19fd62822ac3fc Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 24 Aug 2022 10:28:13 +0300 Subject: [PATCH 1190/2223] perf evlist: Add evlist__{en/dis}able_non_dummy() Dummy events are used to provide sideband information like MMAP events that are always needed even when main events are disabled. Add functions that take that into account. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20220824072814.16422-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/evlist.c | 30 ++++++++++++++++++++++++------ tools/perf/util/evlist.h | 2 ++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4c5e6e9f8d111..3cfe730c12b85 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -480,7 +480,7 @@ static int evlist__is_enabled(struct evlist *evlist) return false; } -static void __evlist__disable(struct evlist *evlist, char *evsel_name) +static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl_dummy) { struct evsel *pos; struct evlist_cpu_iterator evlist_cpu_itr; @@ -502,6 +502,8 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) continue; if (pos->disabled || !evsel__is_group_leader(pos) || !pos->core.fd) continue; + if (excl_dummy && evsel__is_dummy_event(pos)) + continue; if (pos->immediate) has_imm = true; if (pos->immediate != imm) @@ -518,6 +520,8 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) continue; if (!evsel__is_group_leader(pos) || !pos->core.fd) continue; + if (excl_dummy && evsel__is_dummy_event(pos)) + continue; pos->disabled = true; } @@ -533,15 +537,20 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) void evlist__disable(struct evlist *evlist) { - __evlist__disable(evlist, NULL); + __evlist__disable(evlist, NULL, false); +} + +void evlist__disable_non_dummy(struct evlist *evlist) +{ + __evlist__disable(evlist, NULL, true); } void evlist__disable_evsel(struct evlist *evlist, char *evsel_name) { - __evlist__disable(evlist, evsel_name); + __evlist__disable(evlist, evsel_name, false); } -static void __evlist__enable(struct evlist *evlist, char *evsel_name) +static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_dummy) { struct evsel *pos; struct evlist_cpu_iterator evlist_cpu_itr; @@ -560,6 +569,8 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) continue; if (!evsel__is_group_leader(pos) || !pos->core.fd) continue; + if (excl_dummy && evsel__is_dummy_event(pos)) + continue; evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx); } affinity__cleanup(affinity); @@ -568,6 +579,8 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) continue; if (!evsel__is_group_leader(pos) || !pos->core.fd) continue; + if (excl_dummy && evsel__is_dummy_event(pos)) + continue; pos->disabled = false; } @@ -581,12 +594,17 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) void evlist__enable(struct evlist *evlist) { - __evlist__enable(evlist, NULL); + __evlist__enable(evlist, NULL, false); +} + +void evlist__enable_non_dummy(struct evlist *evlist) +{ + __evlist__enable(evlist, NULL, true); } void evlist__enable_evsel(struct evlist *evlist, char *evsel_name) { - __evlist__enable(evlist, evsel_name); + __evlist__enable(evlist, evsel_name, false); } void evlist__toggle_enable(struct evlist *evlist) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 3a464585d3970..3a84744067387 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -205,6 +205,8 @@ void evlist__enable(struct evlist *evlist); void evlist__toggle_enable(struct evlist *evlist); void evlist__disable_evsel(struct evlist *evlist, char *evsel_name); void evlist__enable_evsel(struct evlist *evlist, char *evsel_name); +void evlist__disable_non_dummy(struct evlist *evlist); +void evlist__enable_non_dummy(struct evlist *evlist); void evlist__set_selected(struct evlist *evlist, struct evsel *evsel); -- GitLab From 6657a099e1858e4a39b501c38c16c6ef77c71a5a Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 24 Aug 2022 10:28:14 +0300 Subject: [PATCH 1191/2223] perf record: Allow multiple recording time ranges AUX area traces can produce too much data to record successfully or analyze subsequently. Add another means to reduce data collection by allowing multiple recording time ranges. This is useful, for instance, in cases where a workload produces predictably reproducible events in specific time ranges. Today we only have perf record -D <msecs> to start at a specific region, or some complicated approach using snapshot mode and external scripts sending signals or using the fifos. But these approaches are difficult to set up compared with simply having perf do it. Extend perf record option -D/--delay option to specifying relative time stamps for start stop controlled by perf with the right time offset, for instance: perf record -e intel_pt// -D 10-20,30-40 to record 10ms to 20ms into the trace and 30ms to 40ms. Example: The example workload is: $ cat repeat-usleep.c int usleep(useconds_t usec); int usage(int ret, const char *msg) { if (msg) fprintf(stderr, "%s\n", msg); fprintf(stderr, "Usage is: repeat-usleep <microseconds>\n"); return ret; } int main(int argc, char *argv[]) { unsigned long usecs; char *end_ptr; if (argc != 2) return usage(1, "Error: Wrong number of arguments!"); errno = 0; usecs = strtoul(argv[1], &end_ptr, 0); if (errno || *end_ptr || usecs > UINT_MAX) return usage(1, "Error: Invalid argument!"); while (1) { int ret = usleep(usecs); if (ret & errno != EINTR) return usage(1, "Error: usleep() failed!"); } return 0; } $ perf record -e intel_pt//u --delay 10-20,40-70,110-160 -- ./repeat-usleep 500 Events disabled Events enabled Events disabled Events enabled Events disabled Events enabled Events disabled [ perf record: Woken up 5 times to write data ] [ perf record: Captured and wrote 0.204 MB perf.data ] Terminated A dlfilter is used to determine continuous data collection (timestamps less than 1ms apart): $ cat dlfilter-show-delays.c static __u64 start_time; static __u64 last_time; int start(void **data, void *ctx) { printf("%-17s\t%-9s\t%-6s\n", " Time", " Duration", " Delay"); return 0; } int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, void *ctx) { __u64 delta; if (!sample->time) return 1; if (!last_time) goto out; delta = sample->time - last_time; if (delta < 1000000) goto out2;; printf("%17.9f\t%9.1f\t%6.1f\n", start_time / 1000000000.0, (last_time - start_time) / 1000000.0, delta / 1000000.0); out: start_time = sample->time; out2: last_time = sample->time; return 1; } int stop(void *data, void *ctx) { printf("%17.9f\t%9.1f\n", start_time / 1000000000.0, (last_time - start_time) / 1000000.0); return 0; } The result shows the times roughly match the --delay option: $ perf script --itrace=qb --dlfilter dlfilter-show-delays.so Time Duration Delay 39215.302317300 9.7 20.5 39215.332480217 30.4 40.9 39215.403837717 49.8 Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20220824072814.16422-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-record.txt | 6 +- tools/perf/builtin-record.c | 24 ++- tools/perf/util/evlist.c | 234 +++++++++++++++++++++++ tools/perf/util/evlist.h | 9 + 4 files changed, 269 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 0228efc96686a..b32a9c2726f90 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -433,8 +433,10 @@ if combined with -a or -C options. -D:: --delay=:: After starting the program, wait msecs before measuring (-1: start with events -disabled). This is useful to filter out the startup phase of the program, which -is often very different. +disabled), or enable events only for specified ranges of msecs (e.g. +-D 10-20,30-40 means wait 10 msecs, enable for 10 msecs, wait 10 msecs, enable +for 10 msecs, then stop). Note, delaying enabling of events is useful to filter +out the startup phase of the program, which is often very different. -I:: --intr-regs:: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f6204b8f8a068..df83dd436bdba 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2502,6 +2502,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) } } + err = event_enable_timer__start(rec->evlist->eet); + if (err) + goto out_child; + trigger_ready(&auxtrace_snapshot_trigger); trigger_ready(&switch_output_trigger); perf_hooks__invoke_record_start(); @@ -2625,6 +2629,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) } } + err = event_enable_timer__process(rec->evlist->eet); + if (err < 0) + goto out_child; + if (err) { + err = 0; + done = 1; + } + /* * When perf is starting the traced process, at the end events * die with the process and we wait for that. Thus no need to @@ -2846,6 +2858,12 @@ static int perf_record_config(const char *var, const char *value, void *cb) return 0; } +static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset) +{ + struct record *rec = (struct record *)opt->value; + + return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset); +} static int record__parse_affinity(const struct option *opt, const char *str, int unset) { @@ -3307,8 +3325,10 @@ static struct option __record_options[] = { OPT_CALLBACK('G', "cgroup", &record.evlist, "name", "monitor event in cgroup name only", parse_cgroups), - OPT_INTEGER('D', "delay", &record.opts.initial_delay, - "ms to wait before starting measurement after program start (-1: start with events disabled)"), + OPT_CALLBACK('D', "delay", &record, "ms", + "ms to wait before starting measurement after program start (-1: start with events disabled), " + "or ranges of time to enable events e.g. '-D 10-20,30-40'", + record__parse_event_enable_time), OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), OPT_STRING('u', "uid", &record.opts.target.uid_str, "user", "user to profile"), diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 3cfe730c12b85..fcfe5bcc0bcff 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -15,6 +15,7 @@ #include "target.h" #include "evlist.h" #include "evsel.h" +#include "record.h" #include "debug.h" #include "units.h" #include "bpf_counter.h" @@ -40,12 +41,14 @@ #include <sys/ioctl.h> #include <sys/mman.h> #include <sys/prctl.h> +#include <sys/timerfd.h> #include <linux/bitops.h> #include <linux/hash.h> #include <linux/log2.h> #include <linux/err.h> #include <linux/string.h> +#include <linux/time64.h> #include <linux/zalloc.h> #include <perf/evlist.h> #include <perf/evsel.h> @@ -147,6 +150,7 @@ static void evlist__purge(struct evlist *evlist) void evlist__exit(struct evlist *evlist) { + event_enable_timer__exit(&evlist->eet); zfree(&evlist->mmap); zfree(&evlist->overwrite_mmap); perf_evlist__exit(&evlist->core); @@ -2167,6 +2171,236 @@ int evlist__ctlfd_process(struct evlist *evlist, enum evlist_ctl_cmd *cmd) return err; } +/** + * struct event_enable_time - perf record -D/--delay single time range. + * @start: start of time range to enable events in milliseconds + * @end: end of time range to enable events in milliseconds + * + * N.B. this structure is also accessed as an array of int. + */ +struct event_enable_time { + int start; + int end; +}; + +static int parse_event_enable_time(const char *str, struct event_enable_time *range, bool first) +{ + const char *fmt = first ? "%u - %u %n" : " , %u - %u %n"; + int ret, start, end, n; + + ret = sscanf(str, fmt, &start, &end, &n); + if (ret != 2 || end <= start) + return -EINVAL; + if (range) { + range->start = start; + range->end = end; + } + return n; +} + +static ssize_t parse_event_enable_times(const char *str, struct event_enable_time *range) +{ + int incr = !!range; + bool first = true; + ssize_t ret, cnt; + + for (cnt = 0; *str; cnt++) { + ret = parse_event_enable_time(str, range, first); + if (ret < 0) + return ret; + /* Check no overlap */ + if (!first && range && range->start <= range[-1].end) + return -EINVAL; + str += ret; + range += incr; + first = false; + } + return cnt; +} + +/** + * struct event_enable_timer - control structure for perf record -D/--delay. + * @evlist: event list + * @times: time ranges that events are enabled (N.B. this is also accessed as an + * array of int) + * @times_cnt: number of time ranges + * @timerfd: timer file descriptor + * @pollfd_pos: position in @evlist array of file descriptors to poll (fdarray) + * @times_step: current position in (int *)@times)[], + * refer event_enable_timer__process() + * + * Note, this structure is only used when there are time ranges, not when there + * is only an initial delay. + */ +struct event_enable_timer { + struct evlist *evlist; + struct event_enable_time *times; + size_t times_cnt; + int timerfd; + int pollfd_pos; + size_t times_step; +}; + +static int str_to_delay(const char *str) +{ + char *endptr; + long d; + + d = strtol(str, &endptr, 10); + if (*endptr || d > INT_MAX || d < -1) + return 0; + return d; +} + +int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *opts, + const char *str, int unset) +{ + enum fdarray_flags flags = fdarray_flag__nonfilterable | fdarray_flag__non_perf_event; + struct event_enable_timer *eet; + ssize_t times_cnt; + ssize_t ret; + int err; + + if (unset) + return 0; + + opts->initial_delay = str_to_delay(str); + if (opts->initial_delay) + return 0; + + ret = parse_event_enable_times(str, NULL); + if (ret < 0) + return ret; + + times_cnt = ret; + if (times_cnt == 0) + return -EINVAL; + + eet = zalloc(sizeof(*eet)); + if (!eet) + return -ENOMEM; + + eet->times = calloc(times_cnt, sizeof(*eet->times)); + if (!eet->times) { + err = -ENOMEM; + goto free_eet; + } + + if (parse_event_enable_times(str, eet->times) != times_cnt) { + err = -EINVAL; + goto free_eet_times; + } + + eet->times_cnt = times_cnt; + + eet->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); + if (eet->timerfd == -1) { + err = -errno; + pr_err("timerfd_create failed: %s\n", strerror(errno)); + goto free_eet_times; + } + + eet->pollfd_pos = perf_evlist__add_pollfd(&evlist->core, eet->timerfd, NULL, POLLIN, flags); + if (eet->pollfd_pos < 0) { + err = eet->pollfd_pos; + goto close_timerfd; + } + + eet->evlist = evlist; + evlist->eet = eet; + opts->initial_delay = eet->times[0].start; + + return 0; + +close_timerfd: + close(eet->timerfd); +free_eet_times: + free(eet->times); +free_eet: + free(eet); + return err; +} + +static int event_enable_timer__set_timer(struct event_enable_timer *eet, int ms) +{ + struct itimerspec its = { + .it_value.tv_sec = ms / MSEC_PER_SEC, + .it_value.tv_nsec = (ms % MSEC_PER_SEC) * NSEC_PER_MSEC, + }; + int err = 0; + + if (timerfd_settime(eet->timerfd, 0, &its, NULL) < 0) { + err = -errno; + pr_err("timerfd_settime failed: %s\n", strerror(errno)); + } + return err; +} + +int event_enable_timer__start(struct event_enable_timer *eet) +{ + int ms; + + if (!eet) + return 0; + + ms = eet->times[0].end - eet->times[0].start; + eet->times_step = 1; + + return event_enable_timer__set_timer(eet, ms); +} + +int event_enable_timer__process(struct event_enable_timer *eet) +{ + struct pollfd *entries; + short revents; + + if (!eet) + return 0; + + entries = eet->evlist->core.pollfd.entries; + revents = entries[eet->pollfd_pos].revents; + entries[eet->pollfd_pos].revents = 0; + + if (revents & POLLIN) { + size_t step = eet->times_step; + size_t pos = step / 2; + + if (step & 1) { + evlist__disable_non_dummy(eet->evlist); + pr_info(EVLIST_DISABLED_MSG); + if (pos >= eet->times_cnt - 1) { + /* Disarm timer */ + event_enable_timer__set_timer(eet, 0); + return 1; /* Stop */ + } + } else { + evlist__enable_non_dummy(eet->evlist); + pr_info(EVLIST_ENABLED_MSG); + } + + step += 1; + pos = step / 2; + + if (pos < eet->times_cnt) { + int *times = (int *)eet->times; /* Accessing 'times' as array of int */ + int ms = times[step] - times[step - 1]; + + eet->times_step = step; + return event_enable_timer__set_timer(eet, ms); + } + } + + return 0; +} + +void event_enable_timer__exit(struct event_enable_timer **ep) +{ + if (!ep || !*ep) + return; + free((*ep)->times); + zfree(ep); +} + struct evsel *evlist__find_evsel(struct evlist *evlist, int idx) { struct evsel *evsel; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 3a84744067387..9d967fe3953a8 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -48,6 +48,8 @@ enum bkw_mmap_state { BKW_MMAP_EMPTY, }; +struct event_enable_timer; + struct evlist { struct perf_evlist core; bool enabled; @@ -79,6 +81,7 @@ struct evlist { int ack; /* ack file descriptor for control commands */ int pos; /* index at evlist core object to check signals */ } ctl_fd; + struct event_enable_timer *eet; }; struct evsel_str_handler { @@ -426,6 +429,12 @@ int evlist__ctlfd_ack(struct evlist *evlist); #define EVLIST_ENABLED_MSG "Events enabled\n" #define EVLIST_DISABLED_MSG "Events disabled\n" +int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *opts, + const char *str, int unset); +int event_enable_timer__start(struct event_enable_timer *eet); +void event_enable_timer__exit(struct event_enable_timer **ep); +int event_enable_timer__process(struct event_enable_timer *eet); + struct evsel *evlist__find_evsel(struct evlist *evlist, int idx); int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf); -- GitLab From e57d897703c3bf8b66680c69c0e75fbd9d9617f1 Mon Sep 17 00:00:00 2001 From: Pavithra Gurushankar <gpavithrasha@gmail.com> Date: Fri, 26 Aug 2022 09:42:25 -0700 Subject: [PATCH 1192/2223] perf mutex: Wrapped usage of mutex and cond MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added a new header file mutex.h that wraps the usage of pthread_mutex_t and pthread_cond_t. By abstracting these it is possible to introduce error checking. Signed-off-by: Pavithra Gurushankar <gpavithrasha@gmail.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-2-irogers@google.com Signed-off-by: Ian Rogers <irogers@google.com> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/Build | 1 + tools/perf/util/mutex.c | 117 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/mutex.h | 48 +++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 tools/perf/util/mutex.c create mode 100644 tools/perf/util/mutex.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 485e1a3431652..815d235466d01 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -143,6 +143,7 @@ perf-y += branch.o perf-y += mem2node.o perf-y += clockid.o perf-y += list_sort.o +perf-y += mutex.o perf-$(CONFIG_LIBBPF) += bpf-loader.o perf-$(CONFIG_LIBBPF) += bpf_map.o diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c new file mode 100644 index 0000000000000..5029237164e5e --- /dev/null +++ b/tools/perf/util/mutex.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "mutex.h" + +#include "debug.h" +#include <linux/string.h> +#include <errno.h> + +static void check_err(const char *fn, int err) +{ + char sbuf[STRERR_BUFSIZE]; + + if (err == 0) + return; + + pr_err("%s error: '%s'\n", fn, str_error_r(err, sbuf, sizeof(sbuf))); +} + +#define CHECK_ERR(err) check_err(__func__, err) + +static void __mutex_init(struct mutex *mtx, bool pshared) +{ + pthread_mutexattr_t attr; + + CHECK_ERR(pthread_mutexattr_init(&attr)); + +#ifndef NDEBUG + /* In normal builds enable error checking, such as recursive usage. */ + CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK)); +#endif + if (pshared) + CHECK_ERR(pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); + + CHECK_ERR(pthread_mutex_init(&mtx->lock, &attr)); + CHECK_ERR(pthread_mutexattr_destroy(&attr)); +} + +void mutex_init(struct mutex *mtx) +{ + __mutex_init(mtx, /*pshared=*/false); +} + +void mutex_init_pshared(struct mutex *mtx) +{ + __mutex_init(mtx, /*pshared=*/true); +} + +void mutex_destroy(struct mutex *mtx) +{ + CHECK_ERR(pthread_mutex_destroy(&mtx->lock)); +} + +void mutex_lock(struct mutex *mtx) +{ + CHECK_ERR(pthread_mutex_lock(&mtx->lock)); +} + +void mutex_unlock(struct mutex *mtx) +{ + CHECK_ERR(pthread_mutex_unlock(&mtx->lock)); +} + +bool mutex_trylock(struct mutex *mtx) +{ + int ret = pthread_mutex_trylock(&mtx->lock); + + if (ret == 0) + return true; /* Lock acquired. */ + + if (ret == EBUSY) + return false; /* Lock busy. */ + + /* Print error. */ + CHECK_ERR(ret); + return false; +} + +static void __cond_init(struct cond *cnd, bool pshared) +{ + pthread_condattr_t attr; + + CHECK_ERR(pthread_condattr_init(&attr)); + if (pshared) + CHECK_ERR(pthread_condattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); + + CHECK_ERR(pthread_cond_init(&cnd->cond, &attr)); + CHECK_ERR(pthread_condattr_destroy(&attr)); +} + +void cond_init(struct cond *cnd) +{ + __cond_init(cnd, /*pshared=*/false); +} + +void cond_init_pshared(struct cond *cnd) +{ + __cond_init(cnd, /*pshared=*/true); +} + +void cond_destroy(struct cond *cnd) +{ + CHECK_ERR(pthread_cond_destroy(&cnd->cond)); +} + +void cond_wait(struct cond *cnd, struct mutex *mtx) +{ + CHECK_ERR(pthread_cond_wait(&cnd->cond, &mtx->lock)); +} + +void cond_signal(struct cond *cnd) +{ + CHECK_ERR(pthread_cond_signal(&cnd->cond)); +} + +void cond_broadcast(struct cond *cnd) +{ + CHECK_ERR(pthread_cond_broadcast(&cnd->cond)); +} diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h new file mode 100644 index 0000000000000..cfff32a902d91 --- /dev/null +++ b/tools/perf/util/mutex.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_MUTEX_H +#define __PERF_MUTEX_H + +#include <pthread.h> +#include <stdbool.h> + +/* + * A wrapper around the mutex implementation that allows perf to error check + * usage, etc. + */ +struct mutex { + pthread_mutex_t lock; +}; + +/* A wrapper around the condition variable implementation. */ +struct cond { + pthread_cond_t cond; +}; + +/* Default initialize the mtx struct. */ +void mutex_init(struct mutex *mtx); +/* + * Initialize the mtx struct and set the process-shared rather than default + * process-private attribute. + */ +void mutex_init_pshared(struct mutex *mtx); +void mutex_destroy(struct mutex *mtx); + +void mutex_lock(struct mutex *mtx); +void mutex_unlock(struct mutex *mtx); +/* Tries to acquire the lock and returns true on success. */ +bool mutex_trylock(struct mutex *mtx); + +/* Default initialize the cond struct. */ +void cond_init(struct cond *cnd); +/* + * Initialize the cond struct and specify the process-shared rather than default + * process-private attribute. + */ +void cond_init_pshared(struct cond *cnd); +void cond_destroy(struct cond *cnd); + +void cond_wait(struct cond *cnd, struct mutex *mtx); +void cond_signal(struct cond *cnd); +void cond_broadcast(struct cond *cnd); + +#endif /* __PERF_MUTEX_H */ -- GitLab From a64d3af5d9eca3058ab6e0d3715ff36e4d6b5983 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:26 -0700 Subject: [PATCH 1193/2223] perf bench: Update use of pthread mutex/cond MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/bench/epoll-ctl.c | 33 ++++----- tools/perf/bench/epoll-wait.c | 33 ++++----- tools/perf/bench/futex-hash.c | 33 ++++----- tools/perf/bench/futex-lock-pi.c | 33 ++++----- tools/perf/bench/futex-requeue.c | 33 ++++----- tools/perf/bench/futex-wake-parallel.c | 33 ++++----- tools/perf/bench/futex-wake.c | 33 ++++----- tools/perf/bench/numa.c | 93 ++++++++++---------------- 8 files changed, 153 insertions(+), 171 deletions(-) diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 4256dc5d6236d..521d1ff97b069 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -23,6 +23,7 @@ #include <sys/eventfd.h> #include <perf/cpumap.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include "bench.h" @@ -58,10 +59,10 @@ static unsigned int nested = 0; /* amount of fds to monitor, per thread */ static unsigned int nfds = 64; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats all_stats[EPOLL_NR_OPS]; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; struct worker { int tid; @@ -174,12 +175,12 @@ static void *workerfn(void *arg) struct timespec ts = { .tv_sec = 0, .tv_nsec = 250 }; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); /* Let 'em loose */ do { @@ -367,9 +368,9 @@ int bench_epoll_ctl(int argc, const char **argv) for (i = 0; i < EPOLL_NR_OPS; i++) init_stats(&all_stats[i]); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); threads_starting = nthreads; @@ -377,11 +378,11 @@ int bench_epoll_ctl(int argc, const char **argv) do_threads(worker, cpu); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); sleep(nsecs); toggle_done(0, NULL, NULL); @@ -394,9 +395,9 @@ int bench_epoll_ctl(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); for (i = 0; i < nthreads; i++) { unsigned long t[EPOLL_NR_OPS]; diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 2728b0140853f..c1cdf03c075dc 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -79,6 +79,7 @@ #include <perf/cpumap.h> #include "../util/stat.h" +#include "../util/mutex.h" #include <subcmd/parse-options.h> #include "bench.h" @@ -109,10 +110,10 @@ static bool multiq; /* use an epoll instance per thread */ /* amount of fds to monitor, per thread */ static unsigned int nfds = 64; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; struct worker { int tid; @@ -189,12 +190,12 @@ static void *workerfn(void *arg) int to = nonblocking? 0 : -1; int efd = multiq ? w->epollfd : epollfd; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); do { /* @@ -485,9 +486,9 @@ int bench_epoll_wait(int argc, const char **argv) getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs); init_stats(&throughput_stats); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); threads_starting = nthreads; @@ -495,11 +496,11 @@ int bench_epoll_wait(int argc, const char **argv) do_threads(worker, cpu); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); /* * At this point the workers should be blocked waiting for read events @@ -522,9 +523,9 @@ int bench_epoll_wait(int argc, const char **argv) err(EXIT_FAILURE, "pthread_join"); /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); /* sort the array back before reporting */ if (randomize) diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index f05db4cf983d6..2005a3fa30267 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -23,6 +23,7 @@ #include <sys/mman.h> #include <perf/cpumap.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include "bench.h" @@ -34,10 +35,10 @@ static bool done = false; static int futex_flag = 0; struct timeval bench__start, bench__end, bench__runtime; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; struct worker { int tid; @@ -73,12 +74,12 @@ static void *workerfn(void *arg) unsigned int i; unsigned long ops = w->ops; /* avoid cacheline bouncing */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); do { for (i = 0; i < params.nfutexes; i++, ops++) { @@ -165,9 +166,9 @@ int bench_futex_hash(int argc, const char **argv) getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime); init_stats(&throughput_stats); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); threads_starting = params.nthreads; pthread_attr_init(&thread_attr); @@ -203,11 +204,11 @@ int bench_futex_hash(int argc, const char **argv) CPU_FREE(cpuset); pthread_attr_destroy(&thread_attr); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); sleep(params.runtime); toggle_done(0, NULL, NULL); @@ -219,9 +220,9 @@ int bench_futex_hash(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); for (i = 0; i < params.nthreads; i++) { unsigned long t = bench__runtime.tv_sec > 0 ? diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 0abb3f7ee24f7..2d04179497270 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -8,6 +8,7 @@ #include <pthread.h> #include <signal.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include <linux/compiler.h> @@ -34,10 +35,10 @@ static u_int32_t global_futex = 0; static struct worker *worker; static bool done = false; static int futex_flag = 0; -static pthread_mutex_t thread_lock; +static struct mutex thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; -static pthread_cond_t thread_parent, thread_worker; +static struct cond thread_parent, thread_worker; static struct bench_futex_parameters params = { .runtime = 10, @@ -83,12 +84,12 @@ static void *workerfn(void *arg) struct worker *w = (struct worker *) arg; unsigned long ops = w->ops; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); do { int ret; @@ -197,9 +198,9 @@ int bench_futex_lock_pi(int argc, const char **argv) getpid(), params.nthreads, params.runtime); init_stats(&throughput_stats); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); threads_starting = params.nthreads; pthread_attr_init(&thread_attr); @@ -208,11 +209,11 @@ int bench_futex_lock_pi(int argc, const char **argv) create_threads(worker, thread_attr, cpu); pthread_attr_destroy(&thread_attr); - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); sleep(params.runtime); toggle_done(0, NULL, NULL); @@ -224,9 +225,9 @@ int bench_futex_lock_pi(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); for (i = 0; i < params.nthreads; i++) { unsigned long t = bench__runtime.tv_sec > 0 ? diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index b6faabfafb8ee..69ad896f556c9 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -15,6 +15,7 @@ #include <pthread.h> #include <signal.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include <linux/compiler.h> @@ -34,8 +35,8 @@ static u_int32_t futex1 = 0, futex2 = 0; static pthread_t *worker; static bool done = false; -static pthread_mutex_t thread_lock; -static pthread_cond_t thread_parent, thread_worker; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; static struct stats requeuetime_stats, requeued_stats; static unsigned int threads_starting; static int futex_flag = 0; @@ -82,12 +83,12 @@ static void *workerfn(void *arg __maybe_unused) { int ret; - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); while (1) { if (!params.pi) { @@ -209,9 +210,9 @@ int bench_futex_requeue(int argc, const char **argv) init_stats(&requeued_stats); init_stats(&requeuetime_stats); pthread_attr_init(&thread_attr); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); for (j = 0; j < bench_repeat && !done; j++) { unsigned int nrequeued = 0, wakeups = 0; @@ -221,11 +222,11 @@ int bench_futex_requeue(int argc, const char **argv) block_threads(worker, thread_attr, cpu); /* make sure all threads are already blocked */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); usleep(100000); @@ -297,9 +298,9 @@ int bench_futex_requeue(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); pthread_attr_destroy(&thread_attr); print_summary(); diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index e47f46a3a47e9..6682e49d0ee03 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -10,6 +10,7 @@ #include "bench.h" #include <linux/compiler.h> #include "../util/debug.h" +#include "../util/mutex.h" #ifndef HAVE_PTHREAD_BARRIER int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused) @@ -49,8 +50,8 @@ static u_int32_t futex = 0; static pthread_t *blocked_worker; static bool done = false; -static pthread_mutex_t thread_lock; -static pthread_cond_t thread_parent, thread_worker; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; static pthread_barrier_t barrier; static struct stats waketime_stats, wakeup_stats; static unsigned int threads_starting; @@ -125,12 +126,12 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) static void *blocked_workerfn(void *arg __maybe_unused) { - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); while (1) { /* handle spurious wakeups */ if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) @@ -294,9 +295,9 @@ int bench_futex_wake_parallel(int argc, const char **argv) init_stats(&waketime_stats); pthread_attr_init(&thread_attr); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); for (j = 0; j < bench_repeat && !done; j++) { waking_worker = calloc(params.nwakes, sizeof(*waking_worker)); @@ -307,11 +308,11 @@ int bench_futex_wake_parallel(int argc, const char **argv) block_threads(blocked_worker, thread_attr, cpu); /* make sure all threads are already blocked */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); usleep(100000); @@ -332,9 +333,9 @@ int bench_futex_wake_parallel(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); pthread_attr_destroy(&thread_attr); print_summary(); diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index 201a3555f09a2..9ecab6620a875 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -14,6 +14,7 @@ #include <pthread.h> #include <signal.h> +#include "../util/mutex.h" #include "../util/stat.h" #include <subcmd/parse-options.h> #include <linux/compiler.h> @@ -34,8 +35,8 @@ static u_int32_t futex1 = 0; static pthread_t *worker; static bool done = false; -static pthread_mutex_t thread_lock; -static pthread_cond_t thread_parent, thread_worker; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; static struct stats waketime_stats, wakeup_stats; static unsigned int threads_starting; static int futex_flag = 0; @@ -65,12 +66,12 @@ static const char * const bench_futex_wake_usage[] = { static void *workerfn(void *arg __maybe_unused) { - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); threads_starting--; if (!threads_starting) - pthread_cond_signal(&thread_parent); - pthread_cond_wait(&thread_worker, &thread_lock); - pthread_mutex_unlock(&thread_lock); + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); while (1) { if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR) @@ -178,9 +179,9 @@ int bench_futex_wake(int argc, const char **argv) init_stats(&wakeup_stats); init_stats(&waketime_stats); pthread_attr_init(&thread_attr); - pthread_mutex_init(&thread_lock, NULL); - pthread_cond_init(&thread_parent, NULL); - pthread_cond_init(&thread_worker, NULL); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); for (j = 0; j < bench_repeat && !done; j++) { unsigned int nwoken = 0; @@ -190,11 +191,11 @@ int bench_futex_wake(int argc, const char **argv) block_threads(worker, thread_attr, cpu); /* make sure all threads are already blocked */ - pthread_mutex_lock(&thread_lock); + mutex_lock(&thread_lock); while (threads_starting) - pthread_cond_wait(&thread_parent, &thread_lock); - pthread_cond_broadcast(&thread_worker); - pthread_mutex_unlock(&thread_lock); + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); usleep(100000); @@ -224,9 +225,9 @@ int bench_futex_wake(int argc, const char **argv) } /* cleanup & report results */ - pthread_cond_destroy(&thread_parent); - pthread_cond_destroy(&thread_worker); - pthread_mutex_destroy(&thread_lock); + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); pthread_attr_destroy(&thread_attr); print_summary(); diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 20eed1e53f809..e78dedf9e682c 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -6,8 +6,6 @@ */ #include <inttypes.h> -/* For the CLR_() macros */ -#include <pthread.h> #include <subcmd/parse-options.h> #include "../util/cloexec.h" @@ -35,6 +33,7 @@ #include <linux/zalloc.h> #include "../util/header.h" +#include "../util/mutex.h" #include <numa.h> #include <numaif.h> @@ -67,7 +66,7 @@ struct thread_data { u64 system_time_ns; u64 user_time_ns; double speed_gbs; - pthread_mutex_t *process_lock; + struct mutex *process_lock; }; /* Parameters set by options: */ @@ -137,16 +136,16 @@ struct params { struct global_info { u8 *data; - pthread_mutex_t startup_mutex; - pthread_cond_t startup_cond; + struct mutex startup_mutex; + struct cond startup_cond; int nr_tasks_started; - pthread_mutex_t start_work_mutex; - pthread_cond_t start_work_cond; + struct mutex start_work_mutex; + struct cond start_work_cond; int nr_tasks_working; bool start_work; - pthread_mutex_t stop_work_mutex; + struct mutex stop_work_mutex; u64 bytes_done; struct thread_data *threads; @@ -524,30 +523,6 @@ static void * setup_private_data(ssize_t bytes) return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); } -/* - * Return a process-shared (global) mutex: - */ -static void init_global_mutex(pthread_mutex_t *mutex) -{ - pthread_mutexattr_t attr; - - pthread_mutexattr_init(&attr); - pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); - pthread_mutex_init(mutex, &attr); -} - -/* - * Return a process-shared (global) condition variable: - */ -static void init_global_cond(pthread_cond_t *cond) -{ - pthread_condattr_t attr; - - pthread_condattr_init(&attr); - pthread_condattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); - pthread_cond_init(cond, &attr); -} - static int parse_cpu_list(const char *arg) { p0.cpu_list_str = strdup(arg); @@ -1220,22 +1195,22 @@ static void *worker_thread(void *__tdata) } if (g->p.serialize_startup) { - pthread_mutex_lock(&g->startup_mutex); + mutex_lock(&g->startup_mutex); g->nr_tasks_started++; /* The last thread wakes the main process. */ if (g->nr_tasks_started == g->p.nr_tasks) - pthread_cond_signal(&g->startup_cond); + cond_signal(&g->startup_cond); - pthread_mutex_unlock(&g->startup_mutex); + mutex_unlock(&g->startup_mutex); /* Here we will wait for the main process to start us all at once: */ - pthread_mutex_lock(&g->start_work_mutex); + mutex_lock(&g->start_work_mutex); g->start_work = false; g->nr_tasks_working++; while (!g->start_work) - pthread_cond_wait(&g->start_work_cond, &g->start_work_mutex); + cond_wait(&g->start_work_cond, &g->start_work_mutex); - pthread_mutex_unlock(&g->start_work_mutex); + mutex_unlock(&g->start_work_mutex); } gettimeofday(&start0, NULL); @@ -1254,17 +1229,17 @@ static void *worker_thread(void *__tdata) val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); if (g->p.sleep_usecs) { - pthread_mutex_lock(td->process_lock); + mutex_lock(td->process_lock); usleep(g->p.sleep_usecs); - pthread_mutex_unlock(td->process_lock); + mutex_unlock(td->process_lock); } /* * Amount of work to be done under a process-global lock: */ if (g->p.bytes_process_locked) { - pthread_mutex_lock(td->process_lock); + mutex_lock(td->process_lock); val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); - pthread_mutex_unlock(td->process_lock); + mutex_unlock(td->process_lock); } work_done = g->p.bytes_global + g->p.bytes_process + @@ -1361,9 +1336,9 @@ static void *worker_thread(void *__tdata) free_data(thread_data, g->p.bytes_thread); - pthread_mutex_lock(&g->stop_work_mutex); + mutex_lock(&g->stop_work_mutex); g->bytes_done += bytes_done; - pthread_mutex_unlock(&g->stop_work_mutex); + mutex_unlock(&g->stop_work_mutex); return NULL; } @@ -1373,7 +1348,7 @@ static void *worker_thread(void *__tdata) */ static void worker_process(int process_nr) { - pthread_mutex_t process_lock; + struct mutex process_lock; struct thread_data *td; pthread_t *pthreads; u8 *process_data; @@ -1381,7 +1356,7 @@ static void worker_process(int process_nr) int ret; int t; - pthread_mutex_init(&process_lock, NULL); + mutex_init(&process_lock); set_taskname("process %d", process_nr); /* @@ -1540,11 +1515,11 @@ static int init(void) g->data = setup_shared_data(g->p.bytes_global); /* Startup serialization: */ - init_global_mutex(&g->start_work_mutex); - init_global_cond(&g->start_work_cond); - init_global_mutex(&g->startup_mutex); - init_global_cond(&g->startup_cond); - init_global_mutex(&g->stop_work_mutex); + mutex_init_pshared(&g->start_work_mutex); + cond_init_pshared(&g->start_work_cond); + mutex_init_pshared(&g->startup_mutex); + cond_init_pshared(&g->startup_cond); + mutex_init_pshared(&g->stop_work_mutex); init_thread_data(); @@ -1633,17 +1608,17 @@ static int __bench_numa(const char *name) * Wait for all the threads to start up. The last thread will * signal this process. */ - pthread_mutex_lock(&g->startup_mutex); + mutex_lock(&g->startup_mutex); while (g->nr_tasks_started != g->p.nr_tasks) - pthread_cond_wait(&g->startup_cond, &g->startup_mutex); + cond_wait(&g->startup_cond, &g->startup_mutex); - pthread_mutex_unlock(&g->startup_mutex); + mutex_unlock(&g->startup_mutex); /* Wait for all threads to be at the start_work_cond. */ while (!threads_ready) { - pthread_mutex_lock(&g->start_work_mutex); + mutex_lock(&g->start_work_mutex); threads_ready = (g->nr_tasks_working == g->p.nr_tasks); - pthread_mutex_unlock(&g->start_work_mutex); + mutex_unlock(&g->start_work_mutex); if (!threads_ready) usleep(1); } @@ -1661,10 +1636,10 @@ static int __bench_numa(const char *name) start = stop; /* Start all threads running. */ - pthread_mutex_lock(&g->start_work_mutex); + mutex_lock(&g->start_work_mutex); g->start_work = true; - pthread_mutex_unlock(&g->start_work_mutex); - pthread_cond_broadcast(&g->start_work_cond); + mutex_unlock(&g->start_work_mutex); + cond_broadcast(&g->start_work_cond); } else { gettimeofday(&start, NULL); } -- GitLab From 130f267af6e3e607f9101f2ec1d24d855cd3fb04 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:27 -0700 Subject: [PATCH 1194/2223] perf tests: Avoid pthread.h inclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pthread.h is being included for the side-effect of getting sched.h and macros like CPU_CLR. Switch to directly using sched.h, or if that is already present, just remove the pthread.h inclusion entirely. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/mmap-basic.c | 2 -- tools/perf/tests/openat-syscall-all-cpus.c | 2 +- tools/perf/tests/perf-record.c | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 9e9a2b67de199..8322fc2295fa0 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <errno.h> #include <inttypes.h> -/* For the CLR_() macros */ -#include <pthread.h> #include <stdlib.h> #include <perf/cpumap.h> diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c index 90828ae03ef51..f3275be83a338 100644 --- a/tools/perf/tests/openat-syscall-all-cpus.c +++ b/tools/perf/tests/openat-syscall-all-cpus.c @@ -2,7 +2,7 @@ #include <errno.h> #include <inttypes.h> /* For the CPU_* macros */ -#include <pthread.h> +#include <sched.h> #include <sys/types.h> #include <sys/stat.h> diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 4952abe716f31..7aa946aa886de 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -2,8 +2,6 @@ #include <errno.h> #include <inttypes.h> #include <linux/string.h> -/* For the CLR_() macros */ -#include <pthread.h> #include <sched.h> #include <perf/mmap.h> -- GitLab From 8e03bb88ab8b60b52baafc4f909bddc1c2323cb5 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:28 -0700 Subject: [PATCH 1195/2223] perf hist: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-top.c | 8 ++++---- tools/perf/util/hist.c | 6 +++--- tools/perf/util/hist.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index fd8fd913c533c..14e60f6f219c0 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -220,7 +220,7 @@ static void perf_top__record_precise_ip(struct perf_top *top, * This function is now called with he->hists->lock held. * Release it before going to sleep. */ - pthread_mutex_unlock(&he->hists->lock); + mutex_unlock(&he->hists->lock); if (err == -ERANGE && !he->ms.map->erange_warned) ui__warn_map_erange(he->ms.map, sym, ip); @@ -230,7 +230,7 @@ static void perf_top__record_precise_ip(struct perf_top *top, sleep(1); } - pthread_mutex_lock(&he->hists->lock); + mutex_lock(&he->hists->lock); } } @@ -836,12 +836,12 @@ static void perf_event__process_sample(struct perf_tool *tool, else iter.ops = &hist_iter_normal; - pthread_mutex_lock(&hists->lock); + mutex_lock(&hists->lock); if (hist_entry_iter__add(&iter, &al, top->max_stack, top) < 0) pr_err("Problem incrementing symbol period, skipping event\n"); - pthread_mutex_unlock(&hists->lock); + mutex_unlock(&hists->lock); } addr_location__put(&al); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 1c085ab565340..698add038cecd 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1622,13 +1622,13 @@ struct rb_root_cached *hists__get_rotate_entries_in(struct hists *hists) { struct rb_root_cached *root; - pthread_mutex_lock(&hists->lock); + mutex_lock(&hists->lock); root = hists->entries_in; if (++hists->entries_in > &hists->entries_in_array[1]) hists->entries_in = &hists->entries_in_array[0]; - pthread_mutex_unlock(&hists->lock); + mutex_unlock(&hists->lock); return root; } @@ -2805,7 +2805,7 @@ int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list) hists->entries_in = &hists->entries_in_array[0]; hists->entries_collapsed = RB_ROOT_CACHED; hists->entries = RB_ROOT_CACHED; - pthread_mutex_init(&hists->lock, NULL); + mutex_init(&hists->lock); hists->socket_filter = -1; hists->hpp_list = hpp_list; INIT_LIST_HEAD(&hists->hpp_formats); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 7ed4648d2fc2f..508428b2c1b2a 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -4,10 +4,10 @@ #include <linux/rbtree.h> #include <linux/types.h> -#include <pthread.h> #include "evsel.h" #include "color.h" #include "events_stats.h" +#include "mutex.h" struct hist_entry; struct hist_entry_ops; @@ -98,7 +98,7 @@ struct hists { const struct dso *dso_filter; const char *uid_filter_str; const char *symbol_filter_str; - pthread_mutex_t lock; + struct mutex lock; struct hists_stats stats; u64 event_stream; u16 col_len[HISTC_NR_COLS]; -- GitLab From ed0546b7b8376c2bc137a46babfffd14ab060c10 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:29 -0700 Subject: [PATCH 1196/2223] perf bpf: Remove unused pthread.h include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No pthread usage in bpf-event.h. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/bpf-event.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 144a8a24cc695..1bcbd4fb6c669 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -4,7 +4,6 @@ #include <linux/compiler.h> #include <linux/rbtree.h> -#include <pthread.h> #include <api/fd/array.h> #include <stdio.h> -- GitLab From 6f37dc6ed0f45c476b88c0b25f829749a812238a Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:30 -0700 Subject: [PATCH 1197/2223] perf lock: Remove unused pthread.h include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No pthread usage in builtin-lock.c. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-lock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index ea40ae52cd2c7..e79ef614105c8 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -28,7 +28,6 @@ #include <sys/types.h> #include <sys/prctl.h> #include <semaphore.h> -#include <pthread.h> #include <math.h> #include <limits.h> -- GitLab From 49c670b17e555bb9fab4308bb4dd9eadf29872fb Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:31 -0700 Subject: [PATCH 1198/2223] perf record: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking for synth_lock. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-record.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index df83dd436bdba..a91ead72fd413 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -21,6 +21,7 @@ #include "util/evsel.h" #include "util/debug.h" #include "util/mmap.h" +#include "util/mutex.h" #include "util/target.h" #include "util/session.h" #include "util/tool.h" @@ -616,17 +617,18 @@ static int process_synthesized_event(struct perf_tool *tool, return record__write(rec, NULL, event, event->header.size); } +static struct mutex synth_lock; + static int process_locked_synthesized_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) { - static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER; int ret; - pthread_mutex_lock(&synth_lock); + mutex_lock(&synth_lock); ret = process_synthesized_event(tool, event, sample, machine); - pthread_mutex_unlock(&synth_lock); + mutex_unlock(&synth_lock); return ret; } @@ -1987,6 +1989,7 @@ static int record__synthesize(struct record *rec, bool tail) } if (rec->opts.nr_threads_synthesize > 1) { + mutex_init(&synth_lock); perf_set_multithreaded(); f = process_locked_synthesized_event; } @@ -2000,8 +2003,10 @@ static int record__synthesize(struct record *rec, bool tail) rec->opts.nr_threads_synthesize); } - if (rec->opts.nr_threads_synthesize > 1) + if (rec->opts.nr_threads_synthesize > 1) { perf_set_singlethreaded(); + mutex_destroy(&synth_lock); + } out: return err; -- GitLab From 0bd14ac2d6aab7339c4b410543d978cd254b24f9 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:32 -0700 Subject: [PATCH 1199/2223] perf sched: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Update cmd_sched so that we always explicitly destroy the mutexes. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-sched.c | 67 ++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index a5cf243c337f1..46e3b96457b84 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -7,6 +7,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/evsel_fprintf.h" +#include "util/mutex.h" #include "util/symbol.h" #include "util/thread.h" #include "util/header.h" @@ -184,8 +185,8 @@ struct perf_sched { struct task_desc **pid_to_task; struct task_desc **tasks; const struct trace_sched_handler *tp_handler; - pthread_mutex_t start_work_mutex; - pthread_mutex_t work_done_wait_mutex; + struct mutex start_work_mutex; + struct mutex work_done_wait_mutex; int profile_cpu; /* * Track the current task - that way we can know whether there's any @@ -635,10 +636,8 @@ static void *thread_func(void *ctx) again: ret = sem_post(&this_task->ready_for_work); BUG_ON(ret); - ret = pthread_mutex_lock(&sched->start_work_mutex); - BUG_ON(ret); - ret = pthread_mutex_unlock(&sched->start_work_mutex); - BUG_ON(ret); + mutex_lock(&sched->start_work_mutex); + mutex_unlock(&sched->start_work_mutex); cpu_usage_0 = get_cpu_usage_nsec_self(fd); @@ -652,10 +651,8 @@ again: ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); - ret = pthread_mutex_lock(&sched->work_done_wait_mutex); - BUG_ON(ret); - ret = pthread_mutex_unlock(&sched->work_done_wait_mutex); - BUG_ON(ret); + mutex_lock(&sched->work_done_wait_mutex); + mutex_unlock(&sched->work_done_wait_mutex); goto again; } @@ -672,10 +669,8 @@ static void create_tasks(struct perf_sched *sched) err = pthread_attr_setstacksize(&attr, (size_t) max(16 * 1024, (int)PTHREAD_STACK_MIN)); BUG_ON(err); - err = pthread_mutex_lock(&sched->start_work_mutex); - BUG_ON(err); - err = pthread_mutex_lock(&sched->work_done_wait_mutex); - BUG_ON(err); + mutex_lock(&sched->start_work_mutex); + mutex_lock(&sched->work_done_wait_mutex); for (i = 0; i < sched->nr_tasks; i++) { struct sched_thread_parms *parms = malloc(sizeof(*parms)); BUG_ON(parms == NULL); @@ -699,7 +694,7 @@ static void wait_for_tasks(struct perf_sched *sched) sched->start_time = get_nsecs(); sched->cpu_usage = 0; - pthread_mutex_unlock(&sched->work_done_wait_mutex); + mutex_unlock(&sched->work_done_wait_mutex); for (i = 0; i < sched->nr_tasks; i++) { task = sched->tasks[i]; @@ -707,12 +702,11 @@ static void wait_for_tasks(struct perf_sched *sched) BUG_ON(ret); sem_init(&task->ready_for_work, 0, 0); } - ret = pthread_mutex_lock(&sched->work_done_wait_mutex); - BUG_ON(ret); + mutex_lock(&sched->work_done_wait_mutex); cpu_usage_0 = get_cpu_usage_nsec_parent(); - pthread_mutex_unlock(&sched->start_work_mutex); + mutex_unlock(&sched->start_work_mutex); for (i = 0; i < sched->nr_tasks; i++) { task = sched->tasks[i]; @@ -734,8 +728,7 @@ static void wait_for_tasks(struct perf_sched *sched) sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + sched->parent_cpu_usage)/sched->replay_repeat; - ret = pthread_mutex_lock(&sched->start_work_mutex); - BUG_ON(ret); + mutex_lock(&sched->start_work_mutex); for (i = 0; i < sched->nr_tasks; i++) { task = sched->tasks[i]; @@ -3444,8 +3437,6 @@ int cmd_sched(int argc, const char **argv) }, .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid), .sort_list = LIST_HEAD_INIT(sched.sort_list), - .start_work_mutex = PTHREAD_MUTEX_INITIALIZER, - .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER, .sort_order = default_sort_order, .replay_repeat = 10, .profile_cpu = -1, @@ -3559,8 +3550,10 @@ int cmd_sched(int argc, const char **argv) .fork_event = replay_fork_event, }; unsigned int i; - int ret; + int ret = 0; + mutex_init(&sched.start_work_mutex); + mutex_init(&sched.work_done_wait_mutex); for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++) sched.curr_pid[i] = -1; @@ -3572,11 +3565,10 @@ int cmd_sched(int argc, const char **argv) /* * Aliased to 'perf script' for now: */ - if (!strcmp(argv[0], "script")) - return cmd_script(argc, argv); - - if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) { - return __cmd_record(argc, argv); + if (!strcmp(argv[0], "script")) { + ret = cmd_script(argc, argv); + } else if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) { + ret = __cmd_record(argc, argv); } else if (strlen(argv[0]) > 2 && strstarts("latency", argv[0])) { sched.tp_handler = &lat_ops; if (argc > 1) { @@ -3585,7 +3577,7 @@ int cmd_sched(int argc, const char **argv) usage_with_options(latency_usage, latency_options); } setup_sorting(&sched, latency_options, latency_usage); - return perf_sched__lat(&sched); + ret = perf_sched__lat(&sched); } else if (!strcmp(argv[0], "map")) { if (argc) { argc = parse_options(argc, argv, map_options, map_usage, 0); @@ -3594,7 +3586,7 @@ int cmd_sched(int argc, const char **argv) } sched.tp_handler = &map_ops; setup_sorting(&sched, latency_options, latency_usage); - return perf_sched__map(&sched); + ret = perf_sched__map(&sched); } else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) { sched.tp_handler = &replay_ops; if (argc) { @@ -3602,7 +3594,7 @@ int cmd_sched(int argc, const char **argv) if (argc) usage_with_options(replay_usage, replay_options); } - return perf_sched__replay(&sched); + ret = perf_sched__replay(&sched); } else if (!strcmp(argv[0], "timehist")) { if (argc) { argc = parse_options(argc, argv, timehist_options, @@ -3618,16 +3610,21 @@ int cmd_sched(int argc, const char **argv) parse_options_usage(NULL, timehist_options, "w", true); if (sched.show_next) parse_options_usage(NULL, timehist_options, "n", true); - return -EINVAL; + ret = -EINVAL; + goto out; } ret = symbol__validate_sym_arguments(); if (ret) - return ret; + goto out; - return perf_sched__timehist(&sched); + ret = perf_sched__timehist(&sched); } else { usage_with_options(sched_usage, sched_options); } - return 0; +out: + mutex_destroy(&sched.start_work_mutex); + mutex_destroy(&sched.work_done_wait_mutex); + + return ret; } -- GitLab From 82aff6cc070417f26f9b02b26e63c17ff43b4044 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:33 -0700 Subject: [PATCH 1200/2223] perf ui: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/ui/browser.c | 20 ++++++++++---------- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/ui/setup.c | 5 +++-- tools/perf/ui/tui/helpline.c | 5 ++--- tools/perf/ui/tui/progress.c | 8 ++++---- tools/perf/ui/tui/setup.c | 8 ++++---- tools/perf/ui/tui/util.c | 18 +++++++++--------- tools/perf/ui/ui.h | 4 ++-- 8 files changed, 35 insertions(+), 35 deletions(-) diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index fa5bd5c20e96b..78fb01d6ad63f 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -268,9 +268,9 @@ void __ui_browser__show_title(struct ui_browser *browser, const char *title) void ui_browser__show_title(struct ui_browser *browser, const char *title) { - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); __ui_browser__show_title(browser, title); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } int ui_browser__show(struct ui_browser *browser, const char *title, @@ -284,7 +284,7 @@ int ui_browser__show(struct ui_browser *browser, const char *title, browser->refresh_dimensions(browser); - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); __ui_browser__show_title(browser, title); browser->title = title; @@ -295,16 +295,16 @@ int ui_browser__show(struct ui_browser *browser, const char *title, va_end(ap); if (err > 0) ui_helpline__push(browser->helpline); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); return err ? 0 : -1; } void ui_browser__hide(struct ui_browser *browser) { - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); ui_helpline__pop(); zfree(&browser->helpline); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } static void ui_browser__scrollbar_set(struct ui_browser *browser) @@ -352,9 +352,9 @@ static int __ui_browser__refresh(struct ui_browser *browser) int ui_browser__refresh(struct ui_browser *browser) { - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); __ui_browser__refresh(browser); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); return 0; } @@ -390,10 +390,10 @@ int ui_browser__run(struct ui_browser *browser, int delay_secs) while (1) { off_t offset; - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); err = __ui_browser__refresh(browser); SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); if (err < 0) break; diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 44ba900828f6c..b8747e8dd9ea4 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -8,11 +8,11 @@ #include "../../util/hist.h" #include "../../util/sort.h" #include "../../util/map.h" +#include "../../util/mutex.h" #include "../../util/symbol.h" #include "../../util/evsel.h" #include "../../util/evlist.h" #include <inttypes.h> -#include <pthread.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/zalloc.h> diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c index 700335cde6180..25ded88801a3d 100644 --- a/tools/perf/ui/setup.c +++ b/tools/perf/ui/setup.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include <pthread.h> #include <dlfcn.h> #include <unistd.h> @@ -8,7 +7,7 @@ #include "../util/hist.h" #include "ui.h" -pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER; +struct mutex ui__lock; void *perf_gtk_handle; int use_browser = -1; @@ -76,6 +75,7 @@ int stdio__config_color(const struct option *opt __maybe_unused, void setup_browser(bool fallback_to_pager) { + mutex_init(&ui__lock); if (use_browser < 2 && (!isatty(1) || dump_trace)) use_browser = 0; @@ -118,4 +118,5 @@ void exit_browser(bool wait_for_ok) default: break; } + mutex_destroy(&ui__lock); } diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c index 298d6af82fddd..db4952f5990bd 100644 --- a/tools/perf/ui/tui/helpline.c +++ b/tools/perf/ui/tui/helpline.c @@ -2,7 +2,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <pthread.h> #include <linux/kernel.h> #include <linux/string.h> @@ -33,7 +32,7 @@ static int tui_helpline__show(const char *format, va_list ap) int ret; static int backlog; - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); ret = vscnprintf(ui_helpline__last_msg + backlog, sizeof(ui_helpline__last_msg) - backlog, format, ap); backlog += ret; @@ -45,7 +44,7 @@ static int tui_helpline__show(const char *format, va_list ap) SLsmg_refresh(); backlog = 0; } - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); return ret; } diff --git a/tools/perf/ui/tui/progress.c b/tools/perf/ui/tui/progress.c index 3d74af5a7ece6..71b6c8d9474fb 100644 --- a/tools/perf/ui/tui/progress.c +++ b/tools/perf/ui/tui/progress.c @@ -45,7 +45,7 @@ static void tui_progress__update(struct ui_progress *p) } ui__refresh_dimensions(false); - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); y = SLtt_Screen_Rows / 2 - 2; SLsmg_set_color(0); SLsmg_draw_box(y, 0, 3, SLtt_Screen_Cols); @@ -56,7 +56,7 @@ static void tui_progress__update(struct ui_progress *p) bar = ((SLtt_Screen_Cols - 2) * p->curr) / p->total; SLsmg_fill_region(y, 1, 1, bar, ' '); SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } static void tui_progress__finish(void) @@ -67,12 +67,12 @@ static void tui_progress__finish(void) return; ui__refresh_dimensions(false); - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); y = SLtt_Screen_Rows / 2 - 2; SLsmg_set_color(0); SLsmg_fill_region(y, 0, 3, SLtt_Screen_Cols, ' '); SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } static struct ui_progress_ops tui_progress__ops = { diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c index b1be59b4e2a4f..a3b8c397c24d5 100644 --- a/tools/perf/ui/tui/setup.c +++ b/tools/perf/ui/tui/setup.c @@ -29,10 +29,10 @@ void ui__refresh_dimensions(bool force) { if (force || ui__need_resize) { ui__need_resize = 0; - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); SLtt_get_screen_size(); SLsmg_reinit_smg(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } } @@ -170,10 +170,10 @@ void ui__exit(bool wait_for_ok) "Press any key...", 0); SLtt_set_cursor_visibility(1); - if (!pthread_mutex_trylock(&ui__lock)) { + if (mutex_trylock(&ui__lock)) { SLsmg_refresh(); SLsmg_reset_smg(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } SLang_reset_tty(); perf_error__unregister(&perf_tui_eops); diff --git a/tools/perf/ui/tui/util.c b/tools/perf/ui/tui/util.c index 0f562e2cb1e88..3c5174854ac8b 100644 --- a/tools/perf/ui/tui/util.c +++ b/tools/perf/ui/tui/util.c @@ -95,7 +95,7 @@ int ui_browser__input_window(const char *title, const char *text, char *input, t = sep + 1; } - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); max_len += 2; nr_lines += 8; @@ -125,17 +125,17 @@ int ui_browser__input_window(const char *title, const char *text, char *input, SLsmg_write_nstring((char *)exit_msg, max_len); SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); x += 2; len = 0; key = ui__getch(delay_secs); while (key != K_TIMER && key != K_ENTER && key != K_ESC) { - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); if (key == K_BKSPC) { if (len == 0) { - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); goto next_key; } SLsmg_gotorc(y, x + --len); @@ -147,7 +147,7 @@ int ui_browser__input_window(const char *title, const char *text, char *input, } SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); /* XXX more graceful overflow handling needed */ if (len == sizeof(buf) - 1) { @@ -215,19 +215,19 @@ void __ui__info_window(const char *title, const char *text, const char *exit_msg void ui__info_window(const char *title, const char *text) { - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); __ui__info_window(title, text, NULL); SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); } int ui__question_window(const char *title, const char *text, const char *exit_msg, int delay_secs) { - pthread_mutex_lock(&ui__lock); + mutex_lock(&ui__lock); __ui__info_window(title, text, exit_msg); SLsmg_refresh(); - pthread_mutex_unlock(&ui__lock); + mutex_unlock(&ui__lock); return ui__getch(delay_secs); } diff --git a/tools/perf/ui/ui.h b/tools/perf/ui/ui.h index 9b6fdf06e1d2f..99f8d2fe9bc55 100644 --- a/tools/perf/ui/ui.h +++ b/tools/perf/ui/ui.h @@ -2,11 +2,11 @@ #ifndef _PERF_UI_H_ #define _PERF_UI_H_ 1 -#include <pthread.h> +#include "../util/mutex.h" #include <stdbool.h> #include <linux/compiler.h> -extern pthread_mutex_t ui__lock; +extern struct mutex ui__lock; extern void *perf_gtk_handle; extern int use_browser; -- GitLab From 26b3a5fa41a6063c19747e25174fc6f4cd315c34 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:34 -0700 Subject: [PATCH 1201/2223] perf mmap: Remove unnecessary pthread.h include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment says it is for cpu_set_t which isn't used in the header. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/mmap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index cd8b0777473b3..cd4ccec7f3617 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -9,7 +9,6 @@ #include <linux/bitops.h> #include <perf/cpumap.h> #include <stdbool.h> -#include <pthread.h> // for cpu_set_t #ifdef HAVE_AIO_SUPPORT #include <aio.h> #endif -- GitLab From d9a0d6b83950bde861d2e2715ef476dae67d7873 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:35 -0700 Subject: [PATCH 1202/2223] perf dso: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/dso.c | 12 ++++++------ tools/perf/util/dso.h | 4 ++-- tools/perf/util/symbol.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 5ac13958d1bde..a9789a9554035 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -795,7 +795,7 @@ dso_cache__free(struct dso *dso) struct rb_root *root = &dso->data.cache; struct rb_node *next = rb_first(root); - pthread_mutex_lock(&dso->lock); + mutex_lock(&dso->lock); while (next) { struct dso_cache *cache; @@ -804,7 +804,7 @@ dso_cache__free(struct dso *dso) rb_erase(&cache->rb_node, root); free(cache); } - pthread_mutex_unlock(&dso->lock); + mutex_unlock(&dso->lock); } static struct dso_cache *__dso_cache__find(struct dso *dso, u64 offset) @@ -841,7 +841,7 @@ dso_cache__insert(struct dso *dso, struct dso_cache *new) struct dso_cache *cache; u64 offset = new->offset; - pthread_mutex_lock(&dso->lock); + mutex_lock(&dso->lock); while (*p != NULL) { u64 end; @@ -862,7 +862,7 @@ dso_cache__insert(struct dso *dso, struct dso_cache *new) cache = NULL; out: - pthread_mutex_unlock(&dso->lock); + mutex_unlock(&dso->lock); return cache; } @@ -1297,7 +1297,7 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) dso->root = NULL; INIT_LIST_HEAD(&dso->node); INIT_LIST_HEAD(&dso->data.open_entry); - pthread_mutex_init(&dso->lock, NULL); + mutex_init(&dso->lock); refcount_set(&dso->refcnt, 1); } @@ -1336,7 +1336,7 @@ void dso__delete(struct dso *dso) dso__free_a2l(dso); zfree(&dso->symsrc_filename); nsinfo__zput(dso->nsinfo); - pthread_mutex_destroy(&dso->lock); + mutex_destroy(&dso->lock); free(dso); } diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 66981c7a9a18d..58d94175e7148 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -2,7 +2,6 @@ #ifndef __PERF_DSO #define __PERF_DSO -#include <pthread.h> #include <linux/refcount.h> #include <linux/types.h> #include <linux/rbtree.h> @@ -11,6 +10,7 @@ #include <stdio.h> #include <linux/bitops.h> #include "build-id.h" +#include "mutex.h" struct machine; struct map; @@ -145,7 +145,7 @@ struct dso_cache { struct auxtrace_cache; struct dso { - pthread_mutex_t lock; + struct mutex lock; struct list_head node; struct rb_node rb_node; /* rbtree node sorted by long name */ struct rb_root *root; /* root of rbtree that rb_node is in */ diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index a4b22caa7c24c..656d9b4dd4567 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1800,7 +1800,7 @@ int dso__load(struct dso *dso, struct map *map) } nsinfo__mountns_enter(dso->nsinfo, &nsc); - pthread_mutex_lock(&dso->lock); + mutex_lock(&dso->lock); /* check again under the dso->lock */ if (dso__loaded(dso)) { @@ -1964,7 +1964,7 @@ out_free: ret = 0; out: dso__set_loaded(dso); - pthread_mutex_unlock(&dso->lock); + mutex_unlock(&dso->lock); nsinfo__mountns_exit(&nsc); return ret; -- GitLab From 9b3726ef836f6059af948c4b83317070da8b95f9 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:36 -0700 Subject: [PATCH 1203/2223] perf annotate: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-top.c | 14 +++++++------- tools/perf/ui/browsers/annotate.c | 10 +++++----- tools/perf/util/annotate.c | 13 ++++++------- tools/perf/util/annotate.h | 4 ++-- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 14e60f6f219c0..b96bb9a23ac03 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -136,10 +136,10 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) } notes = symbol__annotation(sym); - pthread_mutex_lock(¬es->lock); + mutex_lock(¬es->lock); if (!symbol__hists(sym, top->evlist->core.nr_entries)) { - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); sleep(1); @@ -155,7 +155,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) pr_err("Couldn't annotate %s: %s\n", sym->name, msg); } - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); return err; } @@ -208,12 +208,12 @@ static void perf_top__record_precise_ip(struct perf_top *top, notes = symbol__annotation(sym); - if (pthread_mutex_trylock(¬es->lock)) + if (!mutex_trylock(¬es->lock)) return; err = hist_entry__inc_addr_samples(he, sample, evsel, ip); - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); if (unlikely(err)) { /* @@ -250,7 +250,7 @@ static void perf_top__show_details(struct perf_top *top) symbol = he->ms.sym; notes = symbol__annotation(symbol); - pthread_mutex_lock(¬es->lock); + mutex_lock(¬es->lock); symbol__calc_percent(symbol, evsel); @@ -271,7 +271,7 @@ static void perf_top__show_details(struct perf_top *top) if (more != 0) printf("%d lines not displayed, maybe increase display entries [e]\n", more); out_unlock: - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); } static void perf_top__resort_hists(struct perf_top *t) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index b8747e8dd9ea4..9bc1076374ffd 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -319,7 +319,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, browser->entries = RB_ROOT; - pthread_mutex_lock(¬es->lock); + mutex_lock(¬es->lock); symbol__calc_percent(sym, evsel); @@ -348,7 +348,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, } disasm_rb_tree__insert(browser, &pos->al); } - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); browser->curr_hot = rb_last(&browser->entries); } @@ -474,10 +474,10 @@ static bool annotate_browser__callq(struct annotate_browser *browser, } notes = symbol__annotation(dl->ops.target.sym); - pthread_mutex_lock(¬es->lock); + mutex_lock(¬es->lock); if (!symbol__hists(dl->ops.target.sym, evsel->evlist->core.nr_entries)) { - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); ui__warning("Not enough memory for annotating '%s' symbol!\n", dl->ops.target.sym->name); return true; @@ -486,7 +486,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser, target_ms.maps = ms->maps; target_ms.map = ms->map; target_ms.sym = dl->ops.target.sym; - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); symbol__tui_annotate(&target_ms, evsel, hbt, browser->opts); sym_title(ms->sym, ms->map, title, sizeof(title), browser->opts->percent_type); ui_browser__show_title(&browser->b, title); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 2c6a485c3de5d..9d7dd6489a058 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -35,7 +35,6 @@ #include "arch/common.h" #include "namespaces.h" #include <regex.h> -#include <pthread.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/string.h> @@ -821,7 +820,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); - pthread_mutex_lock(¬es->lock); + mutex_lock(¬es->lock); if (notes->src != NULL) { memset(notes->src->histograms, 0, notes->src->nr_histograms * notes->src->sizeof_sym_hist); @@ -829,7 +828,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym) memset(notes->src->cycles_hist, 0, symbol__size(sym) * sizeof(struct cyc_hist)); } - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); } static int __symbol__account_cycles(struct cyc_hist *ch, @@ -1086,7 +1085,7 @@ void annotation__compute_ipc(struct annotation *notes, size_t size) notes->hit_insn = 0; notes->cover_insn = 0; - pthread_mutex_lock(¬es->lock); + mutex_lock(¬es->lock); for (offset = size - 1; offset >= 0; --offset) { struct cyc_hist *ch; @@ -1105,7 +1104,7 @@ void annotation__compute_ipc(struct annotation *notes, size_t size) notes->have_cycles = true; } } - pthread_mutex_unlock(¬es->lock); + mutex_unlock(¬es->lock); } int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, @@ -1258,13 +1257,13 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r void annotation__init(struct annotation *notes) { - pthread_mutex_init(¬es->lock, NULL); + mutex_init(¬es->lock); } void annotation__exit(struct annotation *notes) { annotated_source__delete(notes->src); - pthread_mutex_destroy(¬es->lock); + mutex_destroy(¬es->lock); } static void annotation_line__add(struct annotation_line *al, struct list_head *head) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 986f2bbe4870a..3cbd883e4d7ac 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -8,9 +8,9 @@ #include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> -#include <pthread.h> #include <asm/bug.h> #include "symbol_conf.h" +#include "mutex.h" #include "spark.h" struct hist_browser_timer; @@ -273,7 +273,7 @@ struct annotated_source { }; struct annotation { - pthread_mutex_t lock; + struct mutex lock; u64 max_coverage; u64 start; u64 hit_cycles; -- GitLab From d8e40b58ad4701198bfe83b860a29153d17dc478 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:37 -0700 Subject: [PATCH 1204/2223] perf top: Update use of pthread mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to the use of mutex wrappers that provide better error checking. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-top.c | 18 +++++++++--------- tools/perf/util/top.h | 5 +++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index b96bb9a23ac03..5af3347eedc10 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -893,10 +893,10 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) perf_mmap__consume(&md->core); if (top->qe.rotate) { - pthread_mutex_lock(&top->qe.mutex); + mutex_lock(&top->qe.mutex); top->qe.rotate = false; - pthread_cond_signal(&top->qe.cond); - pthread_mutex_unlock(&top->qe.mutex); + cond_signal(&top->qe.cond); + mutex_unlock(&top->qe.mutex); } } @@ -1100,10 +1100,10 @@ static void *process_thread(void *arg) out = rotate_queues(top); - pthread_mutex_lock(&top->qe.mutex); + mutex_lock(&top->qe.mutex); top->qe.rotate = true; - pthread_cond_wait(&top->qe.cond, &top->qe.mutex); - pthread_mutex_unlock(&top->qe.mutex); + cond_wait(&top->qe.cond, &top->qe.mutex); + mutex_unlock(&top->qe.mutex); if (ordered_events__flush(out, OE_FLUSH__TOP)) pr_err("failed to process events\n"); @@ -1217,8 +1217,8 @@ static void init_process_thread(struct perf_top *top) ordered_events__set_copy_on_queue(&top->qe.data[0], true); ordered_events__set_copy_on_queue(&top->qe.data[1], true); top->qe.in = &top->qe.data[0]; - pthread_mutex_init(&top->qe.mutex, NULL); - pthread_cond_init(&top->qe.cond, NULL); + mutex_init(&top->qe.mutex); + cond_init(&top->qe.cond); } static int __cmd_top(struct perf_top *top) @@ -1349,7 +1349,7 @@ static int __cmd_top(struct perf_top *top) out_join: pthread_join(thread, NULL); out_join_thread: - pthread_cond_signal(&top->qe.cond); + cond_signal(&top->qe.cond); pthread_join(thread_process, NULL); return ret; } diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 1c2c0a8384307..a8b0d79bd96cf 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -5,6 +5,7 @@ #include "tool.h" #include "evswitch.h" #include "annotate.h" +#include "mutex.h" #include "ordered-events.h" #include "record.h" #include <linux/types.h> @@ -53,8 +54,8 @@ struct perf_top { struct ordered_events *in; struct ordered_events data[2]; bool rotate; - pthread_mutex_t mutex; - pthread_cond_t cond; + struct mutex mutex; + struct cond cond; } qe; }; -- GitLab From e54dea69cdf6ed4bfcb266160d348be83bcbe826 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:38 -0700 Subject: [PATCH 1205/2223] perf dso: Hold lock when accessing nsinfo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There may be threads racing to update dso->nsinfo: https://lore.kernel.org/linux-perf-users/CAP-5=fWZH20L4kv-BwVtGLwR=Em3AOOT+Q4QGivvQuYn5AsPRg@mail.gmail.com/ Holding the dso->lock avoids use-after-free, memory leaks and other such bugs. Apply the fix in: https://lore.kernel.org/linux-perf-users/20211118193714.2293728-1-irogers@google.com/ of there being a missing nsinfo__put now that the accesses are data race free. Fixes test "Lookup mmap thread" when compiled with address sanitizer. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-inject.c | 4 ++++ tools/perf/util/annotate.c | 2 ++ tools/perf/util/build-id.c | 12 +++++++++--- tools/perf/util/dso.c | 7 ++++++- tools/perf/util/map.c | 3 +++ tools/perf/util/probe-event.c | 3 +++ tools/perf/util/symbol.c | 2 +- 7 files changed, 28 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8ec9554024883..e254f18986f7c 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -436,8 +436,10 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, } if (dso) { + mutex_lock(&dso->lock); nsinfo__put(dso->nsinfo); dso->nsinfo = nsi; + mutex_unlock(&dso->lock); } else nsinfo__put(nsi); @@ -620,6 +622,7 @@ static int dso__read_build_id(struct dso *dso) if (dso->has_build_id) return 0; + mutex_lock(&dso->lock); nsinfo__mountns_enter(dso->nsinfo, &nsc); if (filename__read_build_id(dso->long_name, &dso->bid) > 0) dso->has_build_id = true; @@ -633,6 +636,7 @@ static int dso__read_build_id(struct dso *dso) free(new_name); } nsinfo__mountns_exit(&nsc); + mutex_unlock(&dso->lock); return dso->has_build_id ? 0 : -1; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9d7dd6489a058..5bc63c9e0324d 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1697,6 +1697,7 @@ fallback: */ __symbol__join_symfs(filename, filename_size, dso->long_name); + mutex_lock(&dso->lock); if (access(filename, R_OK) && errno == ENOENT && dso->nsinfo) { char *new_name = filename_with_chroot(dso->nsinfo->pid, filename); @@ -1705,6 +1706,7 @@ fallback: free(new_name); } } + mutex_unlock(&dso->lock); } free(build_id_path); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index ec18ed5caf3ec..a839b30c981b7 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -898,11 +898,15 @@ static int filename__read_build_id_ns(const char *filename, static bool dso__build_id_mismatch(struct dso *dso, const char *name) { struct build_id bid; + bool ret = false; - if (filename__read_build_id_ns(name, &bid, dso->nsinfo) < 0) - return false; + mutex_lock(&dso->lock); + if (filename__read_build_id_ns(name, &bid, dso->nsinfo) >= 0) + ret = !dso__build_id_equal(dso, &bid); - return !dso__build_id_equal(dso, &bid); + mutex_unlock(&dso->lock); + + return ret; } static int dso__cache_build_id(struct dso *dso, struct machine *machine, @@ -941,8 +945,10 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, if (!is_kallsyms && dso__build_id_mismatch(dso, name)) goto out_free; + mutex_lock(&dso->lock); ret = build_id_cache__add_b(&dso->bid, name, dso->nsinfo, is_kallsyms, is_vdso, proper_name, root_dir); + mutex_unlock(&dso->lock); out_free: free(allocated_name); return ret; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index a9789a9554035..f1a14c0ad26d5 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -501,6 +501,7 @@ static int __open_dso(struct dso *dso, struct machine *machine) if (!name) return -ENOMEM; + mutex_lock(&dso->lock); if (machine) root_dir = machine->root_dir; @@ -541,6 +542,7 @@ static int __open_dso(struct dso *dso, struct machine *machine) unlink(name); out: + mutex_unlock(&dso->lock); free(name); return fd; } @@ -559,8 +561,11 @@ static int open_dso(struct dso *dso, struct machine *machine) int fd; struct nscookie nsc; - if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) + if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) { + mutex_lock(&dso->lock); nsinfo__mountns_enter(dso->nsinfo, &nsc); + mutex_unlock(&dso->lock); + } fd = __open_dso(dso, machine); if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) nsinfo__mountns_exit(&nsc); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index e0aa4a2545838..f3a3d9b3a40da 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -181,7 +181,10 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, if (!(prot & PROT_EXEC)) dso__set_loaded(dso); } + mutex_lock(&dso->lock); + nsinfo__put(dso->nsinfo); dso->nsinfo = nsi; + mutex_unlock(&dso->lock); if (build_id__is_defined(bid)) { dso__set_build_id(dso, bid); diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 785246ff41790..0c24bc7afbca2 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -29,6 +29,7 @@ #include "color.h" #include "map.h" #include "maps.h" +#include "mutex.h" #include "symbol.h" #include <api/fs/fs.h> #include "trace-event.h" /* For __maybe_unused */ @@ -180,8 +181,10 @@ struct map *get_target_map(const char *target, struct nsinfo *nsi, bool user) map = dso__new_map(target); if (map && map->dso) { + mutex_lock(&map->dso->lock); nsinfo__put(map->dso->nsinfo); map->dso->nsinfo = nsinfo__get(nsi); + mutex_unlock(&map->dso->lock); } return map; } else { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 656d9b4dd4567..a3a165ae933ad 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1791,6 +1791,7 @@ int dso__load(struct dso *dso, struct map *map) char newmapname[PATH_MAX]; const char *map_path = dso->long_name; + mutex_lock(&dso->lock); perfmap = strncmp(dso->name, "/tmp/perf-", 10) == 0; if (perfmap) { if (dso->nsinfo && (dso__find_perf_map(newmapname, @@ -1800,7 +1801,6 @@ int dso__load(struct dso *dso, struct map *map) } nsinfo__mountns_enter(dso->nsinfo, &nsc); - mutex_lock(&dso->lock); /* check again under the dso->lock */ if (dso__loaded(dso)) { -- GitLab From bfa339ceda3c9e49ffb58c7de50fd86912ab9e6d Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:39 -0700 Subject: [PATCH 1206/2223] perf mutex: Add thread safety annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add thread safety annotations to struct mutex so that when compiled with clang's -Wthread-safety warnings are generated for erroneous lock patterns. NO_THREAD_SAFETY_ANALYSIS is needed for mutex_lock/mutex_unlock as the analysis doesn't under pthread calls. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/mutex.c | 2 ++ tools/perf/util/mutex.h | 70 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c index 5029237164e5e..bca7f0717f355 100644 --- a/tools/perf/util/mutex.c +++ b/tools/perf/util/mutex.c @@ -50,11 +50,13 @@ void mutex_destroy(struct mutex *mtx) } void mutex_lock(struct mutex *mtx) + NO_THREAD_SAFETY_ANALYSIS { CHECK_ERR(pthread_mutex_lock(&mtx->lock)); } void mutex_unlock(struct mutex *mtx) + NO_THREAD_SAFETY_ANALYSIS { CHECK_ERR(pthread_mutex_unlock(&mtx->lock)); } diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h index cfff32a902d91..40661120caccb 100644 --- a/tools/perf/util/mutex.h +++ b/tools/perf/util/mutex.h @@ -5,11 +5,71 @@ #include <pthread.h> #include <stdbool.h> +/* + * A function-like feature checking macro that is a wrapper around + * `__has_attribute`, which is defined by GCC 5+ and Clang and evaluates to a + * nonzero constant integer if the attribute is supported or 0 if not. + */ +#ifdef __has_attribute +#define HAVE_ATTRIBUTE(x) __has_attribute(x) +#else +#define HAVE_ATTRIBUTE(x) 0 +#endif + +#if HAVE_ATTRIBUTE(guarded_by) && HAVE_ATTRIBUTE(pt_guarded_by) && \ + HAVE_ATTRIBUTE(lockable) && HAVE_ATTRIBUTE(exclusive_lock_function) && \ + HAVE_ATTRIBUTE(exclusive_trylock_function) && HAVE_ATTRIBUTE(exclusive_locks_required) && \ + HAVE_ATTRIBUTE(no_thread_safety_analysis) + +/* Documents if a shared field or global variable needs to be protected by a mutex. */ +#define GUARDED_BY(x) __attribute__((guarded_by(x))) + +/* + * Documents if the memory location pointed to by a pointer should be guarded by + * a mutex when dereferencing the pointer. + */ +#define PT_GUARDED_BY(x) __attribute__((pt_guarded_by(x))) + +/* Documents if a type is a lockable type. */ +#define LOCKABLE __attribute__((lockable)) + +/* Documents functions that acquire a lock in the body of a function, and do not release it. */ +#define EXCLUSIVE_LOCK_FUNCTION(...) __attribute__((exclusive_lock_function(__VA_ARGS__))) + +/* + * Documents functions that expect a lock to be held on entry to the function, + * and release it in the body of the function. + */ +#define UNLOCK_FUNCTION(...) __attribute__((unlock_function(__VA_ARGS__))) + +/* Documents functions that try to acquire a lock, and return success or failure. */ +#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \ + __attribute__((exclusive_trylock_function(__VA_ARGS__))) + +/* Documents a function that expects a mutex to be held prior to entry. */ +#define EXCLUSIVE_LOCKS_REQUIRED(...) __attribute__((exclusive_locks_required(__VA_ARGS__))) + +/* Turns off thread safety checking within the body of a particular function. */ +#define NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis)) + +#else + +#define GUARDED_BY(x) +#define PT_GUARDED_BY(x) +#define LOCKABLE +#define EXCLUSIVE_LOCK_FUNCTION(...) +#define UNLOCK_FUNCTION(...) +#define EXCLUSIVE_TRYLOCK_FUNCTION(...) +#define EXCLUSIVE_LOCKS_REQUIRED(...) +#define NO_THREAD_SAFETY_ANALYSIS + +#endif + /* * A wrapper around the mutex implementation that allows perf to error check * usage, etc. */ -struct mutex { +struct LOCKABLE mutex { pthread_mutex_t lock; }; @@ -27,10 +87,10 @@ void mutex_init(struct mutex *mtx); void mutex_init_pshared(struct mutex *mtx); void mutex_destroy(struct mutex *mtx); -void mutex_lock(struct mutex *mtx); -void mutex_unlock(struct mutex *mtx); +void mutex_lock(struct mutex *mtx) EXCLUSIVE_LOCK_FUNCTION(*mtx); +void mutex_unlock(struct mutex *mtx) UNLOCK_FUNCTION(*mtx); /* Tries to acquire the lock and returns true on success. */ -bool mutex_trylock(struct mutex *mtx); +bool mutex_trylock(struct mutex *mtx) EXCLUSIVE_TRYLOCK_FUNCTION(true, *mtx); /* Default initialize the cond struct. */ void cond_init(struct cond *cnd); @@ -41,7 +101,7 @@ void cond_init(struct cond *cnd); void cond_init_pshared(struct cond *cnd); void cond_destroy(struct cond *cnd); -void cond_wait(struct cond *cnd, struct mutex *mtx); +void cond_wait(struct cond *cnd, struct mutex *mtx) EXCLUSIVE_LOCKS_REQUIRED(mtx); void cond_signal(struct cond *cnd); void cond_broadcast(struct cond *cnd); -- GitLab From 59c266604922898ad8aa1ef881a60eb02fcb385f Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:40 -0700 Subject: [PATCH 1207/2223] perf sched: Fixes for thread safety analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add annotations to describe lock behavior. Add unlocks so that mutexes aren't conditionally held on exit from perf_sched__replay. Add an exit variable so that thread_func can terminate, rather than leaving the threads blocked on mutexes. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-sched.c | 46 ++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 46e3b96457b84..a92610eac4bf6 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -246,6 +246,7 @@ struct perf_sched { const char *time_str; struct perf_time_interval ptime; struct perf_time_interval hist_time; + volatile bool thread_funcs_exit; }; /* per thread run time data */ @@ -633,31 +634,34 @@ static void *thread_func(void *ctx) prctl(PR_SET_NAME, comm2); if (fd < 0) return NULL; -again: - ret = sem_post(&this_task->ready_for_work); - BUG_ON(ret); - mutex_lock(&sched->start_work_mutex); - mutex_unlock(&sched->start_work_mutex); - cpu_usage_0 = get_cpu_usage_nsec_self(fd); + while (!sched->thread_funcs_exit) { + ret = sem_post(&this_task->ready_for_work); + BUG_ON(ret); + mutex_lock(&sched->start_work_mutex); + mutex_unlock(&sched->start_work_mutex); - for (i = 0; i < this_task->nr_events; i++) { - this_task->curr_event = i; - perf_sched__process_event(sched, this_task->atoms[i]); - } + cpu_usage_0 = get_cpu_usage_nsec_self(fd); - cpu_usage_1 = get_cpu_usage_nsec_self(fd); - this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; - ret = sem_post(&this_task->work_done_sem); - BUG_ON(ret); + for (i = 0; i < this_task->nr_events; i++) { + this_task->curr_event = i; + perf_sched__process_event(sched, this_task->atoms[i]); + } - mutex_lock(&sched->work_done_wait_mutex); - mutex_unlock(&sched->work_done_wait_mutex); + cpu_usage_1 = get_cpu_usage_nsec_self(fd); + this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; + ret = sem_post(&this_task->work_done_sem); + BUG_ON(ret); - goto again; + mutex_lock(&sched->work_done_wait_mutex); + mutex_unlock(&sched->work_done_wait_mutex); + } + return NULL; } static void create_tasks(struct perf_sched *sched) + EXCLUSIVE_LOCK_FUNCTION(sched->start_work_mutex) + EXCLUSIVE_LOCK_FUNCTION(sched->work_done_wait_mutex) { struct task_desc *task; pthread_attr_t attr; @@ -687,6 +691,8 @@ static void create_tasks(struct perf_sched *sched) } static void wait_for_tasks(struct perf_sched *sched) + EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex) + EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex) { u64 cpu_usage_0, cpu_usage_1; struct task_desc *task; @@ -738,6 +744,8 @@ static void wait_for_tasks(struct perf_sched *sched) } static void run_one_test(struct perf_sched *sched) + EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex) + EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex) { u64 T0, T1, delta, avg_delta, fluct; @@ -3309,11 +3317,15 @@ static int perf_sched__replay(struct perf_sched *sched) print_task_traces(sched); add_cross_task_wakeups(sched); + sched->thread_funcs_exit = false; create_tasks(sched); printf("------------------------------------------------------------\n"); for (i = 0; i < sched->replay_repeat; i++) run_one_test(sched); + sched->thread_funcs_exit = true; + mutex_unlock(&sched->start_work_mutex); + mutex_unlock(&sched->work_done_wait_mutex); return 0; } -- GitLab From b40b2122566ea2d948032370000f0b06b8d507fc Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:41 -0700 Subject: [PATCH 1208/2223] perf top: Fixes for thread safety analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add annotations to describe lock behavior. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-top.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 5af3347eedc10..e89208b4ad4bc 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -196,6 +196,7 @@ static void perf_top__record_precise_ip(struct perf_top *top, struct hist_entry *he, struct perf_sample *sample, struct evsel *evsel, u64 ip) + EXCLUSIVE_LOCKS_REQUIRED(he->hists->lock) { struct annotation *notes; struct symbol *sym = he->ms.sym; @@ -724,13 +725,13 @@ repeat: static int hist_iter__top_callback(struct hist_entry_iter *iter, struct addr_location *al, bool single, void *arg) + EXCLUSIVE_LOCKS_REQUIRED(iter->he->hists->lock) { struct perf_top *top = arg; - struct hist_entry *he = iter->he; struct evsel *evsel = iter->evsel; if (perf_hpp_list.sym && single) - perf_top__record_precise_ip(top, he, iter->sample, evsel, al->addr); + perf_top__record_precise_ip(top, iter->he, iter->sample, evsel, al->addr); hist__account_cycles(iter->sample->branch_stack, al, iter->sample, !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY), -- GitLab From dca571ed9753b4cd8d19d8b5a896351a78e3c5eb Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Fri, 26 Aug 2022 09:42:42 -0700 Subject: [PATCH 1209/2223] perf build: Enable -Wthread-safety with clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If building with clang then enable -Wthread-safety warnings. Signed-off-by: Ian Rogers <irogers@google.com> Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andres Freund <andres@anarazel.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: André Almeida <andrealmeid@igalia.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Cc: Colin Ian King <colin.king@intel.com> Cc: Dario Petrillo <dario.pk1@gmail.com> Cc: Darren Hart <dvhart@infradead.org> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Fangrui Song <maskray@google.com> Cc: Hewenliang <hewenliang4@huawei.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jason Wang <wangborong@cdjrlc.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Martin Liška <mliska@suse.cz> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Pavithra Gurushankar <gpavithrasha@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Monnet <quentin@isovalent.com> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Remi Bernon <rbernon@codeweavers.com> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Tom Rix <trix@redhat.com> Cc: Weiguo Li <liwg06@foxmail.com> Cc: Wenyu Liu <liuwenyu7@huawei.com> Cc: William Cohen <wcohen@redhat.com> Cc: Zechuan Chen <chenzechuan1@huawei.com> Cc: bpf@vger.kernel.org Cc: llvm@lists.linux.dev Cc: yaowenbin <yaowenbin1@huawei.com> Link: https://lore.kernel.org/r/20220826164242.43412-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Makefile.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index b3b733f4366bc..c7c188ba1a4bc 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -19,6 +19,11 @@ detected_var = $(shell echo "$(1)=$($(1))" >> $(OUTPUT).config-detected) CFLAGS := $(EXTRA_CFLAGS) $(filter-out -Wnested-externs,$(EXTRA_WARNINGS)) HOSTCFLAGS := $(filter-out -Wnested-externs,$(EXTRA_WARNINGS)) +# Enabled Wthread-safety analysis for clang builds. +ifeq ($(CC_NO_CLANG), 0) + CFLAGS += -Wthread-safety +endif + include $(srctree)/tools/scripts/Makefile.arch $(call detected_var,SRCARCH) -- GitLab From 1c96b6e45f140a4a43b1e831907e250e6302067c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual <anshuman.khandual@arm.com> Date: Wed, 24 Aug 2022 10:18:19 +0530 Subject: [PATCH 1210/2223] perf branch: Add system error and not in transaction branch types This updates the perf tool with generic branch type classification with two new branch types i.e system error (PERF_BR_SERROR) and not in transaction (PERF_BR_NO_TX) which got updated earlier in the kernel. This also updates corresponding branch type strings in branch_type_name(). Committer notes: At perf tools merge time this is only on PeterZ's tree, at: git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/core So for testing one has to build a kernel with that branch, then test the tooling side from: git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git perf/core Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220824044822.70230-6-anshuman.khandual@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/include/uapi/linux/perf_event.h | 2 ++ tools/perf/util/branch.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 581ed4bdc0621..146c137ff0c13 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -253,6 +253,8 @@ enum { PERF_BR_COND_RET = 10, /* conditional function return */ PERF_BR_ERET = 11, /* exception return */ PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ PERF_BR_MAX, }; diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c index a9a909db8cc7f..abc673347beee 100644 --- a/tools/perf/util/branch.c +++ b/tools/perf/util/branch.c @@ -51,7 +51,9 @@ const char *branch_type_name(int type) "COND_CALL", "COND_RET", "ERET", - "IRQ" + "IRQ", + "SERROR", + "NO_TX" }; if (type >= 0 && type < PERF_BR_MAX) -- GitLab From 0ddea8e2a0c20ff32a28ef21574f704d8f4699a2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual <anshuman.khandual@arm.com> Date: Wed, 24 Aug 2022 10:18:20 +0530 Subject: [PATCH 1211/2223] perf branch: Extend branch type classification This updates the perf tool with generic branch type classification with new ABI extender place holder i.e PERF_BR_EXTEND_ABI, the new 4 bit branch type field i.e perf_branch_entry.new_type, new generic page fault related branch types and some arch specific branch types as added earlier in the kernel. Committer note: Add an extra entry to the branch_type_name array to cope with PERF_BR_EXTEND_ABI, to address build warnings on some compiler/systems, like: 75 8.89 ubuntu:20.04-x-powerpc64el : FAIL gcc version 10.3.0 (Ubuntu 10.3.0-1ubuntu1~20.04) inlined from 'branch_type_stat_display' at util/branch.c:152:4: /usr/powerpc64le-linux-gnu/include/bits/stdio2.h:100:10: error: '%8s' directive argument is null [-Werror=format-overflow=] 100 | return __fprintf_chk (__stream, __USE_FORTIFY_LEVEL - 1, __fmt, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 101 | __va_arg_pack ()); | ~~~~~~~~~~~~~~~~~ Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220824044822.70230-7-anshuman.khandual@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/include/uapi/linux/perf_event.h | 16 +++++++- tools/perf/builtin-script.c | 2 +- tools/perf/util/branch.c | 55 ++++++++++++++++++++++++++- tools/perf/util/branch.h | 6 ++- tools/perf/util/session.c | 2 +- 5 files changed, 75 insertions(+), 6 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 146c137ff0c13..0f7c7ce29899c 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -255,9 +255,22 @@ enum { PERF_BR_IRQ = 12, /* irq */ PERF_BR_SERROR = 13, /* system error */ PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ PERF_BR_MAX, }; +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1375,7 +1388,8 @@ struct perf_branch_entry { abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ type:4, /* branch type */ - reserved:40; + new_type:4, /* additional branch type */ + reserved:36; }; union perf_sample_weight { diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 029b4330e59b1..886f53cfa2574 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -882,7 +882,7 @@ static int print_bstack_flags(FILE *fp, struct branch_entry *br) br->flags.in_tx ? 'X' : '-', br->flags.abort ? 'A' : '-', br->flags.cycles, - br->flags.type ? branch_type_name(br->flags.type) : "-"); + get_branch_type(br)); } static int perf_sample__fprintf_brstack(struct perf_sample *sample, diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c index abc673347beee..675cbbe80ce37 100644 --- a/tools/perf/util/branch.c +++ b/tools/perf/util/branch.c @@ -21,7 +21,10 @@ void branch_type_count(struct branch_type_stat *st, struct branch_flags *flags, if (flags->type == PERF_BR_UNKNOWN || from == 0) return; - st->counts[flags->type]++; + if (flags->type == PERF_BR_EXTEND_ABI) + st->new_counts[flags->new_type]++; + else + st->counts[flags->type]++; if (flags->type == PERF_BR_COND) { if (to > from) @@ -36,6 +39,25 @@ void branch_type_count(struct branch_type_stat *st, struct branch_flags *flags, st->cross_4k++; } +const char *branch_new_type_name(int new_type) +{ + const char *branch_new_names[PERF_BR_NEW_MAX] = { + "FAULT_ALGN", + "FAULT_DATA", + "FAULT_INST", + "ARCH_1", + "ARCH_2", + "ARCH_3", + "ARCH_4", + "ARCH_5" + }; + + if (new_type >= 0 && new_type < PERF_BR_NEW_MAX) + return branch_new_names[new_type]; + + return NULL; +} + const char *branch_type_name(int type) { const char *branch_names[PERF_BR_MAX] = { @@ -53,7 +75,8 @@ const char *branch_type_name(int type) "ERET", "IRQ", "SERROR", - "NO_TX" + "NO_TX", + "", // Needed for PERF_BR_EXTEND_ABI that ends up triggering some compiler warnings about NULL deref }; if (type >= 0 && type < PERF_BR_MAX) @@ -62,6 +85,17 @@ const char *branch_type_name(int type) return NULL; } +const char *get_branch_type(struct branch_entry *e) +{ + if (e->flags.type == PERF_BR_UNKNOWN) + return ""; + + if (e->flags.type == PERF_BR_EXTEND_ABI) + return branch_new_type_name(e->flags.new_type); + + return branch_type_name(e->flags.type); +} + void branch_type_stat_display(FILE *fp, struct branch_type_stat *st) { u64 total = 0; @@ -108,6 +142,15 @@ void branch_type_stat_display(FILE *fp, struct branch_type_stat *st) 100.0 * (double)st->counts[i] / (double)total); } + + for (i = 0; i < PERF_BR_NEW_MAX; i++) { + if (st->new_counts[i] > 0) + fprintf(fp, "\n%8s: %5.1f%%", + branch_new_type_name(i), + 100.0 * + (double)st->new_counts[i] / (double)total); + } + } static int count_str_scnprintf(int idx, const char *str, char *bf, int size) @@ -123,6 +166,9 @@ int branch_type_str(struct branch_type_stat *st, char *bf, int size) for (i = 0; i < PERF_BR_MAX; i++) total += st->counts[i]; + for (i = 0; i < PERF_BR_NEW_MAX; i++) + total += st->new_counts[i]; + if (total == 0) return 0; @@ -140,6 +186,11 @@ int branch_type_str(struct branch_type_stat *st, char *bf, int size) printed += count_str_scnprintf(j++, branch_type_name(i), bf + printed, size - printed); } + for (i = 0; i < PERF_BR_NEW_MAX; i++) { + if (st->new_counts[i] > 0) + printed += count_str_scnprintf(j++, branch_new_type_name(i), bf + printed, size - printed); + } + if (st->cross_4k > 0) printed += count_str_scnprintf(j++, "CROSS_4K", bf + printed, size - printed); diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 17b2ccc61094b..8d251b35428a7 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -24,7 +24,8 @@ struct branch_flags { u64 abort:1; u64 cycles:16; u64 type:4; - u64 reserved:40; + u64 new_type:4; + u64 reserved:36; }; }; }; @@ -72,6 +73,7 @@ static inline struct branch_entry *perf_sample__branch_entries(struct perf_sampl struct branch_type_stat { bool branch_to; u64 counts[PERF_BR_MAX]; + u64 new_counts[PERF_BR_NEW_MAX]; u64 cond_fwd; u64 cond_bwd; u64 cross_4k; @@ -82,6 +84,8 @@ void branch_type_count(struct branch_type_stat *st, struct branch_flags *flags, u64 from, u64 to); const char *branch_type_name(int type); +const char *branch_new_type_name(int new_type); +const char *get_branch_type(struct branch_entry *e); void branch_type_stat_display(FILE *fp, struct branch_type_stat *st); int branch_type_str(struct branch_type_stat *st, char *bf, int bfsize); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 192c9274f7ade..47d5a50e616a3 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1180,7 +1180,7 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) e->flags.abort ? "A" : " ", e->flags.in_tx ? "T" : " ", (unsigned)e->flags.reserved, - e->flags.type ? branch_type_name(e->flags.type) : ""); + get_branch_type(e)); } else { if (i == 0) { printf("..... %2"PRIu64": %016" PRIx64 "\n" -- GitLab From bcb96ce6d2544ae0738cf54fd0a6d048fad791ec Mon Sep 17 00:00:00 2001 From: Anshuman Khandual <anshuman.khandual@arm.com> Date: Wed, 24 Aug 2022 10:18:21 +0530 Subject: [PATCH 1212/2223] perf branch: Add branch privilege information request flag This updates the perf tools with branch privilege information request flag i.e PERF_SAMPLE_BRANCH_PRIV_SAVE that has been added earlier in the kernel. This also updates 'perf record' documentation, branch_modes[], and generic branch privilege level enumeration as added earlier in the kernel. Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220824044822.70230-8-anshuman.khandual@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/include/uapi/linux/perf_event.h | 14 +++++++++++++- tools/perf/Documentation/perf-record.txt | 1 + tools/perf/util/branch.h | 3 ++- tools/perf/util/parse-branch-options.c | 1 + tools/perf/util/perf_event_attr_fprintf.c | 2 +- 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 0f7c7ce29899c..51168e22f4d87 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -233,6 +235,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -271,6 +275,13 @@ enum { PERF_BR_NEW_MAX, }; +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1389,7 +1400,8 @@ struct perf_branch_entry { cycles:16, /* cycle count to last branch */ type:4, /* branch type */ new_type:4, /* additional branch type */ - reserved:36; + priv:3, /* privilege level */ + reserved:33; }; union perf_sample_weight { diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index b32a9c2726f90..378f497f4be32 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -400,6 +400,7 @@ following filters are defined: For the platforms with Intel Arch LBR support (12th-Gen+ client or 4th-Gen Xeon+ server), the save branch type is unconditionally enabled when the taken branch stack sampling is enabled. + - priv: save privilege state during sampling in case binary is not available later + The option requires at least one branch type among any, any_call, any_ret, ind_call, cond. diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 8d251b35428a7..f838b23db1804 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -25,7 +25,8 @@ struct branch_flags { u64 cycles:16; u64 type:4; u64 new_type:4; - u64 reserved:36; + u64 priv:3; + u64 reserved:33; }; }; }; diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c index bb4aa88c50a82..00588b9db474e 100644 --- a/tools/perf/util/parse-branch-options.c +++ b/tools/perf/util/parse-branch-options.c @@ -32,6 +32,7 @@ static const struct branch_mode branch_modes[] = { BRANCH_OPT("call", PERF_SAMPLE_BRANCH_CALL), BRANCH_OPT("save_type", PERF_SAMPLE_BRANCH_TYPE_SAVE), BRANCH_OPT("stack", PERF_SAMPLE_BRANCH_CALL_STACK), + BRANCH_OPT("priv", PERF_SAMPLE_BRANCH_PRIV_SAVE), BRANCH_END }; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 98af3fa4ea353..4b0db27b71991 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -52,7 +52,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), - bit_name(TYPE_SAVE), bit_name(HW_INDEX), + bit_name(TYPE_SAVE), bit_name(HW_INDEX), bit_name(PRIV_SAVE), { .name = NULL, } }; #undef bit_name -- GitLab From fb42f8b729f431b53acfaa8bf1b4d43b98e62e14 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual <anshuman.khandual@arm.com> Date: Wed, 24 Aug 2022 10:18:22 +0530 Subject: [PATCH 1213/2223] perf branch: Add PERF_BR_NEW_ARCH_[N] map for BRBE on arm64 platform This updates the perf tool with arch specific branch type classification used for BRBE on arm64 platform as added in the kernel earlier. Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220824044822.70230-9-anshuman.khandual@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/include/uapi/linux/perf_event.h | 6 ++++++ tools/perf/util/branch.c | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 51168e22f4d87..49cb2355efc0c 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -282,6 +282,12 @@ enum { PERF_BR_PRIV_HV = 3, }; +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c index 675cbbe80ce37..6d38238481d32 100644 --- a/tools/perf/util/branch.c +++ b/tools/perf/util/branch.c @@ -45,11 +45,24 @@ const char *branch_new_type_name(int new_type) "FAULT_ALGN", "FAULT_DATA", "FAULT_INST", +/* + * TODO: This switch should happen on 'session->header.env.arch' + * instead, because an arm64 platform perf recording could be + * opened for analysis on other platforms as well. + */ +#ifdef __aarch64__ + "ARM64_FIQ", + "ARM64_DEBUG_HALT", + "ARM64_DEBUG_EXIT", + "ARM64_DEBUG_INST", + "ARM64_DEBUG_DATA" +#else "ARCH_1", "ARCH_2", "ARCH_3", "ARCH_4", "ARCH_5" +#endif }; if (new_type >= 0 && new_type < PERF_BR_NEW_MAX) -- GitLab From 9dcc22efff4b699a12661f34231a96506338da2e Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Tue, 30 Aug 2022 09:48:39 -0700 Subject: [PATCH 1214/2223] perf smt: Tidy header guard add SPDX Make the header guard consistent with others. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Cc: florian fischer <florian.fischer@muhq.space> Link: http://lore.kernel.org/lkml/20220830164846.401143-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/smt.c | 1 + tools/perf/util/smt.h | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c index 2b0a36ebf27a3..8fed03283c85d 100644 --- a/tools/perf/util/smt.c +++ b/tools/perf/util/smt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <stdio.h> #include <stdlib.h> #include <unistd.h> diff --git a/tools/perf/util/smt.h b/tools/perf/util/smt.h index b8414b7bcbc87..a98d65808f6a8 100644 --- a/tools/perf/util/smt.h +++ b/tools/perf/util/smt.h @@ -1,6 +1,7 @@ -#ifndef SMT_H -#define SMT_H 1 +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SMT_H +#define __SMT_H 1 int smt_on(void); -#endif +#endif /* __SMT_H */ -- GitLab From a8d68cc45799dc7bc8065fd7bb2405335f7d4fa6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 1 Sep 2022 12:57:35 -0700 Subject: [PATCH 1215/2223] perf tools: Print LOST read format in the verbose mode So that we can see it with: $ perf record -vv pwd ... perf_event_attr: size 128 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|PERIOD read_format ID|LOST disabled 1 inherit 1 exclude_kernel 1 freq 1 enable_on_exec 1 precise_ip 3 sample_id_all 1 exclude_guest 1 Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220901195739.668604-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/perf_event_attr_fprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 4b0db27b71991..7e5e7b30510df 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -64,7 +64,7 @@ static void __p_read_format(char *buf, size_t size, u64 value) #define bit_name(n) { PERF_FORMAT_##n, #n } struct bit_names bits[] = { bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING), - bit_name(ID), bit_name(GROUP), + bit_name(ID), bit_name(GROUP), bit_name(LOST), { .name = NULL, } }; #undef bit_name -- GitLab From e17f343c3ba1b317574a4218c631547bb09e72bf Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 1 Sep 2022 12:57:36 -0700 Subject: [PATCH 1216/2223] perf record: Set PERF_FORMAT_LOST by default As we want to see the number of lost samples in the perf report, set the LOST format when it configs evsel. On old kernels, it'd fallback to disable it. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220901195739.668604-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/evsel.c | 10 +++++++++- tools/perf/util/evsel.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index e1bc76ece1178..5776bfa70f11e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1161,6 +1161,7 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1; attr->inherit = !opts->no_inherit; attr->write_backward = opts->overwrite ? 1 : 0; + attr->read_format = PERF_FORMAT_LOST; evsel__set_sample_bit(evsel, IP); evsel__set_sample_bit(evsel, TID); @@ -1856,6 +1857,8 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, static void evsel__disable_missing_features(struct evsel *evsel) { + if (perf_missing_features.read_lost) + evsel->core.attr.read_format &= ~PERF_FORMAT_LOST; if (perf_missing_features.weight_struct) { evsel__set_sample_bit(evsel, WEIGHT); evsel__reset_sample_bit(evsel, WEIGHT_STRUCT); @@ -1907,7 +1910,12 @@ bool evsel__detect_missing_features(struct evsel *evsel) * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.weight_struct && + if (!perf_missing_features.read_lost && + (evsel->core.attr.read_format & PERF_FORMAT_LOST)) { + perf_missing_features.read_lost = true; + pr_debug2("switching off PERF_FORMAT_LOST support\n"); + return true; + } else if (!perf_missing_features.weight_struct && (evsel->core.attr.sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) { perf_missing_features.weight_struct = true; pr_debug2("switching off weight struct support\n"); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index d927713b513e4..989865e16aadd 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -188,6 +188,7 @@ struct perf_missing_features { bool data_page_size; bool code_page_size; bool weight_struct; + bool read_lost; }; extern struct perf_missing_features perf_missing_features; -- GitLab From e3a23261ad06d5986dce0f17a2cfb4d22d493385 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 1 Sep 2022 12:57:37 -0700 Subject: [PATCH 1217/2223] perf record: Read and inject LOST_SAMPLES events When there are lost samples, it can read the number of PERF_FORMAT_LOST and convert it to PERF_RECORD_LOST_SAMPLES and write to the data file at the end. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220901195739.668604-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-record.c | 64 +++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a91ead72fd413..741e763436caf 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -10,6 +10,7 @@ #include "util/build-id.h" #include <subcmd/parse-options.h> +#include <internal/xyarray.h> #include "util/parse-events.h" #include "util/config.h" @@ -1852,6 +1853,68 @@ record__switch_output(struct record *rec, bool at_exit) return fd; } +static void __record__read_lost_samples(struct record *rec, struct evsel *evsel, + struct perf_record_lost_samples *lost, + int cpu_idx, int thread_idx) +{ + struct perf_counts_values count; + struct perf_sample_id *sid; + struct perf_sample sample = {}; + int id_hdr_size; + + if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) { + pr_err("read LOST count failed\n"); + return; + } + + if (count.lost == 0) + return; + + lost->lost = count.lost; + if (evsel->core.ids) { + sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx); + sample.id = sid->id; + } + + id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1), + evsel->core.attr.sample_type, &sample); + lost->header.size = sizeof(*lost) + id_hdr_size; + record__write(rec, NULL, lost, lost->header.size); +} + +static void record__read_lost_samples(struct record *rec) +{ + struct perf_session *session = rec->session; + struct perf_record_lost_samples *lost; + struct evsel *evsel; + + lost = zalloc(PERF_SAMPLE_MAX_SIZE); + if (lost == NULL) { + pr_debug("Memory allocation failed\n"); + return; + } + + lost->header.type = PERF_RECORD_LOST_SAMPLES; + + evlist__for_each_entry(session->evlist, evsel) { + struct xyarray *xy = evsel->core.sample_id; + + if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) || + xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) { + pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n"); + continue; + } + + for (int x = 0; x < xyarray__max_x(xy); x++) { + for (int y = 0; y < xyarray__max_y(xy); y++) { + __record__read_lost_samples(rec, evsel, lost, x, y); + } + } + } + free(lost); + +} + static volatile int workload_exec_errno; /* @@ -2714,6 +2777,7 @@ out_free_threads: if (rec->off_cpu) rec->bytes_written += off_cpu_write(rec->session); + record__read_lost_samples(rec); record__synthesize(rec, true); /* this will be recalculated during process_buildids() */ rec->samples = 0; -- GitLab From 75b37db096e30b12f1de88052a19b1a3fff62b5e Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 1 Sep 2022 12:57:38 -0700 Subject: [PATCH 1218/2223] perf hist: Add nr_lost_samples to hist_stats This is a preparation to display accurate lost sample counts for each evsel. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220901195739.668604-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/events_stats.h | 1 + tools/perf/util/hist.c | 5 +++++ tools/perf/util/hist.h | 1 + 3 files changed, 7 insertions(+) diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h index 040ab9d0a8037..8fecc9fbaecc4 100644 --- a/tools/perf/util/events_stats.h +++ b/tools/perf/util/events_stats.h @@ -47,6 +47,7 @@ struct hists_stats { u64 total_non_filtered_period; u32 nr_samples; u32 nr_non_filtered_samples; + u32 nr_lost_samples; }; void events_stats__inc(struct events_stats *stats, u32 type); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 698add038cecd..8cab049f71191 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2335,6 +2335,11 @@ void hists__inc_nr_samples(struct hists *hists, bool filtered) hists->stats.nr_non_filtered_samples++; } +void hists__inc_nr_lost_samples(struct hists *hists, u32 lost) +{ + hists->stats.nr_lost_samples += lost; +} + static struct hist_entry *hists__add_dummy_entry(struct hists *hists, struct hist_entry *pair) { diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 508428b2c1b2a..c7a7a3fa0b879 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -201,6 +201,7 @@ void hists__reset_stats(struct hists *hists); void hists__inc_stats(struct hists *hists, struct hist_entry *h); void hists__inc_nr_events(struct hists *hists); void hists__inc_nr_samples(struct hists *hists, bool filtered); +void hists__inc_nr_lost_samples(struct hists *hists, u32 lost); size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, int max_cols, float min_pcnt, FILE *fp, -- GitLab From d7ba22d4a3fe0fb878d64263253a7d36bd0aac14 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 1 Sep 2022 12:57:39 -0700 Subject: [PATCH 1219/2223] perf report: Show per-event LOST SAMPLES stat Display lost samples with --stat (if not zero): $ perf report --stat Aggregated stats: TOTAL events: 64 COMM events: 2 ( 3.1%) EXIT events: 1 ( 1.6%) SAMPLE events: 26 (40.6%) MMAP2 events: 4 ( 6.2%) LOST_SAMPLES events: 1 ( 1.6%) ATTR events: 2 ( 3.1%) FINISHED_ROUND events: 1 ( 1.6%) ID_INDEX events: 1 ( 1.6%) THREAD_MAP events: 1 ( 1.6%) CPU_MAP events: 1 ( 1.6%) EVENT_UPDATE events: 2 ( 3.1%) TIME_CONV events: 1 ( 1.6%) FEATURE events: 20 (31.2%) FINISHED_INIT events: 1 ( 1.6%) cycles:uH stats: SAMPLE events: 14 LOST_SAMPLES events: 1 instructions:uH stats: SAMPLE events: 12 Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220901195739.668604-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-report.c | 17 +++++++++++++++++ tools/perf/util/hist.c | 10 +++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 91ed41cc7d884..8361890176c23 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -752,6 +752,22 @@ static int count_sample_event(struct perf_tool *tool __maybe_unused, return 0; } +static int count_lost_samples_event(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine __maybe_unused) +{ + struct report *rep = container_of(tool, struct report, tool); + struct evsel *evsel; + + evsel = evlist__id2evsel(rep->session->evlist, sample->id); + if (evsel) { + hists__inc_nr_lost_samples(evsel__hists(evsel), + event->lost_samples.lost); + } + return 0; +} + static int process_attr(struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist); @@ -761,6 +777,7 @@ static void stats_setup(struct report *rep) memset(&rep->tool, 0, sizeof(rep->tool)); rep->tool.attr = process_attr; rep->tool.sample = count_sample_event; + rep->tool.lost_samples = count_lost_samples_event; rep->tool.no_warn = true; } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 8cab049f71191..06f5dbf213ad1 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2683,12 +2683,16 @@ size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp, evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); - if (skip_empty && !hists->stats.nr_samples) + if (skip_empty && !hists->stats.nr_samples && !hists->stats.nr_lost_samples) continue; ret += fprintf(fp, "%s stats:\n", evsel__name(pos)); - ret += fprintf(fp, "%16s events: %10d\n", - "SAMPLE", hists->stats.nr_samples); + if (hists->stats.nr_samples) + ret += fprintf(fp, "%16s events: %10d\n", + "SAMPLE", hists->stats.nr_samples); + if (hists->stats.nr_lost_samples) + ret += fprintf(fp, "%16s events: %10d\n", + "LOST_SAMPLES", hists->stats.nr_lost_samples); } return ret; -- GitLab From b304c173e3fffc241bc51650980c8342db396bcb Mon Sep 17 00:00:00 2001 From: Nick Forrington <nick.forrington@arm.com> Date: Mon, 5 Sep 2022 12:40:24 +0100 Subject: [PATCH 1220/2223] perf vendor events: Add missing Neoverse V1 events Based on updated data from: https://github.com/ARM-software/data/blob/master/pmu/neoverse-v1.json which is based on PMU event descriptions from the Arm Neoverse V1 Technical Reference Manual. This adds the following missing events: ASE_INST_SPEC SVE_INST_SPEC SVE_PRED_SPEC SVE_PRED_EMPTY_SPEC SVE_PRED_FULL_SPEC SVE_PRED_PARTIAL_SPEC SVE_LDFF_SPEC SVE_LDFF_FAULT_SPEC FP_SCALE_OPS_SPEC FP_FIXED_OPS_SPEC Reviewed-by: John Garry <john.garry@huawei.com> Signed-off-by: Nick Forrington <nick.forrington@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mike Leach <mike.leach@linaro.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220905114024.7552-1-nick.forrington@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arm64/arm/neoverse-v1/instruction.json | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json index 25825e14c535b..e29b88fb7f24a 100644 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json +++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json @@ -85,5 +85,35 @@ }, { "ArchStdEvent": "RC_ST_SPEC" + }, + { + "ArchStdEvent": "ASE_INST_SPEC" + }, + { + "ArchStdEvent": "SVE_INST_SPEC" + }, + { + "ArchStdEvent": "SVE_PRED_SPEC" + }, + { + "ArchStdEvent": "SVE_PRED_EMPTY_SPEC" + }, + { + "ArchStdEvent": "SVE_PRED_FULL_SPEC" + }, + { + "ArchStdEvent": "SVE_PRED_PARTIAL_SPEC" + }, + { + "ArchStdEvent": "SVE_LDFF_SPEC" + }, + { + "ArchStdEvent": "SVE_LDFF_FAULT_SPEC" + }, + { + "ArchStdEvent": "FP_SCALE_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_FIXED_OPS_SPEC" } ] -- GitLab From 4fb47c8c20ec851128a36f82295886d325920864 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Tue, 6 Sep 2022 11:29:04 +0800 Subject: [PATCH 1221/2223] perf tools: Add same_cmd_with_prefix() helper Wrap repeated code in helper function same_cmd_with_prefix for more clearly. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220906032906.21395-2-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/perf.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/perf/perf.c b/tools/perf/perf.c index c21b3973641a0..7af135dea1cd8 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -99,10 +99,16 @@ struct pager_config { int val; }; +static bool same_cmd_with_prefix(const char *var, struct pager_config *c, + const char *header) +{ + return (strstarts(var, header) && !strcmp(var + strlen(header), c->cmd)); +} + static int pager_command_config(const char *var, const char *value, void *data) { struct pager_config *c = data; - if (strstarts(var, "pager.") && !strcmp(var + 6, c->cmd)) + if (same_cmd_with_prefix(var, c, "pager.")) c->val = perf_config_bool(var, value); return 0; } @@ -121,9 +127,9 @@ static int check_pager_config(const char *cmd) static int browser_command_config(const char *var, const char *value, void *data) { struct pager_config *c = data; - if (strstarts(var, "tui.") && !strcmp(var + 4, c->cmd)) + if (same_cmd_with_prefix(var, c, "tui.")) c->val = perf_config_bool(var, value); - if (strstarts(var, "gtk.") && !strcmp(var + 4, c->cmd)) + if (same_cmd_with_prefix(var, c, "gtk.")) c->val = perf_config_bool(var, value) ? 2 : 0; return 0; } -- GitLab From cf874a0165e4a6ea906db9e735d52ee50fdf760b Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Tue, 6 Sep 2022 11:29:05 +0800 Subject: [PATCH 1222/2223] perf c2c: Add helpers to get counts of loads or stores Wrap repeated code in helper functions get_load_llc_misses, get_load_cache_hits. For consistence, helper function get_stores is wraped as well. Reviewed-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220906032906.21395-3-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-c2c.c | 65 +++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 438fc222e2138..f35a47b2dbe49 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -679,28 +679,35 @@ STAT_FN(ld_l2hit) STAT_FN(ld_llchit) STAT_FN(rmt_hit) -static uint64_t total_records(struct c2c_stats *stats) +static uint64_t get_load_llc_misses(struct c2c_stats *stats) { - uint64_t lclmiss, ldcnt, total; - - lclmiss = stats->lcl_dram + - stats->rmt_dram + - stats->rmt_hitm + - stats->rmt_hit; + return stats->lcl_dram + + stats->rmt_dram + + stats->rmt_hitm + + stats->rmt_hit; +} - ldcnt = lclmiss + - stats->ld_fbhit + - stats->ld_l1hit + - stats->ld_l2hit + - stats->ld_llchit + - stats->lcl_hitm; +static uint64_t get_load_cache_hits(struct c2c_stats *stats) +{ + return stats->ld_fbhit + + stats->ld_l1hit + + stats->ld_l2hit + + stats->ld_llchit + + stats->lcl_hitm; +} - total = ldcnt + - stats->st_l1hit + - stats->st_l1miss + - stats->st_na; +static uint64_t get_stores(struct c2c_stats *stats) +{ + return stats->st_l1hit + + stats->st_l1miss + + stats->st_na; +} - return total; +static uint64_t total_records(struct c2c_stats *stats) +{ + return get_load_llc_misses(stats) + + get_load_cache_hits(stats) + + get_stores(stats); } static int @@ -737,21 +744,8 @@ tot_recs_cmp(struct perf_hpp_fmt *fmt __maybe_unused, static uint64_t total_loads(struct c2c_stats *stats) { - uint64_t lclmiss, ldcnt; - - lclmiss = stats->lcl_dram + - stats->rmt_dram + - stats->rmt_hitm + - stats->rmt_hit; - - ldcnt = lclmiss + - stats->ld_fbhit + - stats->ld_l1hit + - stats->ld_l2hit + - stats->ld_llchit + - stats->lcl_hitm; - - return ldcnt; + return get_load_llc_misses(stats) + + get_load_cache_hits(stats); } static int @@ -2376,10 +2370,7 @@ static void print_c2c__display_stats(FILE *out) int llc_misses; struct c2c_stats *stats = &c2c.hists.stats; - llc_misses = stats->lcl_dram + - stats->rmt_dram + - stats->rmt_hit + - stats->rmt_hitm; + llc_misses = get_load_llc_misses(stats); fprintf(out, "=================================================\n"); fprintf(out, " Trace Event Information \n"); -- GitLab From 016f2f9821bd5d056d454aefa603f8b4f7d0e0f0 Mon Sep 17 00:00:00 2001 From: ye xingchen <ye.xingchen@zte.com.cn> Date: Tue, 23 Aug 2022 07:56:05 +0000 Subject: [PATCH 1223/2223] perf callchain: Remove unneeded 'result' variable Return the value scnprintf() directly instead of storing it in a redundant variable. Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn> Cc: Alexandre Truong <alexandre.truong@arm.com> Cc: Ian Rogers <irogers@google.com> Cc: James Clark <james.clark@arm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/callchain.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 7e663673f79f9..a093a15f048fa 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1307,24 +1307,16 @@ int callchain_branch_counts(struct callchain_root *root, static int count_pri64_printf(int idx, const char *str, u64 value, char *bf, int bfsize) { - int printed; - - printed = scnprintf(bf, bfsize, "%s%s:%" PRId64 "", (idx) ? " " : " (", str, value); - - return printed; + return scnprintf(bf, bfsize, "%s%s:%" PRId64 "", (idx) ? " " : " (", str, value); } static int count_float_printf(int idx, const char *str, float value, char *bf, int bfsize, float threshold) { - int printed; - if (threshold != 0.0 && value < threshold) return 0; - printed = scnprintf(bf, bfsize, "%s%s:%.1f%%", (idx) ? " " : " (", str, value); - - return printed; + return scnprintf(bf, bfsize, "%s%s:%.1f%%", (idx) ? " " : " (", str, value); } static int branch_to_str(char *bf, int bfsize, -- GitLab From c3ca8d44185cc2ac5ca75d2d38647979da5b0035 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 5 Sep 2022 10:34:19 +0300 Subject: [PATCH 1224/2223] perf tools: Add perf_config_scan() To simplify getting a single config value, add a function to scan a config variable. Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220905073424.3971-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/config.c | 31 +++++++++++++++++++++++++++++++ tools/perf/util/config.h | 1 + 2 files changed, 32 insertions(+) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 60ce5908c6640..3f2ae19a1dd40 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -908,3 +908,34 @@ void set_buildid_dir(const char *dir) /* for communicating with external commands */ setenv("PERF_BUILDID_DIR", buildid_dir, 1); } + +struct perf_config_scan_data { + const char *name; + const char *fmt; + va_list args; + int ret; +}; + +static int perf_config_scan_cb(const char *var, const char *value, void *data) +{ + struct perf_config_scan_data *d = data; + + if (!strcmp(var, d->name)) + d->ret = vsscanf(value, d->fmt, d->args); + + return 0; +} + +int perf_config_scan(const char *name, const char *fmt, ...) +{ + struct perf_config_scan_data d = { + .name = name, + .fmt = fmt, + }; + + va_start(d.args, fmt); + perf_config(perf_config_scan_cb, &d); + va_end(d.args); + + return d.ret; +} diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h index 2fd77aaff4d24..2e5e808928a55 100644 --- a/tools/perf/util/config.h +++ b/tools/perf/util/config.h @@ -29,6 +29,7 @@ typedef int (*config_fn_t)(const char *, const char *, void *); int perf_default_config(const char *, const char *, void *); int perf_config(config_fn_t fn, void *); +int perf_config_scan(const char *name, const char *fmt, ...) __scanf(2, 3); int perf_config_set(struct perf_config_set *set, config_fn_t fn, void *data); int perf_config_int(int *dest, const char *, const char *); -- GitLab From a7fdd30a22448f17e942436b9db2a94b48218eb6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 5 Sep 2022 10:34:20 +0300 Subject: [PATCH 1225/2223] perf auxtrace: Add itrace option flag d+e to log on error Add flag +e to the itrace d (decoder debug log) option to get output only on decoding errors. The log can be very big so reducing the output to where there are decoding errors can be useful for analyzing errors. By default, the log size in that case is 16384 bytes, but can be altered by perf config e.g. perf config itrace.debug-log-buffer-size=30000 Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220905073424.3971-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/itrace.txt | 1 + tools/perf/Documentation/perf-config.txt | 7 +++++++ tools/perf/util/auxtrace.c | 13 +++++++++++++ tools/perf/util/auxtrace.h | 3 +++ 4 files changed, 24 insertions(+) diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index 6b189669c450e..0916bbfe64cb7 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -64,6 +64,7 @@ debug messages will or will not be logged. Each flag must be preceded by either '+' or '-'. The flags are: a all perf events + e output only on errors (size configurable - see linkperf:perf-config[1]) o output to stdout If supported, the 'q' option may be repeated to increase the effect. diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 0420e71698ee4..39c890ead2dc0 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -729,6 +729,13 @@ auxtrace.*:: If the directory does not exist or has the wrong file type, the current directory is used. +itrace.*:: + + debug-log-buffer-size:: + Log size in bytes to output when using the option --itrace=d+e + Refer 'itrace' option of linkperf:perf-script[1] or + linkperf:perf-report[1]. The default is 16384. + daemon.*:: daemon.base:: diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 6edab8a16de6a..b59c278fe9ede 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -26,6 +26,7 @@ #include <linux/list.h> #include <linux/zalloc.h> +#include "config.h" #include "evlist.h" #include "dso.h" #include "map.h" @@ -1434,6 +1435,16 @@ static int get_flags(const char **ptr, unsigned int *plus_flags, unsigned int *m } } +#define ITRACE_DFLT_LOG_ON_ERROR_SZ 16384 + +static unsigned int itrace_log_on_error_size(void) +{ + unsigned int sz = 0; + + perf_config_scan("itrace.debug-log-buffer-size", "%u", &sz); + return sz ?: ITRACE_DFLT_LOG_ON_ERROR_SZ; +} + /* * Please check tools/perf/Documentation/perf-script.txt for information * about the options parsed here, which is introduced after this cset, @@ -1532,6 +1543,8 @@ int itrace_do_parse_synth_opts(struct itrace_synth_opts *synth_opts, if (get_flags(&p, &synth_opts->log_plus_flags, &synth_opts->log_minus_flags)) goto out_err; + if (synth_opts->log_plus_flags & AUXTRACE_LOG_FLG_ON_ERROR) + synth_opts->log_on_error_size = itrace_log_on_error_size(); break; case 'c': synth_opts->branches = true; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 6a4fbfd34c6ba..cb8e0a01abb6e 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -60,6 +60,7 @@ enum itrace_period_type { #define AUXTRACE_ERR_FLG_DATA_LOST (1 << ('l' - 'a')) #define AUXTRACE_LOG_FLG_ALL_PERF_EVTS (1 << ('a' - 'a')) +#define AUXTRACE_LOG_FLG_ON_ERROR (1 << ('e' - 'a')) #define AUXTRACE_LOG_FLG_USE_STDOUT (1 << ('o' - 'a')) /** @@ -110,6 +111,7 @@ enum itrace_period_type { * @log_plus_flags: flags to affect what is logged * @log_minus_flags: flags to affect what is logged * @quick: quicker (less detailed) decoding + * @log_on_error_size: size of log to keep for outputting log only on errors */ struct itrace_synth_opts { bool set; @@ -155,6 +157,7 @@ struct itrace_synth_opts { unsigned int log_plus_flags; unsigned int log_minus_flags; unsigned int quick; + unsigned int log_on_error_size; }; /** -- GitLab From 52de6aacbe3dc498456a565a85adb2b35f2d05b6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 5 Sep 2022 10:34:21 +0300 Subject: [PATCH 1226/2223] perf intel-pt: Improve man page layout slightly Improve man page layout slightly by adding blank lines. Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220905073424.3971-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-intel-pt.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 3dc3f0ccbd513..d5ddb968bcf4b 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -943,12 +943,15 @@ event packets are recorded only if the "pwr_evt" config term was used. Refer to the config terms section above. The power events record information about C-state changes, whereas CBR is indicative of CPU frequency. perf script "event,synth" fields display information like this: + cbr: cbr: 22 freq: 2189 MHz (200%) mwait: hints: 0x60 extensions: 0x1 pwre: hw: 0 cstate: 2 sub-cstate: 0 exstop: ip: 1 pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 + Where: + "cbr" includes the frequency and the percentage of maximum non-turbo "mwait" shows mwait hints and extensions "pwre" shows C-state transitions (to a C-state deeper than C0) and @@ -956,6 +959,7 @@ Where: "exstop" indicates execution stopped and whether the IP was recorded exactly, "pwrx" indicates return to C0 + For more details refer to the Intel 64 and IA-32 Architectures Software Developer Manuals. @@ -969,8 +973,10 @@ are quite important. Users must know if what they are seeing is a complete picture or not. The "e" option may be followed by flags which affect what errors will or will not be reported. Each flag must be preceded by either '+' or '-'. The flags supported by Intel PT are: + -o Suppress overflow errors -l Suppress trace data lost errors + For example, for errors but not overflow or data lost errors: --itrace=e-o-l @@ -980,9 +986,11 @@ decoded packets and instructions. Note that this option slows down the decoder and that the resulting file may be very large. The "d" option may be followed by flags which affect what debug messages will or will not be logged. Each flag must be preceded by either '+' or '-'. The flags support by Intel PT are: + -a Suppress logging of perf events +a Log all perf events +o Output to stdout instead of "intel_pt.log" + By default, logged perf events are filtered by any specified time ranges, but flag +a overrides that. -- GitLab From 50d7620b27d19bfa4cc12764d27c272f2ee3e28a Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 5 Sep 2022 10:34:22 +0300 Subject: [PATCH 1227/2223] perf intel-pt: Improve object code read error message The offset is more readable in hex instead of decimal. Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220905073424.3971-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/intel-pt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index d5e9fc8106dd8..c01ff8001501c 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -842,7 +842,8 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, offset, buf, INTEL_PT_INSN_BUF_SZ); if (len <= 0) { - intel_pt_log("ERROR: failed to read at %" PRIu64 " ", offset); + intel_pt_log("ERROR: failed to read at offset %#" PRIx64 " ", + offset); if (intel_pt_enable_logging) dso__fprintf(al.map->dso, intel_pt_log_fp()); return -EINVAL; -- GitLab From 65aee81afe7f6a54e2fb2de59e1d6cd47dcf8eb9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 5 Sep 2022 10:34:23 +0300 Subject: [PATCH 1228/2223] perf intel-pt: Support itrace option flag d+e to log on error Pass d+e option and log size via intel_pt_log_enable(). Allocate a buffer for log messages and provide intel_pt_log_dump_buf() to dump and reset the buffer upon decoder errors. Example: $ sudo perf record -e intel_pt// sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.094 MB perf.data ] $ sudo perf config itrace.debug-log-buffer-size=300 $ sudo perf script --itrace=ed+e+o | head -20 Dumping debug log buffer (first line may be sliced) Other ffffffff96ca22f6: 48 89 e5 Other ffffffff96ca22f9: 65 48 8b 05 ff e0 38 69 Other ffffffff96ca2301: 48 3d c0 a5 c1 98 Other ffffffff96ca2307: 74 08 Jcc +8 ffffffff96ca2311: 5d Other ffffffff96ca2312: c3 Ret ERROR: Bad RET compression (TNT=N) at 0xffffffff96ca2312 End of debug log buffer dump instruction trace error type 1 time 15913.537143482 cpu 5 pid 36292 tid 36292 ip 0xffffffff96ca2312 code 6: Trace doesn't match instruction Dumping debug log buffer (first line may be sliced) Other ffffffff96ce7fe9: f6 47 2e 20 Other ffffffff96ce7fed: 74 11 Jcc +17 ffffffff96ce7fef: 48 8b 87 28 0a 00 00 Other ffffffff96ce7ff6: 5d Other ffffffff96ce7ff7: 48 8b 40 18 Other ffffffff96ce7ffb: c3 Ret ERROR: Bad RET compression (TNT=N) at 0xffffffff96ce7ffb Warning: 8 instruction trace errors Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220905073424.3971-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-intel-pt.txt | 5 +- .../perf/util/intel-pt-decoder/intel-pt-log.c | 94 ++++++++++++++++++- .../perf/util/intel-pt-decoder/intel-pt-log.h | 3 +- tools/perf/util/intel-pt.c | 20 +++- 4 files changed, 117 insertions(+), 5 deletions(-) diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index d5ddb968bcf4b..92464a5d7eafd 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -989,10 +989,13 @@ must be preceded by either '+' or '-'. The flags support by Intel PT are: -a Suppress logging of perf events +a Log all perf events + +e Output only on decoding errors (size configurable) +o Output to stdout instead of "intel_pt.log" By default, logged perf events are filtered by any specified time ranges, but -flag +a overrides that. +flag +a overrides that. The +e flag can be useful for analyzing errors. By +default, the log size in that case is 16384 bytes, but can be altered by +linkperf:perf-config[1] e.g. perf config itrace.debug-log-buffer-size=30000 In addition, the period of the "instructions" event can be specified. e.g. diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.c b/tools/perf/util/intel-pt-decoder/intel-pt-log.c index 5f5dfc8753f33..24684edc49f7c 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-log.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.c @@ -5,12 +5,16 @@ */ #include <stdio.h> +#include <stdlib.h> #include <stdint.h> #include <inttypes.h> #include <stdarg.h> #include <stdbool.h> #include <string.h> +#include <linux/zalloc.h> +#include <linux/kernel.h> + #include "intel-pt-log.h" #include "intel-pt-insn-decoder.h" @@ -18,18 +22,33 @@ #define MAX_LOG_NAME 256 +#define DFLT_BUF_SZ (16 * 1024) + +struct log_buf { + char *buf; + size_t buf_sz; + size_t head; + bool wrapped; + FILE *backend; +}; + static FILE *f; static char log_name[MAX_LOG_NAME]; bool intel_pt_enable_logging; +static bool intel_pt_dump_log_on_error; +static unsigned int intel_pt_log_on_error_size; +static struct log_buf log_buf; void *intel_pt_log_fp(void) { return f; } -void intel_pt_log_enable(void) +void intel_pt_log_enable(bool dump_log_on_error, unsigned int log_on_error_size) { intel_pt_enable_logging = true; + intel_pt_dump_log_on_error = dump_log_on_error; + intel_pt_log_on_error_size = log_on_error_size; } void intel_pt_log_disable(void) @@ -74,6 +93,77 @@ static void intel_pt_print_no_data(uint64_t pos, int indent) fprintf(f, " "); } +static ssize_t log_buf__write(void *cookie, const char *buf, size_t size) +{ + struct log_buf *b = cookie; + size_t sz = size; + + if (!b->buf) + return size; + + while (sz) { + size_t space = b->buf_sz - b->head; + size_t n = min(space, sz); + + memcpy(b->buf + b->head, buf, n); + sz -= n; + buf += n; + b->head += n; + if (sz && b->head >= b->buf_sz) { + b->head = 0; + b->wrapped = true; + } + } + return size; +} + +static int log_buf__close(void *cookie) +{ + struct log_buf *b = cookie; + + zfree(&b->buf); + return 0; +} + +static FILE *log_buf__open(struct log_buf *b, FILE *backend, unsigned int sz) +{ + cookie_io_functions_t fns = { + .write = log_buf__write, + .close = log_buf__close, + }; + FILE *file; + + memset(b, 0, sizeof(*b)); + b->buf_sz = sz; + b->buf = malloc(b->buf_sz); + b->backend = backend; + file = fopencookie(b, "a", fns); + if (!file) + zfree(&b->buf); + return file; +} + +static void log_buf__dump(struct log_buf *b) +{ + if (!b->buf) + return; + + fflush(f); + fprintf(b->backend, "Dumping debug log buffer (first line may be sliced)\n"); + if (b->wrapped) + fwrite(b->buf + b->head, b->buf_sz - b->head, 1, b->backend); + fwrite(b->buf, b->head, 1, b->backend); + fprintf(b->backend, "End of debug log buffer dump\n"); + + b->head = 0; + b->wrapped = false; +} + +void intel_pt_log_dump_buf(void) +{ + log_buf__dump(&log_buf); +} + static int intel_pt_log_open(void) { if (!intel_pt_enable_logging) @@ -86,6 +176,8 @@ static int intel_pt_log_open(void) f = fopen(log_name, "w+"); else f = stdout; + if (f && intel_pt_dump_log_on_error) + f = log_buf__open(&log_buf, f, intel_pt_log_on_error_size); if (!f) { intel_pt_enable_logging = false; return -1; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.h b/tools/perf/util/intel-pt-decoder/intel-pt-log.h index d900aab24b211..354d7d23fc817 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-log.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.h @@ -14,9 +14,10 @@ struct intel_pt_pkt; void *intel_pt_log_fp(void); -void intel_pt_log_enable(void); +void intel_pt_log_enable(bool dump_log_on_error, unsigned int log_on_error_size); void intel_pt_log_disable(void); void intel_pt_log_set_name(const char *name); +void intel_pt_log_dump_buf(void); void __intel_pt_log_packet(const struct intel_pt_pkt *packet, int pkt_len, uint64_t pos, const unsigned char *buf); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index c01ff8001501c..b34cb3dec1aac 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -2419,6 +2419,8 @@ static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu, pid_t pid, pid_t tid, u64 ip, u64 timestamp, pid_t machine_pid, int vcpu) { + bool dump_log_on_error = pt->synth_opts.log_plus_flags & AUXTRACE_LOG_FLG_ON_ERROR; + bool log_on_stdout = pt->synth_opts.log_plus_flags & AUXTRACE_LOG_FLG_USE_STDOUT; union perf_event event; char msg[MAX_AUXTRACE_ERROR_MSG]; int err; @@ -2438,6 +2440,16 @@ static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu, code, cpu, pid, tid, ip, msg, timestamp, machine_pid, vcpu); + if (intel_pt_enable_logging && !log_on_stdout) { + FILE *fp = intel_pt_log_fp(); + + if (fp) + perf_event__fprintf_auxtrace_error(&event, fp); + } + + if (code != INTEL_PT_ERR_LOST && dump_log_on_error) + intel_pt_log_dump_buf(); + err = perf_session__deliver_synth_event(pt->session, &event, NULL); if (err) pr_err("Intel Processor Trace: failed to deliver error event, error %d\n", @@ -4272,8 +4284,12 @@ int intel_pt_process_auxtrace_info(union perf_event *event, goto err_delete_thread; } - if (pt->synth_opts.log) - intel_pt_log_enable(); + if (pt->synth_opts.log) { + bool log_on_error = pt->synth_opts.log_plus_flags & AUXTRACE_LOG_FLG_ON_ERROR; + unsigned int log_on_error_size = pt->synth_opts.log_on_error_size; + + intel_pt_log_enable(log_on_error, log_on_error_size); + } /* Maximum non-turbo ratio is TSC freq / 100 MHz */ if (pt->tc.time_mult) { -- GitLab From 3b7ae354c1fcb783848b46e1c1140a66ba742672 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 5 Sep 2022 10:34:24 +0300 Subject: [PATCH 1229/2223] perf intel-pt: Remove first line of log dumped on error Instead of printing "(first line may be sliced)", always remove the first line of the debug log if the buffer has wrapped when dumping on error. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220905073424.3971-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../perf/util/intel-pt-decoder/intel-pt-log.c | 33 ++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.c b/tools/perf/util/intel-pt-decoder/intel-pt-log.c index 24684edc49f7c..ef55d6232cf0c 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-log.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.c @@ -143,16 +143,39 @@ static FILE *log_buf__open(struct log_buf *b, FILE *backend, unsigned int sz) return file; } +static bool remove_first_line(const char **p, size_t *n) +{ + for (; *n && **p != '\n'; ++*p, --*n) + ; + if (*n) { + *p += 1; + *n -= 1; + return true; + } + return false; +} + +static void write_lines(const char *p, size_t n, FILE *fp, bool *remove_first) +{ + if (*remove_first) + *remove_first = !remove_first_line(&p, &n); + fwrite(p, n, 1, fp); +} + static void log_buf__dump(struct log_buf *b) { + bool remove_first = false; + if (!b->buf) return; - fflush(f); - fprintf(b->backend, "Dumping debug log buffer (first line may be sliced)\n"); - if (b->wrapped) - fwrite(b->buf + b->head, b->buf_sz - b->head, 1, b->backend); - fwrite(b->buf, b->head, 1, b->backend); + fflush(f); /* Could update b->head and b->wrapped */ + fprintf(b->backend, "Dumping debug log buffer\n"); + if (b->wrapped) { + remove_first = true; + write_lines(b->buf + b->head, b->buf_sz - b->head, b->backend, &remove_first); + } + write_lines(b->buf, b->head, b->backend, &remove_first); fprintf(b->backend, "End of debug log buffer dump\n"); b->head = 0; -- GitLab From c581e46ba2988a6198b07bcf264beab1895a28ac Mon Sep 17 00:00:00 2001 From: Nick Forrington <nick.forrington@arm.com> Date: Thu, 8 Sep 2022 12:25:18 +0100 Subject: [PATCH 1230/2223] perf vendor events arm64: Move REMOTE_ACCESS to "memory" category Move REMOTE_ACCESS event from other.json to memory.json for Neoverse CPUs. This is consistent with other Arm (Cortex) CPUs. Reviewed-by: John Garry <john.garry@huawei.com> Signed-off-by: Nick Forrington <nick.forrington@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mike Leach <mike.leach@linaro.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220908112519.64614-1-nick.forrington@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json | 3 +++ .../perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json | 5 ----- tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json | 3 +++ tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json | 5 ----- tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json | 3 +++ tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json | 5 ----- 6 files changed, 9 insertions(+), 15 deletions(-) delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json index 20a929e7728d4..5bed2514b245e 100644 --- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json @@ -3,6 +3,9 @@ "PublicDescription": "This event counts memory accesses due to load or store instructions. This event counts the sum of MEM_ACCESS_RD and MEM_ACCESS_WR.", "ArchStdEvent": "MEM_ACCESS" }, + { + "ArchStdEvent": "REMOTE_ACCESS" + }, { "ArchStdEvent": "MEM_ACCESS_RD" }, diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json deleted file mode 100644 index 20d8365756c5f..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json +++ /dev/null @@ -1,5 +0,0 @@ -[ - { - "ArchStdEvent": "REMOTE_ACCESS" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json index e522113aeb961..7b2b21ac150f5 100644 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json +++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json @@ -2,6 +2,9 @@ { "ArchStdEvent": "MEM_ACCESS" }, + { + "ArchStdEvent": "REMOTE_ACCESS" + }, { "ArchStdEvent": "MEM_ACCESS_RD" }, diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json deleted file mode 100644 index 20d8365756c5f..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json +++ /dev/null @@ -1,5 +0,0 @@ -[ - { - "ArchStdEvent": "REMOTE_ACCESS" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json index e3d08f1f7c92c..5aff6e93c1adb 100644 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json +++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json @@ -2,6 +2,9 @@ { "ArchStdEvent": "MEM_ACCESS" }, + { + "ArchStdEvent": "REMOTE_ACCESS" + }, { "ArchStdEvent": "MEM_ACCESS_RD" }, diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json deleted file mode 100644 index 20d8365756c5f..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json +++ /dev/null @@ -1,5 +0,0 @@ -[ - { - "ArchStdEvent": "REMOTE_ACCESS" - } -] -- GitLab From e3e7572fa8062b72385575bf04170621a4a8c447 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 8 Sep 2022 10:11:38 +0800 Subject: [PATCH 1231/2223] perf trace: Use zalloc() to save initialization of syscall_stats As most members of syscall_stats is set to 0 in thread__update_stats, using zalloc() directly. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220908021141.27134-2-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-trace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0bd9d01c0df9d..3ecc31375f90e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2173,13 +2173,10 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr stats = inode->priv; if (stats == NULL) { - stats = malloc(sizeof(*stats)); + stats = zalloc(sizeof(*stats)); if (stats == NULL) return; - stats->nr_failures = 0; - stats->max_errno = 0; - stats->errnos = NULL; init_stats(&stats->stats); inode->priv = stats; } -- GitLab From 0f405f878bc15674e38648121e124a93d0cef9c3 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 8 Sep 2022 10:11:39 +0800 Subject: [PATCH 1232/2223] perf lock: Add get_key_by_aggr_mode helper Wrap repeated code in helper functions get_key_by_aggr_mode and get_key_by_aggr_mode_simple, which assign the value to key based on aggregation mode. Note that for the conditions not support LOCK_AGGR_CALLER, should call get_key_by_aggr_mode_simple directly. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220908021141.27134-3-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-lock.c | 129 ++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 76 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index e79ef614105c8..52a6a10a610cb 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -560,29 +560,50 @@ enum acquire_flags { READ_LOCK = 2, }; -static int report_lock_acquire_event(struct evsel *evsel, - struct perf_sample *sample) +static int get_key_by_aggr_mode_simple(u64 *key, u64 addr, u32 tid) { - struct lock_stat *ls; - struct thread_stat *ts; - struct lock_seq_stat *seq; - const char *name = evsel__strval(evsel, sample, "name"); - u64 addr = evsel__intval(evsel, sample, "lockdep_addr"); - int flag = evsel__intval(evsel, sample, "flags"); - u64 key; - switch (aggr_mode) { case LOCK_AGGR_ADDR: - key = addr; + *key = addr; break; case LOCK_AGGR_TASK: - key = sample->tid; + *key = tid; break; case LOCK_AGGR_CALLER: default: pr_err("Invalid aggregation mode: %d\n", aggr_mode); return -EINVAL; } + return 0; +} + +static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample); + +static int get_key_by_aggr_mode(u64 *key, u64 addr, struct evsel *evsel, + struct perf_sample *sample) +{ + if (aggr_mode == LOCK_AGGR_CALLER) { + *key = callchain_id(evsel, sample); + return 0; + } + return get_key_by_aggr_mode_simple(key, addr, sample->tid); +} + +static int report_lock_acquire_event(struct evsel *evsel, + struct perf_sample *sample) +{ + struct lock_stat *ls; + struct thread_stat *ts; + struct lock_seq_stat *seq; + const char *name = evsel__strval(evsel, sample, "name"); + u64 addr = evsel__intval(evsel, sample, "lockdep_addr"); + int flag = evsel__intval(evsel, sample, "flags"); + u64 key; + int ret; + + ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid); + if (ret < 0) + return ret; ls = lock_stat_findnew(key, name, 0); if (!ls) @@ -653,19 +674,11 @@ static int report_lock_acquired_event(struct evsel *evsel, const char *name = evsel__strval(evsel, sample, "name"); u64 addr = evsel__intval(evsel, sample, "lockdep_addr"); u64 key; + int ret; - switch (aggr_mode) { - case LOCK_AGGR_ADDR: - key = addr; - break; - case LOCK_AGGR_TASK: - key = sample->tid; - break; - case LOCK_AGGR_CALLER: - default: - pr_err("Invalid aggregation mode: %d\n", aggr_mode); - return -EINVAL; - } + ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid); + if (ret < 0) + return ret; ls = lock_stat_findnew(key, name, 0); if (!ls) @@ -726,19 +739,11 @@ static int report_lock_contended_event(struct evsel *evsel, const char *name = evsel__strval(evsel, sample, "name"); u64 addr = evsel__intval(evsel, sample, "lockdep_addr"); u64 key; + int ret; - switch (aggr_mode) { - case LOCK_AGGR_ADDR: - key = addr; - break; - case LOCK_AGGR_TASK: - key = sample->tid; - break; - case LOCK_AGGR_CALLER: - default: - pr_err("Invalid aggregation mode: %d\n", aggr_mode); - return -EINVAL; - } + ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid); + if (ret < 0) + return ret; ls = lock_stat_findnew(key, name, 0); if (!ls) @@ -792,19 +797,11 @@ static int report_lock_release_event(struct evsel *evsel, const char *name = evsel__strval(evsel, sample, "name"); u64 addr = evsel__intval(evsel, sample, "lockdep_addr"); u64 key; + int ret; - switch (aggr_mode) { - case LOCK_AGGR_ADDR: - key = addr; - break; - case LOCK_AGGR_TASK: - key = sample->tid; - break; - case LOCK_AGGR_CALLER: - default: - pr_err("Invalid aggregation mode: %d\n", aggr_mode); - return -EINVAL; - } + ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid); + if (ret < 0) + return ret; ls = lock_stat_findnew(key, name, 0); if (!ls) @@ -1015,21 +1012,11 @@ static int report_lock_contention_begin_event(struct evsel *evsel, struct lock_seq_stat *seq; u64 addr = evsel__intval(evsel, sample, "lock_addr"); u64 key; + int ret; - switch (aggr_mode) { - case LOCK_AGGR_ADDR: - key = addr; - break; - case LOCK_AGGR_TASK: - key = sample->tid; - break; - case LOCK_AGGR_CALLER: - key = callchain_id(evsel, sample); - break; - default: - pr_err("Invalid aggregation mode: %d\n", aggr_mode); - return -EINVAL; - } + ret = get_key_by_aggr_mode(&key, addr, evsel, sample); + if (ret < 0) + return ret; ls = lock_stat_find(key); if (!ls) { @@ -1098,21 +1085,11 @@ static int report_lock_contention_end_event(struct evsel *evsel, u64 contended_term; u64 addr = evsel__intval(evsel, sample, "lock_addr"); u64 key; + int ret; - switch (aggr_mode) { - case LOCK_AGGR_ADDR: - key = addr; - break; - case LOCK_AGGR_TASK: - key = sample->tid; - break; - case LOCK_AGGR_CALLER: - key = callchain_id(evsel, sample); - break; - default: - pr_err("Invalid aggregation mode: %d\n", aggr_mode); - return -EINVAL; - } + ret = get_key_by_aggr_mode(&key, addr, evsel, sample); + if (ret < 0) + return ret; ls = lock_stat_find(key); if (!ls) -- GitLab From 569c746b8a1eab64ebf5b3ebb5d414742c8fc40b Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 8 Sep 2022 10:11:40 +0800 Subject: [PATCH 1233/2223] perf timechart: Add create_pidcomm helper Wrap repeated code combined with alloc of per_pidcomm in helper function create_pidcomm. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220908021141.27134-4-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-timechart.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index e2e9ad929bafa..667a94d45493f 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -215,6 +215,19 @@ static struct per_pid *find_create_pid(struct timechart *tchart, int pid) return cursor; } +static struct per_pidcomm *create_pidcomm(struct per_pid *p) +{ + struct per_pidcomm *c; + + c = zalloc(sizeof(*c)); + if (!c) + return NULL; + p->current = c; + c->next = p->all; + p->all = c; + return c; +} + static void pid_set_comm(struct timechart *tchart, int pid, char *comm) { struct per_pid *p; @@ -233,12 +246,9 @@ static void pid_set_comm(struct timechart *tchart, int pid, char *comm) } c = c->next; } - c = zalloc(sizeof(*c)); + c = create_pidcomm(p); assert(c != NULL); c->comm = strdup(comm); - p->current = c; - c->next = p->all; - p->all = c; } static void pid_fork(struct timechart *tchart, int pid, int ppid, u64 timestamp) @@ -277,11 +287,8 @@ static void pid_put_sample(struct timechart *tchart, int pid, int type, p = find_create_pid(tchart, pid); c = p->current; if (!c) { - c = zalloc(sizeof(*c)); + c = create_pidcomm(p); assert(c != NULL); - p->current = c; - c->next = p->all; - p->all = c; } sample = zalloc(sizeof(*sample)); @@ -726,12 +733,9 @@ static int pid_begin_io_sample(struct timechart *tchart, int pid, int type, struct io_sample *prev; if (!c) { - c = zalloc(sizeof(*c)); + c = create_pidcomm(p); if (!c) return -ENOMEM; - p->current = c; - c->next = p->all; - p->all = c; } prev = c->io_samples; -- GitLab From 3e8d21b922af782954d083d938c117b488c4578c Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 8 Sep 2022 10:11:41 +0800 Subject: [PATCH 1234/2223] perf timechart: Add p_state_end helper Wrap repeated code in helper functions p_state_end, which alloc a new power_event recording last pstate, and insert to the head of tchart->power_events. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220908021141.27134-5-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-timechart.c | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 667a94d45493f..c36296bb7637e 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -376,16 +376,13 @@ static void c_state_end(struct timechart *tchart, int cpu, u64 timestamp) tchart->power_events = pwr; } -static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64 new_freq) +static struct power_event *p_state_end(struct timechart *tchart, int cpu, + u64 timestamp) { - struct power_event *pwr; - - if (new_freq > 8000000) /* detect invalid data */ - return; + struct power_event *pwr = zalloc(sizeof(*pwr)); - pwr = zalloc(sizeof(*pwr)); if (!pwr) - return; + return NULL; pwr->state = cpus_pstate_state[cpu]; pwr->start_time = cpus_pstate_start_times[cpu]; @@ -393,11 +390,23 @@ static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64 pwr->cpu = cpu; pwr->type = PSTATE; pwr->next = tchart->power_events; - if (!pwr->start_time) pwr->start_time = tchart->first_time; tchart->power_events = pwr; + return pwr; +} + +static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64 new_freq) +{ + struct power_event *pwr; + + if (new_freq > 8000000) /* detect invalid data */ + return; + + pwr = p_state_end(tchart, cpu, timestamp); + if (!pwr) + return; cpus_pstate_state[cpu] = new_freq; cpus_pstate_start_times[cpu] = timestamp; @@ -705,22 +714,12 @@ static void end_sample_processing(struct timechart *tchart) #endif /* P state */ - pwr = zalloc(sizeof(*pwr)); + pwr = p_state_end(tchart, cpu, tchart->last_time); if (!pwr) return; - pwr->state = cpus_pstate_state[cpu]; - pwr->start_time = cpus_pstate_start_times[cpu]; - pwr->end_time = tchart->last_time; - pwr->cpu = cpu; - pwr->type = PSTATE; - pwr->next = tchart->power_events; - - if (!pwr->start_time) - pwr->start_time = tchart->first_time; if (!pwr->state) pwr->state = tchart->min_freq; - tchart->power_events = pwr; } } -- GitLab From 3657ad4b0fb6a6c3df12cec92013614212f5f401 Mon Sep 17 00:00:00 2001 From: Nick Forrington <nick.forrington@arm.com> Date: Wed, 7 Sep 2022 16:49:30 +0100 Subject: [PATCH 1235/2223] perf vendor events: Update events for Neoverse E1 These CPUs contain the same PMU events (as per the Arm Technical Reference manuals for Cortex A65 and Neoverse E1) This de-duplicates event data, and avoids issues in previous E1 event data (not present in A65 data) * Missing implementation defined events * Inclusion of events that are not implemented: - L1D_CACHE_ALLOCATE - SAMPLE_POP - SAMPLE_FEED - SAMPLE_FILTRATE - SAMPLE_COLLISION Reviewed-by: John Garry <john.garry@huawei.com> Signed-off-by: Nick Forrington <nick.forrington@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mike Leach <mike.leach@linaro.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20220907154932.60808-1-nick.forrington@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../{cortex-a65 => cortex-a65-e1}/branch.json | 0 .../{cortex-a65 => cortex-a65-e1}/bus.json | 0 .../{cortex-a65 => cortex-a65-e1}/cache.json | 0 .../{cortex-a65 => cortex-a65-e1}/dpu.json | 0 .../exception.json | 0 .../{cortex-a65 => cortex-a65-e1}/ifu.json | 0 .../instruction.json | 0 .../{cortex-a65 => cortex-a65-e1}/memory.json | 0 .../pipeline.json | 0 .../arch/arm64/arm/neoverse-e1/branch.json | 17 --- .../arch/arm64/arm/neoverse-e1/bus.json | 17 --- .../arch/arm64/arm/neoverse-e1/cache.json | 107 ------------------ .../arch/arm64/arm/neoverse-e1/exception.json | 14 --- .../arm64/arm/neoverse-e1/instruction.json | 65 ----------- .../arch/arm64/arm/neoverse-e1/memory.json | 23 ---- .../arch/arm64/arm/neoverse-e1/pipeline.json | 8 -- .../arch/arm64/arm/neoverse-e1/spe.json | 14 --- tools/perf/pmu-events/arch/arm64/mapfile.csv | 4 +- 18 files changed, 2 insertions(+), 267 deletions(-) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/branch.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/bus.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/cache.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/dpu.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/exception.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/ifu.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/instruction.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/memory.json (100%) rename tools/perf/pmu-events/arch/arm64/arm/{cortex-a65 => cortex-a65-e1}/pipeline.json (100%) delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/branch.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/branch.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/branch.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/branch.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/bus.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/bus.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/bus.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/bus.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/cache.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/cache.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/cache.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/cache.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/dpu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/dpu.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/dpu.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/dpu.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/exception.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/exception.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/exception.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/exception.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/ifu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/ifu.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/ifu.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/ifu.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/instruction.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/instruction.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/instruction.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/memory.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/memory.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/memory.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/memory.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/pipeline.json similarity index 100% rename from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/pipeline.json rename to tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/pipeline.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json deleted file mode 100644 index 2f2d137f5f55a..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json +++ /dev/null @@ -1,17 +0,0 @@ -[ - { - "ArchStdEvent": "BR_MIS_PRED" - }, - { - "ArchStdEvent": "BR_PRED" - }, - { - "ArchStdEvent": "BR_IMMED_SPEC" - }, - { - "ArchStdEvent": "BR_RETURN_SPEC" - }, - { - "ArchStdEvent": "BR_INDIRECT_SPEC" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json deleted file mode 100644 index 75d850b781acd..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json +++ /dev/null @@ -1,17 +0,0 @@ -[ - { - "ArchStdEvent": "CPU_CYCLES" - }, - { - "ArchStdEvent": "BUS_ACCESS" - }, - { - "ArchStdEvent": "BUS_CYCLES" - }, - { - "ArchStdEvent": "BUS_ACCESS_RD" - }, - { - "ArchStdEvent": "BUS_ACCESS_WR" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json deleted file mode 100644 index 3ad15e3a93a91..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json +++ /dev/null @@ -1,107 +0,0 @@ -[ - { - "ArchStdEvent": "L1I_CACHE_REFILL" - }, - { - "ArchStdEvent": "L1I_TLB_REFILL" - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL" - }, - { - "ArchStdEvent": "L1D_CACHE" - }, - { - "ArchStdEvent": "L1D_TLB_REFILL" - }, - { - "ArchStdEvent": "L1I_CACHE" - }, - { - "ArchStdEvent": "L1D_CACHE_WB" - }, - { - "ArchStdEvent": "L2D_CACHE" - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL" - }, - { - "ArchStdEvent": "L2D_CACHE_WB" - }, - { - "ArchStdEvent": "L1D_CACHE_ALLOCATE" - }, - { - "ArchStdEvent": "L2D_CACHE_ALLOCATE" - }, - { - "ArchStdEvent": "L1D_TLB" - }, - { - "ArchStdEvent": "L1I_TLB" - }, - { - "ArchStdEvent": "L3D_CACHE_ALLOCATE" - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL" - }, - { - "ArchStdEvent": "L3D_CACHE" - }, - { - "ArchStdEvent": "L2D_TLB_REFILL" - }, - { - "ArchStdEvent": "L2D_TLB" - }, - { - "ArchStdEvent": "DTLB_WALK" - }, - { - "ArchStdEvent": "ITLB_WALK" - }, - { - "ArchStdEvent": "LL_CACHE_RD" - }, - { - "ArchStdEvent": "LL_CACHE_MISS_RD" - }, - { - "ArchStdEvent": "L1D_CACHE_RD" - }, - { - "ArchStdEvent": "L1D_CACHE_WR" - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_RD" - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_WR" - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_INNER" - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_OUTER" - }, - { - "ArchStdEvent": "L2D_CACHE_RD" - }, - { - "ArchStdEvent": "L2D_CACHE_WR" - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_RD" - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_WR" - }, - { - "ArchStdEvent": "L3D_CACHE_RD" - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL_RD" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json deleted file mode 100644 index 27c3fe9c831ae..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "ArchStdEvent": "EXC_TAKEN" - }, - { - "ArchStdEvent": "MEMORY_ERROR" - }, - { - "ArchStdEvent": "EXC_IRQ" - }, - { - "ArchStdEvent": "EXC_FIQ" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json deleted file mode 100644 index 6c3b8f772e7fb..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json +++ /dev/null @@ -1,65 +0,0 @@ -[ - { - "ArchStdEvent": "SW_INCR" - }, - { - "ArchStdEvent": "LD_RETIRED" - }, - { - "ArchStdEvent": "ST_RETIRED" - }, - { - "ArchStdEvent": "INST_RETIRED" - }, - { - "ArchStdEvent": "EXC_RETURN" - }, - { - "ArchStdEvent": "CID_WRITE_RETIRED" - }, - { - "ArchStdEvent": "PC_WRITE_RETIRED" - }, - { - "ArchStdEvent": "BR_IMMED_RETIRED" - }, - { - "ArchStdEvent": "BR_RETURN_RETIRED" - }, - { - "ArchStdEvent": "INST_SPEC" - }, - { - "ArchStdEvent": "TTBR_WRITE_RETIRED" - }, - { - "ArchStdEvent": "BR_RETIRED" - }, - { - "ArchStdEvent": "BR_MIS_PRED_RETIRED" - }, - { - "ArchStdEvent": "LD_SPEC" - }, - { - "ArchStdEvent": "ST_SPEC" - }, - { - "ArchStdEvent": "LDST_SPEC" - }, - { - "ArchStdEvent": "DP_SPEC" - }, - { - "ArchStdEvent": "ASE_SPEC" - }, - { - "ArchStdEvent": "VFP_SPEC" - }, - { - "ArchStdEvent": "CRYPTO_SPEC" - }, - { - "ArchStdEvent": "ISB_SPEC" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json deleted file mode 100644 index 78ed6dfcedc1b..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "ArchStdEvent": "MEM_ACCESS" - }, - { - "ArchStdEvent": "REMOTE_ACCESS_RD" - }, - { - "ArchStdEvent": "MEM_ACCESS_RD" - }, - { - "ArchStdEvent": "MEM_ACCESS_WR" - }, - { - "ArchStdEvent": "UNALIGNED_LD_SPEC" - }, - { - "ArchStdEvent": "UNALIGNED_ST_SPEC" - }, - { - "ArchStdEvent": "UNALIGNED_LDST_SPEC" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json deleted file mode 100644 index eeac798d403a0..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "ArchStdEvent": "STALL_FRONTEND" - }, - { - "ArchStdEvent": "STALL_BACKEND" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json deleted file mode 100644 index 20f2165c85fec..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "ArchStdEvent": "SAMPLE_POP" - }, - { - "ArchStdEvent": "SAMPLE_FEED" - }, - { - "ArchStdEvent": "SAMPLE_FILTRATE" - }, - { - "ArchStdEvent": "SAMPLE_COLLISION" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 406f6edd4e12c..ad502d00f4607 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -17,7 +17,8 @@ 0x00000000420f1000,v1,arm/cortex-a53,core 0x00000000410fd040,v1,arm/cortex-a35,core 0x00000000410fd050,v1,arm/cortex-a55,core -0x00000000410fd060,v1,arm/cortex-a65,core +0x00000000410fd060,v1,arm/cortex-a65-e1,core +0x00000000410fd4a0,v1,arm/cortex-a65-e1,core 0x00000000410fd070,v1,arm/cortex-a57-a72,core 0x00000000410fd080,v1,arm/cortex-a57-a72,core 0x00000000410fd090,v1,arm/cortex-a73,core @@ -34,7 +35,6 @@ 0x00000000410fd470,v1,arm/cortex-a710,core 0x00000000410fd480,v1,arm/cortex-x2,core 0x00000000410fd490,v1,arm/neoverse-n2,core -0x00000000410fd4a0,v1,arm/neoverse-e1,core 0x00000000420f5160,v1,cavium/thunderx2,core 0x00000000430f0af0,v1,cavium/thunderx2,core 0x00000000460f0010,v1,fujitsu/a64fx,core -- GitLab From d773c999b8d22ad3ffd42eca373ebae4cb6512fd Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Tue, 14 Jun 2022 07:33:52 -0700 Subject: [PATCH 1236/2223] perf events: Prefer union over variable length array It is possible for casts to introduce alignment issues, prefer a union for perf_record_event_update. Signed-off-by: Ian Rogers <irogers@google.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Colin Ian King <colin.king@intel.com> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: German Gomez <german.gomez@arm.com> Cc: Gustavo A. R. Silva <gustavoars@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Kees Kook <keescook@chromium.org> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220614143353.1559597-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/include/perf/event.h | 11 ++++++++++- tools/perf/tests/event_update.c | 14 ++++---------- tools/perf/util/header.c | 24 ++++++++---------------- tools/perf/util/synthetic-events.c | 12 +++++------- 4 files changed, 27 insertions(+), 34 deletions(-) diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index d8ae4e944467e..e147e61832927 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -233,7 +233,16 @@ struct perf_record_event_update { struct perf_event_header header; __u64 type; __u64 id; - char data[]; + union { + /* Used when type == PERF_EVENT_UPDATE__SCALE. */ + struct perf_record_event_update_scale scale; + /* Used when type == PERF_EVENT_UPDATE__UNIT. */ + char unit[0]; + /* Used when type == PERF_EVENT_UPDATE__NAME. */ + char name[0]; + /* Used when type == PERF_EVENT_UPDATE__CPUS. */ + struct perf_record_event_update_cpus cpus; + }; }; #define MAX_EVENT_NAME 64 diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 78db4d704e76c..d093a9b878d13 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -21,7 +21,7 @@ static int process_event_unit(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__UNIT); - TEST_ASSERT_VAL("wrong unit", !strcmp(ev->data, "KRAVA")); + TEST_ASSERT_VAL("wrong unit", !strcmp(ev->unit, "KRAVA")); return 0; } @@ -31,13 +31,10 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused, struct machine *machine __maybe_unused) { struct perf_record_event_update *ev = (struct perf_record_event_update *)event; - struct perf_record_event_update_scale *ev_data; - - ev_data = (struct perf_record_event_update_scale *)ev->data; TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE); - TEST_ASSERT_VAL("wrong scale", ev_data->scale == 0.123); + TEST_ASSERT_VAL("wrong scale", ev->scale.scale == 0.123); return 0; } @@ -56,7 +53,7 @@ static int process_event_name(struct perf_tool *tool, TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__NAME); - TEST_ASSERT_VAL("wrong name", !strcmp(ev->data, tmp->name)); + TEST_ASSERT_VAL("wrong name", !strcmp(ev->name, tmp->name)); return 0; } @@ -66,12 +63,9 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, struct machine *machine __maybe_unused) { struct perf_record_event_update *ev = (struct perf_record_event_update *)event; - struct perf_record_event_update_cpus *ev_data; struct perf_cpu_map *map; - ev_data = (struct perf_record_event_update_cpus *) ev->data; - - map = cpu_map__new_data(&ev_data->cpus); + map = cpu_map__new_data(&ev->cpus.cpus); TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong type", ev->type == PERF_EVENT_UPDATE__CPUS); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index c30c29c514105..98dfaf84bd137 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4295,8 +4295,6 @@ out: size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) { struct perf_record_event_update *ev = &event->event_update; - struct perf_record_event_update_scale *ev_scale; - struct perf_record_event_update_cpus *ev_cpus; struct perf_cpu_map *map; size_t ret; @@ -4304,20 +4302,18 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) switch (ev->type) { case PERF_EVENT_UPDATE__SCALE: - ev_scale = (struct perf_record_event_update_scale *)ev->data; - ret += fprintf(fp, "... scale: %f\n", ev_scale->scale); + ret += fprintf(fp, "... scale: %f\n", ev->scale.scale); break; case PERF_EVENT_UPDATE__UNIT: - ret += fprintf(fp, "... unit: %s\n", ev->data); + ret += fprintf(fp, "... unit: %s\n", ev->unit); break; case PERF_EVENT_UPDATE__NAME: - ret += fprintf(fp, "... name: %s\n", ev->data); + ret += fprintf(fp, "... name: %s\n", ev->name); break; case PERF_EVENT_UPDATE__CPUS: - ev_cpus = (struct perf_record_event_update_cpus *)ev->data; ret += fprintf(fp, "... "); - map = cpu_map__new_data(&ev_cpus->cpus); + map = cpu_map__new_data(&ev->cpus.cpus); if (map) ret += cpu_map__fprintf(map, fp); else @@ -4374,8 +4370,6 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, struct evlist **pevlist) { struct perf_record_event_update *ev = &event->event_update; - struct perf_record_event_update_scale *ev_scale; - struct perf_record_event_update_cpus *ev_cpus; struct evlist *evlist; struct evsel *evsel; struct perf_cpu_map *map; @@ -4395,19 +4389,17 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, switch (ev->type) { case PERF_EVENT_UPDATE__UNIT: free((char *)evsel->unit); - evsel->unit = strdup(ev->data); + evsel->unit = strdup(ev->unit); break; case PERF_EVENT_UPDATE__NAME: free(evsel->name); - evsel->name = strdup(ev->data); + evsel->name = strdup(ev->name); break; case PERF_EVENT_UPDATE__SCALE: - ev_scale = (struct perf_record_event_update_scale *)ev->data; - evsel->scale = ev_scale->scale; + evsel->scale = ev->scale.scale; break; case PERF_EVENT_UPDATE__CPUS: - ev_cpus = (struct perf_record_event_update_cpus *)ev->data; - map = cpu_map__new_data(&ev_cpus->cpus); + map = cpu_map__new_data(&ev->cpus.cpus); if (map) { perf_cpu_map__put(evsel->core.own_cpus); evsel->core.own_cpus = map; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 538790758e242..851d11a64b0d7 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1955,7 +1955,7 @@ int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evse if (ev == NULL) return -ENOMEM; - strlcpy(ev->data, evsel->unit, size + 1); + strlcpy(ev->unit, evsel->unit, size + 1); err = process(tool, (union perf_event *)ev, NULL, NULL); free(ev); return err; @@ -1972,8 +1972,7 @@ int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evs if (ev == NULL) return -ENOMEM; - ev_data = (struct perf_record_event_update_scale *)ev->data; - ev_data->scale = evsel->scale; + ev->scale.scale = evsel->scale; err = process(tool, (union perf_event *)ev, NULL, NULL); free(ev); return err; @@ -1990,7 +1989,7 @@ int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evse if (ev == NULL) return -ENOMEM; - strlcpy(ev->data, evsel->name, len + 1); + strlcpy(ev->name, evsel->name, len + 1); err = process(tool, (union perf_event *)ev, NULL, NULL); free(ev); return err; @@ -1999,7 +1998,7 @@ int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evse int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { - size_t size = sizeof(struct perf_record_event_update); + size_t size = sizeof(struct perf_event_header) + sizeof(u64) + sizeof(u64); struct perf_record_event_update *ev; int max, err; u16 type; @@ -2016,8 +2015,7 @@ int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evse ev->type = PERF_EVENT_UPDATE__CPUS; ev->id = evsel->core.id[0]; - cpu_map_data__synthesize((struct perf_record_cpu_map_data *)ev->data, - evsel->core.own_cpus, type, max); + cpu_map_data__synthesize(&ev->cpus.cpus, evsel->core.own_cpus, type, max); err = process(tool, (union perf_event *)ev, NULL, NULL); free(ev); -- GitLab From c7202d20fb4584435ce2af5ef3a7a770f79ab59e Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Tue, 14 Jun 2022 07:33:53 -0700 Subject: [PATCH 1237/2223] perf cpumap: Add range data encoding Often cpumaps encode a range of all CPUs, add a compact encoding that doesn't require a bit mask or list of all CPUs. Signed-off-by: Ian Rogers <irogers@google.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Colin Ian King <colin.king@intel.com> Cc: Dave Marchevsky <davemarchevsky@fb.com> Cc: German Gomez <german.gomez@arm.com> Cc: Gustavo A. R. Silva <gustavoars@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Kees Kook <keescook@chromium.org> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Riccardo Mancini <rickyman7@gmail.com> Cc: Song Liu <songliubraving@fb.com> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220614143353.1559597-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/include/perf/event.h | 14 +++ tools/perf/tests/cpumap.c | 52 ++++++++-- tools/perf/util/cpumap.c | 31 +++++- tools/perf/util/session.c | 5 + tools/perf/util/synthetic-events.c | 151 ++++++++++++++-------------- 5 files changed, 166 insertions(+), 87 deletions(-) diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index e147e61832927..e282faf8fd75b 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -153,6 +153,7 @@ struct perf_record_header_attr { enum { PERF_CPU_MAP__CPUS = 0, PERF_CPU_MAP__MASK = 1, + PERF_CPU_MAP__RANGE_CPUS = 2, }; /* @@ -195,6 +196,17 @@ struct perf_record_mask_cpu_map64 { #pragma GCC diagnostic ignored "-Wpacked" #pragma GCC diagnostic ignored "-Wattributes" +/* + * An encoding of a CPU map for a range starting at start_cpu through to + * end_cpu. If any_cpu is 1, an any CPU (-1) value (aka dummy value) is present. + */ +struct perf_record_range_cpu_map { + __u8 any_cpu; + __u8 __pad; + __u16 start_cpu; + __u16 end_cpu; +}; + struct __packed perf_record_cpu_map_data { __u16 type; union { @@ -204,6 +216,8 @@ struct __packed perf_record_cpu_map_data { struct perf_record_mask_cpu_map32 mask32_data; /* Used when type == PERF_CPU_MAP__MASK and long_size == 8. */ struct perf_record_mask_cpu_map64 mask64_data; + /* Used when type == PERF_CPU_MAP__RANGE_CPUS. */ + struct perf_record_range_cpu_map range_cpu_data; }; }; diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 7ea150cdc137d..7c873c6ae3eb9 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -19,7 +19,6 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused, struct perf_record_cpu_map *map_event = &event->cpu_map; struct perf_record_cpu_map_data *data; struct perf_cpu_map *map; - int i; unsigned int long_size; data = &map_event->data; @@ -32,16 +31,17 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong nr", data->mask32_data.nr == 1); - for (i = 0; i < 20; i++) { + TEST_ASSERT_VAL("wrong cpu", perf_record_cpu_map_data__test_bit(0, data)); + TEST_ASSERT_VAL("wrong cpu", !perf_record_cpu_map_data__test_bit(1, data)); + for (int i = 2; i <= 20; i++) TEST_ASSERT_VAL("wrong cpu", perf_record_cpu_map_data__test_bit(i, data)); - } map = cpu_map__new_data(data); TEST_ASSERT_VAL("wrong nr", perf_cpu_map__nr(map) == 20); - for (i = 0; i < 20; i++) { - TEST_ASSERT_VAL("wrong cpu", perf_cpu_map__cpu(map, i).cpu == i); - } + TEST_ASSERT_VAL("wrong cpu", perf_cpu_map__cpu(map, 0).cpu == 0); + for (int i = 2; i <= 20; i++) + TEST_ASSERT_VAL("wrong cpu", perf_cpu_map__cpu(map, i - 1).cpu == i); perf_cpu_map__put(map); return 0; @@ -73,25 +73,59 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, return 0; } +static int process_event_range_cpus(struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + struct perf_record_cpu_map *map_event = &event->cpu_map; + struct perf_record_cpu_map_data *data; + struct perf_cpu_map *map; + + data = &map_event->data; + + TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__RANGE_CPUS); + + TEST_ASSERT_VAL("wrong any_cpu", data->range_cpu_data.any_cpu == 0); + TEST_ASSERT_VAL("wrong start_cpu", data->range_cpu_data.start_cpu == 1); + TEST_ASSERT_VAL("wrong end_cpu", data->range_cpu_data.end_cpu == 256); + + map = cpu_map__new_data(data); + TEST_ASSERT_VAL("wrong nr", perf_cpu_map__nr(map) == 256); + TEST_ASSERT_VAL("wrong cpu", perf_cpu_map__cpu(map, 0).cpu == 1); + TEST_ASSERT_VAL("wrong cpu", perf_cpu_map__max(map).cpu == 256); + TEST_ASSERT_VAL("wrong refcnt", refcount_read(&map->refcnt) == 1); + perf_cpu_map__put(map); + return 0; +} + static int test__cpu_map_synthesize(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { struct perf_cpu_map *cpus; - /* This one is better stores in mask. */ - cpus = perf_cpu_map__new("0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"); + /* This one is better stored in a mask. */ + cpus = perf_cpu_map__new("0,2-20"); TEST_ASSERT_VAL("failed to synthesize map", !perf_event__synthesize_cpu_map(NULL, cpus, process_event_mask, NULL)); perf_cpu_map__put(cpus); - /* This one is better stores in cpu values. */ + /* This one is better stored in cpu values. */ cpus = perf_cpu_map__new("1,256"); TEST_ASSERT_VAL("failed to synthesize map", !perf_event__synthesize_cpu_map(NULL, cpus, process_event_cpus, NULL)); + perf_cpu_map__put(cpus); + + /* This one is better stored as a range. */ + cpus = perf_cpu_map__new("1-256"); + + TEST_ASSERT_VAL("failed to synthesize map", + !perf_event__synthesize_cpu_map(NULL, cpus, process_event_range_cpus, NULL)); + perf_cpu_map__put(cpus); return 0; } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index ae43fb88f444e..2389bd3e19b86 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -112,12 +112,39 @@ static struct perf_cpu_map *cpu_map__from_mask(const struct perf_record_cpu_map_ } +static struct perf_cpu_map *cpu_map__from_range(const struct perf_record_cpu_map_data *data) +{ + struct perf_cpu_map *map; + unsigned int i = 0; + + map = perf_cpu_map__empty_new(data->range_cpu_data.end_cpu - + data->range_cpu_data.start_cpu + 1 + data->range_cpu_data.any_cpu); + if (!map) + return NULL; + + if (data->range_cpu_data.any_cpu) + map->map[i++].cpu = -1; + + for (int cpu = data->range_cpu_data.start_cpu; cpu <= data->range_cpu_data.end_cpu; + i++, cpu++) + map->map[i].cpu = cpu; + + return map; +} + struct perf_cpu_map *cpu_map__new_data(const struct perf_record_cpu_map_data *data) { - if (data->type == PERF_CPU_MAP__CPUS) + switch (data->type) { + case PERF_CPU_MAP__CPUS: return cpu_map__from_entries(data); - else + case PERF_CPU_MAP__MASK: return cpu_map__from_mask(data); + case PERF_CPU_MAP__RANGE_CPUS: + return cpu_map__from_range(data); + default: + pr_err("cpu_map__new_data unknown type %d\n", data->type); + return NULL; + } } size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 47d5a50e616a3..1a4f10de29ffe 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -943,6 +943,11 @@ static void perf_event__cpu_map_swap(union perf_event *event, default: pr_err("cpu_map swap: unsupported long size\n"); } + break; + case PERF_CPU_MAP__RANGE_CPUS: + data->range_cpu_data.start_cpu = bswap_16(data->range_cpu_data.start_cpu); + data->range_cpu_data.end_cpu = bswap_16(data->range_cpu_data.end_cpu); + break; default: break; } diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 851d11a64b0d7..289ea17ac5f7f 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1195,93 +1195,97 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool, return err; } -static void synthesize_cpus(struct perf_record_cpu_map_data *data, - const struct perf_cpu_map *map) -{ - int i, map_nr = perf_cpu_map__nr(map); - - data->cpus_data.nr = map_nr; +struct synthesize_cpu_map_data { + const struct perf_cpu_map *map; + int nr; + int min_cpu; + int max_cpu; + int has_any_cpu; + int type; + size_t size; + struct perf_record_cpu_map_data *data; +}; - for (i = 0; i < map_nr; i++) - data->cpus_data.cpu[i] = perf_cpu_map__cpu(map, i).cpu; +static void synthesize_cpus(struct synthesize_cpu_map_data *data) +{ + data->data->type = PERF_CPU_MAP__CPUS; + data->data->cpus_data.nr = data->nr; + for (int i = 0; i < data->nr; i++) + data->data->cpus_data.cpu[i] = perf_cpu_map__cpu(data->map, i).cpu; } -static void synthesize_mask(struct perf_record_cpu_map_data *data, - const struct perf_cpu_map *map, int max) +static void synthesize_mask(struct synthesize_cpu_map_data *data) { int idx; struct perf_cpu cpu; /* Due to padding, the 4bytes per entry mask variant is always smaller. */ - data->mask32_data.nr = BITS_TO_U32(max); - data->mask32_data.long_size = 4; + data->data->type = PERF_CPU_MAP__MASK; + data->data->mask32_data.nr = BITS_TO_U32(data->max_cpu); + data->data->mask32_data.long_size = 4; - perf_cpu_map__for_each_cpu(cpu, idx, map) { + perf_cpu_map__for_each_cpu(cpu, idx, data->map) { int bit_word = cpu.cpu / 32; - __u32 bit_mask = 1U << (cpu.cpu & 31); + u32 bit_mask = 1U << (cpu.cpu & 31); - data->mask32_data.mask[bit_word] |= bit_mask; + data->data->mask32_data.mask[bit_word] |= bit_mask; } } -static size_t cpus_size(const struct perf_cpu_map *map) +static void synthesize_range_cpus(struct synthesize_cpu_map_data *data) { - return sizeof(struct cpu_map_entries) + perf_cpu_map__nr(map) * sizeof(u16); + data->data->type = PERF_CPU_MAP__RANGE_CPUS; + data->data->range_cpu_data.any_cpu = data->has_any_cpu; + data->data->range_cpu_data.start_cpu = data->min_cpu; + data->data->range_cpu_data.end_cpu = data->max_cpu; } -static size_t mask_size(const struct perf_cpu_map *map, int *max) -{ - *max = perf_cpu_map__max(map).cpu; - return sizeof(struct perf_record_mask_cpu_map32) + BITS_TO_U32(*max) * sizeof(__u32); -} - -static void *cpu_map_data__alloc(const struct perf_cpu_map *map, size_t *size, - u16 *type, int *max) +static void *cpu_map_data__alloc(struct synthesize_cpu_map_data *syn_data, + size_t header_size) { size_t size_cpus, size_mask; - bool is_dummy = perf_cpu_map__empty(map); - /* - * Both array and mask data have variable size based - * on the number of cpus and their actual values. - * The size of the 'struct perf_record_cpu_map_data' is: - * - * array = size of 'struct cpu_map_entries' + - * number of cpus * sizeof(u64) - * - * mask = size of 'struct perf_record_record_cpu_map' + - * maximum cpu bit converted to size of longs - * - * and finally + the size of 'struct perf_record_cpu_map_data'. - */ - size_cpus = cpus_size(map); - size_mask = mask_size(map, max); + syn_data->nr = perf_cpu_map__nr(syn_data->map); + syn_data->has_any_cpu = (perf_cpu_map__cpu(syn_data->map, 0).cpu == -1) ? 1 : 0; - if (is_dummy || (size_cpus < size_mask)) { - *size += size_cpus; - *type = PERF_CPU_MAP__CPUS; - } else { - *size += size_mask; - *type = PERF_CPU_MAP__MASK; + syn_data->min_cpu = perf_cpu_map__cpu(syn_data->map, syn_data->has_any_cpu).cpu; + syn_data->max_cpu = perf_cpu_map__max(syn_data->map).cpu; + if (syn_data->max_cpu - syn_data->min_cpu + 1 == syn_data->nr - syn_data->has_any_cpu) { + /* A consecutive range of CPUs can be encoded using a range. */ + assert(sizeof(u16) + sizeof(struct perf_record_range_cpu_map) == sizeof(u64)); + syn_data->type = PERF_CPU_MAP__RANGE_CPUS; + syn_data->size = header_size + sizeof(u64); + return zalloc(syn_data->size); } - *size += sizeof(__u16); /* For perf_record_cpu_map_data.type. */ - *size = PERF_ALIGN(*size, sizeof(u64)); - return zalloc(*size); + size_cpus = sizeof(u16) + sizeof(struct cpu_map_entries) + syn_data->nr * sizeof(u16); + /* Due to padding, the 4bytes per entry mask variant is always smaller. */ + size_mask = sizeof(u16) + sizeof(struct perf_record_mask_cpu_map32) + + BITS_TO_U32(syn_data->max_cpu) * sizeof(__u32); + if (syn_data->has_any_cpu || size_cpus < size_mask) { + /* Follow the CPU map encoding. */ + syn_data->type = PERF_CPU_MAP__CPUS; + syn_data->size = header_size + PERF_ALIGN(size_cpus, sizeof(u64)); + return zalloc(syn_data->size); + } + /* Encode using a bitmask. */ + syn_data->type = PERF_CPU_MAP__MASK; + syn_data->size = header_size + PERF_ALIGN(size_mask, sizeof(u64)); + return zalloc(syn_data->size); } -static void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, - const struct perf_cpu_map *map, - u16 type, int max) +static void cpu_map_data__synthesize(struct synthesize_cpu_map_data *data) { - data->type = type; - - switch (type) { + switch (data->type) { case PERF_CPU_MAP__CPUS: - synthesize_cpus(data, map); + synthesize_cpus(data); break; case PERF_CPU_MAP__MASK: - synthesize_mask(data, map, max); + synthesize_mask(data); + break; + case PERF_CPU_MAP__RANGE_CPUS: + synthesize_range_cpus(data); + break; default: break; } @@ -1289,23 +1293,22 @@ static void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, static struct perf_record_cpu_map *cpu_map_event__new(const struct perf_cpu_map *map) { - size_t size = sizeof(struct perf_event_header); + struct synthesize_cpu_map_data syn_data = { .map = map }; struct perf_record_cpu_map *event; - int max; - u16 type; - event = cpu_map_data__alloc(map, &size, &type, &max); + + event = cpu_map_data__alloc(&syn_data, sizeof(struct perf_event_header)); if (!event) return NULL; + syn_data.data = &event->data; event->header.type = PERF_RECORD_CPU_MAP; - event->header.size = size; - event->data.type = type; - - cpu_map_data__synthesize(&event->data, map, type, max); + event->header.size = syn_data.size; + cpu_map_data__synthesize(&syn_data); return event; } + int perf_event__synthesize_cpu_map(struct perf_tool *tool, const struct perf_cpu_map *map, perf_event__handler_t process, @@ -1998,24 +2001,20 @@ int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evse int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { - size_t size = sizeof(struct perf_event_header) + sizeof(u64) + sizeof(u64); + struct synthesize_cpu_map_data syn_data = { .map = evsel->core.own_cpus }; struct perf_record_event_update *ev; - int max, err; - u16 type; - - if (!evsel->core.own_cpus) - return 0; + int err; - ev = cpu_map_data__alloc(evsel->core.own_cpus, &size, &type, &max); + ev = cpu_map_data__alloc(&syn_data, sizeof(struct perf_event_header) + 2 * sizeof(u64)); if (!ev) return -ENOMEM; + syn_data.data = &ev->cpus.cpus; ev->header.type = PERF_RECORD_EVENT_UPDATE; - ev->header.size = (u16)size; + ev->header.size = (u16)syn_data.size; ev->type = PERF_EVENT_UPDATE__CPUS; ev->id = evsel->core.id[0]; - - cpu_map_data__synthesize(&ev->cpus.cpus, evsel->core.own_cpus, type, max); + cpu_map_data__synthesize(&syn_data); err = process(tool, (union perf_event *)ev, NULL, NULL); free(ev); -- GitLab From 165da80296ea6bc996eea4551026e39a0109f71e Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 8 Sep 2022 15:54:48 -0700 Subject: [PATCH 1238/2223] perf sched: Factor out destroy_tasks() Add destroy_tasks() as a counterpart of create_tasks() and put the thread safety notations there. After join, it destroys semaphores too. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220908225448.4105056-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-sched.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index a92610eac4bf6..f93737eef07ba 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -690,6 +690,27 @@ static void create_tasks(struct perf_sched *sched) } } +static void destroy_tasks(struct perf_sched *sched) + UNLOCK_FUNCTION(sched->start_work_mutex) + UNLOCK_FUNCTION(sched->work_done_wait_mutex) +{ + struct task_desc *task; + unsigned long i; + int err; + + mutex_unlock(&sched->start_work_mutex); + mutex_unlock(&sched->work_done_wait_mutex); + /* Get rid of threads so they won't be upset by mutex destrunction */ + for (i = 0; i < sched->nr_tasks; i++) { + task = sched->tasks[i]; + err = pthread_join(task->thread, NULL); + BUG_ON(err); + sem_destroy(&task->sleep_sem); + sem_destroy(&task->ready_for_work); + sem_destroy(&task->work_done_sem); + } +} + static void wait_for_tasks(struct perf_sched *sched) EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex) EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex) @@ -3324,8 +3345,7 @@ static int perf_sched__replay(struct perf_sched *sched) run_one_test(sched); sched->thread_funcs_exit = true; - mutex_unlock(&sched->start_work_mutex); - mutex_unlock(&sched->work_done_wait_mutex); + destroy_tasks(sched); return 0; } -- GitLab From 187c7723e4aae6d5729e27179542956975624ff8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 8 Sep 2022 16:01:50 -0700 Subject: [PATCH 1239/2223] perf test: Skip sigtrap test on old kernels If it runs on an old kernel, perf_event_open would fail because of the new fields sigtrap and sig_data. Just skipping the test could miss an actual bug in the kernel. Let's check BTF (when we have libbpf) if it has the sigtrap field in the perf_event_attr. Otherwise, we can check it with a minimal event config. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Song Liu <song@kernel.org> Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com> # Using BTF to check for the struct members Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Marco Elver <elver@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220908230150.4105955-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/sigtrap.c | 65 +++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c index e32ece90e164a..1de7478ec1894 100644 --- a/tools/perf/tests/sigtrap.c +++ b/tools/perf/tests/sigtrap.c @@ -54,6 +54,63 @@ static struct perf_event_attr make_event_attr(void) return attr; } +#ifdef HAVE_BPF_SKEL +#include <bpf/btf.h> + +static bool attr_has_sigtrap(void) +{ + bool ret = false; + struct btf *btf; + const struct btf_type *t; + const struct btf_member *m; + const char *name; + int i, id; + + btf = btf__load_vmlinux_btf(); + if (btf == NULL) { + /* should be an old kernel */ + return false; + } + + id = btf__find_by_name_kind(btf, "perf_event_attr", BTF_KIND_STRUCT); + if (id < 0) + goto out; + + t = btf__type_by_id(btf, id); + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + name = btf__name_by_offset(btf, m->name_off); + if (!strcmp(name, "sigtrap")) { + ret = true; + break; + } + } +out: + btf__free(btf); + return ret; +} +#else /* !HAVE_BPF_SKEL */ +static bool attr_has_sigtrap(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_DUMMY, + .size = sizeof(attr), + .remove_on_exec = 1, /* Required by sigtrap. */ + .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ + }; + int fd; + bool ret = false; + + fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); + if (fd >= 0) { + ret = true; + close(fd); + } + + return ret; +} +#endif /* HAVE_BPF_SKEL */ + static void sigtrap_handler(int signum __maybe_unused, siginfo_t *info, void *ucontext __maybe_unused) { @@ -139,7 +196,13 @@ static int test__sigtrap(struct test_suite *test __maybe_unused, int subtest __m fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); if (fd < 0) { - pr_debug("FAILED sys_perf_event_open(): %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + if (attr_has_sigtrap()) { + pr_debug("FAILED sys_perf_event_open(): %s\n", + str_error_r(errno, sbuf, sizeof(sbuf))); + } else { + pr_debug("perf_event_attr doesn't have sigtrap\n"); + ret = TEST_SKIP; + } goto out_restore_sigaction; } -- GitLab From 4671855ae7d9f711b8fe2b558a6000b1eb2e4fa3 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui <cuigaosheng1@huawei.com> Date: Fri, 9 Sep 2022 12:45:41 +0800 Subject: [PATCH 1240/2223] perf sort: Remove hist_entry__sort_list() and sort__first_dimension() leftover declarations The hist_entry__sort_list and sort__first_dimension functions have been removed in commit cfaa154b2335d4c8 ("perf tools: Get rid of obsolete hist_entry__sort_list"), remove them. Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220909044542.1087870-2-cuigaosheng1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/sort.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 2ddc00d1c4645..af14eb46c2b65 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -34,7 +34,6 @@ extern struct sort_entry sort_dso_to; extern struct sort_entry sort_sym_from; extern struct sort_entry sort_sym_to; extern struct sort_entry sort_srcline; -extern enum sort_type sort__first_dimension; extern const char default_mem_sort_order[]; struct res_sample { @@ -295,7 +294,6 @@ struct block_hist { }; extern struct sort_entry sort_thread; -extern struct list_head hist_entry__sort_list; struct evlist; struct tep_handle; -- GitLab From 76ed5927ca6185f141336061c4865eb20048c288 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui <cuigaosheng1@huawei.com> Date: Fri, 9 Sep 2022 12:45:42 +0800 Subject: [PATCH 1241/2223] perf pmu: Remove perf_pmu_lex() needless declaration It builds without it, perhaps with some older combination of flex/bison we needed this, clean it up a bit removing this. Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220909044542.1087870-3-cuigaosheng1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/pmu.y | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/perf/util/pmu.y b/tools/perf/util/pmu.y index bfd7e8509869b..0dab0ec2eff7c 100644 --- a/tools/perf/util/pmu.y +++ b/tools/perf/util/pmu.y @@ -10,8 +10,6 @@ #include <string.h> #include "pmu.h" -extern int perf_pmu_lex (void); - #define ABORT_ON(val) \ do { \ if (val) \ -- GitLab From 1a6abdde13bb6542e72dbe7a2219762795f0161a Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Wed, 31 Aug 2022 10:49:21 -0700 Subject: [PATCH 1242/2223] perf expr: Move the scanner_ctx into the parse_ctx We currently maintain the two independently and copy from one to the other. This is a burden when additional scanner context values are necessary, so combine them. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220831174926.579643-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/expr.c | 2 +- tools/perf/util/expr.c | 7 ++----- tools/perf/util/expr.h | 10 +++++----- tools/perf/util/metricgroup.c | 4 ++-- tools/perf/util/stat-shadow.c | 2 +- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 2efe9e3a63b8b..7ca5e37de5606 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -133,7 +133,7 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u (void **)&val_ptr)); expr__ctx_clear(ctx); - ctx->runtime = 3; + ctx->sctx.runtime = 3; TEST_ASSERT_VAL("find ids", expr__find_ids("EVENT1\\,param\\=?@ + EVENT2\\,param\\=?@", NULL, ctx) == 0); diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index c15a9852fa419..00bde682e743c 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -310,7 +310,7 @@ struct expr_parse_ctx *expr__ctx_new(void) free(ctx); return NULL; } - ctx->runtime = 0; + ctx->sctx.runtime = 0; return ctx; } @@ -344,16 +344,13 @@ static int __expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr, bool compute_ids) { - struct expr_scanner_ctx scanner_ctx = { - .runtime = ctx->runtime, - }; YY_BUFFER_STATE buffer; void *scanner; int ret; pr_debug2("parsing metric: %s\n", expr); - ret = expr_lex_init_extra(&scanner_ctx, &scanner); + ret = expr_lex_init_extra(&ctx->sctx, &scanner); if (ret) return ret; diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index 0403a92d9dcc3..07af3d438eb25 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -10,17 +10,17 @@ struct metric_ref; +struct expr_scanner_ctx { + int runtime; +}; + struct expr_parse_ctx { struct hashmap *ids; - int runtime; + struct expr_scanner_ctx sctx; }; struct expr_id_data; -struct expr_scanner_ctx { - int runtime; -}; - struct hashmap *ids__new(void); void ids__free(struct hashmap *ids); int ids__insert(struct hashmap *ids, const char *id); diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 18aae040d61db..b144c3e352648 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -215,7 +215,7 @@ static struct metric *metric__new(const struct pmu_event *pe, } m->metric_expr = pe->metric_expr; m->metric_unit = pe->unit; - m->pctx->runtime = runtime; + m->pctx->sctx.runtime = runtime; m->has_constraint = metric_no_group || metricgroup__has_constraint(pe); m->metric_refs = NULL; m->evlist = NULL; @@ -1626,7 +1626,7 @@ static int parse_groups(struct evlist *perf_evlist, const char *str, } expr->metric_unit = m->metric_unit; expr->metric_events = metric_events; - expr->runtime = m->pctx->runtime; + expr->runtime = m->pctx->sctx.runtime; list_add(&expr->nd, &me->head); } diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 788ce5e46470a..815af948abb93 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -911,7 +911,7 @@ static void generic_metric(struct perf_stat_config *config, if (!pctx) return; - pctx->runtime = runtime; + pctx->sctx.runtime = runtime; i = prepare_metric(metric_events, metric_refs, pctx, cpu_map_idx, st); if (i < 0) { expr__ctx_free(pctx); -- GitLab From 09b73fe9e3debfeed61c1395652aeff59bda6ae4 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Wed, 31 Aug 2022 10:49:22 -0700 Subject: [PATCH 1243/2223] perf smt: Compute SMT from topology The topology records sibling threads. Rather than computing SMT using siblings in sysfs, reuse the values in topology. This only applies when the file smt/active isn't available. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220831174926.579643-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/expr.c | 24 ++++++---- tools/perf/util/cputopo.c | 15 +++++++ tools/perf/util/cputopo.h | 2 + tools/perf/util/expr.c | 9 ++-- tools/perf/util/smt.c | 95 ++++----------------------------------- tools/perf/util/smt.h | 5 ++- 6 files changed, 49 insertions(+), 101 deletions(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 7ca5e37de5606..db736ed49556f 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include "util/cputopo.h" #include "util/debug.h" #include "util/expr.h" #include "util/header.h" @@ -154,15 +155,20 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u (void **)&val_ptr)); /* Only EVENT1 or EVENT2 need be measured depending on the value of smt_on. */ - expr__ctx_clear(ctx); - TEST_ASSERT_VAL("find ids", - expr__find_ids("EVENT1 if #smt_on else EVENT2", - NULL, ctx) == 0); - TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); - TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, - smt_on() ? "EVENT1" : "EVENT2", - (void **)&val_ptr)); - + { + struct cpu_topology *topology = cpu_topology__new(); + bool smton = smt_on(topology); + + cpu_topology__delete(topology); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("EVENT1 if #smt_on else EVENT2", + NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); + TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, + smton ? "EVENT1" : "EVENT2", + (void **)&val_ptr)); + } /* The expression is a constant 1.0 without needing to evaluate EVENT1. */ expr__ctx_clear(ctx); TEST_ASSERT_VAL("find ids", diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index d275d843c1550..511002e527145 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -157,6 +157,21 @@ void cpu_topology__delete(struct cpu_topology *tp) free(tp); } +bool cpu_topology__smt_on(const struct cpu_topology *topology) +{ + for (u32 i = 0; i < topology->core_cpus_lists; i++) { + const char *cpu_list = topology->core_cpus_list[i]; + + /* + * If there is a need to separate siblings in a core then SMT is + * enabled. + */ + if (strchr(cpu_list, ',') || strchr(cpu_list, '-')) + return true; + } + return false; +} + static bool has_die_topology(void) { char filename[MAXPATHLEN]; diff --git a/tools/perf/util/cputopo.h b/tools/perf/util/cputopo.h index 854e18f9041e8..469db775a13ca 100644 --- a/tools/perf/util/cputopo.h +++ b/tools/perf/util/cputopo.h @@ -58,6 +58,8 @@ struct hybrid_topology { struct cpu_topology *cpu_topology__new(void); void cpu_topology__delete(struct cpu_topology *tp); +/* Determine from the core list whether SMT was enabled. */ +bool cpu_topology__smt_on(const struct cpu_topology *topology); struct numa_topology *numa_topology__new(void); void numa_topology__delete(struct numa_topology *tp); diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index 00bde682e743c..8aa7dafa18b3a 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -412,11 +412,6 @@ double expr__get_literal(const char *literal) static struct cpu_topology *topology; double result = NAN; - if (!strcasecmp("#smt_on", literal)) { - result = smt_on() > 0 ? 1.0 : 0.0; - goto out; - } - if (!strcmp("#num_cpus", literal)) { result = cpu__max_present_cpu().cpu; goto out; @@ -440,6 +435,10 @@ double expr__get_literal(const char *literal) goto out; } } + if (!strcasecmp("#smt_on", literal)) { + result = smt_on(topology) ? 1.0 : 0.0; + goto out; + } if (!strcmp("#num_packages", literal)) { result = topology->package_cpus_lists; goto out; diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c index 8fed03283c85d..ce90c4ee4138e 100644 --- a/tools/perf/util/smt.c +++ b/tools/perf/util/smt.c @@ -1,100 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <linux/bitops.h> +#include <string.h> #include "api/fs/fs.h" +#include "cputopo.h" #include "smt.h" -/** - * hweight_str - Returns the number of bits set in str. Stops at first non-hex - * or ',' character. - */ -static int hweight_str(char *str) -{ - int result = 0; - - while (*str) { - switch (*str++) { - case '0': - case ',': - break; - case '1': - case '2': - case '4': - case '8': - result++; - break; - case '3': - case '5': - case '6': - case '9': - case 'a': - case 'A': - case 'c': - case 'C': - result += 2; - break; - case '7': - case 'b': - case 'B': - case 'd': - case 'D': - case 'e': - case 'E': - result += 3; - break; - case 'f': - case 'F': - result += 4; - break; - default: - goto done; - } - } -done: - return result; -} - -int smt_on(void) +bool smt_on(const struct cpu_topology *topology) { static bool cached; - static int cached_result; - int cpu; - int ncpu; + static bool cached_result; + int fs_value; if (cached) return cached_result; - if (sysfs__read_int("devices/system/cpu/smt/active", &cached_result) >= 0) { - cached = true; - return cached_result; - } - - cached_result = 0; - ncpu = sysconf(_SC_NPROCESSORS_CONF); - for (cpu = 0; cpu < ncpu; cpu++) { - unsigned long long siblings; - char *str; - size_t strlen; - char fn[256]; + if (sysfs__read_int("devices/system/cpu/smt/active", &fs_value) >= 0) + cached_result = (fs_value == 1); + else + cached_result = cpu_topology__smt_on(topology); - snprintf(fn, sizeof fn, - "devices/system/cpu/cpu%d/topology/thread_siblings", cpu); - if (sysfs__read_str(fn, &str, &strlen) < 0) { - snprintf(fn, sizeof fn, - "devices/system/cpu/cpu%d/topology/core_cpus", cpu); - if (sysfs__read_str(fn, &str, &strlen) < 0) - continue; - } - /* Entry is hex, but does not have 0x, so need custom parser */ - siblings = hweight_str(str); - free(str); - if (siblings > 1) { - cached_result = 1; - break; - } - } cached = true; return cached_result; } diff --git a/tools/perf/util/smt.h b/tools/perf/util/smt.h index a98d65808f6a8..e26999c6b8d45 100644 --- a/tools/perf/util/smt.h +++ b/tools/perf/util/smt.h @@ -2,6 +2,9 @@ #ifndef __SMT_H #define __SMT_H 1 -int smt_on(void); +struct cpu_topology; + +/* Returns true if SMT (aka hyperthreading) is enabled. */ +bool smt_on(const struct cpu_topology *topology); #endif /* __SMT_H */ -- GitLab From cc2c4e26ece19b4118f059f3a526c048793e58af Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Wed, 31 Aug 2022 10:49:23 -0700 Subject: [PATCH 1244/2223] perf topology: Add core_wide It is possible to optimize metrics when all SMT threads (CPUs) on a core are measuring events in system wide mode. For example, TMA metrics defines CORE_CLKS for Sandybrdige as: if SMT is disabled: CPU_CLK_UNHALTED.THREAD if SMT is enabled and recording on all SMT threads: CPU_CLK_UNHALTED.THREAD_ANY / 2 if SMT is enabled and not recording on all SMT threads: (CPU_CLK_UNHALTED.THREAD/2)* (1+CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE/CPU_CLK_UNHALTED.REF_XCLK ) That is two more events are necessary when not gathering counts on all SMT threads. To distinguish all SMT threads on a core vs system wide (all CPUs) call the new property core wide. Add a core wide test that determines the property from user requested CPUs, the topology and system wide. System wide is required as other processes running on a SMT thread will change the counts. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220831174926.579643-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/cputopo.c | 46 +++++++++++++++++++++++++++++++++++++++ tools/perf/util/cputopo.h | 3 +++ tools/perf/util/smt.c | 14 ++++++++++++ tools/perf/util/smt.h | 7 ++++++ 4 files changed, 70 insertions(+) diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index 511002e527145..1a3ff64491581 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -172,6 +172,52 @@ bool cpu_topology__smt_on(const struct cpu_topology *topology) return false; } +bool cpu_topology__core_wide(const struct cpu_topology *topology, + const char *user_requested_cpu_list) +{ + struct perf_cpu_map *user_requested_cpus; + + /* + * If user_requested_cpu_list is empty then all CPUs are recorded and so + * core_wide is true. + */ + if (!user_requested_cpu_list) + return true; + + user_requested_cpus = perf_cpu_map__new(user_requested_cpu_list); + /* Check that every user requested CPU is the complete set of SMT threads on a core. */ + for (u32 i = 0; i < topology->core_cpus_lists; i++) { + const char *core_cpu_list = topology->core_cpus_list[i]; + struct perf_cpu_map *core_cpus = perf_cpu_map__new(core_cpu_list); + struct perf_cpu cpu; + int idx; + bool has_first, first = true; + + perf_cpu_map__for_each_cpu(cpu, idx, core_cpus) { + if (first) { + has_first = perf_cpu_map__has(user_requested_cpus, cpu); + first = false; + } else { + /* + * If the first core CPU is user requested then + * all subsequent CPUs in the core must be user + * requested too. If the first CPU isn't user + * requested then none of the others must be + * too. + */ + if (perf_cpu_map__has(user_requested_cpus, cpu) != has_first) { + perf_cpu_map__put(core_cpus); + perf_cpu_map__put(user_requested_cpus); + return false; + } + } + } + perf_cpu_map__put(core_cpus); + } + perf_cpu_map__put(user_requested_cpus); + return true; +} + static bool has_die_topology(void) { char filename[MAXPATHLEN]; diff --git a/tools/perf/util/cputopo.h b/tools/perf/util/cputopo.h index 469db775a13ca..969e5920a00e4 100644 --- a/tools/perf/util/cputopo.h +++ b/tools/perf/util/cputopo.h @@ -60,6 +60,9 @@ struct cpu_topology *cpu_topology__new(void); void cpu_topology__delete(struct cpu_topology *tp); /* Determine from the core list whether SMT was enabled. */ bool cpu_topology__smt_on(const struct cpu_topology *topology); +/* Are the sets of SMT siblings all enabled or all disabled in user_requested_cpus. */ +bool cpu_topology__core_wide(const struct cpu_topology *topology, + const char *user_requested_cpu_list); struct numa_topology *numa_topology__new(void); void numa_topology__delete(struct numa_topology *tp); diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c index ce90c4ee4138e..994e9e4182273 100644 --- a/tools/perf/util/smt.c +++ b/tools/perf/util/smt.c @@ -21,3 +21,17 @@ bool smt_on(const struct cpu_topology *topology) cached = true; return cached_result; } + +bool core_wide(bool system_wide, const char *user_requested_cpu_list, + const struct cpu_topology *topology) +{ + /* If not everything running on a core is being recorded then we can't use core_wide. */ + if (!system_wide) + return false; + + /* Cheap case that SMT is disabled and therefore we're inherently core_wide. */ + if (!smt_on(topology)) + return true; + + return cpu_topology__core_wide(topology, user_requested_cpu_list); +} diff --git a/tools/perf/util/smt.h b/tools/perf/util/smt.h index e26999c6b8d45..ae9095f2c38c6 100644 --- a/tools/perf/util/smt.h +++ b/tools/perf/util/smt.h @@ -7,4 +7,11 @@ struct cpu_topology; /* Returns true if SMT (aka hyperthreading) is enabled. */ bool smt_on(const struct cpu_topology *topology); +/* + * Returns true when system wide and all SMT threads for a core are in the + * user_requested_cpus map. + */ +bool core_wide(bool system_wide, const char *user_requested_cpu_list, + const struct cpu_topology *topology); + #endif /* __SMT_H */ -- GitLab From a4b8cfcabb1d90ec40ca5505f0dee71966d338cf Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Wed, 31 Aug 2022 10:49:24 -0700 Subject: [PATCH 1245/2223] perf stat: Delay metric parsing Having metric parsing as part of argument processing causes issues as flags like metric-no-group may be specified later. It also denies the opportunity to optimize the events on SMT systems where fewer events may be possible if we know the target is system-wide. Move metric parsing to after command line option parsing. Because of how stat runs this moves the parsing after record/report which fail to work with metrics currently anyway. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220831174926.579643-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-stat.c | 52 +++++++++++++++++++++++++---------- tools/perf/util/metricgroup.c | 3 +- tools/perf/util/metricgroup.h | 2 +- 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0b4a62e4ff675..9f1074cc03d1a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -191,6 +191,7 @@ static bool append_file; static bool interval_count; static const char *output_name; static int output_fd; +static char *metrics; struct perf_stat { bool record; @@ -1148,14 +1149,23 @@ static int enable_metric_only(const struct option *opt __maybe_unused, return 0; } -static int parse_metric_groups(const struct option *opt, +static int append_metric_groups(const struct option *opt __maybe_unused, const char *str, int unset __maybe_unused) { - return metricgroup__parse_groups(opt, str, - stat_config.metric_no_group, - stat_config.metric_no_merge, - &stat_config.metric_events); + if (metrics) { + char *tmp; + + if (asprintf(&tmp, "%s,%s", metrics, str) < 0) + return -ENOMEM; + free(metrics); + metrics = tmp; + } else { + metrics = strdup(str); + if (!metrics) + return -ENOMEM; + } + return 0; } static int parse_control_option(const struct option *opt, @@ -1299,7 +1309,7 @@ static struct option stat_options[] = { "measure SMI cost"), OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list", "monitor specified metrics or metric groups (separated by ,)", - parse_metric_groups), + append_metric_groups), OPT_BOOLEAN_FLAG(0, "all-kernel", &stat_config.all_kernel, "Configure all used events to run in kernel space.", PARSE_OPT_EXCLUSIVE), @@ -1792,11 +1802,9 @@ static int add_default_attributes(void) * on an architecture test for such a metric name. */ if (metricgroup__has_metric("transaction")) { - struct option opt = { .value = &evsel_list }; - - return metricgroup__parse_groups(&opt, "transaction", + return metricgroup__parse_groups(evsel_list, "transaction", stat_config.metric_no_group, - stat_config.metric_no_merge, + stat_config.metric_no_merge, &stat_config.metric_events); } @@ -2183,6 +2191,8 @@ static int __cmd_report(int argc, const char **argv) input_name = "perf.data"; } + perf_stat__init_shadow_stats(); + perf_stat.data.path = input_name; perf_stat.data.mode = PERF_DATA_MODE_READ; @@ -2262,8 +2272,6 @@ int cmd_stat(int argc, const char **argv) argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands, (const char **) stat_usage, PARSE_OPT_STOP_AT_NON_OPTION); - perf_stat__collect_metric_expr(evsel_list); - perf_stat__init_shadow_stats(); if (stat_config.csv_sep) { stat_config.csv_output = true; @@ -2430,6 +2438,23 @@ int cmd_stat(int argc, const char **argv) target.system_wide = true; } + if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide)) + target.per_thread = true; + + /* + * Metric parsing needs to be delayed as metrics may optimize events + * knowing the target is system-wide. + */ + if (metrics) { + metricgroup__parse_groups(evsel_list, metrics, + stat_config.metric_no_group, + stat_config.metric_no_merge, + &stat_config.metric_events); + zfree(&metrics); + } + perf_stat__collect_metric_expr(evsel_list); + perf_stat__init_shadow_stats(); + if (add_default_attributes()) goto out; @@ -2449,9 +2474,6 @@ int cmd_stat(int argc, const char **argv) } } - if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide)) - target.per_thread = true; - if (evlist__fix_hybrid_cpus(evsel_list, target.cpu_list)) { pr_err("failed to use cpu list %s\n", target.cpu_list); goto out; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index b144c3e352648..9151346a16ab5 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1646,13 +1646,12 @@ out: return ret; } -int metricgroup__parse_groups(const struct option *opt, +int metricgroup__parse_groups(struct evlist *perf_evlist, const char *str, bool metric_no_group, bool metric_no_merge, struct rblist *metric_events) { - struct evlist *perf_evlist = *(struct evlist **)opt->value; const struct pmu_events_table *table = pmu_events_table__find(); if (!table) diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index 016b3b1a289a6..af9ceadaec0fd 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -64,7 +64,7 @@ struct metric_expr { struct metric_event *metricgroup__lookup(struct rblist *metric_events, struct evsel *evsel, bool create); -int metricgroup__parse_groups(const struct option *opt, +int metricgroup__parse_groups(struct evlist *perf_evlist, const char *str, bool metric_no_group, bool metric_no_merge, -- GitLab From 1725e9cd32a0109b1257777a2a74f632ee45b068 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Wed, 31 Aug 2022 10:49:25 -0700 Subject: [PATCH 1246/2223] perf metrics: Wire up core_wide Pass state necessary for core_wide into the expression parser. Add system_wide and user_requested_cpu_list to perf_stat_config to make it available at display time. evlist isn't used as the evlist__create_maps, that computes user_requested_cpus, needs the list of events which is generated by the metric. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220831174926.579643-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-stat.c | 14 ++++ tools/perf/util/expr.c | 13 +++- tools/perf/util/expr.h | 4 +- tools/perf/util/expr.l | 6 +- tools/perf/util/metricgroup.c | 125 ++++++++++++++++++++++++---------- tools/perf/util/metricgroup.h | 2 + tools/perf/util/stat-shadow.c | 11 ++- tools/perf/util/stat.h | 2 + 8 files changed, 134 insertions(+), 43 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 9f1074cc03d1a..e05fe72c1d870 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1805,6 +1805,8 @@ static int add_default_attributes(void) return metricgroup__parse_groups(evsel_list, "transaction", stat_config.metric_no_group, stat_config.metric_no_merge, + stat_config.user_requested_cpu_list, + stat_config.system_wide, &stat_config.metric_events); } @@ -2441,6 +2443,15 @@ int cmd_stat(int argc, const char **argv) if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide)) target.per_thread = true; + stat_config.system_wide = target.system_wide; + if (target.cpu_list) { + stat_config.user_requested_cpu_list = strdup(target.cpu_list); + if (!stat_config.user_requested_cpu_list) { + status = -ENOMEM; + goto out; + } + } + /* * Metric parsing needs to be delayed as metrics may optimize events * knowing the target is system-wide. @@ -2449,6 +2460,8 @@ int cmd_stat(int argc, const char **argv) metricgroup__parse_groups(evsel_list, metrics, stat_config.metric_no_group, stat_config.metric_no_merge, + stat_config.user_requested_cpu_list, + stat_config.system_wide, &stat_config.metric_events); zfree(&metrics); } @@ -2639,6 +2652,7 @@ out: iostat_release(evsel_list); zfree(&stat_config.walltime_run); + zfree(&stat_config.user_requested_cpu_list); if (smi_cost && smi_reset) sysfs__write_int(FREEZE_ON_SMI_PATH, 0); diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index 8aa7dafa18b3a..c6827900f8d31 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -310,7 +310,9 @@ struct expr_parse_ctx *expr__ctx_new(void) free(ctx); return NULL; } + ctx->sctx.user_requested_cpu_list = NULL; ctx->sctx.runtime = 0; + ctx->sctx.system_wide = false; return ctx; } @@ -332,6 +334,10 @@ void expr__ctx_free(struct expr_parse_ctx *ctx) struct hashmap_entry *cur; size_t bkt; + if (!ctx) + return; + + free(ctx->sctx.user_requested_cpu_list); hashmap__for_each_entry(ctx->ids, cur, bkt) { free((char *)cur->key); free(cur->value); @@ -407,7 +413,7 @@ double arch_get_tsc_freq(void) } #endif -double expr__get_literal(const char *literal) +double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx) { static struct cpu_topology *topology; double result = NAN; @@ -439,6 +445,11 @@ double expr__get_literal(const char *literal) result = smt_on(topology) ? 1.0 : 0.0; goto out; } + if (!strcmp("#core_wide", literal)) { + result = core_wide(ctx->system_wide, ctx->user_requested_cpu_list, topology) + ? 1.0 : 0.0; + goto out; + } if (!strcmp("#num_packages", literal)) { result = topology->package_cpus_lists; goto out; diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index 07af3d438eb25..d6c1668dc1a08 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -11,7 +11,9 @@ struct metric_ref; struct expr_scanner_ctx { + char *user_requested_cpu_list; int runtime; + bool system_wide; }; struct expr_parse_ctx { @@ -55,6 +57,6 @@ int expr__find_ids(const char *expr, const char *one, double expr_id_data__value(const struct expr_id_data *data); double expr_id_data__source_count(const struct expr_id_data *data); -double expr__get_literal(const char *literal); +double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx); #endif diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l index 4dc8edbfd9cea..0168a96373309 100644 --- a/tools/perf/util/expr.l +++ b/tools/perf/util/expr.l @@ -79,11 +79,11 @@ static int str(yyscan_t scanner, int token, int runtime) return token; } -static int literal(yyscan_t scanner) +static int literal(yyscan_t scanner, const struct expr_scanner_ctx *sctx) { YYSTYPE *yylval = expr_get_lval(scanner); - yylval->num = expr__get_literal(expr_get_text(scanner)); + yylval->num = expr__get_literal(expr_get_text(scanner), sctx); if (isnan(yylval->num)) return EXPR_ERROR; @@ -108,7 +108,7 @@ min { return MIN; } if { return IF; } else { return ELSE; } source_count { return SOURCE_COUNT; } -{literal} { return literal(yyscanner); } +{literal} { return literal(yyscanner, sctx); } {number} { return value(yyscanner); } {symbol} { return str(yyscanner, ID, sctx->runtime); } "|" { return '|'; } diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 9151346a16ab5..b18da1a62a555 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -22,6 +22,7 @@ #include <linux/list_sort.h> #include <linux/string.h> #include <linux/zalloc.h> +#include <perf/cpumap.h> #include <subcmd/parse-options.h> #include <api/fs/fs.h> #include "util.h" @@ -189,10 +190,24 @@ static bool metricgroup__has_constraint(const struct pmu_event *pe) return false; } +static void metric__free(struct metric *m) +{ + if (!m) + return; + + free(m->metric_refs); + expr__ctx_free(m->pctx); + free((char *)m->modifier); + evlist__delete(m->evlist); + free(m); +} + static struct metric *metric__new(const struct pmu_event *pe, const char *modifier, bool metric_no_group, - int runtime) + int runtime, + const char *user_requested_cpu_list, + bool system_wide) { struct metric *m; @@ -201,35 +216,34 @@ static struct metric *metric__new(const struct pmu_event *pe, return NULL; m->pctx = expr__ctx_new(); - if (!m->pctx) { - free(m); - return NULL; - } + if (!m->pctx) + goto out_err; m->metric_name = pe->metric_name; - m->modifier = modifier ? strdup(modifier) : NULL; - if (modifier && !m->modifier) { - expr__ctx_free(m->pctx); - free(m); - return NULL; + m->modifier = NULL; + if (modifier) { + m->modifier = strdup(modifier); + if (!m->modifier) + goto out_err; } m->metric_expr = pe->metric_expr; m->metric_unit = pe->unit; + m->pctx->sctx.user_requested_cpu_list = NULL; + if (user_requested_cpu_list) { + m->pctx->sctx.user_requested_cpu_list = strdup(user_requested_cpu_list); + if (!m->pctx->sctx.user_requested_cpu_list) + goto out_err; + } m->pctx->sctx.runtime = runtime; + m->pctx->sctx.system_wide = system_wide; m->has_constraint = metric_no_group || metricgroup__has_constraint(pe); m->metric_refs = NULL; m->evlist = NULL; return m; -} - -static void metric__free(struct metric *m) -{ - free(m->metric_refs); - expr__ctx_free(m->pctx); - free((char *)m->modifier); - evlist__delete(m->evlist); - free(m); +out_err: + metric__free(m); + return NULL; } static bool contains_metric_id(struct evsel **metric_events, int num_events, @@ -874,6 +888,8 @@ struct metricgroup_add_iter_data { int *ret; bool *has_match; bool metric_no_group; + const char *user_requested_cpu_list; + bool system_wide; struct metric *root_metric; const struct visited_metric *visited; const struct pmu_events_table *table; @@ -887,6 +903,8 @@ static int add_metric(struct list_head *metric_list, const struct pmu_event *pe, const char *modifier, bool metric_no_group, + const char *user_requested_cpu_list, + bool system_wide, struct metric *root_metric, const struct visited_metric *visited, const struct pmu_events_table *table); @@ -899,6 +917,8 @@ static int add_metric(struct list_head *metric_list, * @metric_no_group: Should events written to events be grouped "{}" or * global. Grouping is the default but due to multiplexing the * user may override. + * @user_requested_cpu_list: Command line specified CPUs to record on. + * @system_wide: Are events for all processes recorded. * @root_metric: Metrics may reference other metrics to form a tree. In this * case the root_metric holds all the IDs and a list of referenced * metrics. When adding a root this argument is NULL. @@ -910,6 +930,8 @@ static int add_metric(struct list_head *metric_list, static int resolve_metric(struct list_head *metric_list, const char *modifier, bool metric_no_group, + const char *user_requested_cpu_list, + bool system_wide, struct metric *root_metric, const struct visited_metric *visited, const struct pmu_events_table *table) @@ -956,7 +978,8 @@ static int resolve_metric(struct list_head *metric_list, */ for (i = 0; i < pending_cnt; i++) { ret = add_metric(metric_list, &pending[i].pe, modifier, metric_no_group, - root_metric, visited, table); + user_requested_cpu_list, system_wide, root_metric, visited, + table); if (ret) break; } @@ -974,6 +997,8 @@ static int resolve_metric(struct list_head *metric_list, * global. Grouping is the default but due to multiplexing the * user may override. * @runtime: A special argument for the parser only known at runtime. + * @user_requested_cpu_list: Command line specified CPUs to record on. + * @system_wide: Are events for all processes recorded. * @root_metric: Metrics may reference other metrics to form a tree. In this * case the root_metric holds all the IDs and a list of referenced * metrics. When adding a root this argument is NULL. @@ -987,6 +1012,8 @@ static int __add_metric(struct list_head *metric_list, const char *modifier, bool metric_no_group, int runtime, + const char *user_requested_cpu_list, + bool system_wide, struct metric *root_metric, const struct visited_metric *visited, const struct pmu_events_table *table) @@ -1011,7 +1038,8 @@ static int __add_metric(struct list_head *metric_list, * This metric is the root of a tree and may reference other * metrics that are added recursively. */ - root_metric = metric__new(pe, modifier, metric_no_group, runtime); + root_metric = metric__new(pe, modifier, metric_no_group, runtime, + user_requested_cpu_list, system_wide); if (!root_metric) return -ENOMEM; @@ -1060,8 +1088,9 @@ static int __add_metric(struct list_head *metric_list, ret = -EINVAL; } else { /* Resolve referenced metrics. */ - ret = resolve_metric(metric_list, modifier, metric_no_group, root_metric, - &visited_node, table); + ret = resolve_metric(metric_list, modifier, metric_no_group, + user_requested_cpu_list, system_wide, + root_metric, &visited_node, table); } if (ret) { @@ -1109,6 +1138,8 @@ static int add_metric(struct list_head *metric_list, const struct pmu_event *pe, const char *modifier, bool metric_no_group, + const char *user_requested_cpu_list, + bool system_wide, struct metric *root_metric, const struct visited_metric *visited, const struct pmu_events_table *table) @@ -1119,7 +1150,8 @@ static int add_metric(struct list_head *metric_list, if (!strstr(pe->metric_expr, "?")) { ret = __add_metric(metric_list, pe, modifier, metric_no_group, 0, - root_metric, visited, table); + user_requested_cpu_list, system_wide, root_metric, + visited, table); } else { int j, count; @@ -1132,7 +1164,8 @@ static int add_metric(struct list_head *metric_list, for (j = 0; j < count && !ret; j++) ret = __add_metric(metric_list, pe, modifier, metric_no_group, j, - root_metric, visited, table); + user_requested_cpu_list, system_wide, + root_metric, visited, table); } return ret; @@ -1149,6 +1182,7 @@ static int metricgroup__add_metric_sys_event_iter(const struct pmu_event *pe, return 0; ret = add_metric(d->metric_list, pe, d->modifier, d->metric_no_group, + d->user_requested_cpu_list, d->system_wide, d->root_metric, d->visited, d->table); if (ret) goto out; @@ -1191,7 +1225,9 @@ struct metricgroup__add_metric_data { struct list_head *list; const char *metric_name; const char *modifier; + const char *user_requested_cpu_list; bool metric_no_group; + bool system_wide; bool has_match; }; @@ -1208,8 +1244,8 @@ static int metricgroup__add_metric_callback(const struct pmu_event *pe, data->has_match = true; ret = add_metric(data->list, pe, data->modifier, data->metric_no_group, - /*root_metric=*/NULL, - /*visited_metrics=*/NULL, table); + data->user_requested_cpu_list, data->system_wide, + /*root_metric=*/NULL, /*visited_metrics=*/NULL, table); } return ret; } @@ -1223,12 +1259,16 @@ static int metricgroup__add_metric_callback(const struct pmu_event *pe, * @metric_no_group: Should events written to events be grouped "{}" or * global. Grouping is the default but due to multiplexing the * user may override. + * @user_requested_cpu_list: Command line specified CPUs to record on. + * @system_wide: Are events for all processes recorded. * @metric_list: The list that the metric or metric group are added to. * @table: The table that is searched for metrics, most commonly the table for the * architecture perf is running upon. */ static int metricgroup__add_metric(const char *metric_name, const char *modifier, bool metric_no_group, + const char *user_requested_cpu_list, + bool system_wide, struct list_head *metric_list, const struct pmu_events_table *table) { @@ -1242,6 +1282,8 @@ static int metricgroup__add_metric(const char *metric_name, const char *modifier .metric_name = metric_name, .modifier = modifier, .metric_no_group = metric_no_group, + .user_requested_cpu_list = user_requested_cpu_list, + .system_wide = system_wide, .has_match = false, }; /* @@ -1263,6 +1305,8 @@ static int metricgroup__add_metric(const char *metric_name, const char *modifier .metric_name = metric_name, .modifier = modifier, .metric_no_group = metric_no_group, + .user_requested_cpu_list = user_requested_cpu_list, + .system_wide = system_wide, .has_match = &has_match, .ret = &ret, .table = table, @@ -1293,12 +1337,15 @@ out: * @metric_no_group: Should events written to events be grouped "{}" or * global. Grouping is the default but due to multiplexing the * user may override. + * @user_requested_cpu_list: Command line specified CPUs to record on. + * @system_wide: Are events for all processes recorded. * @metric_list: The list that metrics are added to. * @table: The table that is searched for metrics, most commonly the table for the * architecture perf is running upon. */ static int metricgroup__add_metric_list(const char *list, bool metric_no_group, - struct list_head *metric_list, + const char *user_requested_cpu_list, + bool system_wide, struct list_head *metric_list, const struct pmu_events_table *table) { char *list_itr, *list_copy, *metric_name, *modifier; @@ -1315,8 +1362,8 @@ static int metricgroup__add_metric_list(const char *list, bool metric_no_group, *modifier++ = '\0'; ret = metricgroup__add_metric(metric_name, modifier, - metric_no_group, metric_list, - table); + metric_no_group, user_requested_cpu_list, + system_wide, metric_list, table); if (ret == -EINVAL) pr_err("Cannot find metric or group `%s'\n", metric_name); @@ -1505,6 +1552,8 @@ err_out: static int parse_groups(struct evlist *perf_evlist, const char *str, bool metric_no_group, bool metric_no_merge, + const char *user_requested_cpu_list, + bool system_wide, struct perf_pmu *fake_pmu, struct rblist *metric_events_list, const struct pmu_events_table *table) @@ -1518,7 +1567,8 @@ static int parse_groups(struct evlist *perf_evlist, const char *str, if (metric_events_list->nr_entries == 0) metricgroup__rblist_init(metric_events_list); ret = metricgroup__add_metric_list(str, metric_no_group, - &metric_list, table); + user_requested_cpu_list, + system_wide, &metric_list, table); if (ret) goto out; @@ -1650,6 +1700,8 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, const char *str, bool metric_no_group, bool metric_no_merge, + const char *user_requested_cpu_list, + bool system_wide, struct rblist *metric_events) { const struct pmu_events_table *table = pmu_events_table__find(); @@ -1657,8 +1709,9 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, if (!table) return -EINVAL; - return parse_groups(perf_evlist, str, metric_no_group, - metric_no_merge, NULL, metric_events, table); + return parse_groups(perf_evlist, str, metric_no_group, metric_no_merge, + user_requested_cpu_list, system_wide, + /*fake_pmu=*/NULL, metric_events, table); } int metricgroup__parse_groups_test(struct evlist *evlist, @@ -1668,8 +1721,10 @@ int metricgroup__parse_groups_test(struct evlist *evlist, bool metric_no_merge, struct rblist *metric_events) { - return parse_groups(evlist, str, metric_no_group, - metric_no_merge, &perf_pmu__fake, metric_events, table); + return parse_groups(evlist, str, metric_no_group, metric_no_merge, + /*user_requested_cpu_list=*/NULL, + /*system_wide=*/false, + &perf_pmu__fake, metric_events, table); } static int metricgroup__has_metric_callback(const struct pmu_event *pe, diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index af9ceadaec0fd..732d3a0d33341 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -68,6 +68,8 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, const char *str, bool metric_no_group, bool metric_no_merge, + const char *user_requested_cpu_list, + bool system_wide, struct rblist *metric_events); int metricgroup__parse_groups_test(struct evlist *evlist, const struct pmu_events_table *table, diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 815af948abb93..9e1eddeff21bd 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -911,7 +911,10 @@ static void generic_metric(struct perf_stat_config *config, if (!pctx) return; + if (config->user_requested_cpu_list) + pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list); pctx->sctx.runtime = runtime; + pctx->sctx.system_wide = config->system_wide; i = prepare_metric(metric_events, metric_refs, pctx, cpu_map_idx, st); if (i < 0) { expr__ctx_free(pctx); @@ -1304,7 +1307,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, core_bound * 100.); } else if (evsel->metric_expr) { generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, - evsel->name, evsel->metric_name, NULL, 1, cpu_map_idx, out, st); + evsel->name, evsel->metric_name, NULL, 1, + cpu_map_idx, out, st); } else if (runtime_stat_n(st, STAT_NSECS, cpu_map_idx, &rsd) != 0) { char unit = ' '; char unit_buf[10] = "/sec"; @@ -1329,8 +1333,9 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (num++ > 0) out->new_line(config, ctxp); generic_metric(config, mexp->metric_expr, mexp->metric_events, - mexp->metric_refs, evsel->name, mexp->metric_name, - mexp->metric_unit, mexp->runtime, cpu_map_idx, out, st); + mexp->metric_refs, evsel->name, mexp->metric_name, + mexp->metric_unit, mexp->runtime, + cpu_map_idx, out, st); } } if (num == 0) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 668250022f8ca..72713b344b792 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -141,6 +141,8 @@ struct perf_stat_config { bool stop_read_counter; bool quiet; bool iostat_run; + char *user_requested_cpu_list; + bool system_wide; FILE *output; unsigned int interval; unsigned int timeout; -- GitLab From f0c4b97a292741270d764a13df3969f7628382b3 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Wed, 31 Aug 2022 10:49:26 -0700 Subject: [PATCH 1247/2223] perf test: Add basic core_wide expression test Add basic test for coverage, similar to #smt_on. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220831174926.579643-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/expr.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index db736ed49556f..8bd7197668140 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -158,6 +158,9 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u { struct cpu_topology *topology = cpu_topology__new(); bool smton = smt_on(topology); + bool corewide = core_wide(/*system_wide=*/false, + /*user_requested_cpus=*/false, + topology); cpu_topology__delete(topology); expr__ctx_clear(ctx); @@ -168,6 +171,16 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, smton ? "EVENT1" : "EVENT2", (void **)&val_ptr)); + + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("EVENT1 if #core_wide else EVENT2", + NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); + TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, + corewide ? "EVENT1" : "EVENT2", + (void **)&val_ptr)); + } /* The expression is a constant 1.0 without needing to evaluate EVENT1. */ expr__ctx_clear(ctx); -- GitLab From 637522ce97b49550bac5a053175c9c9562e2c6b5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Sun, 11 Sep 2022 22:53:11 -0700 Subject: [PATCH 1248/2223] perf lock contention: Factor out get_symbol_name_offset() It's to convert addr to symbol+offset. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <songliubraving@fb.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220912055314.744552-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-lock.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 52a6a10a610cb..eaba6018da69a 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -899,6 +899,23 @@ bool is_lock_function(struct machine *machine, u64 addr) return false; } +static int get_symbol_name_offset(struct map *map, struct symbol *sym, u64 ip, + char *buf, int size) +{ + u64 offset; + + if (map == NULL || sym == NULL) { + buf[0] = '\0'; + return 0; + } + + offset = map->map_ip(map, ip) - sym->start; + + if (offset) + return scnprintf(buf, size, "%s+%#lx", sym->name, offset); + else + return strlcpy(buf, sym->name, size); +} static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sample, char *buf, int size) { @@ -941,15 +958,8 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl sym = node->ms.sym; if (sym && !is_lock_function(machine, node->ip)) { - struct map *map = node->ms.map; - u64 offset; - - offset = map->map_ip(map, node->ip) - sym->start; - - if (offset) - scnprintf(buf, size, "%s+%#lx", sym->name, offset); - else - strlcpy(buf, sym->name, size); + get_symbol_name_offset(node->ms.map, sym, node->ip, + buf, size); return 0; } -- GitLab From a6eaf966bce9a30ccd0969fed195e051b8904983 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Sun, 11 Sep 2022 22:53:12 -0700 Subject: [PATCH 1249/2223] perf lock contention: Show full callstack with -v option Currently it shows a caller function for each entry, but users need to see the full call stacks sometimes. Use -v/--verbose option to do that. # perf lock con -a -b -v sleep 3 Looking at the vmlinux_path (8 entries long) symsrc__init: cannot get elf header. Using /proc/kcore for kernel data Using /proc/kallsyms for symbols contended total wait max wait avg wait type caller 1 10.74 us 10.74 us 10.74 us spinlock __bpf_trace_contention_begin+0xb 0xffffffffc03b5c47 bpf_prog_bf07ae9e2cbd02c5_contention_begin+0x117 0xffffffffc03b5c47 bpf_prog_bf07ae9e2cbd02c5_contention_begin+0x117 0xffffffffbb8b8e75 bpf_trace_run2+0x35 0xffffffffbb7eab9b __bpf_trace_contention_begin+0xb 0xffffffffbb7ebe75 queued_spin_lock_slowpath+0x1f5 0xffffffffbc1c26ff _raw_spin_lock+0x1f 0xffffffffbb841015 tick_do_update_jiffies64+0x25 0xffffffffbb8409ee tick_irq_enter+0x9e 1 7.70 us 7.70 us 7.70 us spinlock __bpf_trace_contention_begin+0xb 0xffffffffc03b5c47 bpf_prog_bf07ae9e2cbd02c5_contention_begin+0x117 0xffffffffc03b5c47 bpf_prog_bf07ae9e2cbd02c5_contention_begin+0x117 0xffffffffbb8b8e75 bpf_trace_run2+0x35 0xffffffffbb7eab9b __bpf_trace_contention_begin+0xb 0xffffffffbb7ebe75 queued_spin_lock_slowpath+0x1f5 0xffffffffbc1c26ff _raw_spin_lock+0x1f 0xffffffffbb7bc27e raw_spin_rq_lock_nested+0xe 0xffffffffbb7cef9c load_balance+0x66c Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <songliubraving@fb.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220912055314.744552-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-lock.c | 51 ++++++++++++++++++++++++--- tools/perf/util/bpf_lock_contention.c | 9 +++++ tools/perf/util/lock-contention.h | 1 + 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index eaba6018da69a..3715390493583 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1014,6 +1014,27 @@ next: return hash; } +static u64 *get_callstack(struct perf_sample *sample, int max_stack) +{ + u64 *callstack; + u64 i; + int c; + + callstack = calloc(max_stack, sizeof(*callstack)); + if (callstack == NULL) + return NULL; + + for (i = 0, c = 0; i < sample->callchain->nr && c < max_stack; i++) { + u64 ip = sample->callchain->ips[i]; + + if (ip >= PERF_CONTEXT_MAX) + continue; + + callstack[c++] = ip; + } + return callstack; +} + static int report_lock_contention_begin_event(struct evsel *evsel, struct perf_sample *sample) { @@ -1040,6 +1061,12 @@ static int report_lock_contention_begin_event(struct evsel *evsel, ls = lock_stat_findnew(key, caller, flags); if (!ls) return -ENOMEM; + + if (aggr_mode == LOCK_AGGR_CALLER && verbose) { + ls->callstack = get_callstack(sample, CONTENTION_STACK_DEPTH); + if (ls->callstack == NULL) + return -ENOMEM; + } } ts = thread_stat_findnew(sample->tid); @@ -1443,7 +1470,7 @@ static void sort_contention_result(void) sort_result(); } -static void print_contention_result(void) +static void print_contention_result(struct lock_contention *con) { struct lock_stat *st; struct lock_key *key; @@ -1482,6 +1509,22 @@ static void print_contention_result(void) } pr_info(" %10s %s\n", get_type_str(st), st->name); + if (verbose) { + struct map *kmap; + struct symbol *sym; + char buf[128]; + u64 ip; + + for (int i = 0; i < CONTENTION_STACK_DEPTH; i++) { + if (!st->callstack || !st->callstack[i]) + break; + + ip = st->callstack[i]; + sym = machine__find_kernel_symbol(con->machine, ip, &kmap); + get_symbol_name_offset(kmap, sym, ip, buf, sizeof(buf)); + pr_info("\t\t\t%#lx %s\n", (unsigned long)ip, buf); + } + } } print_bad_events(bad, total); @@ -1597,6 +1640,8 @@ static int __cmd_contention(int argc, const char **argv) return PTR_ERR(session); } + con.machine = &session->machines.host; + /* for lock function check */ symbol_conf.sort_by_name = true; symbol__init(&session->header.env); @@ -1615,8 +1660,6 @@ static int __cmd_contention(int argc, const char **argv) signal(SIGCHLD, sighandler); signal(SIGTERM, sighandler); - con.machine = &session->machines.host; - con.evlist = evlist__new(); if (con.evlist == NULL) { err = -ENOMEM; @@ -1688,7 +1731,7 @@ static int __cmd_contention(int argc, const char **argv) setup_pager(); sort_contention_result(); - print_contention_result(); + print_contention_result(&con); out_delete: evlist__delete(con.evlist); diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index c591a66733ef5..6545bee65347a 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -8,6 +8,7 @@ #include "util/thread_map.h" #include "util/lock-contention.h" #include <linux/zalloc.h> +#include <linux/string.h> #include <bpf/bpf.h> #include "bpf_skel/lock_contention.skel.h" @@ -171,6 +172,14 @@ int lock_contention_read(struct lock_contention *con) return -1; } + if (verbose) { + st->callstack = memdup(stack_trace, sizeof(stack_trace)); + if (st->callstack == NULL) { + free(st); + return -1; + } + } + hlist_add_head(&st->hash_entry, con->result); prev_key = key; } diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h index 2146efc33396e..bdb6e2a61e5b9 100644 --- a/tools/perf/util/lock-contention.h +++ b/tools/perf/util/lock-contention.h @@ -11,6 +11,7 @@ struct lock_stat { u64 addr; /* address of lockdep_map, used as ID */ char *name; /* for strcpy(), we cannot use const */ + u64 *callstack; unsigned int nr_acquire; unsigned int nr_acquired; -- GitLab From 96532a83ee8e30035e584b046c859adb001a3b8d Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Sun, 11 Sep 2022 22:53:13 -0700 Subject: [PATCH 1250/2223] perf lock contention: Allow to change stack depth and skip It needs stack traces to find callers of locks. To minimize the performance overhead it only collects up to 8 entries for each stack trace. And it skips first 3 entries as they came from BPF, tracepoint and lock functions which are not interested for most users. But it turned out that those numbers are different in some configuration. Using fixed number can result in non meaningful caller names. Let's make them adjustable with --stack-depth and --skip-stack options. On my setup, the default output is like below: # /perf lock con -ab -F contended,wait_total sleep 3 contended total wait type caller 28 4.55 ms rwlock:W __bpf_trace_contention_begin+0xb 33 1.67 ms rwlock:W __bpf_trace_contention_begin+0xb 12 580.28 us spinlock __bpf_trace_contention_begin+0xb 60 240.54 us rwsem:R __bpf_trace_contention_begin+0xb 27 64.45 us spinlock __bpf_trace_contention_begin+0xb If I change the stack skip to 5, the result will be like: # perf lock con -ab -F contended,wait_total --stack-skip 5 sleep 3 contended total wait type caller 32 715.45 us spinlock folio_lruvec_lock_irqsave+0x61 26 550.22 us spinlock folio_lruvec_lock_irqsave+0x61 15 486.93 us rwsem:R mmap_read_lock+0x13 12 139.66 us rwsem:W vm_mmap_pgoff+0x93 1 7.04 us spinlock tick_do_update_jiffies64+0x25 Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <songliubraving@fb.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220912055314.744552-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-lock.txt | 6 ++++++ tools/perf/builtin-lock.c | 22 ++++++++++++++++------ tools/perf/util/bpf_lock_contention.c | 7 ++++--- tools/perf/util/lock-contention.h | 2 ++ 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 193c5d8b8db92..5f2dc634258e9 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -148,6 +148,12 @@ CONTENTION OPTIONS --map-nr-entries:: Maximum number of BPF map entries (default: 10240). +--max-stack:: + Maximum stack depth when collecting lock contention (default: 8). + +--stack-skip + Number of stack depth to skip when finding a lock caller (default: 3). + SEE ALSO -------- diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 3715390493583..25d75fa09b906 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -56,6 +56,8 @@ static bool combine_locks; static bool show_thread_stats; static bool use_bpf; static unsigned long bpf_map_entries = 10240; +static int max_stack_depth = CONTENTION_STACK_DEPTH; +static int stack_skip = CONTENTION_STACK_SKIP; static enum { LOCK_AGGR_ADDR, @@ -936,7 +938,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl /* use caller function name from the callchain */ ret = thread__resolve_callchain(thread, cursor, evsel, sample, - NULL, NULL, CONTENTION_STACK_DEPTH); + NULL, NULL, max_stack_depth); if (ret != 0) { thread__put(thread); return -1; @@ -953,7 +955,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl break; /* skip first few entries - for lock functions */ - if (++skip <= CONTENTION_STACK_SKIP) + if (++skip <= stack_skip) goto next; sym = node->ms.sym; @@ -984,7 +986,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample) /* use caller function name from the callchain */ ret = thread__resolve_callchain(thread, cursor, evsel, sample, - NULL, NULL, CONTENTION_STACK_DEPTH); + NULL, NULL, max_stack_depth); thread__put(thread); if (ret != 0) @@ -1000,7 +1002,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample) break; /* skip first few entries - for lock functions */ - if (++skip <= CONTENTION_STACK_SKIP) + if (++skip <= stack_skip) goto next; if (node->ms.sym && is_lock_function(machine, node->ip)) @@ -1063,7 +1065,7 @@ static int report_lock_contention_begin_event(struct evsel *evsel, return -ENOMEM; if (aggr_mode == LOCK_AGGR_CALLER && verbose) { - ls->callstack = get_callstack(sample, CONTENTION_STACK_DEPTH); + ls->callstack = get_callstack(sample, max_stack_depth); if (ls->callstack == NULL) return -ENOMEM; } @@ -1515,7 +1517,7 @@ static void print_contention_result(struct lock_contention *con) char buf[128]; u64 ip; - for (int i = 0; i < CONTENTION_STACK_DEPTH; i++) { + for (int i = 0; i < max_stack_depth; i++) { if (!st->callstack || !st->callstack[i]) break; @@ -1632,6 +1634,8 @@ static int __cmd_contention(int argc, const char **argv) .target = &target, .result = &lockhash_table[0], .map_nr_entries = bpf_map_entries, + .max_stack = max_stack_depth, + .stack_skip = stack_skip, }; session = perf_session__new(use_bpf ? NULL : &data, &eops); @@ -1895,6 +1899,12 @@ int cmd_lock(int argc, const char **argv) "Trace on existing thread id (exclusive to --pid)"), OPT_CALLBACK(0, "map-nr-entries", &bpf_map_entries, "num", "Max number of BPF map entries", parse_map_entry), + OPT_INTEGER(0, "max-stack", &max_stack_depth, + "Set the maximum stack depth when collecting lock contention, " + "Default: " __stringify(CONTENTION_STACK_DEPTH)), + OPT_INTEGER(0, "stack-skip", &stack_skip, + "Set the number of stack depth to skip when finding a lock caller, " + "Default: " __stringify(CONTENTION_STACK_SKIP)), OPT_PARENT(lock_options) }; diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index 6545bee65347a..ef5323c78ffcc 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -41,6 +41,7 @@ int lock_contention_prepare(struct lock_contention *con) return -1; } + bpf_map__set_value_size(skel->maps.stacks, con->max_stack * sizeof(u64)); bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries); bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries); @@ -115,7 +116,7 @@ int lock_contention_read(struct lock_contention *con) struct lock_contention_data data; struct lock_stat *st; struct machine *machine = con->machine; - u64 stack_trace[CONTENTION_STACK_DEPTH]; + u64 stack_trace[con->max_stack]; fd = bpf_map__fd(skel->maps.lock_stat); stack = bpf_map__fd(skel->maps.stacks); @@ -146,9 +147,9 @@ int lock_contention_read(struct lock_contention *con) bpf_map_lookup_elem(stack, &key, stack_trace); /* skip BPF + lock internal functions */ - idx = CONTENTION_STACK_SKIP; + idx = con->stack_skip; while (is_lock_function(machine, stack_trace[idx]) && - idx < CONTENTION_STACK_DEPTH - 1) + idx < con->max_stack - 1) idx++; st->addr = stack_trace[idx]; diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h index bdb6e2a61e5b9..67db311fc9dfc 100644 --- a/tools/perf/util/lock-contention.h +++ b/tools/perf/util/lock-contention.h @@ -115,6 +115,8 @@ struct lock_contention { struct hlist_head *result; unsigned long map_nr_entries; unsigned long lost; + int max_stack; + int stack_skip; }; #ifdef HAVE_BPF_SKEL -- GitLab From c1da8dd5c11dabd50b1578c6b43d73c7bbc28963 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Sun, 11 Sep 2022 22:53:14 -0700 Subject: [PATCH 1251/2223] perf lock contention: Skip stack trace from BPF Currently it collects stack traces to max size then skip entries. Because we don't have control how to skip perf callchains. But BPF can do it with bpf_get_stackid() with a flag. Say we have max-stack=4 and stack-skip=2, we get these stack traces. Before: After: .---> +---+ <--. .---> +---+ <--. | | | | | | | | | +---+ usable | +---+ | max | | | max | | | stack +---+ <--' stack +---+ usable | | X | | | | | | +---+ skip | +---+ | | | X | | | | | `---> +---+ `---> +---+ <--' <=== collection | X | +---+ skip | X | +---+ Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <songliubraving@fb.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220912055314.744552-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/bpf_lock_contention.c | 7 ++++--- tools/perf/util/bpf_skel/lock_contention.bpf.c | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index ef5323c78ffcc..efe5b9968e774 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -93,6 +93,8 @@ int lock_contention_prepare(struct lock_contention *con) bpf_map_update_elem(fd, &pid, &val, BPF_ANY); } + skel->bss->stack_skip = con->stack_skip; + lock_contention_bpf__attach(skel); return 0; } @@ -127,7 +129,7 @@ int lock_contention_read(struct lock_contention *con) while (!bpf_map_get_next_key(fd, &prev_key, &key)) { struct map *kmap; struct symbol *sym; - int idx; + int idx = 0; bpf_map_lookup_elem(fd, &key, &data); st = zalloc(sizeof(*st)); @@ -146,8 +148,7 @@ int lock_contention_read(struct lock_contention *con) bpf_map_lookup_elem(stack, &key, stack_trace); - /* skip BPF + lock internal functions */ - idx = con->stack_skip; + /* skip lock internal functions */ while (is_lock_function(machine, stack_trace[idx]) && idx < con->max_stack - 1) idx++; diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index 9e8b94eb63204..e107d71f0f1ac 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -72,6 +72,7 @@ struct { int enabled; int has_cpu; int has_task; +int stack_skip; /* error stat */ unsigned long lost; @@ -117,7 +118,7 @@ int contention_begin(u64 *ctx) pelem->timestamp = bpf_ktime_get_ns(); pelem->lock = (__u64)ctx[0]; pelem->flags = (__u32)ctx[1]; - pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP); + pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | stack_skip); if (pelem->stack_id < 0) lost++; -- GitLab From e8a6430ff605734ab5a7da42097f6b786a78ba2b Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 22 Sep 2022 22:14:35 +0800 Subject: [PATCH 1252/2223] perf genelf: Fix error code in jit_write_elf() The error code is set to -1 at the beginning of jit_write_elf(), but it is assigned by jit_add_eh_frame_info() in the middle, hence the following error can only return the error code of jit_add_eh_frame_info(). Reset the error code to the default value after being assigned by jit_add_eh_frame_info(). Fixes: 086f9f3d7897d808 ("perf jit: Generate .eh_frame/.eh_frame_hdr in DSO") Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stefano Sanfilippo <ssanfilippo@chromium.org> Link: https://lore.kernel.org/r/20220922141438.22487-2-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/genelf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c index d81b54563e962..fefc72066c4e8 100644 --- a/tools/perf/util/genelf.c +++ b/tools/perf/util/genelf.c @@ -345,6 +345,7 @@ jit_write_elf(int fd, uint64_t load_addr, const char *sym, eh_frame_base_offset); if (retval) goto error; + retval = -1; } /* -- GitLab From cdd3b15d6871e7b164e3dd82514dfcc4daa7559b Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 22 Sep 2022 22:14:36 +0800 Subject: [PATCH 1253/2223] perf stat: Merge cases in process_evlist As two cases in process_evlist has same behavior, make the first fall through to the second. Commiter notes: Added __fallthrough, the kernel has "fallthrough", we need to make tools/ use it. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220922141438.22487-3-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-stat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e05fe72c1d870..7b8e901bce101 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -662,9 +662,7 @@ static void process_evlist(struct evlist *evlist, unsigned int interval) if (evlist__ctlfd_process(evlist, &cmd) > 0) { switch (cmd) { case EVLIST_CTL_CMD_ENABLE: - if (interval) - process_interval(); - break; + __fallthrough; case EVLIST_CTL_CMD_DISABLE: if (interval) process_interval(); -- GitLab From dc64641c8f917f20ad5cf678de3b77ebc8fb3a9a Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 22 Sep 2022 22:14:37 +0800 Subject: [PATCH 1254/2223] perf top: Fix error code in cmd_top() There are three error paths which return success: 1. Propagate the errno from evlist__create_maps() if it failed. 2. Return -EINVAL if top.sb_evlist is NULL. 3. Return -EINVAL if evlist__add_bpf_sb_event() failed. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220922141438.22487-4-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-top.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index e89208b4ad4bc..4b3ff7687236e 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1707,6 +1707,7 @@ int cmd_top(int argc, const char **argv) if (evlist__create_maps(top.evlist, target) < 0) { ui__error("Couldn't create thread/CPU maps: %s\n", errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf))); + status = -errno; goto out_delete_evlist; } @@ -1759,11 +1760,13 @@ int cmd_top(int argc, const char **argv) if (top.sb_evlist == NULL) { pr_err("Couldn't create side band evlist.\n."); + status = -EINVAL; goto out_delete_evlist; } if (evlist__add_bpf_sb_event(top.sb_evlist, &perf_env)) { pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); + status = -EINVAL; goto out_delete_evlist; } } -- GitLab From d031a00a29b2b2a6ad99c41fadb1ea3c0dc5046c Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 9 Sep 2022 16:50:24 -0700 Subject: [PATCH 1255/2223] perf record: Fix a segfault in record__read_lost_samples() When it fails to open events record__open() returns without setting the session->evlist. Then it gets a segfault in the function trying to read lost sample counts. You can easily reproduce it as a normal user like: $ perf record -p 1 true ... perf: Segmentation fault ... Skip the function if it has no evlist. And add more protection for evsels which are not properly initialized. Fixes: a49aa8a54e861af1 ("perf record: Read and inject LOST_SAMPLES events") Signed-off-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Acked-by: Leo Yan <leo.yan@linaro.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220909235024.278281-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-record.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 741e763436caf..f4f1619199e5c 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1888,6 +1888,10 @@ static void record__read_lost_samples(struct record *rec) struct perf_record_lost_samples *lost; struct evsel *evsel; + /* there was an error during record__open */ + if (session->evlist == NULL) + return; + lost = zalloc(PERF_SAMPLE_MAX_SIZE); if (lost == NULL) { pr_debug("Memory allocation failed\n"); @@ -1899,6 +1903,8 @@ static void record__read_lost_samples(struct record *rec) evlist__for_each_entry(session->evlist, evsel) { struct xyarray *xy = evsel->core.sample_id; + if (xy == NULL || evsel->core.fd == NULL) + continue; if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) || xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) { pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n"); -- GitLab From fd941521e81fd24e4ab164f88513612fb5f3af85 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 23 Sep 2022 10:31:40 -0700 Subject: [PATCH 1256/2223] perf inject: Clarify build-id options a little bit Update the documentation of --build-id and --buildid-all options to clarify the difference between them. The former requires full sample processing to find which DSOs are actually used. While the latter simply injects every DSO's build-id from MMAP{,2} records, skipping SAMPLEs. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220923173142.805896-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-inject.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index 70e2ac3cc91ab..c972032f4ca0d 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -25,10 +25,12 @@ OPTIONS ------- -b:: --build-ids:: - Inject build-ids into the output stream + Inject build-ids of DSOs hit by samples into the output stream. + This means it needs to process all SAMPLE records to find the DSOs. --buildid-all:: - Inject build-ids of all DSOs into the output stream + Inject build-ids of all DSOs into the output stream regardless of hits + and skip SAMPLE processing. --known-build-ids=:: Override build-ids to inject using these comma-separated pairs of -- GitLab From 762461f1a53b268e44fbd941d3734f4553a6e925 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 23 Sep 2022 10:31:41 -0700 Subject: [PATCH 1257/2223] perf tools: Add 'addr' sort key Sometimes users want to see actual (virtual) address of sampled instructions. Add a new 'addr' sort key to display the raw addresses. $ perf record -o- true | perf report -i- -s addr # To display the perf.data header info, please use --header/--header-only options. # [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.000 MB - ] # # Total Lost Samples: 0 # # Samples: 12 of event 'cycles:u' # Event count (approx.): 252512 # # Overhead Address # ........ .................. # 42.96% 0x7f96f08443d7 29.55% 0x7f96f0859b50 14.76% 0x7f96f0852e02 8.30% 0x7f96f0855028 4.43% 0xffffffff8de01087 Note that it just compares and displays the sample ip. Each process can have a different memory layout and the ip will be different even if they run the same binary. So this sort key is mostly meaningful for per-process profile data. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220923173142.805896-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-report.txt | 3 +- tools/perf/util/hist.c | 1 + tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 38 ++++++++++++++++++++++++ tools/perf/util/sort.h | 1 + 5 files changed, 43 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 24efc0583c939..4533db2ee56bb 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -73,7 +73,7 @@ OPTIONS Sort histogram entries by given key(s) - multiple keys can be specified in CSV format. Following sort keys are available: pid, comm, dso, symbol, parent, cpu, socket, srcline, weight, - local_weight, cgroup_id. + local_weight, cgroup_id, addr. Each key has following meaning: @@ -114,6 +114,7 @@ OPTIONS - local_ins_lat: Local instruction latency version - p_stage_cyc: On powerpc, this presents the number of cycles spent in a pipeline stage. And currently supported only on powerpc. + - addr: (Full) virtual address of the sampled instruction By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 06f5dbf213ad1..17a05e943b44b 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -215,6 +215,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13); hists__new_col_len(hists, HISTC_LOCAL_P_STAGE_CYC, 13); hists__new_col_len(hists, HISTC_GLOBAL_P_STAGE_CYC, 13); + hists__new_col_len(hists, HISTC_ADDR, BITS_PER_LONG / 4 + 2); if (symbol_conf.nanosecs) hists__new_col_len(hists, HISTC_TIME, 16); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index c7a7a3fa0b879..ebd8a8f783ee6 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -79,6 +79,7 @@ enum hist_column { HISTC_GLOBAL_P_STAGE_CYC, HISTC_ADDR_FROM, HISTC_ADDR_TO, + HISTC_ADDR, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 6d5588e80935a..2e7330867e2ef 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1948,6 +1948,43 @@ struct sort_entry sort_dso_size = { .se_width_idx = HISTC_DSO_SIZE, }; +/* --sort dso_size */ + +static int64_t +sort__addr_cmp(struct hist_entry *left, struct hist_entry *right) +{ + u64 left_ip = left->ip; + u64 right_ip = right->ip; + struct map *left_map = left->ms.map; + struct map *right_map = right->ms.map; + + if (left_map) + left_ip = left_map->unmap_ip(left_map, left_ip); + if (right_map) + right_ip = right_map->unmap_ip(right_map, right_ip); + + return _sort__addr_cmp(left_ip, right_ip); +} + +static int hist_entry__addr_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + u64 ip = he->ip; + struct map *map = he->ms.map; + + if (map) + ip = map->unmap_ip(map, ip); + + return repsep_snprintf(bf, size, "%-#*llx", width, ip); +} + +struct sort_entry sort_addr = { + .se_header = "Address", + .se_cmp = sort__addr_cmp, + .se_snprintf = hist_entry__addr_snprintf, + .se_width_idx = HISTC_ADDR, +}; + struct sort_dimension { const char *name; @@ -1997,6 +2034,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat), DIM(SORT_LOCAL_PIPELINE_STAGE_CYC, "local_p_stage_cyc", sort_local_p_stage_cyc), DIM(SORT_GLOBAL_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_global_p_stage_cyc), + DIM(SORT_ADDR, "addr", sort_addr), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index af14eb46c2b65..04ff8b61a2a7c 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -236,6 +236,7 @@ enum sort_type { SORT_GLOBAL_INS_LAT, SORT_LOCAL_PIPELINE_STAGE_CYC, SORT_GLOBAL_PIPELINE_STAGE_CYC, + SORT_ADDR, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, -- GitLab From 7d18a824b5e57ddd1261e0116c9d7d81183eca85 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 23 Sep 2022 10:31:42 -0700 Subject: [PATCH 1258/2223] perf annotate: Toggle full address <-> offset display Handle 'f' key to toggle the display offset and full address. Obviously it only works when users set to see disassembler output ('o' key). It'd be useful when users want to see the full virtual address in the TUI annotate browser. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20220923173142.805896-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/ui/browsers/annotate.c | 6 +++++- tools/perf/util/annotate.c | 19 ++++++++++++++++++- tools/perf/util/annotate.h | 4 +++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 9bc1076374ffd..725662e21b23e 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -805,7 +805,8 @@ static int annotate_browser__run(struct annotate_browser *browser, "r Run available scripts\n" "p Toggle percent type [local/global]\n" "b Toggle percent base [period/hits]\n" - "? Search string backwards\n"); + "? Search string backwards\n" + "f Toggle showing offsets to full address\n"); continue; case 'r': script_browse(NULL, NULL); @@ -912,6 +913,9 @@ show_sup_ins: hists__scnprintf_title(hists, title, sizeof(title)); annotate_browser__show(&browser->b, title, help); continue; + case 'f': + annotation__toggle_full_addr(notes, ms); + continue; case K_LEFT: case K_ESC: case 'q': diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 5bc63c9e0324d..db475e44f42fa 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2239,7 +2239,10 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, } args.ms = *ms; - notes->start = map__rip_2objdump(ms->map, sym->start); + if (notes->options && notes->options->full_addr) + notes->start = map__objdump_2mem(ms->map, ms->sym->start); + else + notes->start = map__rip_2objdump(ms->map, ms->sym->start); return symbol__disassemble(sym, &args); } @@ -2762,6 +2765,8 @@ void annotation__update_column_widths(struct annotation *notes) { if (notes->options->use_offset) notes->widths.target = notes->widths.min_addr; + else if (notes->options->full_addr) + notes->widths.target = BITS_PER_LONG / 4; else notes->widths.target = notes->widths.max_addr; @@ -2771,6 +2776,18 @@ void annotation__update_column_widths(struct annotation *notes) notes->widths.addr += notes->widths.jumps + 1; } +void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms) +{ + notes->options->full_addr = !notes->options->full_addr; + + if (notes->options->full_addr) + notes->start = map__objdump_2mem(ms->map, ms->sym->start); + else + notes->start = map__rip_2objdump(ms->map, ms->sym->start); + + annotation__update_column_widths(notes); +} + static void annotation__calc_lines(struct annotation *notes, struct map *map, struct rb_root *root, struct annotation_options *opts) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 3cbd883e4d7ac..8934072c39e6b 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -88,7 +88,8 @@ struct annotation_options { show_nr_jumps, show_minmax_cycle, show_asm_raw, - annotate_src; + annotate_src, + full_addr; u8 offset_level; int min_pcnt; int max_lines; @@ -325,6 +326,7 @@ void annotation__compute_ipc(struct annotation *notes, size_t size); void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym); void annotation__update_column_widths(struct annotation *notes); void annotation__init_column_widths(struct annotation *notes, struct symbol *sym); +void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms); static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx) { -- GitLab From 4627a000dced43ae9e81a9c174e75773794ce905 Mon Sep 17 00:00:00 2001 From: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Date: Fri, 16 Sep 2022 16:19:04 +0530 Subject: [PATCH 1259/2223] perf tests: Fix 'perf probe' error log check in skip_if_no_debuginfo The perf probe related tests like probe_vfs_getname.sh which is in "tools/perf/tests/shell" directory have dependency on debuginfo information in the kernel. Currently debuginfo check is handled by skip_if_no_debuginfo function in the file "lib/probe_vfs_getname.sh". skip_if_no_debuginfo function looks for this specific error log from perf probe to skip the testcase: <<>> Failed to find the path for the kernel|Debuginfo-analysis is not supported <>> But in some case, like this one in powerpc, while running this test, observed error logs is: <<>> The /lib/modules/<version>/build/vmlinux file has no debug information. Rebuild with CONFIG_DEBUG_INFO=y, or install an appropriate debuginfo package. Error: Failed to add events. <<>> Update the skip_if_no_debuginfo function to include the above error, to skip the test in these scenarios too. Reported-by: Disha Goel <disgoel@linux.vnet.ibm.com> Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Tested-by: Disha Goel <disgoel@linux.vnet.ibm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nageswara R Sastry <rnsastry@linux.ibm.com> Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20220916104904.99798-1-atrajeev@linux.vnet.ibm.com Reviewed-By: Kajol Jain <kjain@linux.ibm.com> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/lib/probe_vfs_getname.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh index 5b17d916c5558..b616d42bd19d4 100644 --- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh +++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh @@ -19,6 +19,6 @@ add_probe_vfs_getname() { } skip_if_no_debuginfo() { - add_probe_vfs_getname -v 2>&1 | egrep -q "^(Failed to find the path for the kernel|Debuginfo-analysis is not supported)" && return 2 + add_probe_vfs_getname -v 2>&1 | egrep -q "^(Failed to find the path for the kernel|Debuginfo-analysis is not supported)|(file has no debug information)" && return 2 return 1 } -- GitLab From 19af23df66b412106ce90f2e2258fefe6a256acd Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:02 +0300 Subject: [PATCH 1260/2223] perf test: test_intel_pt.sh: Add cleanup function Add a cleanup function that will still clean up if the script is terminated prematurely. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index a3298643884d9..17338e6a6f990 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -14,6 +14,21 @@ err_cnt=0 tmpfile=`mktemp` perfdatafile=`mktemp` +cleanup() +{ + trap - EXIT TERM INT + rm -f ${tmpfile} + rm -f ${perfdatafile} +} + +trap_cleanup() +{ + cleanup + exit 1 +} + +trap trap_cleanup EXIT TERM INT + can_cpu_wide() { perf record -o ${tmpfile} -B -N --no-bpf-event -e dummy:u -C $1 true 2>&1 >/dev/null || return 2 @@ -57,8 +72,7 @@ test_system_wide_side_band count_result $? -rm -f ${tmpfile} -rm -f ${perfdatafile} +cleanup if [ ${err_cnt} -gt 0 ] ; then exit 1 -- GitLab From 170ac70f16e7993449ae20a5c5f23d965e3e171d Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:03 +0300 Subject: [PATCH 1261/2223] perf test: test_intel_pt.sh: Use a temp directory Create a directory for temporary files so that mktemp needs to be used only once. It also enables more temp files to be added without having to add them also to the cleanup. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 17338e6a6f990..872ee0d89d38d 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -11,14 +11,20 @@ skip_cnt=0 ok_cnt=0 err_cnt=0 -tmpfile=`mktemp` -perfdatafile=`mktemp` +temp_dir=$(mktemp -d /tmp/perf-test-intel-pt-sh.XXXXXXXXXX) + +tmpfile="${temp_dir}/tmp-perf.data" +perfdatafile="${temp_dir}/test-perf.data" cleanup() { trap - EXIT TERM INT - rm -f ${tmpfile} - rm -f ${perfdatafile} + sane=$(echo "${temp_dir}" | cut -b 1-26) + if [ "${sane}" = "/tmp/perf-test-intel-pt-sh" ] ; then + echo "--- Cleaning up ---" + rm -f "${temp_dir}/"* + rmdir "${temp_dir}" + fi } trap_cleanup() -- GitLab From 3f79fff8bd561f22678e7008e0910ffdbc9891ea Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:04 +0300 Subject: [PATCH 1262/2223] perf test: test_intel_pt.sh: Fix redirection As reported by shellcheck, 2>&1 must come after >/dev/null Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 872ee0d89d38d..6e40ee7261da4 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -37,7 +37,7 @@ trap trap_cleanup EXIT TERM INT can_cpu_wide() { - perf record -o ${tmpfile} -B -N --no-bpf-event -e dummy:u -C $1 true 2>&1 >/dev/null || return 2 + perf record -o ${tmpfile} -B -N --no-bpf-event -e dummy:u -C $1 true >/dev/null 2>&1 || return 2 return 0 } -- GitLab From 202d039413818b0cf421d98b6a6068fdd2ec8d08 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:05 +0300 Subject: [PATCH 1263/2223] perf test: test_intel_pt.sh: Stop using expr As suggested by shellcheck, stop using expr. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 6e40ee7261da4..2be8cb03a620e 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -64,14 +64,14 @@ test_system_wide_side_band() count_result() { if [ $1 -eq 2 ] ; then - skip_cnt=`expr ${skip_cnt} \+ 1` + skip_cnt=$((skip_cnt + 1)) return fi if [ $1 -eq 0 ] ; then - ok_cnt=`expr ${ok_cnt} \+ 1` + ok_cnt=$((ok_cnt + 1)) return fi - err_cnt=`expr ${err_cnt} \+ 1` + err_cnt=$((err_cnt + 1)) } test_system_wide_side_band -- GitLab From 1aaff2bac6cdb372ca83f3da6e1f4af6c04eefcd Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:06 +0300 Subject: [PATCH 1264/2223] perf test: test_intel_pt.sh: Stop using backticks As suggested by shellcheck, stop using backticks. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 2be8cb03a620e..0273332b99e9d 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -51,7 +51,7 @@ test_system_wide_side_band() perf record -B -N --no-bpf-event -o ${perfdatafile} -e intel_pt//u -C 0 -- taskset --cpu-list 1 uname # Should get MMAP events from CPU 1 because they can be needed to decode - mmap_cnt=`perf script -i ${perfdatafile} --no-itrace --show-mmap-events -C 1 2>/dev/null | grep MMAP | wc -l` + mmap_cnt=$(perf script -i ${perfdatafile} --no-itrace --show-mmap-events -C 1 2>/dev/null | grep MMAP | wc -l) if [ ${mmap_cnt} -gt 0 ] ; then return 0 -- GitLab From 711949e2f0bac0c8894cf84360354344be55c057 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:07 +0300 Subject: [PATCH 1265/2223] perf test: test_intel_pt.sh: Use grep -c instead of grep plus wc -l As suggested by shellcheck, use grep -c instead of grep plus wc -l Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 0273332b99e9d..3dfdef4fa6f4e 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -51,7 +51,7 @@ test_system_wide_side_band() perf record -B -N --no-bpf-event -o ${perfdatafile} -e intel_pt//u -C 0 -- taskset --cpu-list 1 uname # Should get MMAP events from CPU 1 because they can be needed to decode - mmap_cnt=$(perf script -i ${perfdatafile} --no-itrace --show-mmap-events -C 1 2>/dev/null | grep MMAP | wc -l) + mmap_cnt=$(perf script -i ${perfdatafile} --no-itrace --show-mmap-events -C 1 2>/dev/null | grep -c MMAP) if [ ${mmap_cnt} -gt 0 ] ; then return 0 -- GitLab From 5d7aac2bf87ab6b9f759c107b44bf8a0326c4c19 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:08 +0300 Subject: [PATCH 1266/2223] perf test: test_intel_pt.sh: Use quotes around variable expansion As suggested by shellcheck, use quotes around variable expansion. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-8-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 3dfdef4fa6f4e..075b780fe9ed7 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -37,7 +37,7 @@ trap trap_cleanup EXIT TERM INT can_cpu_wide() { - perf record -o ${tmpfile} -B -N --no-bpf-event -e dummy:u -C $1 true >/dev/null 2>&1 || return 2 + perf record -o "${tmpfile}" -B -N --no-bpf-event -e dummy:u -C "$1" true >/dev/null 2>&1 || return 2 return 0 } @@ -48,12 +48,12 @@ test_system_wide_side_band() can_cpu_wide 1 || return $? # Record on CPU 0 a task running on CPU 1 - perf record -B -N --no-bpf-event -o ${perfdatafile} -e intel_pt//u -C 0 -- taskset --cpu-list 1 uname + perf record -B -N --no-bpf-event -o "${perfdatafile}" -e intel_pt//u -C 0 -- taskset --cpu-list 1 uname # Should get MMAP events from CPU 1 because they can be needed to decode - mmap_cnt=$(perf script -i ${perfdatafile} --no-itrace --show-mmap-events -C 1 2>/dev/null | grep -c MMAP) + mmap_cnt=$(perf script -i "${perfdatafile}" --no-itrace --show-mmap-events -C 1 2>/dev/null | grep -c MMAP) - if [ ${mmap_cnt} -gt 0 ] ; then + if [ "${mmap_cnt}" -gt 0 ] ; then return 0 fi @@ -63,11 +63,11 @@ test_system_wide_side_band() count_result() { - if [ $1 -eq 2 ] ; then + if [ "$1" -eq 2 ] ; then skip_cnt=$((skip_cnt + 1)) return fi - if [ $1 -eq 0 ] ; then + if [ "$1" -eq 0 ] ; then ok_cnt=$((ok_cnt + 1)) return fi -- GitLab From fd9b45e39cfaf885a8767bcb7631868155a2f4d6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:09 +0300 Subject: [PATCH 1267/2223] perf test: test_intel_pt.sh: Fix return checking The use of set -e will cause a function that returns non-zero to terminate the script unless the result is consumed by || for example. That is OK if there is only 1 test function, but not if there are more. Prepare for more by using ||. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-9-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 075b780fe9ed7..7d2f3136ce19a 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -72,11 +72,11 @@ count_result() return fi err_cnt=$((err_cnt + 1)) + ret=0 } -test_system_wide_side_band - -count_result $? +ret=0 +test_system_wide_side_band || ret=$? ; count_result $ret cleanup -- GitLab From 2c1c9e351a43878043684be92615d7002c8ea0c6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:10 +0300 Subject: [PATCH 1268/2223] perf test: test_intel_pt.sh: Add more output in preparation for more tests When there are more tests it won't be obvious which test failed. Add more output so that it is. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-10-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 7d2f3136ce19a..2d489de9097b7 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -37,12 +37,19 @@ trap trap_cleanup EXIT TERM INT can_cpu_wide() { - perf record -o "${tmpfile}" -B -N --no-bpf-event -e dummy:u -C "$1" true >/dev/null 2>&1 || return 2 + echo "Checking for CPU-wide recording on CPU $1" + if ! perf record -o "${tmpfile}" -B -N --no-bpf-event -e dummy:u -C "$1" true >/dev/null 2>&1 ; then + echo "No so skipping" + return 2 + fi + echo OK return 0 } test_system_wide_side_band() { + echo "--- Test system-wide sideband ---" + # Need CPU 0 and CPU 1 can_cpu_wide 0 || return $? can_cpu_wide 1 || return $? @@ -54,6 +61,7 @@ test_system_wide_side_band() mmap_cnt=$(perf script -i "${perfdatafile}" --no-itrace --show-mmap-events -C 1 2>/dev/null | grep -c MMAP) if [ "${mmap_cnt}" -gt 0 ] ; then + echo OK return 0 fi @@ -80,6 +88,8 @@ test_system_wide_side_band || ret=$? ; count_result $ret cleanup +echo "--- Done ---" + if [ ${err_cnt} -gt 0 ] ; then exit 1 fi -- GitLab From da4062021e0e6da52d4919b6d77dbd77fa847f97 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:11 +0300 Subject: [PATCH 1269/2223] perf tools: Add debug messages and comments for testing Add debug messages to enable scripts to track aspects of 'perf record' behaviour. The messages will be consumed after 'perf record' has run, with the exception of "perf record has started" which is consequently flushed. Put comments so developers know which messages are also being used by test scripts. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220912083412.7058-11-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/evlist.c | 2 ++ tools/perf/builtin-record.c | 8 ++++++++ tools/perf/util/evsel.c | 2 ++ 3 files changed, 12 insertions(+) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 8ec5b9f344e02..0e7347d1583dc 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -487,6 +487,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, if (ops->idx) ops->idx(evlist, evsel, mp, idx); + /* Debug message used by test scripts */ pr_debug("idx %d: mmapping fd %d\n", idx, *output); if (ops->mmap(map, mp, *output, evlist_cpu) < 0) return -1; @@ -496,6 +497,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, if (!idx) perf_evlist__set_mmap_first(evlist, map, overwrite); } else { + /* Debug message used by test scripts */ pr_debug("idx %d: set output fd %d -> %d\n", idx, fd, *output); if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0) return -1; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f4f1619199e5c..52d254b1530c9 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2434,10 +2434,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) record__uniquify_name(rec); + /* Debug message used by test scripts */ + pr_debug3("perf record opening and mmapping events\n"); if (record__open(rec) != 0) { err = -1; goto out_free_threads; } + /* Debug message used by test scripts */ + pr_debug3("perf record done opening and mmapping events\n"); session->header.env.comp_mmap_len = session->evlist->core.mmap_len; if (rec->opts.kcore) { @@ -2580,6 +2584,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (err) goto out_child; + /* Debug message used by test scripts */ + pr_debug3("perf record has started\n"); + fflush(stderr); + trigger_ready(&auxtrace_snapshot_trigger); trigger_ready(&switch_output_trigger); perf_hooks__invoke_record_start(); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 5776bfa70f11e..a27092339b81a 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2089,6 +2089,7 @@ retry_open: test_attr__ready(); + /* Debug message used by test scripts */ pr_debug2_peo("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx", pid, perf_cpu_map__cpu(cpus, idx).cpu, group_fd, evsel->open_flags); @@ -2114,6 +2115,7 @@ retry_open: fd, group_fd, evsel->open_flags); } + /* Debug message used by test scripts */ pr_debug2_peo(" = %d\n", fd); if (evsel->bpf_fd >= 0) { -- GitLab From fea753f8e3c88c056806792c4d9de719939e0ef0 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Mon, 12 Sep 2022 11:34:12 +0300 Subject: [PATCH 1270/2223] perf test: test_intel_pt.sh: Add per-thread test When tracing the kernel with Intel PT, text_poke events are recorded per-cpu. In per-thread mode that results in a mixture of per-thread and per-cpu events and mmaps. Check that happens correctly. The debug output from perf record -vvv is recorded and then awk used to process the debug messages that indicate what file descriptors were opened and whether they were mmapped or set-output. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lore.kernel.org/lkml/20220912083412.7058-12-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 247 ++++++++++++++++++++++++ 1 file changed, 247 insertions(+) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 2d489de9097b7..051d088c1b74f 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -4,6 +4,8 @@ set -e +tenths=date\ +%s%1N + # Skip if no Intel PT perf list | grep -q 'intel_pt//' || exit 2 @@ -15,6 +17,10 @@ temp_dir=$(mktemp -d /tmp/perf-test-intel-pt-sh.XXXXXXXXXX) tmpfile="${temp_dir}/tmp-perf.data" perfdatafile="${temp_dir}/test-perf.data" +outfile="${temp_dir}/test-out.txt" +errfile="${temp_dir}/test-err.txt" +workload="${temp_dir}/workload" +awkscript="${temp_dir}/awkscript" cleanup() { @@ -35,6 +41,37 @@ trap_cleanup() trap trap_cleanup EXIT TERM INT +have_workload=false +cat << _end_of_file_ | /usr/bin/cc -o "${workload}" -xc - -pthread && have_workload=true +#include <time.h> +#include <pthread.h> + +void work(void) { + struct timespec tm = { + .tv_nsec = 1000000, + }; + int i; + + /* Run for about 30 seconds */ + for (i = 0; i < 30000; i++) + nanosleep(&tm, NULL); +} + +void *threadfunc(void *arg) { + work(); + return NULL; +} + +int main(void) { + pthread_t th; + + pthread_create(&th, NULL, threadfunc, NULL); + work(); + pthread_join(th, NULL); + return 0; +} +_end_of_file_ + can_cpu_wide() { echo "Checking for CPU-wide recording on CPU $1" @@ -69,6 +106,214 @@ test_system_wide_side_band() return 1 } +can_kernel() +{ + perf record -o "${tmpfile}" -B -N --no-bpf-event -e dummy:k true >/dev/null 2>&1 || return 2 + return 0 +} + +wait_for_threads() +{ + start_time=$($tenths) + while [ -e "/proc/$1/task" ] ; do + th_cnt=$(find "/proc/$1/task" -mindepth 1 -maxdepth 1 -printf x | wc -c) + if [ "${th_cnt}" -ge "$2" ] ; then + return 0 + fi + # Wait at most 5 seconds + if [ $(($($tenths) - start_time)) -ge 50 ] ; then + echo "PID $1 does not have $2 threads" + return 1 + fi + done + return 1 +} + +wait_for_perf_to_start() +{ + echo "Waiting for \"perf record has started\" message" + start_time=$($tenths) + while [ -e "/proc/$1" ] ; do + if grep -q "perf record has started" "${errfile}" ; then + echo OK + break + fi + # Wait at most 5 seconds + if [ $(($($tenths) - start_time)) -ge 50 ] ; then + echo "perf recording did not start" + return 1 + fi + done + return 0 +} + +wait_for_process_to_exit() +{ + start_time=$($tenths) + while [ -e "/proc/$1" ] ; do + # Wait at most 5 seconds + if [ $(($($tenths) - start_time)) -ge 50 ] ; then + echo "PID $1 did not exit as expected" + return 1 + fi + done + return 0 +} + +is_running() +{ + start_time=$($tenths) + while [ -e "/proc/$1" ] ; do + # Check for at least 0.3s + if [ $(($($tenths) - start_time)) -gt 3 ] ; then + return 0 + fi + done + echo "PID $1 exited prematurely" + return 1 +} + +test_per_thread() +{ + k="$1" + desc="$2" + + echo "--- Test per-thread ${desc}recording ---" + + if ! $have_workload ; then + echo "No workload, so skipping" + return 2 + fi + + if [ "${k}" = "k" ] ; then + can_kernel || return 2 + fi + + cat <<- "_end_of_file_" > "${awkscript}" + BEGIN { + s = "[ ]*" + u = s"[0-9]+"s + d = s"[0-9-]+"s + x = s"[0-9a-fA-FxX]+"s + mmapping = "idx"u": mmapping fd"u + set_output = "idx"u": set output fd"u"->"u + perf_event_open = "sys_perf_event_open: pid"d"cpu"d"group_fd"d"flags"x"="u + } + + /perf record opening and mmapping events/ { + if (!done) + active = 1 + } + + /perf record done opening and mmapping events/ { + active = 0 + done = 1 + } + + $0 ~ perf_event_open && active { + match($0, perf_event_open) + $0 = substr($0, RSTART, RLENGTH) + pid = $3 + cpu = $5 + fd = $11 + print "pid " pid " cpu " cpu " fd " fd " : " $0 + fd_array[fd] = fd + pid_array[fd] = pid + cpu_array[fd] = cpu + } + + $0 ~ mmapping && active { + match($0, mmapping) + $0 = substr($0, RSTART, RLENGTH) + fd = $5 + print "fd " fd " : " $0 + if (fd in fd_array) { + mmap_array[fd] = 1 + } else { + print "Unknown fd " fd + exit 1 + } + } + + $0 ~ set_output && active { + match($0, set_output) + $0 = substr($0, RSTART, RLENGTH) + fd = $6 + fd_to = $8 + print "fd " fd " fd_to " fd_to " : " $0 + if (fd in fd_array) { + if (fd_to in fd_array) { + set_output_array[fd] = fd_to + } else { + print "Unknown fd " fd_to + exit 1 + } + } else { + print "Unknown fd " fd + exit 1 + } + } + + END { + print "Checking " length(fd_array) " fds" + for (fd in fd_array) { + if (fd in mmap_array) { + pid = pid_array[fd] + if (pid != -1) { + if (pid in pids) { + print "More than 1 mmap for PID " pid + exit 1 + } + pids[pid] = 1 + } + cpu = cpu_array[fd] + if (cpu != -1) { + if (cpu in cpus) { + print "More than 1 mmap for CPU " cpu + exit 1 + } + cpus[cpu] = 1 + } + } else if (!(fd in set_output_array)) { + print "No mmap for fd " fd + exit 1 + } + } + n = length(pids) + if (n != thread_cnt) { + print "Expected " thread_cnt " per-thread mmaps - found " n + exit 1 + } + } + _end_of_file_ + + $workload & + w1=$! + $workload & + w2=$! + echo "Workload PIDs are $w1 and $w2" + wait_for_threads ${w1} 2 + wait_for_threads ${w2} 2 + + perf record -B -N --no-bpf-event -o "${perfdatafile}" -e intel_pt//u"${k}" -vvv --per-thread -p "${w1},${w2}" 2>"${errfile}" >"${outfile}" & + ppid=$! + echo "perf PID is $ppid" + wait_for_perf_to_start ${ppid} || return 1 + + kill ${w1} + wait_for_process_to_exit ${w1} || return 1 + is_running ${ppid} || return 1 + + kill ${w2} + wait_for_process_to_exit ${w2} || return 1 + wait_for_process_to_exit ${ppid} || return 1 + + awk -v thread_cnt=4 -f "${awkscript}" "${errfile}" || return 1 + + echo OK + return 0 +} + count_result() { if [ "$1" -eq 2 ] ; then @@ -85,6 +330,8 @@ count_result() ret=0 test_system_wide_side_band || ret=$? ; count_result $ret +test_per_thread "" "" || ret=$? ; count_result $ret +test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret cleanup -- GitLab From 5ebcdf07f7e4cdfdcfb3589f6bd3f81c3c061164 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 14 Sep 2022 11:01:49 +0300 Subject: [PATCH 1271/2223] perf test: test_intel_pt.sh: Move helper functions for waiting Move helper functions for waiting to a separate file so they can be shared. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220914080150.5888-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/lib/waiting.sh | 69 +++++++++++++++++++++++++ tools/perf/tests/shell/test_intel_pt.sh | 68 ++---------------------- 2 files changed, 73 insertions(+), 64 deletions(-) create mode 100644 tools/perf/tests/shell/lib/waiting.sh diff --git a/tools/perf/tests/shell/lib/waiting.sh b/tools/perf/tests/shell/lib/waiting.sh new file mode 100644 index 0000000000000..dbd5bd90105e5 --- /dev/null +++ b/tools/perf/tests/shell/lib/waiting.sh @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: GPL-2.0 + +tenths=date\ +%s%1N + +# Wait for PID $1 to have $2 number of threads started +wait_for_threads() +{ + start_time=$($tenths) + while [ -e "/proc/$1/task" ] ; do + th_cnt=$(find "/proc/$1/task" -mindepth 1 -maxdepth 1 -printf x | wc -c) + if [ "${th_cnt}" -ge "$2" ] ; then + return 0 + fi + # Wait at most 5 seconds + if [ $(($($tenths) - start_time)) -ge 50 ] ; then + echo "PID $1 does not have $2 threads" + return 1 + fi + done + return 1 +} + +# Wait for perf record -vvv 2>$2 with PID $1 to start by looking at file $2 +# It depends on capturing perf record debug message "perf record has started" +wait_for_perf_to_start() +{ + echo "Waiting for \"perf record has started\" message" + start_time=$($tenths) + while [ -e "/proc/$1" ] ; do + if grep -q "perf record has started" "$2" ; then + echo OK + break + fi + # Wait at most 5 seconds + if [ $(($($tenths) - start_time)) -ge 50 ] ; then + echo "perf recording did not start" + return 1 + fi + done + return 0 +} + +# Wait for process PID %1 to exit +wait_for_process_to_exit() +{ + start_time=$($tenths) + while [ -e "/proc/$1" ] ; do + # Wait at most 5 seconds + if [ $(($($tenths) - start_time)) -ge 50 ] ; then + echo "PID $1 did not exit as expected" + return 1 + fi + done + return 0 +} + +# Check if PID $1 is still running after 0.3 seconds +is_running() +{ + start_time=$($tenths) + while [ -e "/proc/$1" ] ; do + # Check for at least 0.3s + if [ $(($($tenths) - start_time)) -gt 3 ] ; then + return 0 + fi + done + echo "PID $1 exited prematurely" + return 1 +} diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 051d088c1b74f..efaad9566c347 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -4,11 +4,12 @@ set -e -tenths=date\ +%s%1N - # Skip if no Intel PT perf list | grep -q 'intel_pt//' || exit 2 +shelldir=$(dirname "$0") +. "${shelldir}"/lib/waiting.sh + skip_cnt=0 ok_cnt=0 err_cnt=0 @@ -112,67 +113,6 @@ can_kernel() return 0 } -wait_for_threads() -{ - start_time=$($tenths) - while [ -e "/proc/$1/task" ] ; do - th_cnt=$(find "/proc/$1/task" -mindepth 1 -maxdepth 1 -printf x | wc -c) - if [ "${th_cnt}" -ge "$2" ] ; then - return 0 - fi - # Wait at most 5 seconds - if [ $(($($tenths) - start_time)) -ge 50 ] ; then - echo "PID $1 does not have $2 threads" - return 1 - fi - done - return 1 -} - -wait_for_perf_to_start() -{ - echo "Waiting for \"perf record has started\" message" - start_time=$($tenths) - while [ -e "/proc/$1" ] ; do - if grep -q "perf record has started" "${errfile}" ; then - echo OK - break - fi - # Wait at most 5 seconds - if [ $(($($tenths) - start_time)) -ge 50 ] ; then - echo "perf recording did not start" - return 1 - fi - done - return 0 -} - -wait_for_process_to_exit() -{ - start_time=$($tenths) - while [ -e "/proc/$1" ] ; do - # Wait at most 5 seconds - if [ $(($($tenths) - start_time)) -ge 50 ] ; then - echo "PID $1 did not exit as expected" - return 1 - fi - done - return 0 -} - -is_running() -{ - start_time=$($tenths) - while [ -e "/proc/$1" ] ; do - # Check for at least 0.3s - if [ $(($($tenths) - start_time)) -gt 3 ] ; then - return 0 - fi - done - echo "PID $1 exited prematurely" - return 1 -} - test_per_thread() { k="$1" @@ -298,7 +238,7 @@ test_per_thread() perf record -B -N --no-bpf-event -o "${perfdatafile}" -e intel_pt//u"${k}" -vvv --per-thread -p "${w1},${w2}" 2>"${errfile}" >"${outfile}" & ppid=$! echo "perf PID is $ppid" - wait_for_perf_to_start ${ppid} || return 1 + wait_for_perf_to_start ${ppid} "${errfile}" || return 1 kill ${w1} wait_for_process_to_exit ${w1} || return 1 -- GitLab From 84838712e92eab3dfa37b97100f32490507373b2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 14 Sep 2022 11:01:50 +0300 Subject: [PATCH 1272/2223] perf test: waiting.sh: Parameterize timeouts Let helper functions accept a parameter to specify time out values in tenths of a second. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220914080150.5888-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/lib/waiting.sh | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/perf/tests/shell/lib/waiting.sh b/tools/perf/tests/shell/lib/waiting.sh index dbd5bd90105e5..e7a39134a68e8 100644 --- a/tools/perf/tests/shell/lib/waiting.sh +++ b/tools/perf/tests/shell/lib/waiting.sh @@ -3,16 +3,18 @@ tenths=date\ +%s%1N # Wait for PID $1 to have $2 number of threads started +# Time out after $3 tenths of a second or 5 seconds if $3 is "" wait_for_threads() { + tm_out=$3 ; [ -n "${tm_out}" ] || tm_out=50 start_time=$($tenths) while [ -e "/proc/$1/task" ] ; do th_cnt=$(find "/proc/$1/task" -mindepth 1 -maxdepth 1 -printf x | wc -c) if [ "${th_cnt}" -ge "$2" ] ; then return 0 fi - # Wait at most 5 seconds - if [ $(($($tenths) - start_time)) -ge 50 ] ; then + # Wait at most tm_out tenths of a second + if [ $(($($tenths) - start_time)) -ge $tm_out ] ; then echo "PID $1 does not have $2 threads" return 1 fi @@ -22,8 +24,10 @@ wait_for_threads() # Wait for perf record -vvv 2>$2 with PID $1 to start by looking at file $2 # It depends on capturing perf record debug message "perf record has started" +# Time out after $3 tenths of a second or 5 seconds if $3 is "" wait_for_perf_to_start() { + tm_out=$3 ; [ -n "${tm_out}" ] || tm_out=50 echo "Waiting for \"perf record has started\" message" start_time=$($tenths) while [ -e "/proc/$1" ] ; do @@ -31,8 +35,8 @@ wait_for_perf_to_start() echo OK break fi - # Wait at most 5 seconds - if [ $(($($tenths) - start_time)) -ge 50 ] ; then + # Wait at most tm_out tenths of a second + if [ $(($($tenths) - start_time)) -ge $tm_out ] ; then echo "perf recording did not start" return 1 fi @@ -41,12 +45,14 @@ wait_for_perf_to_start() } # Wait for process PID %1 to exit +# Time out after $2 tenths of a second or 5 seconds if $2 is "" wait_for_process_to_exit() { + tm_out=$2 ; [ -n "${tm_out}" ] || tm_out=50 start_time=$($tenths) while [ -e "/proc/$1" ] ; do - # Wait at most 5 seconds - if [ $(($($tenths) - start_time)) -ge 50 ] ; then + # Wait at most tm_out tenths of a second + if [ $(($($tenths) - start_time)) -ge $tm_out ] ; then echo "PID $1 did not exit as expected" return 1 fi @@ -54,13 +60,15 @@ wait_for_process_to_exit() return 0 } -# Check if PID $1 is still running after 0.3 seconds +# Check if PID $1 is still running after $2 tenths of a second +# or 0.3 seconds if $2 is "" is_running() { + tm_out=$2 ; [ -n "${tm_out}" ] || tm_out=3 start_time=$($tenths) while [ -e "/proc/$1" ] ; do - # Check for at least 0.3s - if [ $(($($tenths) - start_time)) -gt 3 ] ; then + # Check for at least tm_out tenths of a second + if [ $(($($tenths) - start_time)) -gt $tm_out ] ; then return 0 fi done -- GitLab From 6282a1f4f846fda21b16065a2ef094c7b71b2771 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 23 Sep 2022 17:42:19 -0700 Subject: [PATCH 1273/2223] perf lock: Add -E/--entries option Like in 'perf top', the -E option can limit number of entries to print. It can be useful when users want to see top N contended locks only. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220924004221.841024-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-lock.txt | 10 ++++++++++ tools/perf/builtin-lock.c | 20 +++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 5f2dc634258e9..b23e76200ac2c 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -94,6 +94,11 @@ REPORT OPTIONS EventManager_De 1845 1 636 futex-default-S 1609 0 0 +-E:: +--entries=<value>:: + Display this many entries. + + INFO OPTIONS ------------ @@ -105,6 +110,7 @@ INFO OPTIONS --map:: dump map of lock instances (address:name table) + CONTENTION OPTIONS -------------- @@ -154,6 +160,10 @@ CONTENTION OPTIONS --stack-skip Number of stack depth to skip when finding a lock caller (default: 3). +-E:: +--entries=<value>:: + Display this many entries. + SEE ALSO -------- diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 25d75fa09b906..1c0d52384d9e9 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -58,6 +58,7 @@ static bool use_bpf; static unsigned long bpf_map_entries = 10240; static int max_stack_depth = CONTENTION_STACK_DEPTH; static int stack_skip = CONTENTION_STACK_SKIP; +static int print_nr_entries = INT_MAX / 2; static enum { LOCK_AGGR_ADDR, @@ -1266,14 +1267,14 @@ static void print_result(void) struct lock_stat *st; struct lock_key *key; char cut_name[20]; - int bad, total; + int bad, total, printed; pr_info("%20s ", "Name"); list_for_each_entry(key, &lock_keys, list) pr_info("%*s ", key->len, key->header); pr_info("\n\n"); - bad = total = 0; + bad = total = printed = 0; while ((st = pop_from_result())) { total++; if (st->broken) @@ -1311,6 +1312,9 @@ static void print_result(void) pr_info(" "); } pr_info("\n"); + + if (++printed >= print_nr_entries) + break; } print_bad_events(bad, total); @@ -1476,7 +1480,7 @@ static void print_contention_result(struct lock_contention *con) { struct lock_stat *st; struct lock_key *key; - int bad, total; + int bad, total, printed; list_for_each_entry(key, &lock_keys, list) pr_info("%*s ", key->len, key->header); @@ -1486,7 +1490,7 @@ static void print_contention_result(struct lock_contention *con) else pr_info(" %10s %s\n\n", "type", "caller"); - bad = total = 0; + bad = total = printed = 0; if (use_bpf) bad = bad_hist[BROKEN_CONTENDED]; @@ -1507,7 +1511,7 @@ static void print_contention_result(struct lock_contention *con) /* st->addr contains tid of thread */ t = perf_session__findnew(session, pid); pr_info(" %10d %s\n", pid, thread__comm_str(t)); - continue; + goto next; } pr_info(" %10s %s\n", get_type_str(st), st->name); @@ -1527,6 +1531,10 @@ static void print_contention_result(struct lock_contention *con) pr_info("\t\t\t%#lx %s\n", (unsigned long)ip, buf); } } + +next: + if (++printed >= print_nr_entries) + break; } print_bad_events(bad, total); @@ -1878,6 +1886,7 @@ int cmd_lock(int argc, const char **argv) "combine locks in the same class"), OPT_BOOLEAN('t', "threads", &show_thread_stats, "show per-thread lock stats"), + OPT_INTEGER('E', "entries", &print_nr_entries, "display this many functions"), OPT_PARENT(lock_options) }; @@ -1905,6 +1914,7 @@ int cmd_lock(int argc, const char **argv) OPT_INTEGER(0, "stack-skip", &stack_skip, "Set the number of stack depth to skip when finding a lock caller, " "Default: " __stringify(CONTENTION_STACK_SKIP)), + OPT_INTEGER('E', "entries", &print_nr_entries, "display this many functions"), OPT_PARENT(lock_options) }; -- GitLab From 6bbc482017deeacf5c9953bafdeb90517e22dc90 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 23 Sep 2022 17:42:20 -0700 Subject: [PATCH 1274/2223] perf lock: Add -q/--quiet option to suppress header and debug messages Like in 'perf report', this option is to suppress header and debug messages. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220924004221.841024-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-lock.txt | 4 ++++ tools/perf/builtin-lock.c | 27 +++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index b23e76200ac2c..3b1e16563b795 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -40,6 +40,10 @@ COMMON OPTIONS --verbose:: Be more verbose (show symbol address, etc). +-q:: +--quiet:: + Do not show any message. (Suppress -v) + -D:: --dump-raw-trace:: Dump raw trace in ASCII. diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 1c0d52384d9e9..9722d4ab2e557 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1250,7 +1250,7 @@ static void print_bad_events(int bad, int total) for (i = 0; i < BROKEN_MAX; i++) broken += bad_hist[i]; - if (broken == 0 && !verbose) + if (quiet || (broken == 0 && !verbose)) return; pr_info("\n=== output for debug===\n\n"); @@ -1269,10 +1269,12 @@ static void print_result(void) char cut_name[20]; int bad, total, printed; - pr_info("%20s ", "Name"); - list_for_each_entry(key, &lock_keys, list) - pr_info("%*s ", key->len, key->header); - pr_info("\n\n"); + if (!quiet) { + pr_info("%20s ", "Name"); + list_for_each_entry(key, &lock_keys, list) + pr_info("%*s ", key->len, key->header); + pr_info("\n\n"); + } bad = total = printed = 0; while ((st = pop_from_result())) { @@ -1482,13 +1484,15 @@ static void print_contention_result(struct lock_contention *con) struct lock_key *key; int bad, total, printed; - list_for_each_entry(key, &lock_keys, list) - pr_info("%*s ", key->len, key->header); + if (!quiet) { + list_for_each_entry(key, &lock_keys, list) + pr_info("%*s ", key->len, key->header); - if (show_thread_stats) - pr_info(" %10s %s\n\n", "pid", "comm"); - else - pr_info(" %10s %s\n\n", "type", "caller"); + if (show_thread_stats) + pr_info(" %10s %s\n\n", "pid", "comm"); + else + pr_info(" %10s %s\n\n", "type", "caller"); + } bad = total = printed = 0; if (use_bpf) @@ -1865,6 +1869,7 @@ int cmd_lock(int argc, const char **argv) "file", "vmlinux pathname"), OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file", "kallsyms pathname"), + OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"), OPT_END() }; -- GitLab From ec685de25b6718f85380bb4bbaacf23748708ad0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 23 Sep 2022 17:42:21 -0700 Subject: [PATCH 1275/2223] perf test: Add kernel lock contention test Add a new shell test to check if both normal 'perf lock record' + contention and BPF (with -b) option are working. Use 'perf bench sched messaging' as a workload since it creates some contention for sending and receiving messages. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220924004221.841024-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/lock_contention.sh | 73 +++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 tools/perf/tests/shell/lock_contention.sh diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh new file mode 100755 index 0000000000000..04bf604e3c6f8 --- /dev/null +++ b/tools/perf/tests/shell/lock_contention.sh @@ -0,0 +1,73 @@ +#!/bin/sh +# kernel lock contention analysis test +# SPDX-License-Identifier: GPL-2.0 + +set -e + +err=0 +perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +result=$(mktemp /tmp/__perf_test.result.XXXXX) + +cleanup() { + rm -f ${perfdata} + rm -f ${result} + trap - exit term int +} + +trap_cleanup() { + cleanup + exit ${err} +} +trap trap_cleanup exit term int + +check() { + if [ `id -u` != 0 ]; then + echo "[Skip] No root permission" + err=2 + exit + fi + + if ! perf list | grep -q lock:contention_begin; then + echo "[Skip] No lock contention tracepoints" + err=2 + exit + fi +} + +test_record() +{ + echo "Testing perf lock record and perf lock contention" + perf lock record -o ${perfdata} -- perf bench sched messaging > /dev/null 2>&1 + # the output goes to the stderr and we expect only 1 output (-E 1) + perf lock contention -i ${perfdata} -E 1 -q 2> ${result} + if [ $(cat "${result}" | wc -l) != "1" ]; then + echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l) + err=1 + exit + fi +} + +test_bpf() +{ + echo "Testing perf lock contention --use-bpf" + + if ! perf lock con -b true > /dev/null 2>&1 ; then + echo "[Skip] No BPF support" + exit + fi + + # the perf lock contention output goes to the stderr + perf lock con -a -b -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result} + if [ $(cat "${result}" | wc -l) != "1" ]; then + echo "[Fail] BPF result count is not 1:" $(cat "${result}" | wc -l) + err=1 + exit + fi +} + +check + +test_record +test_bpf + +exit ${err} -- GitLab From b71536a4925e630466d2817e65a42f57f0f5b33e Mon Sep 17 00:00:00 2001 From: Chen Zhongjin <chenzhongjin@huawei.com> Date: Mon, 26 Sep 2022 11:14:40 +0800 Subject: [PATCH 1276/2223] perf string: Remove unused macro K() Unused macro reported by [-Wunused-macros]. This macro is introduced to calculate the 'unit' size, in: d2fb8b4151a92223 ("perf tools: Add new perf_atoll() function to parse string representing size in bytes") 8ba7f6c2faada3ad ("saner perf_atoll()") This commit has simplified the perf_atoll() function and remove the 'unit' variable. This macro is not deleted, but nowhere else is using it. A single letter macro is confusing and easy to be misused. So remove it for code cleaning. Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lore.kernel.org/lkml/20220926031440.28275-6-chenzhongjin@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/string.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c index f6d90cdd92253..4f12a96f33cc4 100644 --- a/tools/perf/util/string.c +++ b/tools/perf/util/string.c @@ -15,7 +15,6 @@ const char *dots = "....................................................................." "....................................................................."; -#define K 1024LL /* * perf_atoll() * Parse (\d+)(b|B|kb|KB|mb|MB|gb|GB|tb|TB) (e.g. "256MB") -- GitLab From 888964a05d13f014d21deeb7414904c82afcd82b Mon Sep 17 00:00:00 2001 From: Chen Zhongjin <chenzhongjin@huawei.com> Date: Mon, 26 Sep 2022 11:14:36 +0800 Subject: [PATCH 1277/2223] perf trace: Fix show_arg_names not working for tp arg names trace__fprintf_tp_fields() will always print arg names because when implemented it is forced to print arg_names with: (1 || trace->show_arg_names) So the printing looks like: > cat ~/.perfconfig [trace] show_arg_names = no > perf trace -e syscalls:*mmap sleep 1 0.000 sleep/1119 syscalls:sys_enter_mmap(NULL, 8192, READ|WRITE, PRIVATE|ANONYMOUS) 0.179 sleep/1119 syscalls:sys_exit_mmap(__syscall_nr: 9, ret: 140535426170880) ... Although the comment said that perhaps we need a show_tp_arg_names. I don't think it's necessary to control them separately because it's not so clean that part of the log shows arg names but other not. Also when we are tracing functions it's rare to especially distinguish syscalls and tp trace. Only use one option to control arg names printing is more resonable and simple. So remove the force condition and commit. After fix: > perf trace -e syscalls:*mmap sleep 1 0.000 sleep/1121 syscalls:sys_enter_mmap(NULL, 8192, READ|WRITE, PRIVATE|ANONYMOUS) 0.163 sleep/1121 syscalls:sys_exit_mmap(9, 140454467661824) ... Fixes: f11b2803bb88655d ("perf trace: Allow choosing how to augment the tracepoint arguments") Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lore.kernel.org/lkml/20220926031440.28275-2-chenzhongjin@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-trace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 3ecc31375f90e..99e23e6e6a67a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2759,11 +2759,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); - /* - * XXX Perhaps we should have a show_tp_arg_names, - * leaving show_arg_names just for syscalls? - */ - if (1 || trace->show_arg_names) + if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val); -- GitLab From 96b731412d51c6d19c5269f8e6bf2b6621d3b994 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin <chenzhongjin@huawei.com> Date: Mon, 26 Sep 2022 11:14:37 +0800 Subject: [PATCH 1278/2223] perf trace: Fix incorrectly parsed hexadecimal value for flags in filter When parsing flags in filter, the strtoul function uses wrong parsing condition (tok[1] = 'x'), which can make the flags be corrupted and treat all numbers start with 0 as hex. In fact strtoul() will auto test hex format when base == 0 (See _parse_integer_fixup_radix). So there is no need to test this again. Remove the unnessesary is_hexa test. Fixes: 154c978d484c6104 ("libbeauty: Introduce strarray__strtoul_flags()") Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lore.kernel.org/lkml/20220926031440.28275-3-chenzhongjin@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-trace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 99e23e6e6a67a..d3c757769b965 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -615,11 +615,8 @@ bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *re if (isalpha(*tok) || *tok == '_') { if (!strarray__strtoul(sa, tok, toklen, &val)) return false; - } else { - bool is_hexa = tok[0] == 0 && (tok[1] = 'x' || tok[1] == 'X'); - - val = strtoul(tok, NULL, is_hexa ? 16 : 0); - } + } else + val = strtoul(tok, NULL, 0); *ret |= (1 << (val - 1)); -- GitLab From 058443934524590d5537a80f490267cc95a61c05 Mon Sep 17 00:00:00 2001 From: Leo Yan <leo.yan@linaro.org> Date: Sun, 25 Sep 2022 10:58:34 +0800 Subject: [PATCH 1279/2223] perf subcmd: Set environment variable "PREFIX" Set environment variable "PREFIX", it will be used by invoked shell script, e.g. the shell script uses it to find lib paths. Signed-off-by: Leo Yan <leo.yan@linaro.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220925025835.70364-2-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/subcmd/exec-cmd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/subcmd/exec-cmd.c b/tools/lib/subcmd/exec-cmd.c index 33e94fb839867..5dbea456973e1 100644 --- a/tools/lib/subcmd/exec-cmd.c +++ b/tools/lib/subcmd/exec-cmd.c @@ -24,6 +24,9 @@ void exec_cmd_init(const char *exec_name, const char *prefix, subcmd_config.prefix = prefix; subcmd_config.exec_path = exec_path; subcmd_config.exec_path_env = exec_path_env; + + /* Setup environment variable for invoked shell script. */ + setenv("PREFIX", prefix, 1); } #define is_dir_sep(c) ((c) == '/') -- GitLab From 1dc86fc731addf783d076cb6182ebc84e2624cc0 Mon Sep 17 00:00:00 2001 From: Leo Yan <leo.yan@linaro.org> Date: Sun, 25 Sep 2022 10:58:35 +0800 Subject: [PATCH 1280/2223] perf test: Introduce script for java symbol testing This commit introduces a script for testing java symbols. The test records java program, inject samples with JIT samples, check specific JIT symbols in the report, the test will pass only when these two symbols are detected. Suggested-by: Ian Rogers <irogers@google.com> Signed-off-by: Leo Yan <leo.yan@linaro.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220925025835.70364-3-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_java_symbol.sh | 75 ++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 tools/perf/tests/shell/test_java_symbol.sh diff --git a/tools/perf/tests/shell/test_java_symbol.sh b/tools/perf/tests/shell/test_java_symbol.sh new file mode 100755 index 0000000000000..f221225808a38 --- /dev/null +++ b/tools/perf/tests/shell/test_java_symbol.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Test java symbol + +# SPDX-License-Identifier: GPL-2.0 +# Leo Yan <leo.yan@linaro.org>, 2022 + +# skip if there's no jshell +if ! [ -x "$(command -v jshell)" ]; then + echo "skip: no jshell, install JDK" + exit 2 +fi + +PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +PERF_INJ_DATA=$(mktemp /tmp/__perf_test.perf.data.inj.XXXXX) + +cleanup_files() +{ + echo "Cleaning up files..." + rm -f ${PERF_DATA} + rm -f ${PERF_INJ_DATA} +} + +trap cleanup_files exit term int + +if [ -e "$PWD/tools/perf/libperf-jvmti.so" ]; then + LIBJVMTI=$PWD/tools/perf/libperf-jvmti.so +elif [ -e "$PWD/libperf-jvmti.so" ]; then + LIBJVMTI=$PWD/libperf-jvmti.so +elif [ -e "$PREFIX/lib64/libperf-jvmti.so" ]; then + LIBJVMTI=$PREFIX/lib64/libperf-jvmti.so +elif [ -e "$PREFIX/lib/libperf-jvmti.so" ]; then + LIBJVMTI=$PREFIX/lib/libperf-jvmti.so +elif [ -e "/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-generic//')/libperf-jvmti.so" ]; then + LIBJVMTI=/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-generic//')/libperf-jvmti.so +else + echo "Fail to find libperf-jvmti.so" + # JVMTI is a build option, skip the test if fail to find lib + exit 2 +fi + +cat <<EOF | perf record -k 1 -o $PERF_DATA jshell -s -J-agentpath:$LIBJVMTI +int fib(int x) { + return x > 1 ? fib(x - 2) + fib(x - 1) : 1; +} + +int q = 0; + +for (int i = 0; i < 10; i++) + q += fib(i); + +System.out.println(q); +EOF + +if [ $? -ne 0 ]; then + echo "Fail to record for java program" + exit 1 +fi + +if ! perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then + echo "Fail to inject samples" + exit 1 +fi + +# Below is an example of the instruction samples reporting: +# 8.18% jshell jitted-50116-29.so [.] Interpreter +# 0.75% Thread-1 jitted-83602-1670.so [.] jdk.internal.jimage.BasicImageReader.getString(int) +perf report --stdio -i ${PERF_INJ_DATA} 2>&1 | \ + egrep " +[0-9]+\.[0-9]+% .* (Interpreter|jdk\.internal).*" > /dev/null 2>&1 + +if [ $? -ne 0 ]; then + echo "Fail to find java symbols" + exit 1 +fi + +exit 0 -- GitLab From 728c2edfcf14b3b61bd0ff82894f03455ca0e7d7 Mon Sep 17 00:00:00 2001 From: Jason Andryuk <jandryuk@gmail.com> Date: Mon, 29 Aug 2022 11:15:36 -0400 Subject: [PATCH 1281/2223] xen-pcifront: Handle missed Connected state An HVM guest with linux stubdomain and 2 PCI devices failed to start as libxl timed out waiting for the PCI devices to be added. It happens intermittently but with some regularity. libxl wrote the two xenstore entries for the devices, but then timed out waiting for backend state 4 (Connected) - the state stayed at 7 (Reconfiguring). (PCI passthrough to an HVM with stubdomain is PV passthrough to the stubdomain and then HVM passthrough with the QEMU inside the stubdomain.) The stubdomain kernel never printed "pcifront pci-0: Installing PCI frontend", so it seems to have missed state 4 which would have called pcifront_try_connect() -> pcifront_connect_and_init_dma() Have pcifront_detach_devices() special-case state Initialised and call pcifront_connect_and_init_dma(). Don't use pcifront_try_connect() because that sets the xenbus state which may throw off the backend. After connecting, skip the remainder of detach_devices since none have been initialized yet. When the backend switches to Reconfigured, pcifront_attach_devices() will pick them up again. Signed-off-by: Jason Andryuk <jandryuk@gmail.com> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20220829151536.8578-1-jandryuk@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/pci/xen-pcifront.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 689271c4245c2..77e61b4701218 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -981,13 +981,26 @@ static int pcifront_detach_devices(struct pcifront_device *pdev) { int err = 0; int i, num_devs; + enum xenbus_state state; unsigned int domain, bus, slot, func; struct pci_dev *pci_dev; char str[64]; - if (xenbus_read_driver_state(pdev->xdev->nodename) != - XenbusStateConnected) + state = xenbus_read_driver_state(pdev->xdev->nodename); + if (state == XenbusStateInitialised) { + dev_dbg(&pdev->xdev->dev, "Handle skipped connect.\n"); + /* We missed Connected and need to initialize. */ + err = pcifront_connect_and_init_dma(pdev); + if (err && err != -EEXIST) { + xenbus_dev_fatal(pdev->xdev, err, + "Error setting up PCI Frontend"); + goto out; + } + + goto out_switch_state; + } else if (state != XenbusStateConnected) { goto out; + } err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d", &num_devs); @@ -1048,6 +1061,7 @@ static int pcifront_detach_devices(struct pcifront_device *pdev) domain, bus, slot, func); } + out_switch_state: err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring); out: -- GitLab From 1d800f32b2574c1d055984ad17223198caddbb54 Mon Sep 17 00:00:00 2001 From: Jonathan Derrick <jonathan.derrick@linux.dev> Date: Mon, 3 Oct 2022 14:25:11 -0600 Subject: [PATCH 1282/2223] MAINTAINERS: Update SED-Opal Maintainers Add my new email address and remove Revanth Signed-off-by: Jonathan Derrick <jonathan.derrick@linux.dev> Link: https://lore.kernel.org/r/20221003202511.5124-1-jonathan.derrick@linux.dev Signed-off-by: Jens Axboe <axboe@kernel.dk> --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 47f27eea29ba5..f9f5184e7e74e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18298,8 +18298,7 @@ S: Maintained F: drivers/mmc/host/sdhci-esdhc-imx.c SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER -M: Jonathan Derrick <jonathan.derrick@intel.com> -M: Revanth Rajashekar <revanth.rajashekar@intel.com> +M: Jonathan Derrick <jonathan.derrick@linux.dev> L: linux-block@vger.kernel.org S: Supported F: block/opal_proto.h -- GitLab From da4ab869e37cf81f93333ba74b16e0ea6d322e15 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@kernel.org> Date: Wed, 25 May 2022 06:11:00 -0400 Subject: [PATCH 1283/2223] libceph: drop last_piece flag from ceph_msg_data_cursor ceph_msg_data_next is always passed a NULL pointer for this field. Some of the "next" operations look at it in order to determine the length, but we can just take the min of the data on the page or cursor->resid. Signed-off-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> Reviewed-by: Ilya Dryomov <idryomov@gmail.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- include/linux/ceph/messenger.h | 4 +--- net/ceph/messenger.c | 40 +++++----------------------------- net/ceph/messenger_v1.c | 6 ++--- net/ceph/messenger_v2.c | 2 +- 4 files changed, 10 insertions(+), 42 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e7f2fb2fc2079..99c1726be6ee7 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -207,7 +207,6 @@ struct ceph_msg_data_cursor { struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ - bool last_piece; /* current is last piece */ bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK @@ -498,8 +497,7 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq); void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, struct ceph_msg *msg, size_t length); struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece); + size_t *page_offset, size_t *length); void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes); u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d3bb656308b43..dfa237fbd5a32 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -728,7 +728,6 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, it->iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@ -754,10 +753,8 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, cursor->resid -= bytes; bio_advance_iter(it->bio, &it->iter, bytes); - if (!cursor->resid) { - BUG_ON(!cursor->last_piece); + if (!cursor->resid) return false; /* no more data */ - } if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && page == bio_iter_page(it->bio, it->iter))) @@ -770,9 +767,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, it->iter.bi_size = cursor->resid; } - BUG_ON(cursor->last_piece); BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); return true; } #endif /* CONFIG_BLOCK */ @@ -788,8 +783,6 @@ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->bvec_iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); - cursor->last_piece = - cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); } static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, @@ -815,19 +808,14 @@ static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, cursor->resid -= bytes; bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); - if (!cursor->resid) { - BUG_ON(!cursor->last_piece); + if (!cursor->resid) return false; /* no more data */ - } if (!bytes || (cursor->bvec_iter.bi_bvec_done && page == bvec_iter_page(bvecs, cursor->bvec_iter))) return false; /* more bytes to process in this segment */ - BUG_ON(cursor->last_piece); BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); - cursor->last_piece = - cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); return true; } @@ -853,7 +841,6 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); - cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; } static struct page * @@ -868,11 +855,7 @@ ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, BUG_ON(cursor->page_offset >= PAGE_SIZE); *page_offset = cursor->page_offset; - if (cursor->last_piece) - *length = cursor->resid; - else - *length = PAGE_SIZE - *page_offset; - + *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return data->pages[cursor->page_index]; } @@ -897,8 +880,6 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_index++; - cursor->last_piece = cursor->resid <= PAGE_SIZE; - return true; } @@ -928,7 +909,6 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; - cursor->last_piece = cursor->resid <= PAGE_SIZE; } static struct page * @@ -948,11 +928,7 @@ ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; - if (cursor->last_piece) - *length = cursor->resid; - else - *length = PAGE_SIZE - *page_offset; - + *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return cursor->page; } @@ -985,8 +961,6 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_next_entry(cursor->page, lru); - cursor->last_piece = cursor->resid <= PAGE_SIZE; - return true; } @@ -1044,8 +1018,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, * Indicate whether this is the last piece in this data item. */ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece) + size_t *page_offset, size_t *length) { struct page *page; @@ -1074,8 +1047,6 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); BUG_ON(*length > cursor->resid); - if (last_piece) - *last_piece = cursor->last_piece; return page; } @@ -1112,7 +1083,6 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) cursor->total_resid -= bytes; if (!cursor->resid && cursor->total_resid) { - WARN_ON(!cursor->last_piece); cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 6b014eca3a130..3ddbde87e4d6e 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -495,7 +495,7 @@ static int write_partial_message_data(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + page = ceph_msg_data_next(cursor, &page_offset, &length); if (length == cursor->total_resid) more = MSG_MORE; ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, @@ -1008,7 +1008,7 @@ static int read_partial_msg_data(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + page = ceph_msg_data_next(cursor, &page_offset, &length); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { if (do_datacrc) @@ -1050,7 +1050,7 @@ static int read_partial_msg_data_bounce(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &off, &len, NULL); + page = ceph_msg_data_next(cursor, &off, &len); ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); if (ret <= 0) { con->in_data_crc = crc; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c6e5bfc717d54..cc8ff81a50b7f 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -862,7 +862,7 @@ static void get_bvec_at(struct ceph_msg_data_cursor *cursor, ceph_msg_data_advance(cursor, 0); /* get a piece of data, cursor isn't advanced */ - page = ceph_msg_data_next(cursor, &off, &len, NULL); + page = ceph_msg_data_next(cursor, &off, &len); bv->bv_page = page; bv->bv_offset = off; -- GitLab From f791357330b0043ec953ce122ab7519af4b9d24a Mon Sep 17 00:00:00 2001 From: Xiubo Li <xiubli@redhat.com> Date: Fri, 5 Aug 2022 12:33:03 +0800 Subject: [PATCH 1284/2223] ceph: wake up the waiters if any new caps comes When new caps comes we need to wake up the waiters and also when revoking the caps, there also could be new caps comes. Link: https://tracker.ceph.com/issues/54044 Signed-off-by: Xiubo Li <xiubli@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/caps.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53cfe026b3ea5..0ddd91eadbce2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; + wake_up_all(&ci->i_cap_wq); } /* @@ -3550,6 +3551,9 @@ static void handle_cap_grant(struct inode *inode, check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { -- GitLab From 6eb06c46214d33c71ae86d60b3fc9cb17c20beca Mon Sep 17 00:00:00 2001 From: Xiubo Li <xiubli@redhat.com> Date: Wed, 27 Jul 2022 12:29:10 +0800 Subject: [PATCH 1285/2223] ceph: fail the request if the peer MDS doesn't support getvxattr op Just fail the request instead sending the request out, or the peer MDS will crash. Link: https://tracker.ceph.com/issues/56529 Signed-off-by: Xiubo Li <xiubli@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 11 +++++++++++ fs/ceph/mds_client.h | 6 +++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 42351d7a0dd6b..b4a3cb07d5b00 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2356,6 +2356,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, goto out; } + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { err = -ENOMEM; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 80f8b9ec1a312..26a0a8b9975ef 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); @@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc, dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } + if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 256e3eada6c12..0598faa50e2e0 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -31,8 +31,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,6 +45,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ } /* @@ -336,6 +338,8 @@ struct ceph_mds_request { long long r_dir_ordered_cnt; int r_readdir_cache_idx; + int r_feature_needed; + struct ceph_cap_reservation r_caps_reservation; }; -- GitLab From 7c3ea9870e09e193981695dd67c37a1a2b6d600b Mon Sep 17 00:00:00 2001 From: Xiubo Li <xiubli@redhat.com> Date: Thu, 11 Aug 2022 13:00:53 +0800 Subject: [PATCH 1286/2223] ceph: no need to wait for transition RDCACHE|RD -> RD For write when trying to get the Fwb caps we need to keep waiting on transition from WRBUFFER|WR -> WR to avoid a new WR sync write from going before a prior buffered writeback happens. While for read there is no need to wait on transition from RDCACHE|RD -> RD, and we can just exclude the revoking caps and force to sync read. Signed-off-by: Xiubo Li <xiubli@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/caps.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0ddd91eadbce2..0dc1251c3c6db 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2760,13 +2760,17 @@ again: * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; + int exclude = revoking & not; dout("get_cap_refs %p have %s but not %s (revoking %s)\n", inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); - if ((revoking & not) == 0) { + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { @@ -2788,7 +2792,7 @@ again: snap_rwsem_locked = true; } if ((have & want) == want) - *got = need | want; + *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); -- GitLab From aa1d627207cace003163dee24d1c06fa4e910c6b Mon Sep 17 00:00:00 2001 From: Kenneth Lee <klee33@uw.edu> Date: Thu, 18 Aug 2022 22:42:55 -0700 Subject: [PATCH 1287/2223] ceph: Use kcalloc for allocating multiple elements Prefer using kcalloc(a, b) over kzalloc(a * b) as this improves semantics since kcalloc is intended for allocating an array of memory. Signed-off-by: Kenneth Lee <klee33@uw.edu> Reviewed-by: Xiubo Li <xiubli@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0dc1251c3c6db..fb023f9fafcbe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2286,7 +2286,7 @@ retry: struct ceph_mds_request *req; int i; - sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { err = -ENOMEM; goto out; -- GitLab From b4b924c7a16e857b0715603456045251a49f2ea6 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@kernel.org> Date: Wed, 24 Aug 2022 09:24:42 -0400 Subject: [PATCH 1288/2223] ceph: increment i_version when doing a setattr with caps When the client has enough caps to satisfy a setattr locally without having to talk to the server, we currently do the setattr without incrementing the change attribute. Ensure that if the ctime changes locally, then the change attribute does too. Signed-off-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b4a3cb07d5b00..a5e2eb5704c9f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); inode->i_ctime = attr->ia_ctime; + inode_inc_iversion_raw(inode); } release &= issued; -- GitLab From bd04b9192e1ff6859d6b3906e91cfd5c9b0ad55b Mon Sep 17 00:00:00 2001 From: Xiubo Li <xiubli@redhat.com> Date: Tue, 30 Aug 2022 22:49:36 +0800 Subject: [PATCH 1289/2223] ceph: fail the open_by_handle_at() if the dentry is being unlinked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unlinking a file the kclient will send a unlink request to MDS by holding the dentry reference, and then the MDS will return 2 replies, which are unsafe reply and a deferred safe reply. After the unsafe reply received the kernel will return and succeed the unlink request to user space apps. Only when the safe reply received the dentry's reference will be released. Or the dentry will only be unhashed from dcache. But when the open_by_handle_at() begins to open the unlinked files it will succeed. The inode->i_count couldn't be used to check whether the inode is opened or not. Link: https://tracker.ceph.com/issues/56524 Signed-off-by: Xiubo Li <xiubli@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Luís Henriques <lhenriques@suse.de> Tested-by: Luís Henriques <lhenriques@suse.de> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/export.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e0fa66ac8b9fa..f780e4e0d0629 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); int err; if (IS_ERR(inode)) @@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(err); } /* -ESTALE if inode as been unlinked and no file is open */ - if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { iput(inode); return ERR_PTR(-ESTALE); } -- GitLab From aa87052dd965a6094355fcc13d5abc3f5bebfbe4 Mon Sep 17 00:00:00 2001 From: Xiubo Li <xiubli@redhat.com> Date: Wed, 31 Aug 2022 12:13:28 +0800 Subject: [PATCH 1290/2223] ceph: fix incorrectly showing the .snap size for stat We should set the 'stat->size' to the real number of snapshots for snapdirs. Link: https://tracker.ceph.com/issues/57342 Signed-off-by: Xiubo Li <xiubli@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- fs/ceph/inode.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a5e2eb5704c9f..9ebb7cee79789 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2449,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2478,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, } if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (!parent) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* -- GitLab From 71cf0c1c4f9f8e42c84ca53a5ca7091e4eea7f6a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Mon, 5 Sep 2022 14:35:35 +0800 Subject: [PATCH 1291/2223] ceph: remove Sage's git tree from documentation Sage's git tree has not been pushed to in years, and it was removed in commit 3a5ccecd9af7 ("MAINTAINERS: remove myself as ceph co-maintainer"), so it is better to remove it in the documentation too. Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- Documentation/filesystems/ceph.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst index 4942e018db855..76ce938e70244 100644 --- a/Documentation/filesystems/ceph.rst +++ b/Documentation/filesystems/ceph.rst @@ -203,7 +203,6 @@ For more information on Ceph, see the home page at The Linux kernel client source tree is available at - https://github.com/ceph/ceph-client.git - - git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git and the source for the full system is at https://github.com/ceph/ceph.git -- GitLab From 98828955971363e838149105c268b1fad905f15b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:39 +0200 Subject: [PATCH 1292/2223] drm/i915/gvt: fix a memory leak in intel_gvt_init_vgpu_types gvt->types needs to be freed on error. Fixes: bc90d097ae14 ("drm/i915/gvt: define weight according to vGPU type") Reported-by: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com> Link: https://lore.kernel.org/r/20220923092652.100656-2-hch@lst.de [aw: Correct fixes commit ID as reported by Stephen Rothwell] Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/gpu/drm/i915/gvt/vgpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 5c533fbc2c8da..dbb2a971ba5d8 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -142,7 +142,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) if (vgpu_types[i].weight < 1 || vgpu_types[i].weight > VGPU_MAX_WEIGHT) - return -EINVAL; + goto out_free_types; gvt->types[i].weight = vgpu_types[i].weight; gvt->types[i].resolution = vgpu_types[i].edid; @@ -167,6 +167,10 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) gvt->num_types = i; return 0; + +out_free_types: + kfree(gvt->types); + return -EINVAL; } void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) -- GitLab From 1aa3834f510c9d9206ce4d40aff4903b0c016761 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:40 +0200 Subject: [PATCH 1293/2223] drm/i915/gvt: simplify vgpu configuration management Instead of copying the information from the vgpu_types arrays into each intel_vgpu_type structure, just reference this constant information with a pointer to the already existing data structure, and pass it into the low-level VGPU creation helpers intead of copying the data into yet anothe params data structure. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com> Link: https://lore.kernel.org/r/20220923092652.100656-3-hch@lst.de [aw: Fold fix from 20220928121110.GA30738@lst.de] Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/gpu/drm/i915/gvt/aperture_gm.c | 20 ++-- drivers/gpu/drm/i915/gvt/gvt.h | 37 +++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 10 +- drivers/gpu/drm/i915/gvt/vgpu.c | 159 ++++++++----------------- 4 files changed, 81 insertions(+), 145 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c b/drivers/gpu/drm/i915/gvt/aperture_gm.c index 3b81a6d35a7b2..076c779f776a6 100644 --- a/drivers/gpu/drm/i915/gvt/aperture_gm.c +++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c @@ -240,13 +240,13 @@ static void free_resource(struct intel_vgpu *vgpu) } static int alloc_resource(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param) + const struct intel_vgpu_config *conf) { struct intel_gvt *gvt = vgpu->gvt; unsigned long request, avail, max, taken; const char *item; - if (!param->low_gm_sz || !param->high_gm_sz || !param->fence_sz) { + if (!conf->low_mm || !conf->high_mm || !conf->fence) { gvt_vgpu_err("Invalid vGPU creation params\n"); return -EINVAL; } @@ -255,7 +255,7 @@ static int alloc_resource(struct intel_vgpu *vgpu, max = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; taken = gvt->gm.vgpu_allocated_low_gm_size; avail = max - taken; - request = MB_TO_BYTES(param->low_gm_sz); + request = conf->low_mm; if (request > avail) goto no_enough_resource; @@ -266,7 +266,7 @@ static int alloc_resource(struct intel_vgpu *vgpu, max = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; taken = gvt->gm.vgpu_allocated_high_gm_size; avail = max - taken; - request = MB_TO_BYTES(param->high_gm_sz); + request = conf->high_mm; if (request > avail) goto no_enough_resource; @@ -277,16 +277,16 @@ static int alloc_resource(struct intel_vgpu *vgpu, max = gvt_fence_sz(gvt) - HOST_FENCE; taken = gvt->fence.vgpu_allocated_fence_num; avail = max - taken; - request = param->fence_sz; + request = conf->fence; if (request > avail) goto no_enough_resource; vgpu_fence_sz(vgpu) = request; - gvt->gm.vgpu_allocated_low_gm_size += MB_TO_BYTES(param->low_gm_sz); - gvt->gm.vgpu_allocated_high_gm_size += MB_TO_BYTES(param->high_gm_sz); - gvt->fence.vgpu_allocated_fence_num += param->fence_sz; + gvt->gm.vgpu_allocated_low_gm_size += conf->low_mm; + gvt->gm.vgpu_allocated_high_gm_size += conf->high_mm; + gvt->fence.vgpu_allocated_fence_num += conf->fence; return 0; no_enough_resource: @@ -340,11 +340,11 @@ void intel_vgpu_reset_resource(struct intel_vgpu *vgpu) * */ int intel_vgpu_alloc_resource(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param) + const struct intel_vgpu_config *conf) { int ret; - ret = alloc_resource(vgpu, param); + ret = alloc_resource(vgpu, conf); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 89fab7896fc6a..563ffc2fdfb7a 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -294,15 +294,26 @@ struct intel_gvt_firmware { bool firmware_loaded; }; +struct intel_vgpu_config { + unsigned int low_mm; + unsigned int high_mm; + unsigned int fence; + + /* + * A vGPU with a weight of 8 will get twice as much GPU as a vGPU with + * a weight of 4 on a contended host, different vGPU type has different + * weight set. Legal weights range from 1 to 16. + */ + unsigned int weight; + enum intel_vgpu_edid edid; + const char *name; +}; + #define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { char name[16]; + const struct intel_vgpu_config *conf; unsigned int avail_instance; - unsigned int low_gm_size; - unsigned int high_gm_size; - unsigned int fence; - unsigned int weight; - enum intel_vgpu_edid resolution; }; struct intel_gvt { @@ -436,19 +447,8 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt); /* ring context size i.e. the first 0x50 dwords*/ #define RING_CTX_SIZE 320 -struct intel_vgpu_creation_params { - __u64 low_gm_sz; /* in MB */ - __u64 high_gm_sz; /* in MB */ - __u64 fence_sz; - __u64 resolution; - __s32 primary; - __u64 vgpu_id; - - __u32 weight; -}; - int intel_vgpu_alloc_resource(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param); + const struct intel_vgpu_config *conf); void intel_vgpu_reset_resource(struct intel_vgpu *vgpu); void intel_vgpu_free_resource(struct intel_vgpu *vgpu); void intel_vgpu_write_fence(struct intel_vgpu *vgpu, @@ -494,7 +494,8 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); -int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type); +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, + const struct intel_vgpu_config *conf); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_release_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 9003145adb5a9..7f3596394645e 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -151,10 +151,10 @@ static ssize_t description_show(struct mdev_type *mtype, return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n" "fence: %d\nresolution: %s\n" "weight: %d\n", - BYTES_TO_MB(type->low_gm_size), - BYTES_TO_MB(type->high_gm_size), - type->fence, vgpu_edid_str(type->resolution), - type->weight); + BYTES_TO_MB(type->conf->low_mm), + BYTES_TO_MB(type->conf->high_mm), + type->conf->fence, vgpu_edid_str(type->conf->edid), + type->conf->weight); } static ssize_t name_show(struct mdev_type *mtype, @@ -1559,7 +1559,7 @@ static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) return -EINVAL; vgpu->gvt = gvt; - return intel_gvt_create_vgpu(vgpu, type); + return intel_gvt_create_vgpu(vgpu, type->conf); } static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index dbb2a971ba5d8..b0d5dafd013f4 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -73,24 +73,21 @@ void populate_pvinfo_page(struct intel_vgpu *vgpu) drm_WARN_ON(&i915->drm, sizeof(struct vgt_if) != VGT_PVINFO_SIZE); } +/* + * vGPU type name is defined as GVTg_Vx_y which contains the physical GPU + * generation type (e.g V4 as BDW server, V5 as SKL server). + * + * Depening on the physical SKU resource, we might see vGPU types like + * GVTg_V4_8, GVTg_V4_4, GVTg_V4_2, etc. We can create different types of + * vGPU on same physical GPU depending on available resource. Each vGPU + * type will have a different number of avail_instance to indicate how + * many vGPU instance can be created for this type. + */ #define VGPU_MAX_WEIGHT 16 #define VGPU_WEIGHT(vgpu_num) \ (VGPU_MAX_WEIGHT / (vgpu_num)) -static const struct { - unsigned int low_mm; - unsigned int high_mm; - unsigned int fence; - - /* A vGPU with a weight of 8 will get twice as much GPU as a vGPU - * with a weight of 4 on a contended host, different vGPU type has - * different weight set. Legal weights range from 1 to 16. - */ - unsigned int weight; - enum intel_vgpu_edid edid; - const char *name; -} vgpu_types[] = { -/* Fixed vGPU type table */ +static const struct intel_vgpu_config intel_vgpu_configs[] = { { MB_TO_BYTES(64), MB_TO_BYTES(384), 4, VGPU_WEIGHT(8), GVT_EDID_1024_768, "8" }, { MB_TO_BYTES(128), MB_TO_BYTES(512), 4, VGPU_WEIGHT(4), GVT_EDID_1920_1200, "4" }, { MB_TO_BYTES(256), MB_TO_BYTES(1024), 4, VGPU_WEIGHT(2), GVT_EDID_1920_1200, "2" }, @@ -106,63 +103,34 @@ static const struct { */ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) { - unsigned int num_types; - unsigned int i, low_avail, high_avail; - unsigned int min_low; - - /* vGPU type name is defined as GVTg_Vx_y which contains - * physical GPU generation type (e.g V4 as BDW server, V5 as - * SKL server). - * - * Depend on physical SKU resource, might see vGPU types like - * GVTg_V4_8, GVTg_V4_4, GVTg_V4_2, etc. We can create - * different types of vGPU on same physical GPU depending on - * available resource. Each vGPU type will have "avail_instance" - * to indicate how many vGPU instance can be created for this - * type. - * - */ - low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; - high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; - num_types = ARRAY_SIZE(vgpu_types); + unsigned int low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; + unsigned int high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; + unsigned int num_types = ARRAY_SIZE(intel_vgpu_configs); + unsigned int i; gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type), GFP_KERNEL); if (!gvt->types) return -ENOMEM; - min_low = MB_TO_BYTES(32); for (i = 0; i < num_types; ++i) { - if (low_avail / vgpu_types[i].low_mm == 0) - break; - - gvt->types[i].low_gm_size = vgpu_types[i].low_mm; - gvt->types[i].high_gm_size = vgpu_types[i].high_mm; - gvt->types[i].fence = vgpu_types[i].fence; + const struct intel_vgpu_config *conf = &intel_vgpu_configs[i]; - if (vgpu_types[i].weight < 1 || - vgpu_types[i].weight > VGPU_MAX_WEIGHT) + if (low_avail / conf->low_mm == 0) + break; + if (conf->weight < 1 || conf->weight > VGPU_MAX_WEIGHT) goto out_free_types; - gvt->types[i].weight = vgpu_types[i].weight; - gvt->types[i].resolution = vgpu_types[i].edid; - gvt->types[i].avail_instance = min(low_avail / vgpu_types[i].low_mm, - high_avail / vgpu_types[i].high_mm); - - if (GRAPHICS_VER(gvt->gt->i915) == 8) - sprintf(gvt->types[i].name, "GVTg_V4_%s", - vgpu_types[i].name); - else if (GRAPHICS_VER(gvt->gt->i915) == 9) - sprintf(gvt->types[i].name, "GVTg_V5_%s", - vgpu_types[i].name); + sprintf(gvt->types[i].name, "GVTg_V%u_%s", + GRAPHICS_VER(gvt->gt->i915) == 8 ? 4 : 5, conf->name); + gvt->types[i].conf = conf; + gvt->types[i].avail_instance = min(low_avail / conf->low_mm, + high_avail / conf->high_mm); gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", - i, gvt->types[i].name, - gvt->types[i].avail_instance, - gvt->types[i].low_gm_size, - gvt->types[i].high_gm_size, gvt->types[i].fence, - gvt->types[i].weight, - vgpu_edid_str(gvt->types[i].resolution)); + i, gvt->types[i].name, gvt->types[i].avail_instance, + conf->low_mm, conf->high_mm, conf->fence, + conf->weight, vgpu_edid_str(conf->edid)); } gvt->num_types = i; @@ -195,16 +163,16 @@ static void intel_gvt_update_vgpu_types(struct intel_gvt *gvt) gvt->fence.vgpu_allocated_fence_num; for (i = 0; i < gvt->num_types; i++) { - low_gm_min = low_gm_avail / gvt->types[i].low_gm_size; - high_gm_min = high_gm_avail / gvt->types[i].high_gm_size; - fence_min = fence_avail / gvt->types[i].fence; + low_gm_min = low_gm_avail / gvt->types[i].conf->low_mm; + high_gm_min = high_gm_avail / gvt->types[i].conf->high_mm; + fence_min = fence_avail / gvt->types[i].conf->fence; gvt->types[i].avail_instance = min(min(low_gm_min, high_gm_min), fence_min); gvt_dbg_core("update type[%d]: %s avail %u low %u high %u fence %u\n", i, gvt->types[i].name, - gvt->types[i].avail_instance, gvt->types[i].low_gm_size, - gvt->types[i].high_gm_size, gvt->types[i].fence); + gvt->types[i].avail_instance, gvt->types[i].conf->low_mm, + gvt->types[i].conf->high_mm, gvt->types[i].conf->fence); } } @@ -365,37 +333,38 @@ void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu) vfree(vgpu); } -static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param) +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, + const struct intel_vgpu_config *conf) { struct intel_gvt *gvt = vgpu->gvt; struct drm_i915_private *dev_priv = gvt->gt->i915; int ret; - gvt_dbg_core("low %llu MB high %llu MB fence %llu\n", - param->low_gm_sz, param->high_gm_sz, - param->fence_sz); + gvt_dbg_core("low %u MB high %u MB fence %u\n", + BYTES_TO_MB(conf->low_mm), BYTES_TO_MB(conf->high_mm), + conf->fence); + mutex_lock(&gvt->lock); ret = idr_alloc(&gvt->vgpu_idr, vgpu, IDLE_VGPU_IDR + 1, GVT_MAX_VGPU, GFP_KERNEL); if (ret < 0) - return ret; + goto out_unlock;; vgpu->id = ret; - vgpu->sched_ctl.weight = param->weight; + vgpu->sched_ctl.weight = conf->weight; mutex_init(&vgpu->vgpu_lock); mutex_init(&vgpu->dmabuf_lock); INIT_LIST_HEAD(&vgpu->dmabuf_obj_list_head); INIT_RADIX_TREE(&vgpu->page_track_tree, GFP_KERNEL); idr_init_base(&vgpu->object_idr, 1); - intel_vgpu_init_cfg_space(vgpu, param->primary); + intel_vgpu_init_cfg_space(vgpu, 1); vgpu->d3_entered = false; ret = intel_vgpu_init_mmio(vgpu); if (ret) goto out_clean_idr; - ret = intel_vgpu_alloc_resource(vgpu, param); + ret = intel_vgpu_alloc_resource(vgpu, conf); if (ret) goto out_clean_vgpu_mmio; @@ -409,7 +378,7 @@ static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, if (ret) goto out_clean_gtt; - ret = intel_vgpu_init_display(vgpu, param->resolution); + ret = intel_vgpu_init_display(vgpu, conf->edid); if (ret) goto out_clean_opregion; @@ -434,6 +403,9 @@ static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, if (ret) goto out_clean_sched_policy; + intel_gvt_update_vgpu_types(gvt); + intel_gvt_update_reg_whitelist(vgpu); + mutex_unlock(&gvt->lock); return 0; out_clean_sched_policy: @@ -452,45 +424,8 @@ out_clean_vgpu_mmio: intel_vgpu_clean_mmio(vgpu); out_clean_idr: idr_remove(&gvt->vgpu_idr, vgpu->id); - return ret; -} - -/** - * intel_gvt_create_vgpu - create a virtual GPU - * @gvt: GVT device - * @type: type of the vGPU to create - * - * This function is called when user wants to create a virtual GPU. - * - * Returns: - * pointer to intel_vgpu, error pointer if failed. - */ -int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type) -{ - struct intel_gvt *gvt = vgpu->gvt; - struct intel_vgpu_creation_params param; - int ret; - - param.primary = 1; - param.low_gm_sz = type->low_gm_size; - param.high_gm_sz = type->high_gm_size; - param.fence_sz = type->fence; - param.weight = type->weight; - param.resolution = type->resolution; - - /* XXX current param based on MB */ - param.low_gm_sz = BYTES_TO_MB(param.low_gm_sz); - param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); - - mutex_lock(&gvt->lock); - ret = __intel_gvt_create_vgpu(vgpu, ¶m); - if (!ret) { - /* calculate left instance change for types */ - intel_gvt_update_vgpu_types(gvt); - intel_gvt_update_reg_whitelist(vgpu); - } +out_unlock: mutex_unlock(&gvt->lock); - return ret; } -- GitLab From bdef2b7896df293736330eb6eb0f43947049b828 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:41 +0200 Subject: [PATCH 1294/2223] vfio/mdev: make mdev.h standalone includable Include <linux/device.h> and <linux/uuid.h> so that users of this headers don't need to do that and remove those includes that aren't needed any more. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Eric Farman <farman@linux.ibm.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Link: https://lore.kernel.org/r/20220923092652.100656-4-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/gpu/drm/i915/gvt/kvmgt.c | 2 -- drivers/s390/cio/vfio_ccw_drv.c | 1 - drivers/s390/crypto/vfio_ap_private.h | 1 - drivers/vfio/mdev/mdev_core.c | 2 -- drivers/vfio/mdev/mdev_driver.c | 1 - drivers/vfio/mdev/mdev_sysfs.c | 2 -- include/linux/mdev.h | 3 +++ samples/vfio-mdev/mbochs.c | 1 - samples/vfio-mdev/mdpy.c | 1 - samples/vfio-mdev/mtty.c | 2 -- 10 files changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7f3596394645e..ee314402fb611 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -34,7 +34,6 @@ */ #include <linux/init.h> -#include <linux/device.h> #include <linux/mm.h> #include <linux/kthread.h> #include <linux/sched/mm.h> @@ -43,7 +42,6 @@ #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/eventfd.h> -#include <linux/uuid.h> #include <linux/mdev.h> #include <linux/debugfs.h> diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 86d9e428357b0..e9985c63dc6bf 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -12,7 +12,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/device.h> #include <linux/slab.h> #include <linux/mdev.h> diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index d782cf463eaba..163eeaaf24cee 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -13,7 +13,6 @@ #define _VFIO_AP_PRIVATE_H_ #include <linux/types.h> -#include <linux/device.h> #include <linux/mdev.h> #include <linux/delay.h> #include <linux/mutex.h> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index b8b9e7911e559..2c32923fbad27 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -8,9 +8,7 @@ */ #include <linux/module.h> -#include <linux/device.h> #include <linux/slab.h> -#include <linux/uuid.h> #include <linux/sysfs.h> #include <linux/mdev.h> diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 9c2af59809e2e..7bd4bb9850e81 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -7,7 +7,6 @@ * Kirti Wankhede <kwankhede@nvidia.com> */ -#include <linux/device.h> #include <linux/iommu.h> #include <linux/mdev.h> diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 0ccfeb3dda245..4bfbf49aaa66a 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -9,9 +9,7 @@ #include <linux/sysfs.h> #include <linux/ctype.h> -#include <linux/device.h> #include <linux/slab.h> -#include <linux/uuid.h> #include <linux/mdev.h> #include "mdev_private.h" diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 47ad3b104d9e7..a5d8ae6132a20 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -10,6 +10,9 @@ #ifndef MDEV_H #define MDEV_H +#include <linux/device.h> +#include <linux/uuid.h> + struct mdev_type; struct mdev_device { diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 6901947e27d2d..985b6e7136219 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -21,7 +21,6 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index bb2af1ec0f7c6..1daab012b5d89 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -17,7 +17,6 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index d151928e4f21d..86843ce3d9a27 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/poll.h> @@ -20,7 +19,6 @@ #include <linux/cdev.h> #include <linux/sched.h> #include <linux/wait.h> -#include <linux/uuid.h> #include <linux/vfio.h> #include <linux/iommu.h> #include <linux/sysfs.h> -- GitLab From 89345d5177aa0f6d678251e1e0870b0eeb1ab510 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:42 +0200 Subject: [PATCH 1295/2223] vfio/mdev: embedd struct mdev_parent in the parent data structure Simplify mdev_{un}register_device by requiring the caller to pass in a structure allocate as part of the parent device structure. This removes the need for a list of parents and the separate mdev_parent refcount as we can simplify rely on the reference to the parent device. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-5-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- .../driver-api/vfio-mediated-device.rst | 12 +- Documentation/s390/vfio-ap.rst | 2 +- Documentation/s390/vfio-ccw.rst | 2 +- drivers/gpu/drm/i915/gvt/gvt.h | 2 + drivers/gpu/drm/i915/gvt/kvmgt.c | 5 +- drivers/s390/cio/vfio_ccw_drv.c | 5 +- drivers/s390/cio/vfio_ccw_ops.c | 1 - drivers/s390/cio/vfio_ccw_private.h | 4 + drivers/s390/crypto/vfio_ap_ops.c | 5 +- drivers/s390/crypto/vfio_ap_private.h | 1 + drivers/vfio/mdev/mdev_core.c | 120 ++++-------------- drivers/vfio/mdev/mdev_private.h | 23 ---- drivers/vfio/mdev/mdev_sysfs.c | 4 +- include/linux/mdev.h | 15 ++- samples/vfio-mdev/mbochs.c | 5 +- samples/vfio-mdev/mdpy.c | 5 +- samples/vfio-mdev/mtty.c | 6 +- 17 files changed, 71 insertions(+), 146 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index f47dca6645aae..cd1667608ab5d 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -58,19 +58,19 @@ devices as examples, as these devices are the first devices to use this module:: | MDEV CORE | | MODULE | | mdev.ko | - | +-----------+ | mdev_register_device() +--------------+ + | +-----------+ | mdev_register_parent() +--------------+ | | | +<------------------------+ | | | | | | nvidia.ko |<-> physical | | | +------------------------>+ | device | | | | callbacks +--------------+ | | Physical | | - | | device | | mdev_register_device() +--------------+ + | | device | | mdev_register_parent() +--------------+ | | interface | |<------------------------+ | | | | | | i915.ko |<-> physical | | | +------------------------>+ | device | | | | callbacks +--------------+ | | | | - | | | | mdev_register_device() +--------------+ + | | | | mdev_register_parent() +--------------+ | | | +<------------------------+ | | | | | | ccw_device.ko|<-> physical | | | +------------------------>+ | device @@ -125,8 +125,8 @@ vfio_device_ops. When a driver wants to add the GUID creation sysfs to an existing device it has probe'd to then it should call:: - int mdev_register_device(struct device *dev, - struct mdev_driver *mdev_driver); + int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver); This will provide the 'mdev_supported_types/XX/create' files which can then be used to trigger the creation of a mdev_device. The created mdev_device will be @@ -134,7 +134,7 @@ attached to the specified driver. When the driver needs to remove itself it calls:: - void mdev_unregister_device(struct device *dev); + void mdev_unregister_parent(struct mdev_parent *parent); Which will unbind and destroy all the created mdevs and remove the sysfs files. diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst index 61a0a3c6c7b4b..00f4a04f6d4c6 100644 --- a/Documentation/s390/vfio-ap.rst +++ b/Documentation/s390/vfio-ap.rst @@ -297,7 +297,7 @@ of the VFIO AP mediated device driver:: | MDEV CORE | | MODULE | | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ + | +---------+ | mdev_register_parent() +--------------+ | |Physical | +<-----------------------+ | | | device | | | vfio_ap.ko |<-> matrix | |interface| +----------------------->+ | device diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 8aad08a8b8a50..ea928a3806f43 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -156,7 +156,7 @@ Below is a high Level block diagram:: | MDEV CORE | | MODULE | | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ + | +---------+ | mdev_register_parent() +--------------+ | |Physical | +<-----------------------+ | | | device | | | vfio_ccw.ko |<-> subchannel | |interface| +----------------------->+ | device diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 563ffc2fdfb7a..fa4a56b50c828 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -36,6 +36,7 @@ #include <uapi/linux/pci_regs.h> #include <linux/kvm_host.h> #include <linux/vfio.h> +#include <linux/mdev.h> #include "i915_drv.h" #include "intel_gvt.h" @@ -337,6 +338,7 @@ struct intel_gvt { struct intel_gvt_workload_scheduler scheduler; struct notifier_block shadow_ctx_notifier_block[I915_NUM_ENGINES]; DECLARE_HASHTABLE(cmd_table, GVT_CMD_HASH_BITS); + struct mdev_parent parent; struct intel_vgpu_type *types; unsigned int num_types; struct intel_vgpu *idle_vgpu; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index ee314402fb611..d7afe3f5f75b3 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1923,7 +1923,7 @@ static void intel_gvt_clean_device(struct drm_i915_private *i915) if (drm_WARN_ON(&i915->drm, !gvt)) return; - mdev_unregister_device(i915->drm.dev); + mdev_unregister_parent(&gvt->parent); intel_gvt_cleanup_vgpu_type_groups(gvt); intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu); intel_gvt_clean_vgpu_types(gvt); @@ -2028,7 +2028,8 @@ static int intel_gvt_init_device(struct drm_i915_private *i915) if (ret) goto out_destroy_idle_vgpu; - ret = mdev_register_device(i915->drm.dev, &intel_vgpu_mdev_driver); + ret = mdev_register_parent(&gvt->parent, i915->drm.dev, + &intel_vgpu_mdev_driver); if (ret) goto out_cleanup_vgpu_type_groups; diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e9985c63dc6bf..7d105915bd149 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -221,7 +221,8 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) dev_set_drvdata(&sch->dev, private); - ret = mdev_register_device(&sch->dev, &vfio_ccw_mdev_driver); + ret = mdev_register_parent(&private->parent, &sch->dev, + &vfio_ccw_mdev_driver); if (ret) goto out_free; @@ -240,7 +241,7 @@ static void vfio_ccw_sch_remove(struct subchannel *sch) { struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); - mdev_unregister_device(&sch->dev); + mdev_unregister_parent(&private->parent); dev_set_drvdata(&sch->dev, NULL); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 9f8486c0d3d37..9a0e0c5ffb1a5 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -11,7 +11,6 @@ */ #include <linux/vfio.h> -#include <linux/mdev.h> #include <linux/nospec.h> #include <linux/slab.h> diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 63d9202b29c7f..1a4bfb1b5a808 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -18,6 +18,7 @@ #include <linux/workqueue.h> #include <linux/vfio_ccw.h> #include <linux/vfio.h> +#include <linux/mdev.h> #include <asm/crw.h> #include <asm/debug.h> @@ -89,6 +90,7 @@ struct vfio_ccw_crw { * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling * @release_comp: synchronization helper for vfio device release + * @parent: parent data structures for mdevs created */ struct vfio_ccw_private { struct vfio_device vdev; @@ -116,6 +118,8 @@ struct vfio_ccw_private { struct work_struct crw_work; struct completion release_comp; + + struct mdev_parent parent; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 161597357a642..724d09a74a8f1 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1830,7 +1830,8 @@ int vfio_ap_mdev_register(void) if (ret) return ret; - ret = mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_driver); + ret = mdev_register_parent(&matrix_dev->parent, &matrix_dev->device, + &vfio_ap_matrix_driver); if (ret) goto err_driver; return 0; @@ -1842,7 +1843,7 @@ err_driver: void vfio_ap_mdev_unregister(void) { - mdev_unregister_device(&matrix_dev->device); + mdev_unregister_parent(&matrix_dev->parent); mdev_unregister_driver(&vfio_ap_matrix_driver); } diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 163eeaaf24cee..35165730f5174 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -52,6 +52,7 @@ struct ap_matrix_dev { struct mutex mdevs_lock; /* serializes access to each ap_matrix_mdev */ struct ap_driver *vfio_ap_drv; struct mutex guests_lock; /* serializes access to each KVM guest */ + struct mdev_parent parent; }; extern struct ap_matrix_dev *matrix_dev; diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 2c32923fbad27..fa05ac3396950 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -18,8 +18,6 @@ #define DRIVER_AUTHOR "NVIDIA Corporation" #define DRIVER_DESC "Mediated device Core Driver" -static LIST_HEAD(parent_list); -static DEFINE_MUTEX(parent_list_lock); static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); @@ -61,28 +59,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype) } EXPORT_SYMBOL(mtype_get_parent_dev); -/* Should be called holding parent_list_lock */ -static struct mdev_parent *__find_parent_device(struct device *dev) -{ - struct mdev_parent *parent; - - list_for_each_entry(parent, &parent_list, next) { - if (parent->dev == dev) - return parent; - } - return NULL; -} - -void mdev_release_parent(struct kref *kref) -{ - struct mdev_parent *parent = container_of(kref, struct mdev_parent, - ref); - struct device *dev = parent->dev; - - kfree(parent); - put_device(dev); -} - /* Caller must hold parent unreg_sem read or write lock */ static void mdev_device_remove_common(struct mdev_device *mdev) { @@ -105,125 +81,73 @@ static int mdev_device_remove_cb(struct device *dev, void *data) } /* - * mdev_register_device : Register a device + * mdev_register_parent: Register a device as parent for mdevs + * @parent: parent structure registered * @dev: device structure representing parent device. * @mdev_driver: Device driver to bind to the newly created mdev * - * Add device to list of registered parent devices. + * Registers the @parent stucture as a parent for mdev types and thus mdev + * devices. The caller needs to hold a reference on @dev that must not be + * released until after the call to mdev_unregister_parent(). + * * Returns a negative value on error, otherwise 0. */ -int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver) +int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver) { - int ret; - struct mdev_parent *parent; char *env_string = "MDEV_STATE=registered"; char *envp[] = { env_string, NULL }; + int ret; /* check for mandatory ops */ if (!mdev_driver->supported_type_groups) return -EINVAL; - dev = get_device(dev); - if (!dev) - return -EINVAL; - - mutex_lock(&parent_list_lock); - - /* Check for duplicate */ - parent = __find_parent_device(dev); - if (parent) { - parent = NULL; - ret = -EEXIST; - goto add_dev_err; - } - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (!parent) { - ret = -ENOMEM; - goto add_dev_err; - } - - kref_init(&parent->ref); + memset(parent, 0, sizeof(*parent)); init_rwsem(&parent->unreg_sem); - parent->dev = dev; parent->mdev_driver = mdev_driver; if (!mdev_bus_compat_class) { mdev_bus_compat_class = class_compat_register("mdev_bus"); - if (!mdev_bus_compat_class) { - ret = -ENOMEM; - goto add_dev_err; - } + if (!mdev_bus_compat_class) + return -ENOMEM; } ret = parent_create_sysfs_files(parent); if (ret) - goto add_dev_err; + return ret; ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL); if (ret) dev_warn(dev, "Failed to create compatibility class link\n"); - list_add(&parent->next, &parent_list); - mutex_unlock(&parent_list_lock); - dev_info(dev, "MDEV: Registered\n"); kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp); - return 0; - -add_dev_err: - mutex_unlock(&parent_list_lock); - if (parent) - mdev_put_parent(parent); - else - put_device(dev); - return ret; } -EXPORT_SYMBOL(mdev_register_device); +EXPORT_SYMBOL(mdev_register_parent); /* - * mdev_unregister_device : Unregister a parent device - * @dev: device structure representing parent device. - * - * Remove device from list of registered parent devices. Give a chance to free - * existing mediated devices for given device. + * mdev_unregister_parent : Unregister a parent device + * @parent: parent structure to unregister */ - -void mdev_unregister_device(struct device *dev) +void mdev_unregister_parent(struct mdev_parent *parent) { - struct mdev_parent *parent; char *env_string = "MDEV_STATE=unregistered"; char *envp[] = { env_string, NULL }; - mutex_lock(&parent_list_lock); - parent = __find_parent_device(dev); - - if (!parent) { - mutex_unlock(&parent_list_lock); - return; - } - dev_info(dev, "MDEV: Unregistering\n"); - - list_del(&parent->next); - mutex_unlock(&parent_list_lock); + dev_info(parent->dev, "MDEV: Unregistering\n"); down_write(&parent->unreg_sem); - - class_compat_remove_link(mdev_bus_compat_class, dev, NULL); - - device_for_each_child(dev, NULL, mdev_device_remove_cb); - + class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL); + device_for_each_child(parent->dev, NULL, mdev_device_remove_cb); parent_remove_sysfs_files(parent); up_write(&parent->unreg_sem); - mdev_put_parent(parent); - - /* We still have the caller's reference to use for the uevent */ - kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp); + kobject_uevent_env(&parent->dev->kobj, KOBJ_CHANGE, envp); } -EXPORT_SYMBOL(mdev_unregister_device); +EXPORT_SYMBOL(mdev_unregister_parent); static void mdev_device_release(struct device *dev) { diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index 7c9fc79f3d838..297f911fdc890 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -13,17 +13,6 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); -struct mdev_parent { - struct device *dev; - struct mdev_driver *mdev_driver; - struct kref ref; - struct list_head next; - struct kset *mdev_types_kset; - struct list_head type_list; - /* Synchronize device creation/removal with parent unregistration */ - struct rw_semaphore unreg_sem; -}; - struct mdev_type { struct kobject kobj; struct kobject *devices_kobj; @@ -48,16 +37,4 @@ void mdev_remove_sysfs_files(struct mdev_device *mdev); int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid); int mdev_device_remove(struct mdev_device *dev); -void mdev_release_parent(struct kref *kref); - -static inline void mdev_get_parent(struct mdev_parent *parent) -{ - kref_get(&parent->ref); -} - -static inline void mdev_put_parent(struct mdev_parent *parent) -{ - kref_put(&parent->ref, mdev_release_parent); -} - #endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 4bfbf49aaa66a..b71ffc5594870 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -81,7 +81,7 @@ static void mdev_type_release(struct kobject *kobj) pr_debug("Releasing group %s\n", kobj->name); /* Pairs with the get in add_mdev_supported_type() */ - mdev_put_parent(type->parent); + put_device(type->parent->dev); kfree(type); } @@ -110,7 +110,7 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, type->kobj.kset = parent->mdev_types_kset; type->parent = parent; /* Pairs with the put in mdev_type_release() */ - mdev_get_parent(parent); + get_device(parent->dev); type->type_group_id = type_group_id; ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL, diff --git a/include/linux/mdev.h b/include/linux/mdev.h index a5d8ae6132a20..262512c2a8ffc 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -23,6 +23,16 @@ struct mdev_device { bool active; }; +/* embedded into the struct device that the mdev devices hang off */ +struct mdev_parent { + struct device *dev; + struct mdev_driver *mdev_driver; + struct kset *mdev_types_kset; + struct list_head type_list; + /* Synchronize device creation/removal with parent unregistration */ + struct rw_semaphore unreg_sem; +}; + static inline struct mdev_device *to_mdev_device(struct device *dev) { return container_of(dev, struct mdev_device, dev); @@ -70,8 +80,9 @@ struct mdev_driver { extern struct bus_type mdev_bus_type; -int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver); -void mdev_unregister_device(struct device *dev); +int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver); +void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); void mdev_unregister_driver(struct mdev_driver *drv); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 985b6e7136219..2c4791abbc3d3 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -128,6 +128,7 @@ static dev_t mbochs_devt; static struct class *mbochs_class; static struct cdev mbochs_cdev; static struct device mbochs_dev; +static struct mdev_parent mbochs_parent; static atomic_t mbochs_avail_mbytes; static const struct vfio_device_ops mbochs_dev_ops; @@ -1475,7 +1476,7 @@ static int __init mbochs_dev_init(void) if (ret) goto err_class; - ret = mdev_register_device(&mbochs_dev, &mbochs_driver); + ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver); if (ret) goto err_device; @@ -1496,7 +1497,7 @@ err_cdev: static void __exit mbochs_dev_exit(void) { mbochs_dev.bus = NULL; - mdev_unregister_device(&mbochs_dev); + mdev_unregister_parent(&mbochs_parent); device_unregister(&mbochs_dev); mdev_unregister_driver(&mbochs_driver); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 1daab012b5d89..01f345430b975 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -83,6 +83,7 @@ static dev_t mdpy_devt; static struct class *mdpy_class; static struct cdev mdpy_cdev; static struct device mdpy_dev; +static struct mdev_parent mdpy_parent; static u32 mdpy_count; static const struct vfio_device_ops mdpy_dev_ops; @@ -778,7 +779,7 @@ static int __init mdpy_dev_init(void) if (ret) goto err_class; - ret = mdev_register_device(&mdpy_dev, &mdpy_driver); + ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver); if (ret) goto err_device; @@ -799,7 +800,7 @@ err_cdev: static void __exit mdpy_dev_exit(void) { mdpy_dev.bus = NULL; - mdev_unregister_device(&mdpy_dev); + mdev_unregister_parent(&mdpy_parent); device_unregister(&mdpy_dev); mdev_unregister_driver(&mdpy_driver); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 86843ce3d9a27..e80baac513811 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -72,6 +72,7 @@ static struct mtty_dev { struct cdev vd_cdev; struct idr vd_idr; struct device dev; + struct mdev_parent parent; } mtty_dev; struct mdev_region_info { @@ -1361,7 +1362,8 @@ static int __init mtty_dev_init(void) if (ret) goto err_class; - ret = mdev_register_device(&mtty_dev.dev, &mtty_driver); + ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, + &mtty_driver); if (ret) goto err_device; return 0; @@ -1381,7 +1383,7 @@ err_cdev: static void __exit mtty_dev_exit(void) { mtty_dev.dev.bus = NULL; - mdev_unregister_device(&mtty_dev.dev); + mdev_unregister_parent(&mtty_dev.parent); device_unregister(&mtty_dev.dev); idr_destroy(&mtty_dev.vd_idr); -- GitLab From da44c340c4fe9d9653ae84fa6a60f406bafcffce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:43 +0200 Subject: [PATCH 1296/2223] vfio/mdev: simplify mdev_type handling Instead of abusing struct attribute_group to control initialization of struct mdev_type, just define the actual attributes in the mdev_driver, allocate the mdev_type structures in the caller and pass them to mdev_register_parent. This allows the caller to use container_of to get at the containing structure and thus significantly simplify the code. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-6-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- .../driver-api/vfio-mediated-device.rst | 2 +- drivers/gpu/drm/i915/gvt/gvt.h | 3 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 102 +++--------------- drivers/gpu/drm/i915/gvt/vgpu.c | 13 ++- drivers/s390/cio/vfio_ccw_drv.c | 6 +- drivers/s390/cio/vfio_ccw_ops.c | 14 +-- drivers/s390/cio/vfio_ccw_private.h | 2 + drivers/s390/crypto/vfio_ap_ops.c | 19 ++-- drivers/s390/crypto/vfio_ap_private.h | 2 + drivers/vfio/mdev/mdev_core.c | 31 ++---- drivers/vfio/mdev/mdev_driver.c | 5 +- drivers/vfio/mdev/mdev_private.h | 8 -- drivers/vfio/mdev/mdev_sysfs.c | 91 ++++------------ include/linux/mdev.h | 26 +++-- samples/vfio-mdev/mbochs.c | 57 ++++------ samples/vfio-mdev/mdpy.c | 50 ++++----- samples/vfio-mdev/mtty.c | 60 +++++------ 17 files changed, 165 insertions(+), 326 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index cd1667608ab5d..ff7342d2e332d 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -103,7 +103,7 @@ structure to represent a mediated device's driver:: struct mdev_driver { int (*probe) (struct mdev_device *dev); void (*remove) (struct mdev_device *dev); - struct attribute_group **supported_type_groups; + const struct attribute * const *types_attrs; struct device_driver driver; }; diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index fa4a56b50c828..db182066d56c9 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -310,8 +310,8 @@ struct intel_vgpu_config { const char *name; }; -#define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { + struct mdev_type type; char name[16]; const struct intel_vgpu_config *conf; unsigned int avail_instance; @@ -339,6 +339,7 @@ struct intel_gvt { struct notifier_block shadow_ctx_notifier_block[I915_NUM_ENGINES]; DECLARE_HASHTABLE(cmd_table, GVT_CMD_HASH_BITS); struct mdev_parent parent; + struct mdev_type **mdev_types; struct intel_vgpu_type *types; unsigned int num_types; struct intel_vgpu *idle_vgpu; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index d7afe3f5f75b3..12b0b33061685 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -117,17 +117,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct intel_vgpu_type *type; - unsigned int num = 0; - struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt; + struct intel_vgpu_type *type = + container_of(mtype, struct intel_vgpu_type, type); - type = &gvt->types[mtype_get_type_group_id(mtype)]; - if (!type) - num = 0; - else - num = type->avail_instance; - - return sprintf(buf, "%u\n", num); + return sprintf(buf, "%u\n", type->avail_instance); } static ssize_t device_api_show(struct mdev_type *mtype, @@ -139,12 +132,8 @@ static ssize_t device_api_show(struct mdev_type *mtype, static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct intel_vgpu_type *type; - struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt; - - type = &gvt->types[mtype_get_type_group_id(mtype)]; - if (!type) - return 0; + struct intel_vgpu_type *type = + container_of(mtype, struct intel_vgpu_type, type); return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n" "fence: %d\nresolution: %s\n" @@ -158,14 +147,7 @@ static ssize_t description_show(struct mdev_type *mtype, static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct intel_vgpu_type *type; - struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt; - - type = &gvt->types[mtype_get_type_group_id(mtype)]; - if (!type) - return 0; - - return sprintf(buf, "%s\n", type->name); + return sprintf(buf, "%s\n", mtype->sysfs_name); } static MDEV_TYPE_ATTR_RO(available_instances); @@ -173,7 +155,7 @@ static MDEV_TYPE_ATTR_RO(device_api); static MDEV_TYPE_ATTR_RO(description); static MDEV_TYPE_ATTR_RO(name); -static struct attribute *gvt_type_attrs[] = { +static const struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_description.attr, @@ -181,51 +163,6 @@ static struct attribute *gvt_type_attrs[] = { NULL, }; -static struct attribute_group *gvt_vgpu_type_groups[] = { - [0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL, -}; - -static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt) -{ - int i, j; - struct intel_vgpu_type *type; - struct attribute_group *group; - - for (i = 0; i < gvt->num_types; i++) { - type = &gvt->types[i]; - - group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL); - if (!group) - goto unwind; - - group->name = type->name; - group->attrs = gvt_type_attrs; - gvt_vgpu_type_groups[i] = group; - } - - return 0; - -unwind: - for (j = 0; j < i; j++) { - group = gvt_vgpu_type_groups[j]; - kfree(group); - } - - return -ENOMEM; -} - -static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt) -{ - int i; - struct attribute_group *group; - - for (i = 0; i < gvt->num_types; i++) { - group = gvt_vgpu_type_groups[i]; - gvt_vgpu_type_groups[i] = NULL; - kfree(group); - } -} - static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long size) { @@ -1547,16 +1484,11 @@ static const struct attribute_group *intel_vgpu_groups[] = { static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) { struct mdev_device *mdev = to_mdev_device(vfio_dev->dev); - struct device *pdev = mdev_parent_dev(mdev); - struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt; - struct intel_vgpu_type *type; struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + struct intel_vgpu_type *type = + container_of(mdev->type, struct intel_vgpu_type, type); - type = &gvt->types[mdev_get_type_group_id(mdev)]; - if (!type) - return -EINVAL; - - vgpu->gvt = gvt; + vgpu->gvt = kdev_to_i915(mdev_parent_dev(mdev))->gvt; return intel_gvt_create_vgpu(vgpu, type->conf); } @@ -1625,7 +1557,7 @@ static struct mdev_driver intel_vgpu_mdev_driver = { }, .probe = intel_vgpu_probe, .remove = intel_vgpu_remove, - .supported_type_groups = gvt_vgpu_type_groups, + .types_attrs = gvt_type_attrs, }; int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) @@ -1924,7 +1856,6 @@ static void intel_gvt_clean_device(struct drm_i915_private *i915) return; mdev_unregister_parent(&gvt->parent); - intel_gvt_cleanup_vgpu_type_groups(gvt); intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu); intel_gvt_clean_vgpu_types(gvt); @@ -2024,20 +1955,15 @@ static int intel_gvt_init_device(struct drm_i915_private *i915) intel_gvt_debugfs_init(gvt); - ret = intel_gvt_init_vgpu_type_groups(gvt); - if (ret) - goto out_destroy_idle_vgpu; - ret = mdev_register_parent(&gvt->parent, i915->drm.dev, - &intel_vgpu_mdev_driver); + &intel_vgpu_mdev_driver, + gvt->mdev_types, gvt->num_types); if (ret) - goto out_cleanup_vgpu_type_groups; + goto out_destroy_idle_vgpu; gvt_dbg_core("gvt device initialization is done\n"); return 0; -out_cleanup_vgpu_type_groups: - intel_gvt_cleanup_vgpu_type_groups(gvt); out_destroy_idle_vgpu: intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu); intel_gvt_debugfs_clean(gvt); diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index b0d5dafd013f4..92aaa77feceee 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -113,13 +113,18 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) if (!gvt->types) return -ENOMEM; + gvt->mdev_types = kcalloc(num_types, sizeof(*gvt->mdev_types), + GFP_KERNEL); + if (!gvt->mdev_types) + goto out_free_types; + for (i = 0; i < num_types; ++i) { const struct intel_vgpu_config *conf = &intel_vgpu_configs[i]; if (low_avail / conf->low_mm == 0) break; if (conf->weight < 1 || conf->weight > VGPU_MAX_WEIGHT) - goto out_free_types; + goto out_free_mdev_types; sprintf(gvt->types[i].name, "GVTg_V%u_%s", GRAPHICS_VER(gvt->gt->i915) == 8 ? 4 : 5, conf->name); @@ -131,11 +136,16 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) i, gvt->types[i].name, gvt->types[i].avail_instance, conf->low_mm, conf->high_mm, conf->fence, conf->weight, vgpu_edid_str(conf->edid)); + + gvt->mdev_types[i] = &gvt->types[i].type; + gvt->mdev_types[i]->sysfs_name = gvt->types[i].name; } gvt->num_types = i; return 0; +out_free_mdev_types: + kfree(gvt->mdev_types); out_free_types: kfree(gvt->types); return -EINVAL; @@ -143,6 +153,7 @@ out_free_types: void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) { + kfree(gvt->mdev_types); kfree(gvt->types); } diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 7d105915bd149..25a5de08b3902 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -202,7 +202,6 @@ static void vfio_ccw_free_private(struct vfio_ccw_private *private) mutex_destroy(&private->io_mutex); kfree(private); } - static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; @@ -221,8 +220,11 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) dev_set_drvdata(&sch->dev, private); + private->mdev_type.sysfs_name = "io"; + private->mdev_types[0] = &private->mdev_type; ret = mdev_register_parent(&private->parent, &sch->dev, - &vfio_ccw_mdev_driver); + &vfio_ccw_mdev_driver, + private->mdev_types, 1); if (ret) goto out_free; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 9a0e0c5ffb1a5..c37e712a4b069 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -69,23 +69,13 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; -static struct attribute_group mdev_type_group = { - .name = "io", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group, - NULL, -}; - static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = @@ -646,5 +636,5 @@ struct mdev_driver vfio_ccw_mdev_driver = { }, .probe = vfio_ccw_mdev_probe, .remove = vfio_ccw_mdev_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 1a4bfb1b5a808..52caa721ec06c 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -120,6 +120,8 @@ struct vfio_ccw_private { struct completion release_comp; struct mdev_parent parent; + struct mdev_type mdev_type; + struct mdev_type *mdev_types[1]; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 724d09a74a8f1..24d131c502ca3 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -816,23 +816,13 @@ static ssize_t device_api_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *vfio_ap_mdev_type_attrs[] = { +static const struct attribute *vfio_ap_mdev_type_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; -static struct attribute_group vfio_ap_mdev_hwvirt_type_group = { - .name = VFIO_AP_MDEV_TYPE_HWVIRT, - .attrs = vfio_ap_mdev_type_attrs, -}; - -static struct attribute_group *vfio_ap_mdev_type_groups[] = { - &vfio_ap_mdev_hwvirt_type_group, - NULL, -}; - #define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ "already assigned to %s" @@ -1817,7 +1807,7 @@ static struct mdev_driver vfio_ap_matrix_driver = { }, .probe = vfio_ap_mdev_probe, .remove = vfio_ap_mdev_remove, - .supported_type_groups = vfio_ap_mdev_type_groups, + .types_attrs = vfio_ap_mdev_type_attrs, }; int vfio_ap_mdev_register(void) @@ -1830,8 +1820,11 @@ int vfio_ap_mdev_register(void) if (ret) return ret; + matrix_dev->mdev_type.sysfs_name = VFIO_AP_MDEV_TYPE_HWVIRT; + matrix_dev->mdev_types[0] = &matrix_dev->mdev_type; ret = mdev_register_parent(&matrix_dev->parent, &matrix_dev->device, - &vfio_ap_matrix_driver); + &vfio_ap_matrix_driver, + matrix_dev->mdev_types, 1); if (ret) goto err_driver; return 0; diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 35165730f5174..441dc8dda380b 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -53,6 +53,8 @@ struct ap_matrix_dev { struct ap_driver *vfio_ap_drv; struct mutex guests_lock; /* serializes access to each KVM guest */ struct mdev_parent parent; + struct mdev_type mdev_type; + struct mdev_type *mdev_types[]; }; extern struct ap_matrix_dev *matrix_dev; diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index fa05ac3396950..2d95a497fd3b2 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -29,26 +29,6 @@ struct device *mdev_parent_dev(struct mdev_device *mdev) } EXPORT_SYMBOL(mdev_parent_dev); -/* - * Return the index in supported_type_groups that this mdev_device was created - * from. - */ -unsigned int mdev_get_type_group_id(struct mdev_device *mdev) -{ - return mdev->type->type_group_id; -} -EXPORT_SYMBOL(mdev_get_type_group_id); - -/* - * Used in mdev_type_attribute sysfs functions to return the index in the - * supported_type_groups that the sysfs is called from. - */ -unsigned int mtype_get_type_group_id(struct mdev_type *mtype) -{ - return mtype->type_group_id; -} -EXPORT_SYMBOL(mtype_get_type_group_id); - /* * Used in mdev_type_attribute sysfs functions to return the parent struct * device @@ -85,6 +65,8 @@ static int mdev_device_remove_cb(struct device *dev, void *data) * @parent: parent structure registered * @dev: device structure representing parent device. * @mdev_driver: Device driver to bind to the newly created mdev + * @types: Array of supported mdev types + * @nr_types: Number of entries in @types * * Registers the @parent stucture as a parent for mdev types and thus mdev * devices. The caller needs to hold a reference on @dev that must not be @@ -93,20 +75,19 @@ static int mdev_device_remove_cb(struct device *dev, void *data) * Returns a negative value on error, otherwise 0. */ int mdev_register_parent(struct mdev_parent *parent, struct device *dev, - struct mdev_driver *mdev_driver) + struct mdev_driver *mdev_driver, struct mdev_type **types, + unsigned int nr_types) { char *env_string = "MDEV_STATE=registered"; char *envp[] = { env_string, NULL }; int ret; - /* check for mandatory ops */ - if (!mdev_driver->supported_type_groups) - return -EINVAL; - memset(parent, 0, sizeof(*parent)); init_rwsem(&parent->unreg_sem); parent->dev = dev; parent->mdev_driver = mdev_driver; + parent->types = types; + parent->nr_types = nr_types; if (!mdev_bus_compat_class) { mdev_bus_compat_class = class_compat_register("mdev_bus"); diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 7bd4bb9850e81..1da1ecf76a0d5 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -56,10 +56,9 @@ EXPORT_SYMBOL_GPL(mdev_bus_type); **/ int mdev_register_driver(struct mdev_driver *drv) { - /* initialize common driver fields */ + if (!drv->types_attrs) + return -EINVAL; drv->driver.bus = &mdev_bus_type; - - /* register with core */ return driver_register(&drv->driver); } EXPORT_SYMBOL(mdev_register_driver); diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index 297f911fdc890..ba1b2dbddc0bc 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -13,14 +13,6 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); -struct mdev_type { - struct kobject kobj; - struct kobject *devices_kobj; - struct mdev_parent *parent; - struct list_head next; - unsigned int type_group_id; -}; - extern const struct attribute_group *mdev_device_groups[]; #define to_mdev_type_attr(_attr) \ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index b71ffc5594870..38b4c2466ec43 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -82,7 +82,6 @@ static void mdev_type_release(struct kobject *kobj) pr_debug("Releasing group %s\n", kobj->name); /* Pairs with the get in add_mdev_supported_type() */ put_device(type->parent->dev); - kfree(type); } static struct kobj_type mdev_type_ktype = { @@ -90,35 +89,21 @@ static struct kobj_type mdev_type_ktype = { .release = mdev_type_release, }; -static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, - unsigned int type_group_id) +static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) { - struct mdev_type *type; - struct attribute_group *group = - parent->mdev_driver->supported_type_groups[type_group_id]; int ret; - if (!group->name) { - pr_err("%s: Type name empty!\n", __func__); - return ERR_PTR(-EINVAL); - } - - type = kzalloc(sizeof(*type), GFP_KERNEL); - if (!type) - return ERR_PTR(-ENOMEM); - type->kobj.kset = parent->mdev_types_kset; type->parent = parent; /* Pairs with the put in mdev_type_release() */ get_device(parent->dev); - type->type_group_id = type_group_id; ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL, "%s-%s", dev_driver_string(parent->dev), - group->name); + type->sysfs_name); if (ret) { kobject_put(&type->kobj); - return ERR_PTR(ret); + return ret; } ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr); @@ -131,13 +116,10 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, goto attr_devices_failed; } - ret = sysfs_create_files(&type->kobj, - (const struct attribute **)group->attrs); - if (ret) { - ret = -ENOMEM; + ret = sysfs_create_files(&type->kobj, parent->mdev_driver->types_attrs); + if (ret) goto attrs_failed; - } - return type; + return 0; attrs_failed: kobject_put(type->devices_kobj); @@ -146,78 +128,49 @@ attr_devices_failed: attr_create_failed: kobject_del(&type->kobj); kobject_put(&type->kobj); - return ERR_PTR(ret); + return ret; } -static void remove_mdev_supported_type(struct mdev_type *type) +static void mdev_type_remove(struct mdev_type *type) { - struct attribute_group *group = - type->parent->mdev_driver->supported_type_groups[type->type_group_id]; + sysfs_remove_files(&type->kobj, type->parent->mdev_driver->types_attrs); - sysfs_remove_files(&type->kobj, - (const struct attribute **)group->attrs); kobject_put(type->devices_kobj); sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); kobject_del(&type->kobj); kobject_put(&type->kobj); } -static int add_mdev_supported_type_groups(struct mdev_parent *parent) -{ - int i; - - for (i = 0; parent->mdev_driver->supported_type_groups[i]; i++) { - struct mdev_type *type; - - type = add_mdev_supported_type(parent, i); - if (IS_ERR(type)) { - struct mdev_type *ltype, *tmp; - - list_for_each_entry_safe(ltype, tmp, &parent->type_list, - next) { - list_del(<ype->next); - remove_mdev_supported_type(ltype); - } - return PTR_ERR(type); - } - list_add(&type->next, &parent->type_list); - } - return 0; -} - /* mdev sysfs functions */ void parent_remove_sysfs_files(struct mdev_parent *parent) { - struct mdev_type *type, *tmp; - - list_for_each_entry_safe(type, tmp, &parent->type_list, next) { - list_del(&type->next); - remove_mdev_supported_type(type); - } + int i; + for (i = 0; i < parent->nr_types; i++) + mdev_type_remove(parent->types[i]); kset_unregister(parent->mdev_types_kset); } int parent_create_sysfs_files(struct mdev_parent *parent) { - int ret; + int ret, i; parent->mdev_types_kset = kset_create_and_add("mdev_supported_types", NULL, &parent->dev->kobj); - if (!parent->mdev_types_kset) return -ENOMEM; - INIT_LIST_HEAD(&parent->type_list); - - ret = add_mdev_supported_type_groups(parent); - if (ret) - goto create_err; + for (i = 0; i < parent->nr_types; i++) { + ret = mdev_type_add(parent, parent->types[i]); + if (ret) + goto out_err; + } return 0; -create_err: - kset_unregister(parent->mdev_types_kset); - return ret; +out_err: + while (--i >= 0) + mdev_type_remove(parent->types[i]); + return 0; } static ssize_t remove_store(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 262512c2a8ffc..19bc93c10e8c7 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -23,14 +23,27 @@ struct mdev_device { bool active; }; +struct mdev_type { + /* set by the driver before calling mdev_register parent: */ + const char *sysfs_name; + + /* set by the core, can be used drivers */ + struct mdev_parent *parent; + + /* internal only */ + struct kobject kobj; + struct kobject *devices_kobj; +}; + /* embedded into the struct device that the mdev devices hang off */ struct mdev_parent { struct device *dev; struct mdev_driver *mdev_driver; struct kset *mdev_types_kset; - struct list_head type_list; /* Synchronize device creation/removal with parent unregistration */ struct rw_semaphore unreg_sem; + struct mdev_type **types; + unsigned int nr_types; }; static inline struct mdev_device *to_mdev_device(struct device *dev) @@ -38,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -unsigned int mdev_get_type_group_id(struct mdev_device *mdev); -unsigned int mtype_get_type_group_id(struct mdev_type *mtype); struct device *mtype_get_parent_dev(struct mdev_type *mtype); /* interface for exporting mdev supported type attributes */ @@ -66,22 +77,21 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ * struct mdev_driver - Mediated device driver * @probe: called when new device created * @remove: called when device removed - * @supported_type_groups: Attributes to define supported types. It is mandatory - * to provide supported types. + * @types_attrs: attributes to the type kobjects. * @driver: device driver structure - * **/ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); - struct attribute_group **supported_type_groups; + const struct attribute * const *types_attrs; struct device_driver driver; }; extern struct bus_type mdev_bus_type; int mdev_register_parent(struct mdev_parent *parent, struct device *dev, - struct mdev_driver *mdev_driver); + struct mdev_driver *mdev_driver, struct mdev_type **types, + unsigned int nr_types); void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 2c4791abbc3d3..4d0839cb51943 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -99,23 +99,27 @@ MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); #define MBOCHS_TYPE_2 "medium" #define MBOCHS_TYPE_3 "large" -static const struct mbochs_type { +static struct mbochs_type { + struct mdev_type type; const char *name; u32 mbytes; u32 max_x; u32 max_y; } mbochs_types[] = { { + .type.sysfs_name = MBOCHS_TYPE_1, .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, .mbytes = 4, .max_x = 800, .max_y = 600, }, { + .type.sysfs_name = MBOCHS_TYPE_2, .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, .mbytes = 16, .max_x = 1920, .max_y = 1440, }, { + .type.sysfs_name = MBOCHS_TYPE_3, .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, .mbytes = 64, .max_x = 0, @@ -123,6 +127,11 @@ static const struct mbochs_type { }, }; +static struct mdev_type *mbochs_mdev_types[] = { + &mbochs_types[0].type, + &mbochs_types[1].type, + &mbochs_types[2].type, +}; static dev_t mbochs_devt; static struct class *mbochs_class; @@ -510,8 +519,8 @@ static int mbochs_init_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); struct mdev_device *mdev = to_mdev_device(vdev->dev); - const struct mbochs_type *type = - &mbochs_types[mdev_get_type_group_id(mdev)]; + struct mbochs_type *type = + container_of(mdev->type, struct mbochs_type, type); int avail_mbytes = atomic_read(&mbochs_avail_mbytes); int ret = -ENOMEM; @@ -1345,8 +1354,8 @@ static const struct attribute_group *mdev_dev_groups[] = { static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mbochs_type *type = - &mbochs_types[mtype_get_type_group_id(mtype)]; + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); return sprintf(buf, "%s\n", type->name); } @@ -1355,8 +1364,8 @@ static MDEV_TYPE_ATTR_RO(name); static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mbochs_type *type = - &mbochs_types[mtype_get_type_group_id(mtype)]; + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); return sprintf(buf, "virtual display, %d MB video memory\n", type ? type->mbytes : 0); @@ -1367,8 +1376,8 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mbochs_type *type = - &mbochs_types[mtype_get_type_group_id(mtype)]; + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes; return sprintf(buf, "%d\n", count); @@ -1382,7 +1391,7 @@ static ssize_t device_api_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_device_api.attr, @@ -1390,28 +1399,6 @@ static struct attribute *mdev_types_attrs[] = { NULL, }; -static struct attribute_group mdev_type_group1 = { - .name = MBOCHS_TYPE_1, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = MBOCHS_TYPE_2, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group3 = { - .name = MBOCHS_TYPE_3, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - &mdev_type_group3, - NULL, -}; - static const struct vfio_device_ops mbochs_dev_ops = { .close_device = mbochs_close_device, .init = mbochs_init_dev, @@ -1431,7 +1418,7 @@ static struct mdev_driver mbochs_driver = { }, .probe = mbochs_probe, .remove = mbochs_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; static const struct file_operations vd_fops = { @@ -1476,7 +1463,9 @@ static int __init mbochs_dev_init(void) if (ret) goto err_class; - ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver); + ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, + mbochs_mdev_types, + ARRAY_SIZE(mbochs_mdev_types)); if (ret) goto err_device; diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 01f345430b975..4a341f4849e73 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -51,7 +51,8 @@ MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); #define MDPY_TYPE_2 "xga" #define MDPY_TYPE_3 "hd" -static const struct mdpy_type { +static struct mdpy_type { + struct mdev_type type; const char *name; u32 format; u32 bytepp; @@ -59,18 +60,21 @@ static const struct mdpy_type { u32 height; } mdpy_types[] = { { + .type.sysfs_name = MDPY_TYPE_1, .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 640, .height = 480, }, { + .type.sysfs_name = MDPY_TYPE_2, .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1024, .height = 768, }, { + .type.sysfs_name = MDPY_TYPE_3, .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, @@ -79,6 +83,12 @@ static const struct mdpy_type { }, }; +static struct mdev_type *mdpy_mdev_types[] = { + &mdpy_types[0].type, + &mdpy_types[1].type, + &mdpy_types[2].type, +}; + static dev_t mdpy_devt; static struct class *mdpy_class; static struct cdev mdpy_cdev; @@ -222,7 +232,7 @@ static int mdpy_init_dev(struct vfio_device *vdev) container_of(vdev, struct mdev_state, vdev); struct mdev_device *mdev = to_mdev_device(vdev->dev); const struct mdpy_type *type = - &mdpy_types[mdev_get_type_group_id(mdev)]; + container_of(mdev->type, struct mdpy_type, type); u32 fbsize; int ret = -ENOMEM; @@ -655,8 +665,7 @@ static const struct attribute_group *mdev_dev_groups[] = { static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mdpy_type *type = - &mdpy_types[mtype_get_type_group_id(mtype)]; + struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "%s\n", type->name); } @@ -665,8 +674,7 @@ static MDEV_TYPE_ATTR_RO(name); static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mdpy_type *type = - &mdpy_types[mtype_get_type_group_id(mtype)]; + struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "virtual display, %dx%d framebuffer\n", type->width, type->height); @@ -688,7 +696,7 @@ static ssize_t device_api_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_device_api.attr, @@ -696,28 +704,6 @@ static struct attribute *mdev_types_attrs[] = { NULL, }; -static struct attribute_group mdev_type_group1 = { - .name = MDPY_TYPE_1, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = MDPY_TYPE_2, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group3 = { - .name = MDPY_TYPE_3, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - &mdev_type_group3, - NULL, -}; - static const struct vfio_device_ops mdpy_dev_ops = { .init = mdpy_init_dev, .release = mdpy_release_dev, @@ -736,7 +722,7 @@ static struct mdev_driver mdpy_driver = { }, .probe = mdpy_probe, .remove = mdpy_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; static const struct file_operations vd_fops = { @@ -779,7 +765,9 @@ static int __init mdpy_dev_init(void) if (ret) goto err_class; - ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver); + ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, + mdpy_mdev_types, + ARRAY_SIZE(mdpy_mdev_types)); if (ret) goto err_device; diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index e80baac513811..814a7f98738a2 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -143,6 +143,20 @@ struct mdev_state { int nr_ports; }; +static struct mtty_type { + struct mdev_type type; + int nr_ports; + const char *name; +} mtty_types[2] = { + { .nr_ports = 1, .type.sysfs_name = "1", .name = "Single port serial" }, + { .nr_ports = 2, .type.sysfs_name = "2", .name = "Dual port serial" }, +}; + +static struct mdev_type *mtty_mdev_types[] = { + &mtty_types[0].type, + &mtty_types[1].type, +}; + static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS); static const struct file_operations vd_fops = { @@ -707,17 +721,19 @@ static int mtty_init_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); struct mdev_device *mdev = to_mdev_device(vdev->dev); - int nr_ports = mdev_get_type_group_id(mdev) + 1; + struct mtty_type *type = + container_of(mdev->type, struct mtty_type, type); int avail_ports = atomic_read(&mdev_avail_ports); int ret; do { - if (avail_ports < nr_ports) + if (avail_ports < type->nr_ports) return -ENOSPC; } while (!atomic_try_cmpxchg(&mdev_avail_ports, - &avail_ports, avail_ports - nr_ports)); + &avail_ports, + avail_ports - type->nr_ports)); - mdev_state->nr_ports = nr_ports; + mdev_state->nr_ports = type->nr_ports; mdev_state->irq_index = -1; mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; @@ -735,7 +751,7 @@ static int mtty_init_dev(struct vfio_device *vdev) return 0; err_nr_ports: - atomic_add(nr_ports, &mdev_avail_ports); + atomic_add(type->nr_ports, &mdev_avail_ports); return ret; } @@ -1242,11 +1258,9 @@ static const struct attribute_group *mdev_dev_groups[] = { static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - static const char *name_str[2] = { "Single port serial", - "Dual port serial" }; + struct mtty_type *type = container_of(mtype, struct mtty_type, type); - return sysfs_emit(buf, "%s\n", - name_str[mtype_get_type_group_id(mtype)]); + return sysfs_emit(buf, "%s\n", type->name); } static MDEV_TYPE_ATTR_RO(name); @@ -1255,9 +1269,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - unsigned int ports = mtype_get_type_group_id(mtype) + 1; + struct mtty_type *type = container_of(mtype, struct mtty_type, type); - return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / ports); + return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / + type->nr_ports); } static MDEV_TYPE_ATTR_RO(available_instances); @@ -1270,29 +1285,13 @@ static ssize_t device_api_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; -static struct attribute_group mdev_type_group1 = { - .name = "1", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = "2", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - NULL, -}; - static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", .init = mtty_init_dev, @@ -1311,7 +1310,7 @@ static struct mdev_driver mtty_driver = { }, .probe = mtty_probe, .remove = mtty_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; static void mtty_device_release(struct device *dev) @@ -1363,7 +1362,8 @@ static int __init mtty_dev_init(void) goto err_class; ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, - &mtty_driver); + &mtty_driver, mtty_mdev_types, + ARRAY_SIZE(mtty_mdev_types)); if (ret) goto err_device; return 0; -- GitLab From cbf3bb28aaeaee425ca7b9c537a3efff1f8c98ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:44 +0200 Subject: [PATCH 1297/2223] vfio/mdev: remove mdev_from_dev Just open code it in the only caller. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Link: https://lore.kernel.org/r/20220923092652.100656-7-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/mdev/mdev_core.c | 6 ++---- include/linux/mdev.h | 4 ---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 2d95a497fd3b2..bde7ce620dae0 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -53,10 +53,8 @@ static void mdev_device_remove_common(struct mdev_device *mdev) static int mdev_device_remove_cb(struct device *dev, void *data) { - struct mdev_device *mdev = mdev_from_dev(dev); - - if (mdev) - mdev_device_remove_common(mdev); + if (dev->bus == &mdev_bus_type) + mdev_device_remove_common(to_mdev_device(dev)); return 0; } diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 19bc93c10e8c7..4f558de52fd94 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -102,9 +102,5 @@ static inline struct device *mdev_dev(struct mdev_device *mdev) { return &mdev->dev; } -static inline struct mdev_device *mdev_from_dev(struct device *dev) -{ - return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL; -} #endif /* MDEV_H */ -- GitLab From 2815fe149ffa8e1a022b2830ab62999135c00a4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:45 +0200 Subject: [PATCH 1298/2223] vfio/mdev: unexport mdev_bus_type mdev_bus_type is only used in mdev.ko now, so unexport it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Link: https://lore.kernel.org/r/20220923092652.100656-8-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/mdev/mdev_driver.c | 1 - drivers/vfio/mdev/mdev_private.h | 1 + include/linux/mdev.h | 2 -- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 1da1ecf76a0d5..5b3c94f4fb13d 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -46,7 +46,6 @@ struct bus_type mdev_bus_type = { .remove = mdev_remove, .match = mdev_match, }; -EXPORT_SYMBOL_GPL(mdev_bus_type); /** * mdev_register_driver - register a new MDEV driver diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index ba1b2dbddc0bc..af457b27f6074 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -13,6 +13,7 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); +extern struct bus_type mdev_bus_type; extern const struct attribute_group *mdev_device_groups[]; #define to_mdev_type_attr(_attr) \ diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 4f558de52fd94..6c179d2b89274 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -87,8 +87,6 @@ struct mdev_driver { struct device_driver driver; }; -extern struct bus_type mdev_bus_type; - int mdev_register_parent(struct mdev_parent *parent, struct device *dev, struct mdev_driver *mdev_driver, struct mdev_type **types, unsigned int nr_types); -- GitLab From 062e720cd209d8091c4f3d118d93973f02209aca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:46 +0200 Subject: [PATCH 1299/2223] vfio/mdev: remove mdev_parent_dev Just open code the dereferences in the only user. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Link: https://lore.kernel.org/r/20220923092652.100656-9-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- Documentation/driver-api/vfio-mediated-device.rst | 3 --- drivers/gpu/drm/i915/gvt/kvmgt.c | 2 +- drivers/vfio/mdev/mdev_core.c | 6 ------ include/linux/mdev.h | 1 - 4 files changed, 1 insertion(+), 11 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index ff7342d2e332d..7b660f3fa2c92 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -200,9 +200,6 @@ Directories and files under the sysfs for Each Physical Device sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name); - (or using mdev_parent_dev(mdev) to arrive at the parent device outside - of the core mdev code) - * device_api This attribute should show which device API is being created, for example, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 12b0b33061685..2265dd867956f 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1488,7 +1488,7 @@ static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) struct intel_vgpu_type *type = container_of(mdev->type, struct intel_vgpu_type, type); - vgpu->gvt = kdev_to_i915(mdev_parent_dev(mdev))->gvt; + vgpu->gvt = kdev_to_i915(mdev->type->parent->dev)->gvt; return intel_gvt_create_vgpu(vgpu, type->conf); } diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index bde7ce620dae0..75628759a3bf0 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -23,12 +23,6 @@ static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); static DEFINE_MUTEX(mdev_list_lock); -struct device *mdev_parent_dev(struct mdev_device *mdev) -{ - return mdev->type->parent->dev; -} -EXPORT_SYMBOL(mdev_parent_dev); - /* * Used in mdev_type_attribute sysfs functions to return the parent struct * device diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 6c179d2b89274..bbedffcb38d48 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -95,7 +95,6 @@ void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); void mdev_unregister_driver(struct mdev_driver *drv); -struct device *mdev_parent_dev(struct mdev_device *mdev); static inline struct device *mdev_dev(struct mdev_device *mdev) { return &mdev->dev; -- GitLab From c7c1f38f6cba7e3249866c06639ea62755f0a24e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:47 +0200 Subject: [PATCH 1300/2223] vfio/mdev: remove mtype_get_parent_dev Just open code the dereferences in the only user. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-10-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/s390/cio/vfio_ccw_ops.c | 3 +-- drivers/vfio/mdev/mdev_core.c | 10 ---------- include/linux/mdev.h | 2 -- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index c37e712a4b069..3db6251b31143 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -62,8 +62,7 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct vfio_ccw_private *private = - dev_get_drvdata(mtype_get_parent_dev(mtype)); + struct vfio_ccw_private *private = dev_get_drvdata(mtype->parent->dev); return sprintf(buf, "%d\n", atomic_read(&private->avail)); } diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 75628759a3bf0..93f8caf2e5f77 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -23,16 +23,6 @@ static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); static DEFINE_MUTEX(mdev_list_lock); -/* - * Used in mdev_type_attribute sysfs functions to return the parent struct - * device - */ -struct device *mtype_get_parent_dev(struct mdev_type *mtype) -{ - return mtype->parent->dev; -} -EXPORT_SYMBOL(mtype_get_parent_dev); - /* Caller must hold parent unreg_sem read or write lock */ static void mdev_device_remove_common(struct mdev_device *mdev) { diff --git a/include/linux/mdev.h b/include/linux/mdev.h index bbedffcb38d48..e445f809ceca3 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -51,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -struct device *mtype_get_parent_dev(struct mdev_type *mtype); - /* interface for exporting mdev supported type attributes */ struct mdev_type_attribute { struct attribute attr; -- GitLab From 290aac5df88a83e264b3a73ec146e5e5b3c45793 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Fri, 23 Sep 2022 11:26:48 +0200 Subject: [PATCH 1301/2223] vfio/mdev: consolidate all the device_api sysfs into the core code Every driver just emits a static string, simply feed it through the ops and provide a standard sysfs show function. Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-11-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- .../driver-api/vfio-mediated-device.rst | 2 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +---- drivers/s390/cio/vfio_ccw_ops.c | 9 +---- drivers/s390/crypto/vfio_ap_ops.c | 10 +----- drivers/vfio/mdev/mdev_driver.c | 4 ++- drivers/vfio/mdev/mdev_sysfs.c | 35 +++++++++++++------ include/linux/mdev.h | 7 ++-- samples/vfio-mdev/mbochs.c | 9 +---- samples/vfio-mdev/mdpy.c | 9 +---- samples/vfio-mdev/mtty.c | 10 +----- 10 files changed, 37 insertions(+), 67 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index 7b660f3fa2c92..b0c29e37f61b4 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -202,7 +202,7 @@ Directories and files under the sysfs for Each Physical Device * device_api - This attribute should show which device API is being created, for example, + This attribute shows which device API is being created, for example, "vfio-pci" for a PCI device. * available_instances diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 2265dd867956f..0f70886a63e92 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -123,12 +123,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, return sprintf(buf, "%u\n", type->avail_instance); } -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -151,13 +145,11 @@ static ssize_t name_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static MDEV_TYPE_ATTR_RO(device_api); static MDEV_TYPE_ATTR_RO(description); static MDEV_TYPE_ATTR_RO(name); static const struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_description.attr, &mdev_type_attr_name.attr, NULL, @@ -1550,6 +1542,7 @@ static void intel_vgpu_remove(struct mdev_device *mdev) } static struct mdev_driver intel_vgpu_mdev_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "intel_vgpu_mdev", .owner = THIS_MODULE, diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 3db6251b31143..4c7b181519228 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -51,13 +51,6 @@ static ssize_t name_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(name); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_CCW_STRING); -} -static MDEV_TYPE_ATTR_RO(device_api); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -70,7 +63,6 @@ static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -628,6 +620,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { }; struct mdev_driver vfio_ccw_mdev_driver = { + .device_api = VFIO_DEVICE_API_CCW_STRING, .driver = { .name = "vfio_ccw_mdev", .owner = THIS_MODULE, diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 24d131c502ca3..d440acfbb261e 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -808,17 +808,8 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_AP_STRING); -} - -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *vfio_ap_mdev_type_attrs[] = { &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1799,6 +1790,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { }; static struct mdev_driver vfio_ap_matrix_driver = { + .device_api = VFIO_DEVICE_API_AP_STRING, .driver = { .name = "vfio_ap_mdev", .owner = THIS_MODULE, diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 5b3c94f4fb13d..60e8b9f6474e8 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -55,8 +55,10 @@ struct bus_type mdev_bus_type = { **/ int mdev_register_driver(struct mdev_driver *drv) { - if (!drv->types_attrs) + if (!drv->types_attrs || !drv->device_api) return -EINVAL; + + /* initialize common driver fields */ drv->driver.bus = &mdev_bus_type; return driver_register(&drv->driver); } diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 38b4c2466ec43..60fc52ff92448 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -72,9 +72,30 @@ static ssize_t create_store(struct mdev_type *mtype, return count; } - static MDEV_TYPE_ATTR_WO(create); +static ssize_t device_api_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", mtype->parent->mdev_driver->device_api); +} +static MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_core_attrs[] = { + &mdev_type_attr_create.attr, + &mdev_type_attr_device_api.attr, + NULL, +}; + +static struct attribute_group mdev_type_core_group = { + .attrs = mdev_types_core_attrs, +}; + +static const struct attribute_group *mdev_type_groups[] = { + &mdev_type_core_group, + NULL, +}; + static void mdev_type_release(struct kobject *kobj) { struct mdev_type *type = to_mdev_type(kobj); @@ -85,8 +106,9 @@ static void mdev_type_release(struct kobject *kobj) } static struct kobj_type mdev_type_ktype = { - .sysfs_ops = &mdev_type_sysfs_ops, - .release = mdev_type_release, + .sysfs_ops = &mdev_type_sysfs_ops, + .release = mdev_type_release, + .default_groups = mdev_type_groups, }; static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) @@ -106,10 +128,6 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) return ret; } - ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr); - if (ret) - goto attr_create_failed; - type->devices_kobj = kobject_create_and_add("devices", &type->kobj); if (!type->devices_kobj) { ret = -ENOMEM; @@ -124,8 +142,6 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) attrs_failed: kobject_put(type->devices_kobj); attr_devices_failed: - sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); -attr_create_failed: kobject_del(&type->kobj); kobject_put(&type->kobj); return ret; @@ -136,7 +152,6 @@ static void mdev_type_remove(struct mdev_type *type) sysfs_remove_files(&type->kobj, type->parent->mdev_driver->types_attrs); kobject_put(type->devices_kobj); - sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); kobject_del(&type->kobj); kobject_put(&type->kobj); } diff --git a/include/linux/mdev.h b/include/linux/mdev.h index e445f809ceca3..af1ff0165b8d3 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -61,11 +61,6 @@ struct mdev_type_attribute { size_t count); }; -#define MDEV_TYPE_ATTR(_name, _mode, _show, _store) \ -struct mdev_type_attribute mdev_type_attr_##_name = \ - __ATTR(_name, _mode, _show, _store) -#define MDEV_TYPE_ATTR_RW(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name) #define MDEV_TYPE_ATTR_RO(_name) \ struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) #define MDEV_TYPE_ATTR_WO(_name) \ @@ -73,12 +68,14 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ /** * struct mdev_driver - Mediated device driver + * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed * @types_attrs: attributes to the type kobjects. * @driver: device driver structure **/ struct mdev_driver { + const char *device_api; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); const struct attribute * const *types_attrs; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 4d0839cb51943..a2fc13fade757 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1384,17 +1384,9 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1410,6 +1402,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { }; static struct mdev_driver mbochs_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "mbochs", .owner = THIS_MODULE, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 4a341f4849e73..f9069ed2750fa 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -689,17 +689,9 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -714,6 +706,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { }; static struct mdev_driver mdpy_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "mdpy", .owner = THIS_MODULE, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 814a7f98738a2..064e71b28dd19 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1277,17 +1277,8 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} - -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1302,6 +1293,7 @@ static const struct vfio_device_ops mtty_dev_ops = { }; static struct mdev_driver mtty_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "mtty", .owner = THIS_MODULE, -- GitLab From 0bc79069ccbdbe26492493dd0c4e38b7cadf8ad5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:49 +0200 Subject: [PATCH 1302/2223] vfio/mdev: consolidate all the name sysfs into the core code Every driver just emits a static string, simply add a field to the mdev_type for the driver to fill out or fall back to the sysfs name and provide a standard sysfs show function. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-12-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- .../driver-api/vfio-mediated-device.rst | 2 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 8 ------- drivers/s390/cio/vfio_ccw_drv.c | 1 + drivers/s390/cio/vfio_ccw_ops.c | 8 ------- drivers/s390/crypto/vfio_ap_ops.c | 10 +-------- drivers/vfio/mdev/mdev_sysfs.c | 10 +++++++++ include/linux/mdev.h | 1 + samples/vfio-mdev/mbochs.c | 20 ++++-------------- samples/vfio-mdev/mdpy.c | 21 +++++-------------- samples/vfio-mdev/mtty.c | 18 ++++------------ 10 files changed, 27 insertions(+), 72 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index b0c29e37f61b4..dcd1231a6fa84 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -217,7 +217,7 @@ Directories and files under the sysfs for Each Physical Device * name - This attribute should show human readable name. This is optional attribute. + This attribute shows a human readable name. * description diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 0f70886a63e92..93a52ae26f684 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -138,20 +138,12 @@ static ssize_t description_show(struct mdev_type *mtype, type->conf->weight); } -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", mtype->sysfs_name); -} - static MDEV_TYPE_ATTR_RO(available_instances); static MDEV_TYPE_ATTR_RO(description); -static MDEV_TYPE_ATTR_RO(name); static const struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, &mdev_type_attr_description.attr, - &mdev_type_attr_name.attr, NULL, }; diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 25a5de08b3902..e5f21c725326b 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -221,6 +221,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) dev_set_drvdata(&sch->dev, private); private->mdev_type.sysfs_name = "io"; + private->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; private->mdev_types[0] = &private->mdev_type; ret = mdev_register_parent(&private->parent, &sch->dev, &vfio_ccw_mdev_driver, diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 4c7b181519228..394aab60dbd0a 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -44,13 +44,6 @@ static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) vfio_ccw_mdev_reset(private); } -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "I/O subchannel (Non-QDIO)\n"); -} -static MDEV_TYPE_ATTR_RO(name); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -62,7 +55,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, NULL, }; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index d440acfbb261e..5d8dd7e837f3d 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -790,14 +790,6 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_AP_MDEV_NAME_HWVIRT); -} - -static MDEV_TYPE_ATTR_RO(name); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -809,7 +801,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *vfio_ap_mdev_type_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1813,6 +1804,7 @@ int vfio_ap_mdev_register(void) return ret; matrix_dev->mdev_type.sysfs_name = VFIO_AP_MDEV_TYPE_HWVIRT; + matrix_dev->mdev_type.pretty_name = VFIO_AP_MDEV_NAME_HWVIRT; matrix_dev->mdev_types[0] = &matrix_dev->mdev_type; ret = mdev_register_parent(&matrix_dev->parent, &matrix_dev->device, &vfio_ap_matrix_driver, diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 60fc52ff92448..34583e6a97f27 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -81,9 +81,19 @@ static ssize_t device_api_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(device_api); +static ssize_t name_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", + mtype->pretty_name ? mtype->pretty_name : mtype->sysfs_name); +} + +static MDEV_TYPE_ATTR_RO(name); + static struct attribute *mdev_types_core_attrs[] = { &mdev_type_attr_create.attr, &mdev_type_attr_device_api.attr, + &mdev_type_attr_name.attr, NULL, }; diff --git a/include/linux/mdev.h b/include/linux/mdev.h index af1ff0165b8d3..4bb8a58b577b3 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -26,6 +26,7 @@ struct mdev_device { struct mdev_type { /* set by the driver before calling mdev_register parent: */ const char *sysfs_name; + const char *pretty_name; /* set by the core, can be used drivers */ struct mdev_parent *parent; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index a2fc13fade757..0b7585f16d8ab 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -101,26 +101,25 @@ MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); static struct mbochs_type { struct mdev_type type; - const char *name; u32 mbytes; u32 max_x; u32 max_y; } mbochs_types[] = { { .type.sysfs_name = MBOCHS_TYPE_1, - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, .mbytes = 4, .max_x = 800, .max_y = 600, }, { .type.sysfs_name = MBOCHS_TYPE_2, - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, .mbytes = 16, .max_x = 1920, .max_y = 1440, }, { .type.sysfs_name = MBOCHS_TYPE_3, - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, .mbytes = 64, .max_x = 0, .max_y = 0, @@ -556,7 +555,7 @@ static int mbochs_init_dev(struct vfio_device *vdev) mbochs_reset(mdev_state); dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__, - type->name, type->mbytes, mdev_state->pagecount); + type->type.pretty_name, type->mbytes, mdev_state->pagecount); return 0; err_vconfig: @@ -1351,16 +1350,6 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - struct mbochs_type *type = - container_of(mtype, struct mbochs_type, type); - - return sprintf(buf, "%s\n", type->name); -} -static MDEV_TYPE_ATTR_RO(name); - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -1385,7 +1374,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_available_instances.attr, NULL, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index f9069ed2750fa..90c6fed200b19 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -53,7 +53,6 @@ MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); static struct mdpy_type { struct mdev_type type; - const char *name; u32 format; u32 bytepp; u32 width; @@ -61,21 +60,21 @@ static struct mdpy_type { } mdpy_types[] = { { .type.sysfs_name = MDPY_TYPE_1, - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 640, .height = 480, }, { .type.sysfs_name = MDPY_TYPE_2, - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1024, .height = 768, }, { .type.sysfs_name = MDPY_TYPE_3, - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1920, @@ -256,8 +255,8 @@ static int mdpy_init_dev(struct vfio_device *vdev) mdpy_create_config_space(mdev_state); mdpy_reset(mdev_state); - dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, - type->height); + dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name, + type->width, type->height); mdpy_count++; return 0; @@ -662,15 +661,6 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); - - return sprintf(buf, "%s\n", type->name); -} -static MDEV_TYPE_ATTR_RO(name); - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -690,7 +680,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_available_instances.attr, NULL, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 064e71b28dd19..eab1b4442a96e 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -146,10 +146,11 @@ struct mdev_state { static struct mtty_type { struct mdev_type type; int nr_ports; - const char *name; } mtty_types[2] = { - { .nr_ports = 1, .type.sysfs_name = "1", .name = "Single port serial" }, - { .nr_ports = 2, .type.sysfs_name = "2", .name = "Dual port serial" }, + { .nr_ports = 1, .type.sysfs_name = "1", + .type.pretty_name = "Single port serial" }, + { .nr_ports = 2, .type.sysfs_name = "2", + .type.pretty_name = "Dual port serial" }, }; static struct mdev_type *mtty_mdev_types[] = { @@ -1255,16 +1256,6 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - struct mtty_type *type = container_of(mtype, struct mtty_type, type); - - return sysfs_emit(buf, "%s\n", type->name); -} - -static MDEV_TYPE_ATTR_RO(name); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -1278,7 +1269,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, NULL, }; -- GitLab From f2fbc72e6da4f8e01fe5fe3d6871a791e76271c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:50 +0200 Subject: [PATCH 1303/2223] vfio/mdev: consolidate all the available_instance sysfs into the core code Every driver just print a number, simply add a method to the mdev_driver to return it and provide a standard sysfs show function. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-13-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- .../driver-api/vfio-mediated-device.rst | 3 +- drivers/gpu/drm/i915/gvt/gvt.h | 1 - drivers/gpu/drm/i915/gvt/kvmgt.c | 34 +++++++++------ drivers/gpu/drm/i915/gvt/vgpu.c | 41 ++----------------- drivers/s390/cio/vfio_ccw_ops.c | 14 ++----- drivers/s390/crypto/vfio_ap_ops.c | 16 ++------ drivers/vfio/mdev/mdev_sysfs.c | 11 +++++ include/linux/mdev.h | 2 + samples/vfio-mdev/mbochs.c | 10 ++--- samples/vfio-mdev/mdpy.c | 9 ++-- samples/vfio-mdev/mtty.c | 16 ++------ 11 files changed, 55 insertions(+), 102 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index dcd1231a6fa84..558bd7ebced86 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -103,6 +103,7 @@ structure to represent a mediated device's driver:: struct mdev_driver { int (*probe) (struct mdev_device *dev); void (*remove) (struct mdev_device *dev); + unsigned int (*get_available)(struct mdev_type *mtype); const struct attribute * const *types_attrs; struct device_driver driver; }; @@ -207,7 +208,7 @@ Directories and files under the sysfs for Each Physical Device * available_instances - This attribute should show the number of devices of type <type-id> that can be + This attribute shows the number of devices of type <type-id> that can be created. * [device] diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index db182066d56c9..dbf8d7470b2c1 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -314,7 +314,6 @@ struct intel_vgpu_type { struct mdev_type type; char name[16]; const struct intel_vgpu_config *conf; - unsigned int avail_instance; }; struct intel_gvt { diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 93a52ae26f684..45051aedb3191 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -113,16 +113,6 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node); -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) -{ - struct intel_vgpu_type *type = - container_of(mtype, struct intel_vgpu_type, type); - - return sprintf(buf, "%u\n", type->avail_instance); -} - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -138,11 +128,9 @@ static ssize_t description_show(struct mdev_type *mtype, type->conf->weight); } -static MDEV_TYPE_ATTR_RO(available_instances); static MDEV_TYPE_ATTR_RO(description); static const struct attribute *gvt_type_attrs[] = { - &mdev_type_attr_available_instances.attr, &mdev_type_attr_description.attr, NULL, }; @@ -1533,6 +1521,27 @@ static void intel_vgpu_remove(struct mdev_device *mdev) vfio_put_device(&vgpu->vfio_device); } +static unsigned int intel_vgpu_get_available(struct mdev_type *mtype) +{ + struct intel_vgpu_type *type = + container_of(mtype, struct intel_vgpu_type, type); + struct intel_gvt *gvt = kdev_to_i915(mtype->parent->dev)->gvt; + unsigned int low_gm_avail, high_gm_avail, fence_avail; + + mutex_lock(&gvt->lock); + low_gm_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE - + gvt->gm.vgpu_allocated_low_gm_size; + high_gm_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE - + gvt->gm.vgpu_allocated_high_gm_size; + fence_avail = gvt_fence_sz(gvt) - HOST_FENCE - + gvt->fence.vgpu_allocated_fence_num; + mutex_unlock(&gvt->lock); + + return min3(low_gm_avail / type->conf->low_mm, + high_gm_avail / type->conf->high_mm, + fence_avail / type->conf->fence); +} + static struct mdev_driver intel_vgpu_mdev_driver = { .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { @@ -1542,6 +1551,7 @@ static struct mdev_driver intel_vgpu_mdev_driver = { }, .probe = intel_vgpu_probe, .remove = intel_vgpu_remove, + .get_available = intel_vgpu_get_available, .types_attrs = gvt_type_attrs, }; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 92aaa77feceee..56c71474008a3 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -129,11 +129,11 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) sprintf(gvt->types[i].name, "GVTg_V%u_%s", GRAPHICS_VER(gvt->gt->i915) == 8 ? 4 : 5, conf->name); gvt->types[i].conf = conf; - gvt->types[i].avail_instance = min(low_avail / conf->low_mm, - high_avail / conf->high_mm); gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", - i, gvt->types[i].name, gvt->types[i].avail_instance, + i, gvt->types[i].name, + min(low_avail / conf->low_mm, + high_avail / conf->high_mm), conf->low_mm, conf->high_mm, conf->fence, conf->weight, vgpu_edid_str(conf->edid)); @@ -157,36 +157,6 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) kfree(gvt->types); } -static void intel_gvt_update_vgpu_types(struct intel_gvt *gvt) -{ - int i; - unsigned int low_gm_avail, high_gm_avail, fence_avail; - unsigned int low_gm_min, high_gm_min, fence_min; - - /* Need to depend on maxium hw resource size but keep on - * static config for now. - */ - low_gm_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE - - gvt->gm.vgpu_allocated_low_gm_size; - high_gm_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE - - gvt->gm.vgpu_allocated_high_gm_size; - fence_avail = gvt_fence_sz(gvt) - HOST_FENCE - - gvt->fence.vgpu_allocated_fence_num; - - for (i = 0; i < gvt->num_types; i++) { - low_gm_min = low_gm_avail / gvt->types[i].conf->low_mm; - high_gm_min = high_gm_avail / gvt->types[i].conf->high_mm; - fence_min = fence_avail / gvt->types[i].conf->fence; - gvt->types[i].avail_instance = min(min(low_gm_min, high_gm_min), - fence_min); - - gvt_dbg_core("update type[%d]: %s avail %u low %u high %u fence %u\n", - i, gvt->types[i].name, - gvt->types[i].avail_instance, gvt->types[i].conf->low_mm, - gvt->types[i].conf->high_mm, gvt->types[i].conf->fence); - } -} - /** * intel_gvt_active_vgpu - activate a virtual GPU * @vgpu: virtual GPU @@ -281,10 +251,6 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu) intel_vgpu_clean_mmio(vgpu); intel_vgpu_dmabuf_cleanup(vgpu); mutex_unlock(&vgpu->vgpu_lock); - - mutex_lock(&gvt->lock); - intel_gvt_update_vgpu_types(gvt); - mutex_unlock(&gvt->lock); } #define IDLE_VGPU_IDR 0 @@ -414,7 +380,6 @@ int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, if (ret) goto out_clean_sched_policy; - intel_gvt_update_vgpu_types(gvt); intel_gvt_update_reg_whitelist(vgpu); mutex_unlock(&gvt->lock); return 0; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 394aab60dbd0a..559ca18055928 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -44,20 +44,12 @@ static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) vfio_ccw_mdev_reset(private); } -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int vfio_ccw_get_available(struct mdev_type *mtype) { struct vfio_ccw_private *private = dev_get_drvdata(mtype->parent->dev); - return sprintf(buf, "%d\n", atomic_read(&private->avail)); + return atomic_read(&private->avail); } -static MDEV_TYPE_ATTR_RO(available_instances); - -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_available_instances.attr, - NULL, -}; static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) { @@ -620,5 +612,5 @@ struct mdev_driver vfio_ccw_mdev_driver = { }, .probe = vfio_ccw_mdev_probe, .remove = vfio_ccw_mdev_remove, - .types_attrs = mdev_types_attrs, + .get_available = vfio_ccw_get_available, }; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 5d8dd7e837f3d..8606f5d75188c 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -790,21 +790,11 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int vfio_ap_mdev_get_available(struct mdev_type *mtype) { - return sprintf(buf, "%d\n", - atomic_read(&matrix_dev->available_instances)); + return atomic_read(&matrix_dev->available_instances); } -static MDEV_TYPE_ATTR_RO(available_instances); - -static const struct attribute *vfio_ap_mdev_type_attrs[] = { - &mdev_type_attr_available_instances.attr, - NULL, -}; - #define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ "already assigned to %s" @@ -1790,7 +1780,7 @@ static struct mdev_driver vfio_ap_matrix_driver = { }, .probe = vfio_ap_mdev_probe, .remove = vfio_ap_mdev_remove, - .types_attrs = vfio_ap_mdev_type_attrs, + .get_available = vfio_ap_mdev_get_available, }; int vfio_ap_mdev_register(void) diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 34583e6a97f27..b7f87c3eda5ea 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -90,10 +90,21 @@ static ssize_t name_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(name); +static ssize_t available_instances_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, + char *buf) +{ + struct mdev_driver *drv = mtype->parent->mdev_driver; + + return sysfs_emit(buf, "%u\n", drv->get_available(mtype)); +} +static MDEV_TYPE_ATTR_RO(available_instances); + static struct attribute *mdev_types_core_attrs[] = { &mdev_type_attr_create.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_name.attr, + &mdev_type_attr_available_instances.attr, NULL, }; diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 4bb8a58b577b3..d39e08a1824c6 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -72,6 +72,7 @@ struct mdev_type_attribute { * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed + * @get_available: Return the max number of instances that can be created * @types_attrs: attributes to the type kobjects. * @driver: device driver structure **/ @@ -79,6 +80,7 @@ struct mdev_driver { const char *device_api; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); + unsigned int (*get_available)(struct mdev_type *mtype); const struct attribute * const *types_attrs; struct device_driver driver; }; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 0b7585f16d8ab..6c2cbc4e25ca9 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1361,21 +1361,16 @@ static ssize_t description_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(description); -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int mbochs_get_available(struct mdev_type *mtype) { struct mbochs_type *type = container_of(mtype, struct mbochs_type, type); - int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes; - return sprintf(buf, "%d\n", count); + return atomic_read(&mbochs_avail_mbytes) / type->mbytes; } -static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_description.attr, - &mdev_type_attr_available_instances.attr, NULL, }; @@ -1399,6 +1394,7 @@ static struct mdev_driver mbochs_driver = { }, .probe = mbochs_probe, .remove = mbochs_remove, + .get_available = mbochs_get_available, .types_attrs = mdev_types_attrs, }; diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 90c6fed200b19..d1c835c9cabf2 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -671,17 +671,13 @@ static ssize_t description_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(description); -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int mdpy_get_available(struct mdev_type *mtype) { - return sprintf(buf, "%d\n", max_devices - mdpy_count); + return max_devices - mdpy_count; } -static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_description.attr, - &mdev_type_attr_available_instances.attr, NULL, }; @@ -704,6 +700,7 @@ static struct mdev_driver mdpy_driver = { }, .probe = mdpy_probe, .remove = mdpy_remove, + .get_available = mdpy_get_available, .types_attrs = mdev_types_attrs, }; diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index eab1b4442a96e..e72085fc13763 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1256,23 +1256,13 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int mtty_get_available(struct mdev_type *mtype) { struct mtty_type *type = container_of(mtype, struct mtty_type, type); - return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / - type->nr_ports); + return atomic_read(&mdev_avail_ports) / type->nr_ports; } -static MDEV_TYPE_ATTR_RO(available_instances); - -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_available_instances.attr, - NULL, -}; - static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", .init = mtty_init_dev, @@ -1292,7 +1282,7 @@ static struct mdev_driver mtty_driver = { }, .probe = mtty_probe, .remove = mtty_remove, - .types_attrs = mdev_types_attrs, + .get_available = mtty_get_available, }; static void mtty_device_release(struct device *dev) -- GitLab From 685a1537f4c603cfcaf4b9be56ff6a571f7ddd08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 23 Sep 2022 11:26:51 +0200 Subject: [PATCH 1304/2223] vfio/mdev: consolidate all the description sysfs into the core code Every driver just emits a string, simply add a method to the mdev_driver to return it and provide a standard sysfs show function. Remove the now unused types_attrs field in struct mdev_driver and the support code for it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Link: https://lore.kernel.org/r/20220923092652.100656-14-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- .../driver-api/vfio-mediated-device.rst | 4 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 18 +++------ drivers/vfio/mdev/mdev_driver.c | 2 +- drivers/vfio/mdev/mdev_sysfs.c | 40 +++++++++++++++---- include/linux/mdev.h | 19 +-------- samples/vfio-mdev/mbochs.c | 11 +---- samples/vfio-mdev/mdpy.c | 11 +---- 7 files changed, 46 insertions(+), 59 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index 558bd7ebced86..fdf7d69378ec4 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -104,7 +104,7 @@ structure to represent a mediated device's driver:: int (*probe) (struct mdev_device *dev); void (*remove) (struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); - const struct attribute * const *types_attrs; + ssize_t (*show_description)(struct mdev_type *mtype, char *buf); struct device_driver driver; }; @@ -222,7 +222,7 @@ Directories and files under the sysfs for Each Physical Device * description - This attribute should show brief features/description of the type. This is + This attribute can show brief features/description of the type. This is an optional attribute. Directories and Files Under the sysfs for Each mdev Device diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 45051aedb3191..7a45e5360caf2 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -113,8 +113,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node); -static ssize_t description_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) +static ssize_t intel_vgpu_show_description(struct mdev_type *mtype, char *buf) { struct intel_vgpu_type *type = container_of(mtype, struct intel_vgpu_type, type); @@ -128,13 +127,6 @@ static ssize_t description_show(struct mdev_type *mtype, type->conf->weight); } -static MDEV_TYPE_ATTR_RO(description); - -static const struct attribute *gvt_type_attrs[] = { - &mdev_type_attr_description.attr, - NULL, -}; - static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long size) { @@ -1549,10 +1541,10 @@ static struct mdev_driver intel_vgpu_mdev_driver = { .owner = THIS_MODULE, .dev_groups = intel_vgpu_groups, }, - .probe = intel_vgpu_probe, - .remove = intel_vgpu_remove, - .get_available = intel_vgpu_get_available, - .types_attrs = gvt_type_attrs, + .probe = intel_vgpu_probe, + .remove = intel_vgpu_remove, + .get_available = intel_vgpu_get_available, + .show_description = intel_vgpu_show_description, }; int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 60e8b9f6474e8..7825d83a55f8c 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -55,7 +55,7 @@ struct bus_type mdev_bus_type = { **/ int mdev_register_driver(struct mdev_driver *drv) { - if (!drv->types_attrs || !drv->device_api) + if (!drv->device_api) return -EINVAL; /* initialize common driver fields */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index b7f87c3eda5ea..658b3bf5ed0bf 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -14,7 +14,19 @@ #include "mdev_private.h" -/* Static functions */ +struct mdev_type_attribute { + struct attribute attr; + ssize_t (*show)(struct mdev_type *mtype, + struct mdev_type_attribute *attr, char *buf); + ssize_t (*store)(struct mdev_type *mtype, + struct mdev_type_attribute *attr, const char *buf, + size_t count); +}; + +#define MDEV_TYPE_ATTR_RO(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) +#define MDEV_TYPE_ATTR_WO(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name) static ssize_t mdev_type_attr_show(struct kobject *kobj, struct attribute *__attr, char *buf) @@ -100,16 +112,35 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); +static ssize_t description_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, + char *buf) +{ + return mtype->parent->mdev_driver->show_description(mtype, buf); +} +static MDEV_TYPE_ATTR_RO(description); + static struct attribute *mdev_types_core_attrs[] = { &mdev_type_attr_create.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, + &mdev_type_attr_description.attr, NULL, }; +static umode_t mdev_types_core_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + if (attr == &mdev_type_attr_description.attr && + !to_mdev_type(kobj)->parent->mdev_driver->show_description) + return 0; + return attr->mode; +} + static struct attribute_group mdev_type_core_group = { .attrs = mdev_types_core_attrs, + .is_visible = mdev_types_core_is_visible, }; static const struct attribute_group *mdev_type_groups[] = { @@ -155,13 +186,8 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) goto attr_devices_failed; } - ret = sysfs_create_files(&type->kobj, parent->mdev_driver->types_attrs); - if (ret) - goto attrs_failed; return 0; -attrs_failed: - kobject_put(type->devices_kobj); attr_devices_failed: kobject_del(&type->kobj); kobject_put(&type->kobj); @@ -170,8 +196,6 @@ attr_devices_failed: static void mdev_type_remove(struct mdev_type *type) { - sysfs_remove_files(&type->kobj, type->parent->mdev_driver->types_attrs); - kobject_put(type->devices_kobj); kobject_del(&type->kobj); kobject_put(&type->kobj); diff --git a/include/linux/mdev.h b/include/linux/mdev.h index d39e08a1824c6..33674cb5ed5d4 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -52,28 +52,13 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -/* interface for exporting mdev supported type attributes */ -struct mdev_type_attribute { - struct attribute attr; - ssize_t (*show)(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf); - ssize_t (*store)(struct mdev_type *mtype, - struct mdev_type_attribute *attr, const char *buf, - size_t count); -}; - -#define MDEV_TYPE_ATTR_RO(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) -#define MDEV_TYPE_ATTR_WO(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name) - /** * struct mdev_driver - Mediated device driver * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed * @get_available: Return the max number of instances that can be created - * @types_attrs: attributes to the type kobjects. + * @show_description: Print a description of the mtype * @driver: device driver structure **/ struct mdev_driver { @@ -81,7 +66,7 @@ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); - const struct attribute * const *types_attrs; + ssize_t (*show_description)(struct mdev_type *mtype, char *buf); struct device_driver driver; }; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 6c2cbc4e25ca9..117a8d799f711 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1350,8 +1350,7 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t description_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) +static ssize_t mbochs_show_description(struct mdev_type *mtype, char *buf) { struct mbochs_type *type = container_of(mtype, struct mbochs_type, type); @@ -1359,7 +1358,6 @@ static ssize_t description_show(struct mdev_type *mtype, return sprintf(buf, "virtual display, %d MB video memory\n", type ? type->mbytes : 0); } -static MDEV_TYPE_ATTR_RO(description); static unsigned int mbochs_get_available(struct mdev_type *mtype) { @@ -1369,11 +1367,6 @@ static unsigned int mbochs_get_available(struct mdev_type *mtype) return atomic_read(&mbochs_avail_mbytes) / type->mbytes; } -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_description.attr, - NULL, -}; - static const struct vfio_device_ops mbochs_dev_ops = { .close_device = mbochs_close_device, .init = mbochs_init_dev, @@ -1395,7 +1388,7 @@ static struct mdev_driver mbochs_driver = { .probe = mbochs_probe, .remove = mbochs_remove, .get_available = mbochs_get_available, - .types_attrs = mdev_types_attrs, + .show_description = mbochs_show_description, }; static const struct file_operations vd_fops = { diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index d1c835c9cabf2..a7cf59246ddd0 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -661,26 +661,19 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t description_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) +static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf) { struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "virtual display, %dx%d framebuffer\n", type->width, type->height); } -static MDEV_TYPE_ATTR_RO(description); static unsigned int mdpy_get_available(struct mdev_type *mtype) { return max_devices - mdpy_count; } -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_description.attr, - NULL, -}; - static const struct vfio_device_ops mdpy_dev_ops = { .init = mdpy_init_dev, .release = mdpy_release_dev, @@ -701,7 +694,7 @@ static struct mdev_driver mdpy_driver = { .probe = mdpy_probe, .remove = mdpy_remove, .get_available = mdpy_get_available, - .types_attrs = mdev_types_attrs, + .show_description = mdpy_show_description, }; static const struct file_operations vd_fops = { -- GitLab From 9c799c224d6ebc5be51065bd3217a2d7eea23b8f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Fri, 23 Sep 2022 11:26:52 +0200 Subject: [PATCH 1305/2223] vfio/mdev: add mdev available instance checking to the core Many of the mdev drivers use a simple counter for keeping track of the available instances. Move this code to the core code and store the counter in the mdev_parent. Implement it using correct locking, fixing mdpy. Drivers just provide the value in the mdev_driver at registration time and the core code takes care of maintaining it and exposing the value in sysfs. [hch: count instances per-parent instead of per-type, use an atomic_t to avoid taking mdev_list_lock in the show method] Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com> Reviewed-by: Eric Farman <farman@linux.ibm.com> Link: https://lore.kernel.org/r/20220923092652.100656-15-hch@lst.de Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/s390/cio/vfio_ccw_drv.c | 1 - drivers/s390/cio/vfio_ccw_ops.c | 15 +-------------- drivers/s390/cio/vfio_ccw_private.h | 2 -- drivers/s390/crypto/vfio_ap_ops.c | 13 +------------ drivers/s390/crypto/vfio_ap_private.h | 2 -- drivers/vfio/mdev/mdev_core.c | 22 +++++++++++++++++++--- drivers/vfio/mdev/mdev_sysfs.c | 5 ++++- include/linux/mdev.h | 3 +++ samples/vfio-mdev/mdpy.c | 22 ++++------------------ 9 files changed, 32 insertions(+), 53 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e5f21c725326b..7f5402fe857a2 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -141,7 +141,6 @@ static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) INIT_LIST_HEAD(&private->crw); INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - atomic_set(&private->avail, 1); private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), GFP_KERNEL); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 559ca18055928..6ae4d012d8008 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -44,13 +44,6 @@ static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) vfio_ccw_mdev_reset(private); } -static unsigned int vfio_ccw_get_available(struct mdev_type *mtype) -{ - struct vfio_ccw_private *private = dev_get_drvdata(mtype->parent->dev); - - return atomic_read(&private->avail); -} - static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = @@ -68,9 +61,6 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) if (private->state == VFIO_CCW_STATE_NOT_OPER) return -ENODEV; - if (atomic_dec_if_positive(&private->avail) < 0) - return -EPERM; - ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); if (ret) return ret; @@ -88,7 +78,6 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) err_put_vdev: vfio_put_device(&private->vdev); - atomic_inc(&private->avail); return ret; } @@ -130,8 +119,6 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) * cycle. */ wait_for_completion(&private->release_comp); - - atomic_inc(&private->avail); } static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) @@ -605,6 +592,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { struct mdev_driver vfio_ccw_mdev_driver = { .device_api = VFIO_DEVICE_API_CCW_STRING, + .max_instances = 1, .driver = { .name = "vfio_ccw_mdev", .owner = THIS_MODULE, @@ -612,5 +600,4 @@ struct mdev_driver vfio_ccw_mdev_driver = { }, .probe = vfio_ccw_mdev_probe, .remove = vfio_ccw_mdev_remove, - .get_available = vfio_ccw_get_available, }; diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 52caa721ec06c..bd5fb81456af8 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -73,7 +73,6 @@ struct vfio_ccw_crw { * @sch: pointer to the subchannel * @state: internal state of the device * @completion: synchronization helper of the I/O completion - * @avail: available for creating a mediated device * @io_region: MMIO region to input/output I/O arguments/results * @io_mutex: protect against concurrent update of I/O regions * @region: additional regions for other subchannel operations @@ -97,7 +96,6 @@ struct vfio_ccw_private { struct subchannel *sch; int state; struct completion *completion; - atomic_t avail; struct ccw_io_region *io_region; struct mutex io_mutex; struct vfio_ccw_region *region; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 8606f5d75188c..2884189f38771 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -689,9 +689,6 @@ static int vfio_ap_mdev_init_dev(struct vfio_device *vdev) struct ap_matrix_mdev *matrix_mdev = container_of(vdev, struct ap_matrix_mdev, vdev); - if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) - return -EPERM; - matrix_mdev->mdev = to_mdev_device(vdev->dev); vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); matrix_mdev->pqap_hook = handle_pqap; @@ -770,7 +767,6 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) { - atomic_inc(&matrix_dev->available_instances); vfio_free_device(vdev); } @@ -790,11 +786,6 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -static unsigned int vfio_ap_mdev_get_available(struct mdev_type *mtype) -{ - return atomic_read(&matrix_dev->available_instances); -} - #define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ "already assigned to %s" @@ -1772,6 +1763,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { static struct mdev_driver vfio_ap_matrix_driver = { .device_api = VFIO_DEVICE_API_AP_STRING, + .max_instances = MAX_ZDEV_ENTRIES_EXT, .driver = { .name = "vfio_ap_mdev", .owner = THIS_MODULE, @@ -1780,15 +1772,12 @@ static struct mdev_driver vfio_ap_matrix_driver = { }, .probe = vfio_ap_mdev_probe, .remove = vfio_ap_mdev_remove, - .get_available = vfio_ap_mdev_get_available, }; int vfio_ap_mdev_register(void) { int ret; - atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT); - ret = mdev_register_driver(&vfio_ap_matrix_driver); if (ret) return ret; diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 441dc8dda380b..2eddd5f34ed34 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -29,7 +29,6 @@ * struct ap_matrix_dev - Contains the data for the matrix device. * * @device: generic device structure associated with the AP matrix device - * @available_instances: number of mediated matrix devices that can be created * @info: the struct containing the output from the PQAP(QCI) instruction * @mdev_list: the list of mediated matrix devices created * @mdevs_lock: mutex for locking the AP matrix device. This lock will be @@ -46,7 +45,6 @@ */ struct ap_matrix_dev { struct device device; - atomic_t available_instances; struct ap_config_info info; struct list_head mdev_list; struct mutex mdevs_lock; /* serializes access to each ap_matrix_mdev */ diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 93f8caf2e5f77..58f91b3bd670c 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -70,6 +70,7 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev, parent->mdev_driver = mdev_driver; parent->types = types; parent->nr_types = nr_types; + atomic_set(&parent->available_instances, mdev_driver->max_instances); if (!mdev_bus_compat_class) { mdev_bus_compat_class = class_compat_register("mdev_bus"); @@ -115,14 +116,17 @@ EXPORT_SYMBOL(mdev_unregister_parent); static void mdev_device_release(struct device *dev) { struct mdev_device *mdev = to_mdev_device(dev); - - /* Pairs with the get in mdev_device_create() */ - kobject_put(&mdev->type->kobj); + struct mdev_parent *parent = mdev->type->parent; mutex_lock(&mdev_list_lock); list_del(&mdev->next); + if (!parent->mdev_driver->get_available) + atomic_inc(&parent->available_instances); mutex_unlock(&mdev_list_lock); + /* Pairs with the get in mdev_device_create() */ + kobject_put(&mdev->type->kobj); + dev_dbg(&mdev->dev, "MDEV: destroying\n"); kfree(mdev); } @@ -144,6 +148,18 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid) } } + if (!drv->get_available) { + /* + * Note: that non-atomic read and dec is fine here because + * all modifications are under mdev_list_lock. + */ + if (!atomic_read(&parent->available_instances)) { + mutex_unlock(&mdev_list_lock); + return -EUSERS; + } + atomic_dec(&parent->available_instances); + } + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) { mutex_unlock(&mdev_list_lock); diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 658b3bf5ed0bf..abe3359dd477f 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -108,7 +108,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype, { struct mdev_driver *drv = mtype->parent->mdev_driver; - return sysfs_emit(buf, "%u\n", drv->get_available(mtype)); + if (drv->get_available) + return sysfs_emit(buf, "%u\n", drv->get_available(mtype)); + return sysfs_emit(buf, "%u\n", + atomic_read(&mtype->parent->available_instances)); } static MDEV_TYPE_ATTR_RO(available_instances); diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 33674cb5ed5d4..139d05b26f820 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -45,6 +45,7 @@ struct mdev_parent { struct rw_semaphore unreg_sem; struct mdev_type **types; unsigned int nr_types; + atomic_t available_instances; }; static inline struct mdev_device *to_mdev_device(struct device *dev) @@ -55,6 +56,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) /** * struct mdev_driver - Mediated device driver * @device_api: string to return for the device_api sysfs + * @max_instances: maximum number of instances supported (optional) * @probe: called when new device created * @remove: called when device removed * @get_available: Return the max number of instances that can be created @@ -63,6 +65,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) **/ struct mdev_driver { const char *device_api; + unsigned int max_instances; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index a7cf59246ddd0..946e8cfde6fdd 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -42,11 +42,6 @@ MODULE_LICENSE("GPL v2"); -static int max_devices = 4; -module_param_named(count, max_devices, int, 0444); -MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); - - #define MDPY_TYPE_1 "vga" #define MDPY_TYPE_2 "xga" #define MDPY_TYPE_3 "hd" @@ -93,7 +88,6 @@ static struct class *mdpy_class; static struct cdev mdpy_cdev; static struct device mdpy_dev; static struct mdev_parent mdpy_parent; -static u32 mdpy_count; static const struct vfio_device_ops mdpy_dev_ops; /* State of each mdev device */ @@ -235,9 +229,6 @@ static int mdpy_init_dev(struct vfio_device *vdev) u32 fbsize; int ret = -ENOMEM; - if (mdpy_count >= max_devices) - return ret; - mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); if (!mdev_state->vconfig) return ret; @@ -257,8 +248,6 @@ static int mdpy_init_dev(struct vfio_device *vdev) dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name, type->width, type->height); - - mdpy_count++; return 0; out_vconfig: @@ -292,7 +281,6 @@ static void mdpy_release_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); - mdpy_count--; vfree(mdev_state->memblk); kfree(mdev_state->vconfig); vfio_free_device(vdev); @@ -669,11 +657,6 @@ static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf) type->width, type->height); } -static unsigned int mdpy_get_available(struct mdev_type *mtype) -{ - return max_devices - mdpy_count; -} - static const struct vfio_device_ops mdpy_dev_ops = { .init = mdpy_init_dev, .release = mdpy_release_dev, @@ -685,6 +668,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { static struct mdev_driver mdpy_driver = { .device_api = VFIO_DEVICE_API_PCI_STRING, + .max_instances = 4, .driver = { .name = "mdpy", .owner = THIS_MODULE, @@ -693,7 +677,6 @@ static struct mdev_driver mdpy_driver = { }, .probe = mdpy_probe, .remove = mdpy_remove, - .get_available = mdpy_get_available, .show_description = mdpy_show_description, }; @@ -770,5 +753,8 @@ static void __exit mdpy_dev_exit(void) mdpy_class = NULL; } +module_param_named(count, mdpy_driver.max_instances, int, 0444); +MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); + module_init(mdpy_dev_init) module_exit(mdpy_dev_exit) -- GitLab From 912b74d26c7df2da1e261f3dac8942c8cbb76a49 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Thu, 29 Sep 2022 11:59:24 -0300 Subject: [PATCH 1306/2223] vfio: Remove the vfio_group->users and users_comp Kevin points out that the users is really just tracking if group->opened_file is set, so we can simplify this code to a wait_queue that looks for !opened_file under the group_rwsem. Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Link: https://lore.kernel.org/r/1-v1-917e3647f123+b1a-vfio_group_users_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/vfio.h | 3 +-- drivers/vfio/vfio_main.c | 45 +++++++++++++++++----------------------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 039e3208d286f..78b362a925011 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -48,8 +48,6 @@ struct vfio_group { * reaches 0 then the iommu_group is invalid. */ refcount_t drivers; - refcount_t users; - struct completion users_comp; unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; @@ -61,6 +59,7 @@ struct vfio_group { struct rw_semaphore group_rwsem; struct kvm *kvm; struct file *opened_file; + struct swait_queue_head opened_file_wait; struct blocking_notifier_head notifier; }; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f19171cad9a25..57a7576a96a61 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -186,10 +186,9 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, cdev_init(&group->cdev, &vfio_group_fops); group->cdev.owner = THIS_MODULE; - refcount_set(&group->users, 1); refcount_set(&group->drivers, 1); - init_completion(&group->users_comp); init_rwsem(&group->group_rwsem); + init_swait_queue_head(&group->opened_file_wait); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -245,12 +244,6 @@ err_put: return ret; } -static void vfio_group_put(struct vfio_group *group) -{ - if (refcount_dec_and_test(&group->users)) - complete(&group->users_comp); -} - static void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; @@ -270,10 +263,6 @@ static void vfio_device_remove_group(struct vfio_device *device) * cdev_device_add() will fail due to the name aready existing. */ cdev_device_del(&group->cdev, &group->dev); - mutex_unlock(&vfio.group_lock); - - /* Matches the get from vfio_group_alloc() */ - vfio_group_put(group); /* * Before we allow the last driver in the group to be unplugged the @@ -281,7 +270,13 @@ static void vfio_device_remove_group(struct vfio_device *device) * is because the group->iommu_group pointer should only be used so long * as a device driver is attached to a device in the group. */ - wait_for_completion(&group->users_comp); + while (group->opened_file) { + mutex_unlock(&vfio.group_lock); + swait_event_idle_exclusive(group->opened_file_wait, + !group->opened_file); + mutex_lock(&vfio.group_lock); + } + mutex_unlock(&vfio.group_lock); /* * These data structures all have paired operations that can only be @@ -906,15 +901,18 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) down_write(&group->group_rwsem); - /* users can be zero if this races with vfio_device_remove_group() */ - if (!refcount_inc_not_zero(&group->users)) { + /* + * drivers can be zero if this races with vfio_device_remove_group(), it + * will be stable at 0 under the group rwsem + */ + if (refcount_read(&group->drivers) == 0) { ret = -ENODEV; - goto err_unlock; + goto out_unlock; } if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { ret = -EPERM; - goto err_put; + goto out_unlock; } /* @@ -922,16 +920,12 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) */ if (group->opened_file) { ret = -EBUSY; - goto err_put; + goto out_unlock; } group->opened_file = filep; filep->private_data = group; - - up_write(&group->group_rwsem); - return 0; -err_put: - vfio_group_put(group); -err_unlock: + ret = 0; +out_unlock: up_write(&group->group_rwsem); return ret; } @@ -952,8 +946,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_detach_container(group); group->opened_file = NULL; up_write(&group->group_rwsem); - - vfio_group_put(group); + swake_up_one(&group->opened_file_wait); return 0; } -- GitLab From c82e81ab2569559ad873b3061217c2f37560682b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Thu, 29 Sep 2022 11:59:25 -0300 Subject: [PATCH 1307/2223] vfio: Change vfio_group->group_rwsem to a mutex These days not much is using the read side: - device first open - ioctl_get_status - device FD release - check enforced_coherent None of this is performance, so just make it into a normal mutex. Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Link: https://lore.kernel.org/r/2-v1-917e3647f123+b1a-vfio_group_users_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/container.c | 10 ++++----- drivers/vfio/vfio.h | 2 +- drivers/vfio/vfio_main.c | 47 ++++++++++++++++++++-------------------- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index db7c071ee3de1..d74164abbf401 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -430,7 +430,7 @@ int vfio_container_attach_group(struct vfio_container *container, struct vfio_iommu_driver *driver; int ret = 0; - lockdep_assert_held_write(&group->group_rwsem); + lockdep_assert_held(&group->group_lock); if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; @@ -481,7 +481,7 @@ void vfio_group_detach_container(struct vfio_group *group) struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; - lockdep_assert_held_write(&group->group_rwsem); + lockdep_assert_held(&group->group_lock); WARN_ON(group->container_users != 1); down_write(&container->group_lock); @@ -515,7 +515,7 @@ int vfio_device_assign_container(struct vfio_device *device) { struct vfio_group *group = device->group; - lockdep_assert_held_write(&group->group_rwsem); + lockdep_assert_held(&group->group_lock); if (!group->container || !group->container->iommu_driver || WARN_ON(!group->container_users)) @@ -531,11 +531,11 @@ int vfio_device_assign_container(struct vfio_device *device) void vfio_device_unassign_container(struct vfio_device *device) { - down_write(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); WARN_ON(device->group->container_users <= 1); device->group->container_users--; fput(device->group->opened_file); - up_write(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); } /* diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 78b362a925011..4a1bac1359a95 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -56,7 +56,7 @@ struct vfio_group { struct list_head vfio_next; struct list_head container_next; enum vfio_group_type type; - struct rw_semaphore group_rwsem; + struct mutex group_lock; struct kvm *kvm; struct file *opened_file; struct swait_queue_head opened_file_wait; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 57a7576a96a61..9207e6c0e3cb2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -158,6 +158,7 @@ static void vfio_group_release(struct device *dev) struct vfio_group *group = container_of(dev, struct vfio_group, dev); mutex_destroy(&group->device_lock); + mutex_destroy(&group->group_lock); iommu_group_put(group->iommu_group); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); @@ -187,7 +188,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->drivers, 1); - init_rwsem(&group->group_rwsem); + mutex_init(&group->group_lock); init_swait_queue_head(&group->opened_file_wait); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); @@ -665,7 +666,7 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) { int ret = 0; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); if (!group->container) { ret = -EINVAL; goto out_unlock; @@ -677,7 +678,7 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) vfio_group_detach_container(group); out_unlock: - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); return ret; } @@ -696,7 +697,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, if (!f.file) return -EBADF; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); if (group->container || WARN_ON(group->container_users)) { ret = -EINVAL; goto out_unlock; @@ -709,7 +710,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, } out_unlock: - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); fdput(f); return ret; } @@ -727,9 +728,9 @@ static struct file *vfio_device_open(struct vfio_device *device) struct file *filep; int ret; - down_write(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); ret = vfio_device_assign_container(device); - up_write(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); if (ret) return ERR_PTR(ret); @@ -746,7 +747,7 @@ static struct file *vfio_device_open(struct vfio_device *device) * lock. If the device driver will use it, it must obtain a * reference and release it during close_device. */ - down_read(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); device->kvm = device->group->kvm; if (device->ops->open_device) { @@ -755,7 +756,7 @@ static struct file *vfio_device_open(struct vfio_device *device) goto err_undo_count; } vfio_device_container_register(device); - up_read(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); } mutex_unlock(&device->dev_set->lock); @@ -788,14 +789,14 @@ static struct file *vfio_device_open(struct vfio_device *device) err_close_device: mutex_lock(&device->dev_set->lock); - down_read(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); if (device->open_count == 1 && device->ops->close_device) { device->ops->close_device(device); vfio_device_container_unregister(device); } err_undo_count: - up_read(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); device->open_count--; if (device->open_count == 0 && device->kvm) device->kvm = NULL; @@ -860,13 +861,13 @@ static int vfio_group_ioctl_get_status(struct vfio_group *group, status.flags = 0; - down_read(&group->group_rwsem); + mutex_lock(&group->group_lock); if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) status.flags |= VFIO_GROUP_FLAGS_VIABLE; - up_read(&group->group_rwsem); + mutex_unlock(&group->group_lock); if (copy_to_user(arg, &status, minsz)) return -EFAULT; @@ -899,7 +900,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) container_of(inode->i_cdev, struct vfio_group, cdev); int ret; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); /* * drivers can be zero if this races with vfio_device_remove_group(), it @@ -926,7 +927,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) filep->private_data = group; ret = 0; out_unlock: - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); return ret; } @@ -936,7 +937,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); /* * Device FDs hold a group file reference, therefore the group release * is only called when there are no open devices. @@ -945,7 +946,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) if (group->container) vfio_group_detach_container(group); group->opened_file = NULL; - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); swake_up_one(&group->opened_file_wait); return 0; @@ -1001,12 +1002,12 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); - down_read(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); vfio_device_container_unregister(device); - up_read(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); device->open_count--; if (device->open_count == 0) device->kvm = NULL; @@ -1580,7 +1581,7 @@ bool vfio_file_enforced_coherent(struct file *file) if (file->f_op != &vfio_group_fops) return true; - down_read(&group->group_rwsem); + mutex_lock(&group->group_lock); if (group->container) { ret = vfio_container_ioctl_check_extension(group->container, VFIO_DMA_CC_IOMMU); @@ -1592,7 +1593,7 @@ bool vfio_file_enforced_coherent(struct file *file) */ ret = true; } - up_read(&group->group_rwsem); + mutex_unlock(&group->group_lock); return ret; } EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); @@ -1612,9 +1613,9 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) if (file->f_op != &vfio_group_fops) return; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); group->kvm = kvm; - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); -- GitLab From 280dfeae56e6fbfff21cfece356379e318ae10fe Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Tue, 30 Aug 2022 20:13:23 +0800 Subject: [PATCH 1308/2223] f2fs: return the tmp_ptr directly in __bitmap_ptr Just return tmp_ptr here, it's no need to dereference checkpoint pointer again. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 088c3d1574b8c..0cc2f7aa45dbe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2529,7 +2529,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { -- GitLab From 173cdf2c32b4b02474006d87648383244c0a6db9 Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Tue, 30 Aug 2022 14:55:15 +0800 Subject: [PATCH 1309/2223] f2fs: use COMPRESS_MAPPING to get compress cache mapping Just use the defined COMPRESS_MAPPING to get compress cache mapping instaed of direct accessing name. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 730256732a9e9..6baaff4c52baf 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1901,7 +1901,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->compress_inode->i_mapping; + struct address_space *mapping = COMPRESS_MAPPING(sbi); struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); -- GitLab From 9b7eadd9bd3a0cc24533a23d83c46430a0ea60ff Mon Sep 17 00:00:00 2001 From: Shuqi Zhang <zhangshuqi3@huawei.com> Date: Wed, 31 Aug 2022 10:24:40 +0800 Subject: [PATCH 1310/2223] f2fs: fix wrong dirty page count when race between mmap and fallocate. This is a BUG_ON issue as follows when running xfstest-generic-503: WARNING: CPU: 21 PID: 1385 at fs/f2fs/inode.c:762 f2fs_evict_inode+0x847/0xaa0 Modules linked in: CPU: 21 PID: 1385 Comm: umount Not tainted 5.19.0-rc5+ #73 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014 Call Trace: evict+0x129/0x2d0 dispose_list+0x4f/0xb0 evict_inodes+0x204/0x230 generic_shutdown_super+0x5b/0x1e0 kill_block_super+0x29/0x80 kill_f2fs_super+0xe6/0x140 deactivate_locked_super+0x44/0xc0 deactivate_super+0x79/0x90 cleanup_mnt+0x114/0x1a0 __cleanup_mnt+0x16/0x20 task_work_run+0x98/0x100 exit_to_user_mode_prepare+0x3d0/0x3e0 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Function flow analysis when BUG occurs: f2fs_fallocate mmap do_page_fault pte_spinlock // ---lock_pte do_wp_page wp_page_shared pte_unmap_unlock // unlock_pte do_page_mkwrite f2fs_vm_page_mkwrite down_read(invalidate_lock) lock_page if (PageMappedToDisk(page)) goto out; // set_page_dirty --NOT RUN out: up_read(invalidate_lock); finish_mkwrite_fault // unlock_pte f2fs_collapse_range down_write(i_mmap_sem) truncate_pagecache unmap_mapping_pages i_mmap_lock_write // down_write(i_mmap_rwsem) ...... zap_pte_range pte_offset_map_lock // ---lock_pte set_page_dirty f2fs_dirty_data_folio if (!folio_test_dirty(folio)) { fault_dirty_shared_page set_page_dirty f2fs_dirty_data_folio if (!folio_test_dirty(folio)) { filemap_dirty_folio f2fs_update_dirty_folio // ++ } unlock_page filemap_dirty_folio f2fs_update_dirty_folio // page count++ } pte_unmap_unlock // --unlock_pte i_mmap_unlock_write // up_write(i_mmap_rwsem) truncate_inode_pages up_write(i_mmap_sem) When race happens between mmap-do_page_fault-wp_page_shared and fallocate-truncate_pagecache-zap_pte_range, the zap_pte_range calls function set_page_dirty without page lock. Besides, though truncate_pagecache has immap and pte lock, wp_page_shared calls fault_dirty_shared_page without any. In this case, two threads race in f2fs_dirty_data_folio function. Page is set to dirty only ONCE, but the count is added TWICE by calling filemap_dirty_folio. Thus the count of dirty page cannot accord with the real dirty pages. Following is the solution to in case of race happens without any lock. Since folio_test_set_dirty in filemap_dirty_folio is atomic, judge return value will not be at risk of race. Signed-off-by: Shuqi Zhang <zhangshuqi3@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/data.c | 3 +-- fs/f2fs/node.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7bf1feb5ac783..cf315e3d244ca 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -449,8 +449,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0869fbbb5516f..87524d3dce223 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3697,8 +3697,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2484285be3ad9..23291f1575d35 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2147,8 +2147,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (IS_INODE(&folio->page)) f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); set_page_private_reference(&folio->page); return true; -- GitLab From d382e36970ecf8242921400db2afde15fb6ed49e Mon Sep 17 00:00:00 2001 From: Yonggil Song <yonggil.song@samsung.com> Date: Fri, 2 Sep 2022 11:07:49 +0900 Subject: [PATCH 1311/2223] f2fs: fix typo Fix typo in f2fs.h Detected by Jaeyoon Choi Signed-off-by: Yonggil Song <yonggil.song@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0cc2f7aa45dbe..d7bb7d4f9434c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -274,7 +274,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; -- GitLab From 049ea86cb5c7212a6e7e617a67fe686f9b0b0669 Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Wed, 31 Aug 2022 17:48:15 +0800 Subject: [PATCH 1312/2223] f2fs: add static init_idisk_time function to reduce the code We can use a inner function to init the disk time of f2fs_inode_info for cleaning redundant code. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/inode.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6d11c365d7b4e..cde0a3dc80c3e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -333,6 +333,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -465,10 +475,7 @@ static int do_read_inode(struct inode *inode) } } - fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; - fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; + init_idisk_time(inode); f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -676,11 +683,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_page_private_inline(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif -- GitLab From 9df6d6f9be4754da96d3c91ec518ed974e6b81e7 Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Thu, 1 Sep 2022 15:19:37 +0800 Subject: [PATCH 1313/2223] f2fs: remove redundant check in f2fs_sanity_check_cluster It have checked "compressed" at the entry of f2fs_sanity_check_cluster, just remove the redundant check for better performance here. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/compress.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6baaff4c52baf..c16bab5bd6000 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -912,17 +912,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) reason = "[C|*|C|*]"; goto out; } - if (compressed) { - if (!__is_valid_data_blkaddr(blkaddr)) { - if (!cluster_end) - cluster_end = i; - continue; - } - /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ - if (cluster_end) { - reason = "[C|N|N|V]"; - goto out; - } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; } } return false; -- GitLab From 07725adc55c0a414c10acb5c8c86cea34b95ddef Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Mon, 5 Sep 2022 12:59:17 +0800 Subject: [PATCH 1314/2223] f2fs: fix race condition on setting FI_NO_EXTENT flag The following scenarios exist. process A: process B: ->f2fs_drop_extent_tree ->f2fs_update_extent_cache_range ->f2fs_update_extent_tree_range ->write_lock ->set_inode_flag ->is_inode_flag_set ->__free_extent_tree // Shouldn't // have been // cleaned up // here ->write_lock In this case, the "FI_NO_EXTENT" flag is set between f2fs_update_extent_tree_range and is_inode_flag_set by other process. it leads to clearing the whole exten tree which should not have happened. And we fix it by move the setting it to the range of write_lock. Fixes:5f281fab9b9a3 ("f2fs: disable extent_cache for fcollapse/finsert inodes") Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/extent_cache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 866e72b29bd5a..761fd42c93f23 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -804,9 +804,8 @@ void f2fs_drop_extent_tree(struct inode *inode) if (!f2fs_may_extent_tree(inode)) return; - set_inode_flag(inode, FI_NO_EXTENT); - write_lock(&et->lock); + set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); if (et->largest.len) { et->largest.len = 0; -- GitLab From f3b23c785aa5d1920f479533f1d7361c2feceea5 Mon Sep 17 00:00:00 2001 From: Weichao Guo <guoweichao@oppo.com> Date: Wed, 7 Sep 2022 10:38:48 +0800 Subject: [PATCH 1315/2223] f2fs: let FI_OPU_WRITE override FADVISE_COLD_BIT Cold files may be fragmented due to SSR, defragment is needed as sequential reads are dominant scenarios of these files. FI_OPU_WRITE should override FADVISE_COLD_BIT to avoid defragment fails. Signed-off-by: Weichao Guo <guoweichao@oppo.com> Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87524d3dce223..a737eedef779f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2543,7 +2543,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); -- GitLab From 0ef4ca04a3f9223ff8bc440041c524b2123e09a3 Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Tue, 13 Sep 2022 10:08:41 +0800 Subject: [PATCH 1316/2223] f2fs: fix to do sanity check on destination blkaddr during recovery As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216456 loop5: detected capacity change from 0 to 131072 F2FS-fs (loop5): recover_inode: ino = 6, name = hln, inline = 1 F2FS-fs (loop5): recover_data: ino = 6 (i_size: recover) err = 0 F2FS-fs (loop5): recover_inode: ino = 6, name = hln, inline = 1 F2FS-fs (loop5): recover_data: ino = 6 (i_size: recover) err = 0 F2FS-fs (loop5): recover_inode: ino = 6, name = hln, inline = 1 F2FS-fs (loop5): recover_data: ino = 6 (i_size: recover) err = 0 F2FS-fs (loop5): Bitmap was wrongly set, blk:5634 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1013 at fs/f2fs/segment.c:2198 RIP: 0010:update_sit_entry+0xa55/0x10b0 [f2fs] Call Trace: <TASK> f2fs_do_replace_block+0xa98/0x1890 [f2fs] f2fs_replace_block+0xeb/0x180 [f2fs] recover_data+0x1a69/0x6ae0 [f2fs] f2fs_recover_fsync_data+0x120d/0x1fc0 [f2fs] f2fs_fill_super+0x4665/0x61e0 [f2fs] mount_bdev+0x2cf/0x3b0 legacy_get_tree+0xed/0x1d0 vfs_get_tree+0x81/0x2b0 path_mount+0x47e/0x19d0 do_mount+0xce/0xf0 __x64_sys_mount+0x12c/0x1a0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd If we enable CONFIG_F2FS_CHECK_FS config, it will trigger a kernel panic instead of warning. The root cause is: in fuzzed image, SIT table is inconsistent with inode mapping table, result in triggering such warning during SIT table update. This patch introduces a new flag DATA_GENERIC_ENHANCE_UPDATE, w/ this flag, data block recovery flow can check destination blkaddr's validation in SIT table, and skip f2fs_replace_block() to avoid inconsistent status. Cc: stable@vger.kernel.org Reported-by: Wenqing Liu <wenqingliu0120@gmail.com> Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/checkpoint.c | 10 +++++++++- fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/recovery.c | 8 ++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cf315e3d244ca..c3119e4c890c0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -140,7 +140,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int segno, offset; bool exist; - if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ) + if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); @@ -148,6 +148,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -185,6 +192,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { f2fs_warn(sbi, "access invalid blkaddr:%u", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d7bb7d4f9434c..4636e14bcbf39 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -266,6 +266,10 @@ enum { * condition of read on truncated area * by extent_cache */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ META_GENERIC, }; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dcd0a1e350951..8326003e6918a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -698,6 +698,14 @@ retry_prev: goto err; } + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + goto err; + } + /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); -- GitLab From 1e8a9191ccc286bbbfc1f9dccd31ac3bc9ec8a3f Mon Sep 17 00:00:00 2001 From: Christian Brauner <brauner@kernel.org> Date: Fri, 9 Sep 2022 11:17:44 +0200 Subject: [PATCH 1317/2223] f2fs: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Seth Forshee (Digital Ocean) <sforshee@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Chao Yu <chao@kernel.org> Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/acl.c | 2 +- fs/f2fs/file.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index eaa240b21f071..5bbc44a5216e6 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -219,7 +219,7 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 771f1f7f3690c..5efe0e4a725ae 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -871,9 +871,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } -- GitLab From c6ad7fd16657ebd34a87a97d9588195aae87597d Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Wed, 14 Sep 2022 19:51:51 +0800 Subject: [PATCH 1318/2223] f2fs: fix to do sanity check on summary info As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216456 BUG: KASAN: use-after-free in recover_data+0x63ae/0x6ae0 [f2fs] Read of size 4 at addr ffff8881464dcd80 by task mount/1013 CPU: 3 PID: 1013 Comm: mount Tainted: G W 6.0.0-rc4 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x5e print_report.cold+0xf3/0x68d kasan_report+0xa8/0x130 recover_data+0x63ae/0x6ae0 [f2fs] f2fs_recover_fsync_data+0x120d/0x1fc0 [f2fs] f2fs_fill_super+0x4665/0x61e0 [f2fs] mount_bdev+0x2cf/0x3b0 legacy_get_tree+0xed/0x1d0 vfs_get_tree+0x81/0x2b0 path_mount+0x47e/0x19d0 do_mount+0xce/0xf0 __x64_sys_mount+0x12c/0x1a0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd The root cause is: in fuzzed image, SSA table is corrupted: ofs_in_node is larger than ADDRS_PER_PAGE(), result in out-of-range access on 4k-size page. - recover_data - do_recover_data - check_index_in_prev_nodes - f2fs_data_blkaddr This patch adds sanity check on summary info in recovery and GC flow in where the flows rely on them. After patch: [ 29.310883] F2FS-fs (loop0): Inconsistent ofs_in_node:65286 in summary, ino:0, nid:6, max:1018 Cc: stable@vger.kernel.org Reported-by: Wenqing Liu <wenqingliu0120@gmail.com> Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/gc.c | 10 +++++++++- fs/f2fs/recovery.c | 15 ++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fd400d148afb2..3a820e5cdaee3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1078,7 +1078,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1104,6 +1104,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } + max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : + DEF_ADDRS_PER_BLOCK; + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", + ofs_in_node, dni->ino, dni->nid, max_addrs); + return false; + } + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 8326003e6918a..5c9facec98f69 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -474,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; - unsigned int offset; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -501,15 +501,24 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_page_locked) lock_page(dn->inode_page); tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } -- GitLab From a834aa3ec95b0d1a465854b27016eec1af2f0e1f Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Mon, 19 Sep 2022 19:57:09 +0800 Subject: [PATCH 1319/2223] f2fs: add "c_len" into trace_f2fs_update_extent_tree_range for compressed file The trace_f2fs_update_extent_tree_range could not record compressed block length in the cluster of compress file and we just add it. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/extent_cache.c | 4 ++-- include/trace/events/f2fs.h | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 761fd42c93f23..746abfda3b664 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -544,7 +544,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); write_lock(&et->lock); @@ -675,7 +675,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index b262985f0c3a2..c6b372401c278 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1578,9 +1578,10 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, TRACE_EVENT(f2fs_update_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, - unsigned int len), + unsigned int len, + unsigned int c_len), - TP_ARGS(inode, pgofs, blkaddr, len), + TP_ARGS(inode, pgofs, blkaddr, len, c_len), TP_STRUCT__entry( __field(dev_t, dev) @@ -1588,6 +1589,7 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __field(unsigned int, pgofs) __field(u32, blk) __field(unsigned int, len) + __field(unsigned int, c_len) ), TP_fast_assign( @@ -1596,14 +1598,17 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __entry->pgofs = pgofs; __entry->blk = blkaddr; __entry->len = len; + __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "blkaddr = %u, len = %u", + "blkaddr = %u, len = %u, " + "c_len = %u", show_dev_ino(__entry), __entry->pgofs, __entry->blk, - __entry->len) + __entry->len, + __entry->c_len) ); TRACE_EVENT(f2fs_shrink_extent_tree, -- GitLab From 544b53dadc208278fd0796f2c22ea24a3fe16564 Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Wed, 14 Sep 2022 09:33:22 +0800 Subject: [PATCH 1320/2223] f2fs: code clean and fix a type error ERROR: code indent should use tabs where possible ERROR: spaces required around that ':' ERROR: incorrect tab Found serveral code type errors when review the code and fix it. There is no function change. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/node.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a737eedef779f..b90f5f39da78b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -723,7 +723,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c01471573977a..29cf5b6b23414 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -347,7 +347,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 746abfda3b664..932c070173b97 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -583,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, org_end = dei.fofs + dei.len; f2fs_bug_on(sbi, pos >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { en->ei.len = pos - en->ei.fofs; prev_en = en; parts = 1; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5efe0e4a725ae..4020f5e72a2c4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4622,7 +4622,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) skip_write_trace: /* Do the actual write. */ ret = dio ? - f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); if (trace_f2fs_datawrite_end_enabled()) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 23291f1575d35..9263bf5f10d37 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -585,7 +585,7 @@ retry: ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); goto cache; -- GitLab From d80afefb17e01aa0c46a8eebc01882e0ebd8b0f6 Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Wed, 14 Sep 2022 21:28:46 +0800 Subject: [PATCH 1321/2223] f2fs: fix to account FS_CP_DATA_IO correctly f2fs_inode_info.cp_task was introduced for FS_CP_DATA_IO accounting since commit b0af6d491a6b ("f2fs: add app/fs io stat"). However, cp_task usage coverage has been increased due to below commits: commit 040d2bb318d1 ("f2fs: fix to avoid deadloop if data_flush is on") commit 186857c5a14a ("f2fs: fix potential recursive call when enabling data_flush") So that, if data_flush mountoption is on, when data flush was triggered from background, the IO from data flush will be accounted as checkpoint IO type incorrectly. In order to fix this issue, this patch splits cp_task into two: a) cp_task: used for IO accounting b) wb_task: used to avoid deadlock Fixes: 040d2bb318d1 ("f2fs: fix to avoid deadloop if data_flush is on") Fixes: 186857c5a14a ("f2fs: fix potential recursive call when enabling data_flush") Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/checkpoint.c | 13 +++++++++---- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/segment.c | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c3119e4c890c0..308b70812cbd8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1061,7 +1061,8 @@ void f2fs_remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { struct list_head *head; struct inode *inode; @@ -1096,11 +1097,15 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; - F2FS_I(inode)->cp_task = current; + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); - F2FS_I(inode)->cp_task = NULL; + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ @@ -1229,7 +1234,7 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b90f5f39da78b..a45b6ab2e2a57 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2856,7 +2856,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task && allow_balance) + !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -3156,7 +3156,7 @@ static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) + if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4636e14bcbf39..c494c40b644b0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -786,6 +786,7 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ @@ -3745,7 +3746,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 460048f3c850d..3f14c0a4fb89e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -476,7 +476,7 @@ do_sync: mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); -- GitLab From fcc2d8cc96b2f6141bbbe5b1e8953db990794b44 Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Tue, 13 Sep 2022 15:48:12 +0800 Subject: [PATCH 1322/2223] f2fs: fix to detect corrupted meta ino It is possible that ino of dirent or orphan inode is corrupted in a fuzzed image, occasionally, if corrupted ino is equal to meta ino: meta_ino, node_ino or compress_ino, caller of f2fs_iget() from below call paths will get meta inode directly, it's not allowed, let's add sanity check to detect such cases. case #1 - recover_dentry - __f2fs_find_entry - f2fs_iget_retry case #2 - recover_orphan_inode - f2fs_iget_retry Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/inode.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cde0a3dc80c3e..93ec216da3e1b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -487,6 +487,12 @@ static int do_read_inode(struct inode *inode) return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -498,16 +504,21 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif ret = do_read_inode(inode); if (ret) -- GitLab From 718693c84d8f4b235d030c377258f12f38a71c67 Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Tue, 27 Sep 2022 10:44:47 +0800 Subject: [PATCH 1323/2223] f2fs: introduce cp_status sysfs entry This patch adds a new sysfs entry named cp_status, it can output checkpoint flags in real time. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 ++++++++++++++++++++++++ fs/f2fs/sysfs.c | 8 ++++++++ 2 files changed, 32 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 083ac2d63eefd..483639fb727b2 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -466,6 +466,30 @@ Description: Show status of f2fs superblock in real time. 0x4000 SBI_IS_FREEZING freefs is in process ====== ===================== ================================= +What: /sys/fs/f2fs/<disk>/stat/cp_status +Date: September 2022 +Contact: "Chao Yu" <chao.yu@oppo.com> +Description: Show status of f2fs checkpoint in real time. + + =============================== ============================== + cp flag value + CP_UMOUNT_FLAG 0x00000001 + CP_ORPHAN_PRESENT_FLAG 0x00000002 + CP_COMPACT_SUM_FLAG 0x00000004 + CP_ERROR_FLAG 0x00000008 + CP_FSCK_FLAG 0x00000010 + CP_FASTBOOT_FLAG 0x00000020 + CP_CRC_RECOVERY_FLAG 0x00000040 + CP_NAT_BITS_FLAG 0x00000080 + CP_TRIMMED_FLAG 0x00000100 + CP_NOCRC_RECOVERY_FLAG 0x00000200 + CP_LARGE_NAT_BITMAP_FLAG 0x00000400 + CP_QUOTA_NEED_FSCK_FLAG 0x00000800 + CP_DISABLED_FLAG 0x00001000 + CP_DISABLED_QUICK_FLAG 0x00002000 + CP_RESIZEFS_FLAG 0x00004000 + =============================== ============================== + What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio Date: January 2021 Contact: "Daeho Jeong" <daehojeong@google.com> diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 39ebf0ad133a9..df27afd71ef48 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -128,6 +128,12 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + static ssize_t pending_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1029,8 +1035,10 @@ static struct attribute *f2fs_feat_attrs[] = { ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), + ATTR_LIST(cp_status), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); -- GitLab From ca7efd71c3dffd5442b448dd553a903425222597 Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Fri, 23 Sep 2022 15:17:55 +0800 Subject: [PATCH 1324/2223] f2fs: remove the unnecessary check in f2fs_xattr_fiemap Whehter or not error occurs, checking "err == 1" is unnecessary in f2fs_xattr_fiemap(), and just remove it here. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a45b6ab2e2a57..a921cd40db785 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1816,7 +1816,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); - if (err || err == 1) + if (err) return err; } -- GitLab From a9cfee0ef98e99c8b1951dfd1d57a88580354d0d Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Wed, 28 Sep 2022 23:38:53 +0800 Subject: [PATCH 1325/2223] f2fs: support recording stop_checkpoint reason into super_block This patch supports to record stop_checkpoint error into f2fs_super_block.s_stop_reason[]. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/checkpoint.c | 10 +++++++--- fs/f2fs/data.c | 6 ++++-- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/file.c | 11 ++++++----- fs/f2fs/gc.c | 6 ++++-- fs/f2fs/inode.c | 3 ++- fs/f2fs/segment.c | 5 +++-- fs/f2fs/super.c | 20 ++++++++++++++++++++ include/linux/f2fs_fs.h | 17 ++++++++++++++++- 9 files changed, 65 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 308b70812cbd8..0c82dae082aa9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,12 +26,16 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -122,7 +126,7 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a921cd40db785..3f2210e54577f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -333,7 +333,8 @@ static void f2fs_write_end_io(struct bio *bio) mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -349,7 +350,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c494c40b644b0..7d56948273bef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3556,6 +3556,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3715,7 +3716,8 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4020f5e72a2c4..c86e5e1601c98 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2145,7 +2145,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) { if (ret == -EROFS) { ret = 0; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } @@ -2158,7 +2159,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = freeze_bdev(sb->s_bdev); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; @@ -2167,16 +2168,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3a820e5cdaee3..6e42dad0ac2d0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -74,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -1712,7 +1713,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 93ec216da3e1b..c972276027b49 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -713,7 +713,8 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3f14c0a4fb89e..54c86a5518597 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -376,7 +376,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ @@ -694,7 +694,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) } while (ret && --count); if (ret) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b8e5fe2445968..2533d309a9240 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3846,6 +3846,26 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_bug_on(sbi, reason >= MAX_STOP_REASON); + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d445150c5350f..5dd1e52b8997f 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -73,6 +73,20 @@ struct f2fs_device { __le32 total_segments; } __packed; +/* reason of stop_checkpoint */ +enum stop_cp_reason { + STOP_CP_REASON_SHUTDOWN, + STOP_CP_REASON_FAULT_INJECT, + STOP_CP_REASON_META_PAGE, + STOP_CP_REASON_WRITE_FAIL, + STOP_CP_REASON_CORRUPTED_SUMMARY, + STOP_CP_REASON_UPDATE_INODE, + STOP_CP_REASON_FLUSH_FAIL, + STOP_CP_REASON_MAX, +}; + +#define MAX_STOP_REASON 32 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -116,7 +130,8 @@ struct f2fs_super_block { __u8 hot_ext_count; /* # of hot file extension */ __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __u8 reserved[306]; /* valid reserved region */ + __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ + __u8 reserved[274]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; -- GitLab From 95fa90c9e5a7f14c2497d5b032544478c9377c3a Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Wed, 28 Sep 2022 23:38:54 +0800 Subject: [PATCH 1326/2223] f2fs: support recording errors into superblock This patch supports to record detail reason of FSCORRUPTED error into f2fs_super_block.s_errors[]. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/compress.c | 2 ++ fs/f2fs/data.c | 24 +++++++++++++++++--- fs/f2fs/dir.c | 1 + fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 12 ++++++++-- fs/f2fs/gc.c | 2 ++ fs/f2fs/inline.c | 2 ++ fs/f2fs/inode.c | 6 ++++- fs/f2fs/node.c | 2 ++ fs/f2fs/recovery.c | 6 +++++ fs/f2fs/segment.c | 11 +++++++++ fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 49 +++++++++++++++++++++++++++++++++++++++-- fs/f2fs/verity.c | 2 ++ fs/f2fs/xattr.c | 8 +++++++ include/linux/f2fs_fs.h | 25 ++++++++++++++++++++- 16 files changed, 150 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c16bab5bd6000..d315c2de136f2 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -762,6 +762,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } @@ -950,6 +951,7 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3f2210e54577f..1c82a4a4e8616 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -705,8 +705,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -906,8 +908,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -1217,6 +1221,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1237,6 +1243,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1550,6 +1558,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -1595,6 +1604,8 @@ next_block: (flag != F2FS_GET_BLOCK_FIEMAP || IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); goto sync_out; } if (flag == F2FS_GET_BLOCK_BMAP) { @@ -2076,6 +2087,8 @@ got_it: if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2619,8 +2632,11 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2648,6 +2664,7 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } @@ -3561,6 +3578,7 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d5bd7932fb642..21960a899b6ad 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1041,6 +1041,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7d56948273bef..b63b482c35a85 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1815,6 +1815,10 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ @@ -3557,6 +3561,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c86e5e1601c98..7b3ed4a9bb46e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1156,6 +1156,7 @@ next_dnode: !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1440,6 +1441,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -3323,8 +3325,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3485,8 +3489,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3758,6 +3764,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6e42dad0ac2d0..d36bcb23ccfec 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1164,6 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1182,6 +1183,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 73da933180369..21a495234ffd7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -160,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -412,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c972276027b49..9f0d3864d9f13 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -415,6 +417,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -510,6 +513,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = -EFSCORRUPTED; trace_f2fs_iget_exit(inode, ret); iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return ERR_PTR(ret); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9263bf5f10d37..983572f238969 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -1295,6 +1296,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } #endif diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 5c9facec98f69..dea95b48b647d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -507,6 +507,7 @@ got_it: if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); return -EFSCORRUPTED; } @@ -637,6 +638,7 @@ retry_dn: inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } @@ -649,12 +651,14 @@ retry_dn: if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -712,6 +716,8 @@ retry_prev: f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 54c86a5518597..d7b13127b0b8a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -312,6 +312,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode) DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -3433,6 +3435,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } @@ -4381,6 +4384,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } @@ -4417,6 +4422,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4436,6 +4442,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } @@ -4467,6 +4474,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } @@ -4475,6 +4483,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } @@ -4625,6 +4634,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } @@ -4642,6 +4652,7 @@ out: "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1d63766f2c7e..be8f2d7d007b9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -753,6 +753,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } @@ -767,6 +768,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2533d309a9240..6cf72fbf20541 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3851,8 +3851,6 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); int err; - f2fs_bug_on(sbi, reason >= MAX_STOP_REASON); - f2fs_down_write(&sbi->sb_lock); if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) @@ -3862,7 +3860,51 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) if (err) f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: f2fs_up_write(&sbi->sb_lock); } @@ -4213,6 +4255,9 @@ try_onemore: goto free_devices; } + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 97ec60f39d696..f0805e51b3fed 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -240,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c76c15086e5f5..dc2e8637189e2 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -367,6 +367,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -583,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -658,6 +662,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -684,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } last = XATTR_NEXT_ENTRY(last); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5dd1e52b8997f..ee0d75d9a302d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -87,6 +87,28 @@ enum stop_cp_reason { #define MAX_STOP_REASON 32 +/* detail reason for EFSCORRUPTED */ +enum f2fs_error { + ERROR_CORRUPTED_CLUSTER, + ERROR_FAIL_DECOMPRESSION, + ERROR_INVALID_BLKADDR, + ERROR_CORRUPTED_DIRENT, + ERROR_CORRUPTED_INODE, + ERROR_INCONSISTENT_SUMMARY, + ERROR_INCONSISTENT_FOOTER, + ERROR_INCONSISTENT_SUM_TYPE, + ERROR_CORRUPTED_JOURNAL, + ERROR_INCONSISTENT_NODE_COUNT, + ERROR_INCONSISTENT_BLOCK_COUNT, + ERROR_INVALID_CURSEG, + ERROR_INCONSISTENT_SIT, + ERROR_CORRUPTED_VERITY_XATTR, + ERROR_CORRUPTED_XATTR, + ERROR_MAX, +}; + +#define MAX_F2FS_ERRORS 16 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -131,7 +153,8 @@ struct f2fs_super_block { __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ - __u8 reserved[274]; /* valid reserved region */ + __u8 s_errors[MAX_F2FS_ERRORS]; /* reason of image corrupts */ + __u8 reserved[258]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; -- GitLab From 52f1c45dde9136f964d63a77d19826c8a74e2c7f Mon Sep 17 00:00:00 2001 From: Dominique Martinet <asmadeus@codewreck.org> Date: Wed, 17 Aug 2022 14:58:44 +0900 Subject: [PATCH 1327/2223] 9p: trans_fd/p9_conn_cancel: drop client lock earlier syzbot reported a double-lock here and we no longer need this lock after requests have been moved off to local list: just drop the lock earlier. Link: https://lkml.kernel.org/r/20220904064028.1305220-1-asmadeus@codewreck.org Reported-by: syzbot+50f7e8d06c3768dd97f3@syzkaller.appspotmail.com Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> Tested-by: Schspa Shi <schspa@gmail.com> --- net/9p/trans_fd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index e758978b44bee..60fcc6b30b468 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -205,6 +205,8 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_move(&req->req_list, &cancel_list); } + spin_unlock(&m->client->lock); + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); list_del(&req->req_list); @@ -212,7 +214,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err) req->t_err = err; p9_client_cb(m->client, req, REQ_STATUS_ERROR); } - spin_unlock(&m->client->lock); } static __poll_t -- GitLab From e7c6219778e46143ee9e68a25febac10a66383ae Mon Sep 17 00:00:00 2001 From: Christian Schoenebeck <linux_oss@crudebyte.com> Date: Fri, 15 Jul 2022 23:32:28 +0200 Subject: [PATCH 1328/2223] net/9p: split message size argument into 't_size' and 'r_size' pair Refactor 'max_size' argument of p9_tag_alloc() and 'req_size' argument of p9_client_prepare_req() both into a pair of arguments 't_size' and 'r_size' respectively to allow handling the buffer size for request and reply separately from each other. Link: https://lkml.kernel.org/r/9431a25fe4b37fd12cecbd715c13af71f701f220.1657920926.git.linux_oss@crudebyte.com Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/client.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 0a6110e15d0f8..0bd7e43e5c4fc 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -255,24 +255,26 @@ static struct kmem_cache *p9_req_cache; * p9_tag_alloc - Allocate a new request. * @c: Client session. * @type: Transaction type. - * @max_size: Maximum packet size for this request. + * @t_size: Buffer size for holding this request. + * @r_size: Buffer size for holding server's reply on this request. * * Context: Process context. * Return: Pointer to new request. */ static struct p9_req_t * -p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) +p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size) { struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); - int alloc_msize = min(c->msize, max_size); + int alloc_tsize = min(c->msize, t_size); + int alloc_rsize = min(c->msize, r_size); int tag; if (!req) return ERR_PTR(-ENOMEM); - if (p9_fcall_init(c, &req->tc, alloc_msize)) + if (p9_fcall_init(c, &req->tc, alloc_tsize)) goto free_req; - if (p9_fcall_init(c, &req->rc, alloc_msize)) + if (p9_fcall_init(c, &req->rc, alloc_rsize)) goto free; p9pdu_reset(&req->tc); @@ -592,7 +594,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) } static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, - int8_t type, int req_size, + int8_t type, uint t_size, uint r_size, const char *fmt, va_list ap) { int err; @@ -608,7 +610,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, if (c->status == BeginDisconnect && type != P9_TCLUNK) return ERR_PTR(-EIO); - req = p9_tag_alloc(c, type, req_size); + req = p9_tag_alloc(c, type, t_size, r_size); if (IS_ERR(req)) return req; @@ -645,7 +647,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) struct p9_req_t *req; va_start(ap, fmt); - req = p9_client_prepare_req(c, type, c->msize, fmt, ap); + req = p9_client_prepare_req(c, type, c->msize, c->msize, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; @@ -743,7 +745,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, /* We allocate a inline protocol data of only 4k bytes. * The actual content is passed in zero-copy fashion. */ - req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap); + req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; -- GitLab From 58d331312bf78a10740fc3c6c370c98e8c53fa6b Mon Sep 17 00:00:00 2001 From: Christian Schoenebeck <linux_oss@crudebyte.com> Date: Fri, 15 Jul 2022 23:32:30 +0200 Subject: [PATCH 1329/2223] 9p: add P9_ERRMAX for 9p2000 and 9p2000.u Add P9_ERRMAX macro to 9P protocol header which reflects the maximum error string length of Rerror replies for 9p2000 and 9p2000.u protocol versions. Unfortunately a maximum error string length is not defined by the 9p2000 spec, picking 128 as value for now, as this seems to be a common max. size for POSIX error strings in practice. 9p2000.L protocol version uses Rlerror replies instead which does not contain an error string. Link: https://lkml.kernel.org/r/3f23191d21032e7c14852b1e1a4ae26417a36739.1657920926.git.linux_oss@crudebyte.com Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- include/net/9p/9p.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h index 24a509f559ee2..13abe013af21c 100644 --- a/include/net/9p/9p.h +++ b/include/net/9p/9p.h @@ -331,6 +331,9 @@ enum p9_qid_t { /* size of header for zero copy read/write */ #define P9_ZC_HDR_SZ 4096 +/* maximum length of an error string */ +#define P9_ERRMAX 128 + /** * struct p9_qid - file system entity information * @type: 8-bit type &p9_qid_t -- GitLab From 1effdbf94a728b74b23a24ce7b6f1d1d9a2480a4 Mon Sep 17 00:00:00 2001 From: Christian Schoenebeck <linux_oss@crudebyte.com> Date: Fri, 15 Jul 2022 23:32:34 +0200 Subject: [PATCH 1330/2223] net/9p: add p9_msg_buf_size() This new function calculates a buffer size suitable for holding the intended 9p request or response. For rather small message types (which applies to almost all 9p message types actually) simply use hard coded values. For some variable-length and potentially large message types calculate a more precise value according to what data is actually transmitted to avoid unnecessarily huge buffers. So p9_msg_buf_size() divides the individual 9p message types into 3 message size categories: - dynamically calculated message size (i.e. potentially large) - 8k hard coded message size - 4k hard coded message size As for the latter two hard coded message types: for most 9p message types it is pretty obvious whether they would always fit into 4k or 8k. But for some of them it depends on the maximum directory entry name length allowed by OS and filesystem for determining into which of the two size categories they would fit into. Currently Linux supports directory entry names up to NAME_MAX (255), however when comparing the limitation of individual filesystems, ReiserFS theoretically supports up to slightly below 4k long names. So in order to make this code more future proof, and as revisiting it later on is a bit tedious and has the potential to miss out details, the decision [1] was made to take 4k as basis as for max. name length. Link: https://lkml.kernel.org/r/bd6be891cf67e867688e8c8796d06408bfafa0d9.1657920926.git.linux_oss@crudebyte.com Link: https://lore.kernel.org/all/5564296.oo812IJUPE@silver/ [1] Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/protocol.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++ net/9p/protocol.h | 2 + 2 files changed, 169 insertions(+) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 83694c6319890..4e3a2a1ffcb3f 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -23,6 +23,173 @@ #include <trace/events/9p.h> +/* len[2] text[len] */ +#define P9_STRLEN(s) \ + (2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX)) + +/** + * p9_msg_buf_size - Returns a buffer size sufficiently large to hold the + * intended 9p message. + * @c: client + * @type: message type + * @fmt: format template for assembling request message + * (see p9pdu_vwritef) + * @ap: variable arguments to be fed to passed format template + * (see p9pdu_vwritef) + * + * Note: Even for response types (P9_R*) the format template and variable + * arguments must always be for the originating request type (P9_T*). + */ +size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type, + const char *fmt, va_list ap) +{ + /* size[4] type[1] tag[2] */ + const int hdr = 4 + 1 + 2; + /* ename[s] errno[4] */ + const int rerror_size = hdr + P9_ERRMAX + 4; + /* ecode[4] */ + const int rlerror_size = hdr + 4; + const int err_size = + c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size; + + static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes " + "a max. allowed directory entry name length of 4k"); + + switch (type) { + + /* message types not used at all */ + case P9_TERROR: + case P9_TLERROR: + case P9_TAUTH: + case P9_RAUTH: + BUG(); + + /* variable length & potentially large message types */ + case P9_TATTACH: + BUG_ON(strcmp("ddss?u", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + const char *uname = va_arg(ap, const char *); + const char *aname = va_arg(ap, const char *); + /* fid[4] afid[4] uname[s] aname[s] n_uname[4] */ + return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4; + } + case P9_TWALK: + BUG_ON(strcmp("ddT", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + uint i, nwname = va_arg(ap, int); + size_t wname_all; + const char **wnames = va_arg(ap, const char **); + for (i = 0, wname_all = 0; i < nwname; ++i) { + wname_all += P9_STRLEN(wnames[i]); + } + /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */ + return hdr + 4 + 4 + 2 + wname_all; + } + case P9_RWALK: + BUG_ON(strcmp("ddT", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + uint nwname = va_arg(ap, int); + /* nwqid[2] nwqid*(wqid[13]) */ + return max_t(size_t, hdr + 2 + nwname * 13, err_size); + } + case P9_TCREATE: + BUG_ON(strcmp("dsdb?s", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + if (c->proto_version == p9_proto_legacy) { + /* fid[4] name[s] perm[4] mode[1] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 1; + } else { + va_arg(ap, int32_t); + va_arg(ap, int); + { + const char *ext = va_arg(ap, const char *); + /* fid[4] name[s] perm[4] mode[1] extension[s] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext); + } + } + } + case P9_TLCREATE: + BUG_ON(strcmp("dsddg", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + /* fid[4] name[s] flags[4] mode[4] gid[4] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4; + } + case P9_RREAD: + case P9_RREADDIR: + BUG_ON(strcmp("dqd", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int64_t); + { + const int32_t count = va_arg(ap, int32_t); + /* count[4] data[count] */ + return max_t(size_t, hdr + 4 + count, err_size); + } + case P9_TWRITE: + BUG_ON(strcmp("dqV", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int64_t); + { + const int32_t count = va_arg(ap, int32_t); + /* fid[4] offset[8] count[4] data[count] */ + return hdr + 4 + 8 + 4 + count; + } + case P9_TRENAMEAT: + BUG_ON(strcmp("dsds", fmt)); + va_arg(ap, int32_t); + { + const char *oldname, *newname; + oldname = va_arg(ap, const char *); + va_arg(ap, int32_t); + newname = va_arg(ap, const char *); + /* olddirfid[4] oldname[s] newdirfid[4] newname[s] */ + return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname); + } + case P9_TSYMLINK: + BUG_ON(strcmp("dssg", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + const char *symtgt = va_arg(ap, const char *); + /* fid[4] name[s] symtgt[s] gid[4] */ + return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4; + } + + case P9_RERROR: + return rerror_size; + case P9_RLERROR: + return rlerror_size; + + /* small message types */ + case P9_TWSTAT: + case P9_RSTAT: + case P9_RREADLINK: + case P9_TXATTRWALK: + case P9_TXATTRCREATE: + case P9_TLINK: + case P9_TMKDIR: + case P9_TMKNOD: + case P9_TRENAME: + case P9_TUNLINKAT: + case P9_TLOCK: + return 8 * 1024; + + /* tiny message types */ + default: + return 4 * 1024; + + } +} + static int p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); diff --git a/net/9p/protocol.h b/net/9p/protocol.h index 6d719c30331ac..ad2283d1f96be 100644 --- a/net/9p/protocol.h +++ b/net/9p/protocol.h @@ -8,6 +8,8 @@ * Copyright (C) 2008 by IBM, Corp. */ +size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type, + const char *fmt, va_list ap); int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, va_list ap); int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); -- GitLab From 01d205d936ae18532e14814808592b926aacc6d5 Mon Sep 17 00:00:00 2001 From: Christian Schoenebeck <linux_oss@crudebyte.com> Date: Fri, 15 Jul 2022 23:33:09 +0200 Subject: [PATCH 1331/2223] net/9p: add 'pooled_rbuffers' flag to struct p9_trans_module This is a preparatory change for the subsequent patch: the RDMA transport pulls the buffers for its 9p response messages from a shared pool. [1] So this case has to be considered when choosing an appropriate response message size in the subsequent patch. Link: https://lore.kernel.org/all/Ys3jjg52EIyITPua@codewreck.org/ [1] Link: https://lkml.kernel.org/r/79d24310226bc4eb037892b5c097ec4ad4819a03.1657920926.git.linux_oss@crudebyte.com Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- include/net/9p/transport.h | 5 +++++ net/9p/trans_fd.c | 1 + net/9p/trans_rdma.c | 1 + net/9p/trans_virtio.c | 1 + net/9p/trans_xen.c | 1 + 5 files changed, 9 insertions(+) diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index ff842f9630718..766ec07c95999 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -19,6 +19,10 @@ * @list: used to maintain a list of currently available transports * @name: the human-readable name of the transport * @maxsize: transport provided maximum packet size + * @pooled_rbuffers: currently only set for RDMA transport which pulls the + * response buffers from a shared pool, and accordingly + * we're less flexible when choosing the response message + * size in this case * @def: set if this transport should be considered the default * @create: member function to create a new connection on this transport * @close: member function to discard a connection on this transport @@ -38,6 +42,7 @@ struct p9_trans_module { struct list_head list; char *name; /* name of transport */ int maxsize; /* max message size of transport */ + bool pooled_rbuffers; int def; /* this transport should be default */ struct module *owner; int (*create)(struct p9_client *client, diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 60fcc6b30b468..25d422c473e8a 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1083,6 +1083,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) static struct p9_trans_module p9_tcp_trans = { .name = "tcp", .maxsize = MAX_SOCK_BUF, + .pooled_rbuffers = false, .def = 0, .create = p9_fd_create_tcp, .close = p9_fd_close, diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index d817d3745238b..6ff706760676e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -739,6 +739,7 @@ error: static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, + .pooled_rbuffers = true, .def = 0, .owner = THIS_MODULE, .create = rdma_create_trans, diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index b84d35cf68994..e757f06013043 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -802,6 +802,7 @@ static struct p9_trans_module p9_virtio_trans = { * page in zero copy. */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), + .pooled_rbuffers = false, .def = 1, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 227f89cc7237c..41c57d40efb69 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -246,6 +246,7 @@ static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) static struct p9_trans_module p9_xen_trans = { .name = "xen", .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2), + .pooled_rbuffers = false, .def = 1, .create = p9_xen_create, .close = p9_xen_close, -- GitLab From 60ece0833b6c2bc1465eb2803fec20b670e2ee93 Mon Sep 17 00:00:00 2001 From: Christian Schoenebeck <linux_oss@crudebyte.com> Date: Fri, 15 Jul 2022 23:33:56 +0200 Subject: [PATCH 1332/2223] net/9p: allocate appropriate reduced message buffers So far 'msize' was simply used for all 9p message types, which is far too much and slowed down performance tremendously with large values for user configurable 'msize' option. Let's stop this waste by using the new p9_msg_buf_size() function for allocating more appropriate, smaller buffers according to what is actually sent over the wire. Only exception: RDMA transport is currently excluded from this message size optimization - for its response buffers that is - as RDMA transport would not cope with it, due to its response buffers being pulled from a shared pool. [1] Link: https://lore.kernel.org/all/Ys3jjg52EIyITPua@codewreck.org/ [1] Link: https://lkml.kernel.org/r/3f51590535dc96ed0a165b8218c57639cfa5c36c.1657920926.git.linux_oss@crudebyte.com Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/client.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 0bd7e43e5c4fc..aaa37b07e30a5 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -255,19 +255,35 @@ static struct kmem_cache *p9_req_cache; * p9_tag_alloc - Allocate a new request. * @c: Client session. * @type: Transaction type. - * @t_size: Buffer size for holding this request. - * @r_size: Buffer size for holding server's reply on this request. + * @t_size: Buffer size for holding this request + * (automatic calculation by format template if 0). + * @r_size: Buffer size for holding server's reply on this request + * (automatic calculation by format template if 0). + * @fmt: Format template for assembling 9p request message + * (see p9pdu_vwritef). + * @ap: Variable arguments to be fed to passed format template + * (see p9pdu_vwritef). * * Context: Process context. * Return: Pointer to new request. */ static struct p9_req_t * -p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size) +p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size, + const char *fmt, va_list ap) { struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); - int alloc_tsize = min(c->msize, t_size); - int alloc_rsize = min(c->msize, r_size); + int alloc_tsize; + int alloc_rsize; int tag; + va_list apc; + + va_copy(apc, ap); + alloc_tsize = min_t(size_t, c->msize, + t_size ?: p9_msg_buf_size(c, type, fmt, apc)); + va_end(apc); + + alloc_rsize = min_t(size_t, c->msize, + r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap)); if (!req) return ERR_PTR(-ENOMEM); @@ -599,6 +615,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, { int err; struct p9_req_t *req; + va_list apc; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -610,7 +627,9 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, if (c->status == BeginDisconnect && type != P9_TCLUNK) return ERR_PTR(-EIO); - req = p9_tag_alloc(c, type, t_size, r_size); + va_copy(apc, ap); + req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc); + va_end(apc); if (IS_ERR(req)) return req; @@ -645,9 +664,18 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) int sigpending, err; unsigned long flags; struct p9_req_t *req; + /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to + * auto determine an appropriate (small) request/response size + * according to actual message data being sent. Currently RDMA + * transport is excluded from this response message size optimization, + * as it would not cope with it, due to its pooled response buffers + * (using an optimized request size for RDMA as well though). + */ + const uint tsize = 0; + const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0; va_start(ap, fmt); - req = p9_client_prepare_req(c, type, c->msize, c->msize, fmt, ap); + req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; -- GitLab From e4a7e67a08ac409f1485c82a2190636d5c81b932 Mon Sep 17 00:00:00 2001 From: Frank Li <Frank.Li@nxp.com> Date: Tue, 4 Oct 2022 15:24:14 -0500 Subject: [PATCH 1333/2223] irqchip/imx-mu-msi: Fix wrong register offset for 8ulp Offset 0x124 should be for IMX_MU_TSR, not IMX_MU_GSR. Fixes: 70afdab904d2 ("irqchip: Add IMX MU MSI controller driver") Reported-by: Colin King <colin.i.king@gmail.com> Signed-off-by: Frank Li <Frank.Li@nxp.com> [maz: updated commit message, tags] Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221004202414.216577-1-Frank.Li@nxp.com --- drivers/irqchip/irq-imx-mu-msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c index b62139dc36e82..229039eda1b1f 100644 --- a/drivers/irqchip/irq-imx-mu-msi.c +++ b/drivers/irqchip/irq-imx-mu-msi.c @@ -292,7 +292,7 @@ static const struct imx_mu_dcfg imx_mu_cfg_imx8ulp = { .xSR = { [IMX_MU_SR] = 0xC, [IMX_MU_GSR] = 0x118, - [IMX_MU_GSR] = 0x124, + [IMX_MU_TSR] = 0x124, [IMX_MU_RSR] = 0x12C, }, .xCR = { -- GitLab From 5e85eba6f50dc288c22083a7e213152bcc4b8208 Mon Sep 17 00:00:00 2001 From: Vidya Sagar <vidyas@nvidia.com> Date: Tue, 13 Sep 2022 18:48:21 +0530 Subject: [PATCH 1334/2223] PCI/ASPM: Refactor L1 PM Substates Control Register programming Refactor the code to extract the common code to program Control Registers 1 and 2 of the L1 PM Substates capability to a new function aspm_program_l1ss() and call it for both parent and child devices. [bhelgaas: squash in update to preserve fields we're not updating from https://lore.kernel.org/r/36fa13c5-e0f8-022f-77f7-7908e4df98b8@nvidia.com] Link: https://lore.kernel.org/r/20220913131822.16557-2-vidyas@nvidia.com Signed-off-by: Vidya Sagar <vidyas@nvidia.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- drivers/pci/pcie/aspm.c | 74 ++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index a8aec190986c0..b4bdadc4ac356 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -455,6 +455,31 @@ static void pci_clear_and_set_dword(struct pci_dev *pdev, int pos, pci_write_config_dword(pdev, pos, val); } +static void aspm_program_l1ss(struct pci_dev *dev, u32 ctl1, u32 ctl2) +{ + u16 l1ss = dev->l1ss; + u32 l1_2_enable; + + /* + * Per PCIe r6.0, sec 5.5.4, T_POWER_ON in PCI_L1SS_CTL2 must be + * programmed prior to setting the L1.2 enable bits in PCI_L1SS_CTL1. + */ + pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL2, ctl2); + + /* + * In addition, Common_Mode_Restore_Time and LTR_L1.2_THRESHOLD in + * PCI_L1SS_CTL1 must be programmed *before* setting the L1.2 + * enable bits, even though they're all in PCI_L1SS_CTL1. + */ + l1_2_enable = ctl1 & PCI_L1SS_CTL1_L1_2_MASK; + ctl1 &= ~PCI_L1SS_CTL1_L1_2_MASK; + + pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL1, ctl1); + if (l1_2_enable) + pci_write_config_dword(dev, l1ss + PCI_L1SS_CTL1, + ctl1 | l1_2_enable); +} + /* Calculate L1.2 PM substate timing parameters */ static void aspm_calc_l1ss_info(struct pcie_link_state *link, u32 parent_l1ss_cap, u32 child_l1ss_cap) @@ -464,7 +489,6 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, u32 t_common_mode, t_power_on, l1_2_threshold, scale, value; u32 ctl1 = 0, ctl2 = 0; u32 pctl1, pctl2, cctl1, cctl2; - u32 pl1_2_enables, cl1_2_enables; if (!(link->aspm_support & ASPM_STATE_L1_2_MASK)) return; @@ -513,39 +537,21 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, ctl2 == pctl2 && ctl2 == cctl2) return; - /* Disable L1.2 while updating. See PCIe r5.0, sec 5.5.4, 7.8.3.3 */ - pl1_2_enables = pctl1 & PCI_L1SS_CTL1_L1_2_MASK; - cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK; - - if (pl1_2_enables || cl1_2_enables) { - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1_2_MASK, 0); - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1_2_MASK, 0); - } - - /* Program T_POWER_ON times in both ports */ - pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2); - pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); - - /* Program Common_Mode_Restore_Time in upstream device */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); - - /* Program LTR_L1.2_THRESHOLD time in both ports */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); - - if (pl1_2_enables || cl1_2_enables) { - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, - pl1_2_enables); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0, - cl1_2_enables); - } + pctl1 &= ~(PCI_L1SS_CTL1_CM_RESTORE_TIME | + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE); + pctl1 |= (ctl1 & (PCI_L1SS_CTL1_CM_RESTORE_TIME | + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE)); + aspm_program_l1ss(parent, pctl1, ctl2); + + cctl1 &= ~(PCI_L1SS_CTL1_CM_RESTORE_TIME | + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE); + cctl1 |= (ctl1 & (PCI_L1SS_CTL1_CM_RESTORE_TIME | + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE)); + aspm_program_l1ss(child, cctl1, ctl2); } static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) -- GitLab From 4ff116d0d5fd8a025604b0802d93a2d5f4e465d1 Mon Sep 17 00:00:00 2001 From: Vidya Sagar <vidyas@nvidia.com> Date: Tue, 13 Sep 2022 18:48:22 +0530 Subject: [PATCH 1335/2223] PCI/ASPM: Save L1 PM Substates Capability for suspend/resume Previously the L1 PM Substates Control Registers (CTL1 and CTL2) weren't saved and restored during suspend/resume leading to the L1 PM Substates configuration being lost post-resume. Save the L1 PM Substates Control Registers so that the configuration is retained post-resume. [bhelgaas: drop pci_is_pcie() testing; we can rely on pci_configure_ltr() having already done that] Link: https://lore.kernel.org/r/20220913131822.16557-3-vidyas@nvidia.com Signed-off-by: Vidya Sagar <vidyas@nvidia.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- drivers/pci/pci.c | 7 +++++++ drivers/pci/pci.h | 4 ++++ drivers/pci/pcie/aspm.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 95bc329e74c0e..68a49fbaabdec 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1663,6 +1663,7 @@ int pci_save_state(struct pci_dev *dev) return i; pci_save_ltr_state(dev); + pci_save_aspm_l1ss_state(dev); pci_save_dpc_state(dev); pci_save_aer_state(dev); pci_save_ptm_state(dev); @@ -1769,6 +1770,7 @@ void pci_restore_state(struct pci_dev *dev) * LTR itself (in the PCIe capability). */ pci_restore_ltr_state(dev); + pci_restore_aspm_l1ss_state(dev); pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); @@ -3485,6 +3487,11 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) if (error) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); + error = pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_L1SS, + 2 * sizeof(u32)); + if (error) + pci_err(dev, "unable to allocate suspend buffer for ASPM-L1SS\n"); + pci_allocate_vc_save_buffers(dev); } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 785f31086313a..365a844ec4304 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -561,10 +561,14 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active); void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); +void pci_save_aspm_l1ss_state(struct pci_dev *dev); +void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } +static inline void pci_save_aspm_l1ss_state(struct pci_dev *dev) { } +static inline void pci_restore_aspm_l1ss_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCIE_ECRC diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index b4bdadc4ac356..016d222b07c74 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -732,6 +732,43 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) PCI_L1SS_CTL1_L1SS_MASK, val); } +void pci_save_aspm_l1ss_state(struct pci_dev *dev) +{ + struct pci_cap_saved_state *save_state; + u16 l1ss = dev->l1ss; + u32 *cap; + + if (!l1ss) + return; + + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); + if (!save_state) + return; + + cap = (u32 *)&save_state->cap.data[0]; + pci_read_config_dword(dev, l1ss + PCI_L1SS_CTL2, cap++); + pci_read_config_dword(dev, l1ss + PCI_L1SS_CTL1, cap++); +} + +void pci_restore_aspm_l1ss_state(struct pci_dev *dev) +{ + struct pci_cap_saved_state *save_state; + u32 *cap, ctl1, ctl2; + u16 l1ss = dev->l1ss; + + if (!l1ss) + return; + + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); + if (!save_state) + return; + + cap = (u32 *)&save_state->cap.data[0]; + ctl2 = *cap++; + ctl1 = *cap; + aspm_program_l1ss(dev, ctl1, ctl2); +} + static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val) { pcie_capability_clear_and_set_word(pdev, PCI_EXP_LNKCTL, -- GitLab From e98ecc6e94f4e6d21c06660b0f336df02836694f Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Mon, 26 Sep 2022 11:36:29 +0800 Subject: [PATCH 1336/2223] cifs: Fix the error length of VALIDATE_NEGOTIATE_INFO message Commit d5c7076b772a ("smb3: add smb3.1.1 to default dialect list") extend the dialects from 3 to 4, but forget to decrease the extended length when specific the dialect, then the message length is larger than expected. This maybe leak some info through network because not initialize the message body. After apply this patch, the VALIDATE_NEGOTIATE_INFO message length is reduced from 28 bytes to 26 bytes. Fixes: d5c7076b772a ("smb3: add smb3.1.1 to default dialect list") Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Cc: <stable@vger.kernel.org> Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Reviewed-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6352ab32c7e7a..223056097b54f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1169,9 +1169,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[0] = cpu_to_le16(server->vals->protocol_id); pneg_inbuf->DialectCount = cpu_to_le16(1); - /* structure is big enough for 3 dialects, sending only 1 */ + /* structure is big enough for 4 dialects, sending only 1 */ inbuflen = sizeof(*pneg_inbuf) - - sizeof(pneg_inbuf->Dialects[0]) * 2; + sizeof(pneg_inbuf->Dialects[0]) * 3; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, -- GitLab From d2e81f92e5b76c4c260141928700442876fa4bb3 Mon Sep 17 00:00:00 2001 From: Tom Talpey <tom@talpey.com> Date: Fri, 23 Sep 2022 21:53:55 +0000 Subject: [PATCH 1337/2223] Decrease the number of SMB3 smbdirect client SGEs The client-side SMBDirect layer requires no more than 6 send SGEs and 1 receive SGE. The previous default of 8 send and 8 receive causes smbdirect to fail on the SoftiWARP (siw) provider, and possibly others. Additionally, large numbers of SGEs reduces performance significantly on adapter implementations. Also correct the frmr page count comment (not an SGE count). Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smbdirect.c | 26 ++++++++++++-------------- fs/cifs/smbdirect.h | 14 +++++++++----- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fbbec22bcc8b..f81229721b765 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -99,7 +99,7 @@ int smbd_keep_alive_interval = 120; * User configurable initial values for RDMA transport * The actual values used may be lower and are limited to hardware capabilities */ -/* Default maximum number of SGEs in a RDMA write/read */ +/* Default maximum number of pages in a single RDMA write/read */ int smbd_max_frmr_depth = 2048; /* If payload is less than this byte, use RDMA send/recv not read/write */ @@ -1017,9 +1017,9 @@ static int smbd_post_send_data( { int i; u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SGE]; + struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - if (n_vec > SMBDIRECT_MAX_SGE) { + if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); return -EINVAL; } @@ -1562,17 +1562,15 @@ static struct smbd_connection *_smbd_get_connection( info->max_receive_size = smbd_max_receive_size; info->keep_alive_interval = smbd_keep_alive_interval; - if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) { + if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE || + info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) { log_rdma_event(ERR, - "warning: device max_send_sge = %d too small\n", - info->id->device->attrs.max_send_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); - } - if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) { - log_rdma_event(ERR, - "warning: device max_recv_sge = %d too small\n", + "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", + IB_DEVICE_NAME_MAX, + info->id->device->name, + info->id->device->attrs.max_send_sge, info->id->device->attrs.max_recv_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); + goto config_failed; } info->send_cq = NULL; @@ -1598,8 +1596,8 @@ static struct smbd_connection *_smbd_get_connection( qp_attr.qp_context = info; qp_attr.cap.max_send_wr = info->send_credit_target; qp_attr.cap.max_recv_wr = info->receive_credit_max; - qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; - qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE; + qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE; qp_attr.cap.max_inline_data = 0; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index a87fca82a7963..207ef979cd51c 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -91,7 +91,7 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ int responder_resources; - /* Maximum number of SGEs in a RDMA write/read */ + /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* * If payload is less than or equal to the threshold, @@ -225,21 +225,25 @@ struct smbd_buffer_descriptor_v1 { __le32 length; } __packed; -/* Default maximum number of SGEs in a RDMA send/recv */ -#define SMBDIRECT_MAX_SGE 16 +/* Maximum number of SGEs used by smbdirect.c in any send work request */ +#define SMBDIRECT_MAX_SEND_SGE 6 + /* The context for a SMBD request */ struct smbd_request { struct smbd_connection *info; struct ib_cqe cqe; - /* the SGE entries for this packet */ - struct ib_sge sge[SMBDIRECT_MAX_SGE]; + /* the SGE entries for this work request */ + struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE]; int num_sge; /* SMBD packet header follows this structure */ u8 packet[]; }; +/* Maximum number of SGEs used by smbdirect.c in any receive work request */ +#define SMBDIRECT_MAX_RECV_SGE 1 + /* The context for a SMBD response */ struct smbd_response { struct smbd_connection *info; -- GitLab From 3c62df55f3306238f36dc19cbe40b5e3d288d116 Mon Sep 17 00:00:00 2001 From: Tom Talpey <tom@talpey.com> Date: Fri, 23 Sep 2022 21:53:57 +0000 Subject: [PATCH 1338/2223] Reduce client smbdirect max receive segment size Reduce client smbdirect max segment receive size to 1364 to match protocol norms. Larger buffers are unnecessary and add significant memory overhead. Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smbdirect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index f81229721b765..4908ca54610c9 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -90,7 +90,7 @@ int smbd_max_send_size = 1364; int smbd_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -int smbd_max_receive_size = 8192; +int smbd_max_receive_size = 1364; /* The timeout to initiate send of a keepalive message on idle */ int smbd_keep_alive_interval = 120; -- GitLab From adeb964d3791e1eea8c4c3ab13549ccc7e411e07 Mon Sep 17 00:00:00 2001 From: Tom Talpey <tom@talpey.com> Date: Fri, 23 Sep 2022 21:53:59 +0000 Subject: [PATCH 1339/2223] Handle variable number of SGEs in client smbdirect send. If/when an outgoing request contains more scatter/gather segments than can be mapped in a single RDMA send work request, use smbdirect fragments to send it in multiple packets. Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smbdirect.c | 185 ++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 108 deletions(-) diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 4908ca54610c9..6ac424d26fe6d 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1984,10 +1984,11 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vec; + struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; int nvecs; int size; unsigned int buflen, remaining_data_length; + unsigned int offset, remaining_vec_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -1996,10 +1997,8 @@ int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst; int rqst_idx; - if (info->transport_status != SMBD_CONNECTED) { - rc = -EAGAIN; - goto done; - } + if (info->transport_status != SMBD_CONNECTED) + return -EAGAIN; /* * Add in the page array if there is one. The caller needs to set @@ -2010,125 +2009,95 @@ int smbd_send(struct TCP_Server_Info *server, for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (remaining_data_length > info->max_fragmented_send_size) { + if (unlikely(remaining_data_length > info->max_fragmented_send_size)) { + /* assertion: payload never exceeds negotiated maximum */ log_write(ERR, "payload size %d > max size %d\n", remaining_data_length, info->max_fragmented_send_size); - rc = -EINVAL; - goto done; + return -EINVAL; } log_write(INFO, "num_rqst=%d total length=%u\n", num_rqst, remaining_data_length); rqst_idx = 0; -next_rqst: - rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; - - cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - for (i = 0; i < rqst->rq_nvec; i++) - dump_smb(iov[i].iov_base, iov[i].iov_len); - - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = i = 0; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, - remaining_data_length); - rc = smbd_post_send_data( - info, &iov[start], i-start, - remaining_data_length); - if (rc) - goto done; - } else { - /* iov[start] is too big, break it */ - nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j*max_iov_size; - vec.iov_len = max_iov_size; - if (j == nvecs-1) - vec.iov_len = - buflen - - max_iov_size*(nvecs-1); - remaining_data_length -= vec.iov_len; - log_write(INFO, - "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); - rc = smbd_post_send_data( - info, &vec, 1, - remaining_data_length); - if (rc) - goto done; + do { + rqst = &rqst_array[rqst_idx]; + iov = rqst->rq_iov; + + cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", + rqst_idx, smb_rqst_len(server, rqst)); + remaining_vec_data_length = 0; + for (i = 0; i < rqst->rq_nvec; i++) { + remaining_vec_data_length += iov[i].iov_len; + dump_smb(iov[i].iov_base, iov[i].iov_len); + } + + log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", + rqst_idx, rqst->rq_nvec, + rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, smb_rqst_len(server, rqst)); + + start = 0; + offset = 0; + do { + buflen = 0; + i = start; + j = 0; + while (i < rqst->rq_nvec && + j < SMBDIRECT_MAX_SEND_SGE - 1 && + buflen < max_iov_size) { + + vecs[j].iov_base = iov[i].iov_base + offset; + if (buflen + iov[i].iov_len > max_iov_size) { + vecs[j].iov_len = + max_iov_size - iov[i].iov_len; + buflen = max_iov_size; + offset = vecs[j].iov_len; + } else { + vecs[j].iov_len = + iov[i].iov_len - offset; + buflen += vecs[j].iov_len; + offset = 0; + ++i; } - i++; - if (i == rqst->rq_nvec) - break; + ++j; } + + remaining_vec_data_length -= buflen; + remaining_data_length -= buflen; + log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", + remaining_vec_data_length > 0 ? + "partial" : "complete", + rqst->rq_nvec, start, j, + remaining_data_length); + start = i; - buflen = 0; - } else { - i++; - if (i == rqst->rq_nvec) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, + rc = smbd_post_send_data(info, vecs, j, remaining_data_length); + if (rc) + goto done; + } while (remaining_vec_data_length > 0); + + /* now sending pages if there are any */ + for (i = 0; i < rqst->rq_npages; i++) { + rqst_page_get_length(rqst, i, &buflen, &offset); + nvecs = (buflen + max_iov_size - 1) / max_iov_size; + log_write(INFO, "sending pages buflen=%d nvecs=%d\n", + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + size = min_t(unsigned int, max_iov_size, remaining_data_length); + remaining_data_length -= size; + log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", + i, j * max_iov_size + offset, size, remaining_data_length); - rc = smbd_post_send_data(info, &iov[start], - i-start, remaining_data_length); + rc = smbd_post_send_page( + info, rqst->rq_pages[i], + j*max_iov_size + offset, + size, remaining_data_length); if (rc) goto done; - break; } } - log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); - } - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - unsigned int offset; - - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = max_iov_size; - if (j == nvecs-1) - size = buflen - j*max_iov_size; - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } - } - - rqst_idx++; - if (rqst_idx < num_rqst) - goto next_rqst; + } while (++rqst_idx < num_rqst); done: /* -- GitLab From 0350d7a39c7f8175fca001b6d6a39481da5ef22c Mon Sep 17 00:00:00 2001 From: Tom Talpey <tom@talpey.com> Date: Fri, 23 Sep 2022 21:54:00 +0000 Subject: [PATCH 1340/2223] Fix formatting of client smbdirect RDMA logging Make the debug logging more consistent in formatting of addresses, lengths, and bitfields. Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smbdirect.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 6ac424d26fe6d..90789aaa6567e 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -270,7 +270,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_request *request = container_of(wc->wr_cqe, struct smbd_request, cqe); - log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", + log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n", request, wc->status); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { @@ -448,7 +448,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_connection *info = response->info; int data_length = 0; - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n", + log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, response->type, wc->status, wc->opcode, wc->byte_len, wc->pkey_index); @@ -723,7 +723,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", + log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n", request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); @@ -792,7 +792,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, - "rdma_request sge[%d] addr=%llu length=%u\n", + "rdma_request sge[%d] addr=0x%llx length=%u\n", i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, @@ -1079,7 +1079,7 @@ static int smbd_negotiate(struct smbd_connection *info) response->type = SMBD_NEGOTIATE_RESP; rc = smbd_post_recv(info, response); - log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n", + log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); if (rc) @@ -1539,7 +1539,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_send_credit_target > info->id->device->attrs.max_cqe || smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_send_credit_target, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1548,7 +1548,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_receive_credit_max, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); -- GitLab From 68e14569d7e5a1798fcbfd945022a4de86f944a0 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Wed, 21 Sep 2022 14:05:53 -0500 Subject: [PATCH 1341/2223] smb3: add dynamic trace points for tree disconnect Needed this for debugging a failing xfstest. Also change camel case for "treeName" to "tree_name" in tcon struct. Example trace output (from "trace-cmd record -e smb3_tdis*"): umount-9718 [006] ..... 5909.780244: smb3_tdis_enter: xid=206 sid=0xcf38894e tid=0x3d0b8cf8 path=\\localhost\test umount-9718 [007] ..... 5909.780878: smb3_tdis_done: xid=206 sid=0xcf38894e tid=0x3d0b8cf8 Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 2 +- fs/cifs/cifs_debug.c | 4 ++-- fs/cifs/cifs_debug.h | 6 +++--- fs/cifs/cifs_swn.c | 12 ++++++------ fs/cifs/cifsglob.h | 2 +- fs/cifs/connect.c | 13 +++++++------ fs/cifs/dfs_cache.c | 2 +- fs/cifs/dir.c | 8 ++++---- fs/cifs/fscache.c | 2 +- fs/cifs/inode.c | 2 +- fs/cifs/misc.c | 4 ++-- fs/cifs/smb2inode.c | 2 +- fs/cifs/smb2ops.c | 6 +++--- fs/cifs/smb2pdu.c | 16 ++++++++++------ fs/cifs/trace.h | 3 +++ 15 files changed, 46 insertions(+), 38 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index b401339f6e738..ca8d7cf2a1473 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -160,7 +160,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } goto oshr_exit; } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index c05477e28cffa..90850da390aeb 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -87,7 +87,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) { __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count); + seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count); if (tcon->nativeFileSystem) seq_printf(m, "Type: %s ", tcon->nativeFileSystem); seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", @@ -601,7 +601,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; - seq_printf(m, "\n%d) %s", i, tcon->treeName); + seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_printf(m, "\nSMBs: %d", diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index ee4ea2b60c0fb..d44808263cfba 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -108,8 +108,8 @@ do { \ #define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ do { \ const char *tn = ""; \ - if (tcon && tcon->treeName) \ - tn = tcon->treeName; \ + if (tcon && tcon->tree_name) \ + tn = tcon->tree_name; \ if ((type) & FYI && cifsFYI & CIFS_INFO) { \ pr_debug_ ## ratefunc("%s: %s " fmt, \ __FILE__, tn, ##__VA_ARGS__); \ @@ -150,7 +150,7 @@ do { \ #define cifs_tcon_dbg(type, fmt, ...) \ do { \ if (0) \ - pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \ + pr_debug("%s " fmt, tcon->tree_name, ##__VA_ARGS__); \ } while (0) #define cifs_info(fmt, ...) \ diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 1e4c7cc5287f0..7233c6a7e6d70 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -256,23 +256,23 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) const char *share_name; const char *net_name; - net_name = extract_hostname(tcon->treeName); + net_name = extract_hostname(tcon->tree_name); if (IS_ERR(net_name)) { int ret; ret = PTR_ERR(net_name); cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); return ERR_PTR(-EINVAL); } - share_name = extract_sharename(tcon->treeName); + share_name = extract_sharename(tcon->tree_name); if (IS_ERR(share_name)) { int ret; ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); kfree(net_name); return ERR_PTR(-EINVAL); } @@ -335,14 +335,14 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) goto fail; } - reg->net_name = extract_hostname(tcon->treeName); + reg->net_name = extract_hostname(tcon->tree_name); if (IS_ERR(reg->net_name)) { ret = PTR_ERR(reg->net_name); cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret); goto fail_idr; } - reg->share_name = extract_sharename(tcon->treeName); + reg->share_name = extract_sharename(tcon->tree_name); if (IS_ERR(reg->share_name)) { ret = PTR_ERR(reg->share_name); cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ae7f571a7dba2..ad606f648bdc8 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1149,7 +1149,7 @@ struct cifs_tcon { struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ - char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ + char tree_name[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; char *password; /* for share-level security */ __u32 tid; /* The 4 byte tree id */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae6f2c08153e..ad81d7d43eafb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1940,7 +1940,8 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&ses->ses_lock); cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); - cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + cifs_dbg(FYI, + "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE"); spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { @@ -2293,7 +2294,7 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { if (tcon->status == TID_EXITING) return 0; - if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE)) + if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) return 0; if (tcon->seal != ctx->seal) return 0; @@ -3989,7 +3990,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } bcc_ptr += length + 1; bytes_left -= (length + 1); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); @@ -4197,7 +4198,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->local_nls = cifs_sb->local_nls; ctx->linux_uid = fsuid; ctx->cred_uid = fsuid; - ctx->UNC = master_tcon->treeName; + ctx->UNC = master_tcon->tree_name; ctx->retry = master_tcon->retry; ctx->nocase = master_tcon->nocase; ctx->nohandlecache = master_tcon->nohandlecache; @@ -4663,7 +4664,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */ if (!server->current_fullpath || dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) { - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls); goto out; } @@ -4707,7 +4708,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc); if (rc) { spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index a9b6c3eba6de5..e70915ad75410 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -98,7 +98,7 @@ static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const c get_ipc_unc(path, unc, sizeof(unc)); for (; *ses; ses++) { - if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName)) + if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name)) return *ses; } return ERR_PTR(-ENOENT); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 08f7392716e2f..f58869306309f 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -50,7 +50,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } if (add_treename) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s return full_path; if (dfsplen) - memcpy(full_path, tcon->treeName, dfsplen); + memcpy(full_path, tcon->tree_name, dfsplen); full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); memcpy(full_path + dfsplen + 1, ctx->prepath, pplen); convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); @@ -93,7 +93,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, return ERR_PTR(-ENOMEM); if (prefix) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -123,7 +123,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, } if (dfsplen) { s -= dfsplen; - memcpy(s, tcon->treeName, dfsplen); + memcpy(s, tcon->tree_name, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 23ef56f55ce50..a1751b9563184 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -45,7 +45,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) memset(&key, 0, sizeof(key)); - sharename = extract_sharename(tcon->treeName); + sharename = extract_sharename(tcon->tree_name); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); return -EINVAL; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index bac08c20f559b..3784d3a880536 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -913,7 +913,7 @@ cifs_set_fattr_ino(int xid, } else { /* make an ino by hashing the UNC */ fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO; - fattr->cf_uniqueid = simple_hashstr(tcon->treeName); + fattr->cf_uniqueid = simple_hashstr(tcon->tree_name); } } } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c6679398fff9f..f42812e4c2cda 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -525,7 +525,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cifs_sb->mnt_cifs_serverino_autodisabled = true; cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n", - tcon ? tcon->treeName : "new server"); + tcon ? tcon->tree_name : "new server"); cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n"); cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n"); @@ -1328,7 +1328,7 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, char *treename, *dfspath, sep; int treenamelen, linkpathlen, rc; - treename = tcon->treeName; + treename = tcon->tree_name; /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL * messages MUST be encoded with exactly one leading backslash, not two * leading backslashes. diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index b83f59051b26f..bb3e3d5a0cdac 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -379,7 +379,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_open_free(&rqst[0]); if (rc == -EREMCHG) { - pr_warn_once("server share %s deleted\n", tcon->treeName); + pr_warn_once("server share %s deleted\n", tcon->tree_name); tcon->need_reconnect = true; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 421be43af4253..f590a9cb6a1a2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1327,7 +1327,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc == -EOPNOTSUPP) { - pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + pr_warn_once("Server share %s does not support copy range\n", tcon->tree_name); goto req_res_key_exit; } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); @@ -2289,7 +2289,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", - tcon->treeName); + tcon->tree_name); return; } } @@ -2498,7 +2498,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } goto qic_exit; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 223056097b54f..90ccac18f9f3f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1930,7 +1930,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) @@ -1973,6 +1973,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (!ses || !(ses->server)) return -EIO; + trace_smb3_tdis_enter(xid, tcon->tid, ses->Suid, tcon->tree_name); spin_lock(&ses->chan_lock); if ((tcon->need_reconnect) || (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) { @@ -2004,8 +2005,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rc = cifs_send_recv(xid, ses, ses->server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) + if (rc) { cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); + trace_smb3_tdis_err(xid, tcon->tid, ses->Suid, rc); + } + trace_smb3_tdis_done(xid, tcon->tid, ses->Suid); return rc; } @@ -2674,7 +2678,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, utf16_path); + tcon->tree_name, utf16_path); if (rc) goto err_free_req; @@ -2816,7 +2820,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, path); + tcon->tree_name, path); if (rc) return rc; req->NameLength = cpu_to_le16(name_len * 2); @@ -3011,7 +3015,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, oparms->create_options, oparms->desired_access, rc); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); tcon->need_reconnect = true; } goto creat_exit; @@ -4429,7 +4433,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->bytes, wdata->result); if (wdata->result == -ENOSPC) pr_warn_once("Out of space writing to %s\n", - tcon->treeName); + tcon->tree_name); } else trace_smb3_write_done(0 /* no xid */, wdata->cfile->fid.persistent_fid, diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6b88dc2e364f5..110070ba8b04e 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -372,6 +372,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, @@ -409,6 +410,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, @@ -451,6 +453,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); /* * For logging SMB3 Status code and Command for responses which return errors -- GitLab From aea6794e664a07324288f3d3484b950922baeebd Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Wed, 31 Aug 2022 12:49:42 +1000 Subject: [PATCH 1342/2223] cifs: Make tcon contain a wrapper structure cached_fids instead of cached_fid This wrapper structure will later be expanded to contain a list of fids that are cached and not just the root fid. Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 50 ++++++++++++++++++++++++-------------------- fs/cifs/cached_dir.h | 8 +++++-- fs/cifs/cifsglob.h | 2 +- fs/cifs/misc.c | 6 +++--- 4 files changed, 37 insertions(+), 29 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index ca8d7cf2a1473..88d117ddb6308 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -52,7 +52,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, dentry = cifs_sb->root; - cfid = tcon->cfid; + cfid = &tcon->cfids->cfid; mutex_lock(&cfid->fid_mutex); if (cfid->is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); @@ -226,7 +226,7 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, { struct cached_fid *cfid; - cfid = tcon->cfid; + cfid = &tcon->cfids->cfid; mutex_lock(&cfid->fid_mutex); if (cfid->dentry == dentry) { @@ -320,7 +320,7 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) tcon = tlink_tcon(tlink); if (IS_ERR(tcon)) continue; - cfid = tcon->cfid; + cfid = &tcon->cfids->cfid; mutex_lock(&cfid->fid_mutex); if (cfid->dentry) { dput(cfid->dentry); @@ -336,12 +336,14 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) */ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { - mutex_lock(&tcon->cfid->fid_mutex); - tcon->cfid->is_valid = false; + struct cached_fid *cfid = &tcon->cfids->cfid; + + mutex_lock(&cfid->fid_mutex); + cfid->is_valid = false; /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_cached_dir_lease_locked(tcon->cfid); - memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->cfid->fid_mutex); + close_cached_dir_lease_locked(cfid); + memset(&cfid->fid, 0, sizeof(struct cifs_fid)); + mutex_unlock(&cfid->fid_mutex); } static void @@ -355,34 +357,36 @@ smb2_cached_lease_break(struct work_struct *work) int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { - if (tcon->cfid->is_valid && + struct cached_fid *cfid = &tcon->cfids->cfid; + + if (cfid->is_valid && !memcmp(lease_key, - tcon->cfid->fid.lease_key, + cfid->fid.lease_key, SMB2_LEASE_KEY_SIZE)) { - tcon->cfid->time = 0; - INIT_WORK(&tcon->cfid->lease_break, + cfid->time = 0; + INIT_WORK(&cfid->lease_break, smb2_cached_lease_break); queue_work(cifsiod_wq, - &tcon->cfid->lease_break); + &cfid->lease_break); return true; } return false; } -struct cached_fid *init_cached_dir(void) +struct cached_fids *init_cached_dirs(void) { - struct cached_fid *cfid; + struct cached_fids *cfids; - cfid = kzalloc(sizeof(*cfid), GFP_KERNEL); - if (!cfid) + cfids = kzalloc(sizeof(*cfids), GFP_KERNEL); + if (!cfids) return NULL; - INIT_LIST_HEAD(&cfid->dirents.entries); - mutex_init(&cfid->dirents.de_mutex); - mutex_init(&cfid->fid_mutex); - return cfid; + INIT_LIST_HEAD(&cfids->cfid.dirents.entries); + mutex_init(&cfids->cfid.dirents.de_mutex); + mutex_init(&cfids->cfid.fid_mutex); + return cfids; } -void free_cached_dir(struct cifs_tcon *tcon) +void free_cached_dirs(struct cached_fids *cfids) { - kfree(tcon->cfid); + kfree(cfids); } diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index bd262dc8b179a..e430e11022968 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -45,8 +45,12 @@ struct cached_fid { struct cached_dirents dirents; }; -extern struct cached_fid *init_cached_dir(void); -extern void free_cached_dir(struct cifs_tcon *tcon); +struct cached_fids { + struct cached_fid cfid; +}; + +extern struct cached_fids *init_cached_dirs(void); +extern void free_cached_dirs(struct cached_fids *cfids); extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ad606f648bdc8..338bc11f682ee 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1228,7 +1228,7 @@ struct cifs_tcon { struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ - struct cached_fid *cfid; /* Cached root fid */ + struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index f42812e4c2cda..20a112c96bae5 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -117,8 +117,8 @@ tconInfoAlloc(void) ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfid = init_cached_dir(); - if (!ret_buf->cfid) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { kfree(ret_buf); return NULL; } @@ -144,7 +144,7 @@ tconInfoFree(struct cifs_tcon *tcon) cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); return; } - free_cached_dir(tcon); + free_cached_dirs(tcon->cfids); atomic_dec(&tconInfoAllocCount); kfree(tcon->nativeFileSystem); kfree_sensitive(tcon->password); -- GitLab From 47fc2491e108f253cf963c50acc59a74d34c7f2b Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Wed, 31 Aug 2022 12:49:43 +1000 Subject: [PATCH 1343/2223] cifs: improve handlecaching Only track the dentry for the root handle Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 88d117ddb6308..211f630cd8760 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -47,11 +47,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (cifs_sb->root == NULL) return -ENOENT; - if (strlen(path)) + if (!path[0]) + dentry = cifs_sb->root; + else return -ENOENT; - dentry = cifs_sb->root; - cfid = &tcon->cfids->cfid; mutex_lock(&cfid->fid_mutex); if (cfid->is_valid) { @@ -177,7 +177,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, cfid->tcon = tcon; cfid->is_valid = true; cfid->dentry = dentry; - dget(dentry); + if (dentry) + dget(dentry); kref_init(&cfid->refcount); /* BB TBD check to see if oplock level check can be removed below */ -- GitLab From 30f8f37147bc9af794b89e37d42fc858f201e5b0 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Wed, 31 Aug 2022 12:49:44 +1000 Subject: [PATCH 1344/2223] cifs: store a pointer to a fid in the cfid structure instead of the struct also create a constructor that takes a path name and stores it in the fid. Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 63 ++++++++++++++++++++++++++++++++++++++------ fs/cifs/cached_dir.h | 4 ++- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 211f630cd8760..b705dac383f9f 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -11,6 +11,8 @@ #include "smb2proto.h" #include "cached_dir.h" +struct cached_fid *init_cached_dir(const char *path); + /* * Open the and cache a directory handle. * If error then *cfid is not initialized. @@ -52,7 +54,14 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, else return -ENOENT; - cfid = &tcon->cfids->cfid; + cfid = tcon->cfids->cfid; + if (cfid == NULL) { + cfid = init_cached_dir(path); + tcon->cfids->cfid = cfid; + } + if (cfid == NULL) + return -ENOMEM; + mutex_lock(&cfid->fid_mutex); if (cfid->is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); @@ -227,7 +236,9 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, { struct cached_fid *cfid; - cfid = &tcon->cfids->cfid; + cfid = tcon->cfids->cfid; + if (cfid == NULL) + return -ENOENT; mutex_lock(&cfid->fid_mutex); if (cfid->dentry == dentry) { @@ -321,7 +332,9 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) tcon = tlink_tcon(tlink); if (IS_ERR(tcon)) continue; - cfid = &tcon->cfids->cfid; + cfid = tcon->cfids->cfid; + if (cfid == NULL) + continue; mutex_lock(&cfid->fid_mutex); if (cfid->dentry) { dput(cfid->dentry); @@ -337,7 +350,10 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) */ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { - struct cached_fid *cfid = &tcon->cfids->cfid; + struct cached_fid *cfid = tcon->cfids->cfid; + + if (cfid == NULL) + return; mutex_lock(&cfid->fid_mutex); cfid->is_valid = false; @@ -358,7 +374,10 @@ smb2_cached_lease_break(struct work_struct *work) int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { - struct cached_fid *cfid = &tcon->cfids->cfid; + struct cached_fid *cfid = tcon->cfids->cfid; + + if (cfid == NULL) + return false; if (cfid->is_valid && !memcmp(lease_key, @@ -374,6 +393,32 @@ int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) return false; } +struct cached_fid *init_cached_dir(const char *path) +{ + struct cached_fid *cfid; + + cfid = kzalloc(sizeof(*cfid), GFP_KERNEL); + if (!cfid) + return NULL; + cfid->path = kstrdup(path, GFP_KERNEL); + if (!cfid->path) { + kfree(cfid); + return NULL; + } + + INIT_LIST_HEAD(&cfid->dirents.entries); + mutex_init(&cfid->dirents.de_mutex); + mutex_init(&cfid->fid_mutex); + return cfid; +} + +void free_cached_dir(struct cached_fid *cfid) +{ + kfree(cfid->path); + cfid->path = NULL; + kfree(cfid); +} + struct cached_fids *init_cached_dirs(void) { struct cached_fids *cfids; @@ -381,13 +426,15 @@ struct cached_fids *init_cached_dirs(void) cfids = kzalloc(sizeof(*cfids), GFP_KERNEL); if (!cfids) return NULL; - INIT_LIST_HEAD(&cfids->cfid.dirents.entries); - mutex_init(&cfids->cfid.dirents.de_mutex); - mutex_init(&cfids->cfid.fid_mutex); + mutex_init(&cfids->cfid_list_mutex); return cfids; } void free_cached_dirs(struct cached_fids *cfids) { + if (cfids->cfid) { + free_cached_dir(cfids->cfid); + cfids->cfid = NULL; + } kfree(cfids); } diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index e430e11022968..bdf6c3866653b 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -31,6 +31,7 @@ struct cached_dirents { }; struct cached_fid { + const char *path; bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; bool has_lease:1; @@ -46,7 +47,8 @@ struct cached_fid { }; struct cached_fids { - struct cached_fid cfid; + struct mutex cfid_list_mutex; + struct cached_fid *cfid; }; extern struct cached_fids *init_cached_dirs(void); -- GitLab From 3afdfb0dd4baed45b7010e672e44c21fa790bace Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Sat, 1 Oct 2022 22:52:20 -0500 Subject: [PATCH 1345/2223] smb3: define missing create contexts Update the list of create contexts to include the three more recent ones and the one used for mounts to Macs. Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/smbfs_common/smb2pdu.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 2cab413fffeea..7d605db3bb3b9 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -1101,7 +1101,11 @@ struct smb2_change_notify_rsp { #define SMB2_CREATE_REQUEST_LEASE "RqLs" #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74" +#define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10" +#define SVHDX_OPEN_DEVICE_CONTEXT "\x9C\xCB\xCF\x9E\x04\xC1\xE6\x43\x98\x0E\x15\x8D\xA1\xF6\xEC\x83" +#define SMB2_CREATE_TAG_AAPL "AAPL" /* Flag (SMB3 open response) values */ #define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 -- GitLab From 2eb2756f6c9e9621e022d78321ce40a62c4520b5 Mon Sep 17 00:00:00 2001 From: Alexander Aring <aahringo@redhat.com> Date: Tue, 4 Oct 2022 21:47:49 -0400 Subject: [PATCH 1346/2223] Revert "net/ieee802154: reject zero-sized raw_sendmsg()" This reverts commit 3a4d061c699bd3eedc80dc97a4b2a2e1af83c6f5. There is a v2 which does return zero if zero length is given. Signed-off-by: Alexander Aring <aahringo@redhat.com> Link: https://lore.kernel.org/r/20221005014750.3685555-1-aahringo@redhat.com Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org> --- net/ieee802154/socket.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index cbd0e2ac4ffe9..7889e1ef7fad6 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -251,9 +251,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) return -EOPNOTSUPP; } - if (!size) - return -EINVAL; - lock_sock(sk); if (!sk->sk_bound_dev_if) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); -- GitLab From b12e924a2f5b960373459c8f8a514f887adf5cac Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Date: Tue, 4 Oct 2022 21:47:50 -0400 Subject: [PATCH 1347/2223] net/ieee802154: don't warn zero-sized raw_sendmsg() syzbot is hitting skb_assert_len() warning at __dev_queue_xmit() [1], for PF_IEEE802154 socket's zero-sized raw_sendmsg() request is hitting __dev_queue_xmit() with skb->len == 0. Since PF_IEEE802154 socket's zero-sized raw_sendmsg() request was able to return 0, don't call __dev_queue_xmit() if packet length is 0. ---------- #include <sys/socket.h> #include <netinet/in.h> int main(int argc, char *argv[]) { struct sockaddr_in addr = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_LOOPBACK) }; struct iovec iov = { }; struct msghdr hdr = { .msg_name = &addr, .msg_namelen = sizeof(addr), .msg_iov = &iov, .msg_iovlen = 1 }; sendmsg(socket(PF_IEEE802154, SOCK_RAW, 0), &hdr, 0); return 0; } ---------- Note that this might be a sign that commit fd1894224407c484 ("bpf: Don't redirect packets with invalid pkt_len") should be reverted, for skb->len == 0 was acceptable for at least PF_IEEE802154 socket. Link: https://syzkaller.appspot.com/bug?extid=5ea725c25d06fb9114c4 [1] Reported-by: syzbot <syzbot+5ea725c25d06fb9114c4@syzkaller.appspotmail.com> Fixes: fd1894224407c484 ("bpf: Don't redirect packets with invalid pkt_len") Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Signed-off-by: Alexander Aring <aahringo@redhat.com> Link: https://lore.kernel.org/r/20221005014750.3685555-2-aahringo@redhat.com Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org> --- net/ieee802154/socket.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 7889e1ef7fad6..6e55fae4c6860 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -272,6 +272,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = -EMSGSIZE; goto out_dev; } + if (!size) { + err = 0; + goto out_dev; + } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; -- GitLab From 417b9c51f59734d852e47252476fadc293ad994a Mon Sep 17 00:00:00 2001 From: Callum Osmotherly <callum.osmotherly@gmail.com> Date: Wed, 5 Oct 2022 17:44:16 +1030 Subject: [PATCH 1348/2223] ALSA: hda/realtek: remove ALC289_FIXUP_DUAL_SPK for Dell 5530 After some feedback from users with Dell Precision 5530 machines, this patch reverts the previous change to add ALC289_FIXUP_DUAL_SPK. While it improved the speaker output quality, it caused the headphone jack to have an audible "pop" sound when power saving was toggled. Fixes: 1885ff13d4c4 ("ALSA: hda/realtek: Enable 4-speaker output Dell Precision 5530 laptop") Signed-off-by: Callum Osmotherly <callum.osmotherly@gmail.com> Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/Yz0uyN1zwZhnyRD6@piranha Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/patch_realtek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bce82b834cec7..d89f95ae0efc7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9198,7 +9198,6 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0871, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0872, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0873, "Dell Precision 3930", ALC255_FIXUP_DUMMY_LINEOUT_VERB), - SND_PCI_QUIRK(0x1028, 0x087d, "Dell Precision 5530", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x08ae, "Dell WYSE NB", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), -- GitLab From 19619b43f0319c7a0564f6ff35aca5f62e7cb118 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:42 +0530 Subject: [PATCH 1349/2223] PCI: qcom-ep: Disable IRQs during driver remove Disable the Global and PERST IRQs during driver remove to avoid getting spurious IRQs after resource deallocation. Link: https://lore.kernel.org/r/20220914075350.7992-5-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 51afd9c547f5a..d7a8dd0533b0b 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -581,13 +581,13 @@ static irqreturn_t qcom_pcie_ep_perst_irq_thread(int irq, void *data) static int qcom_pcie_ep_enable_irq_resources(struct platform_device *pdev, struct qcom_pcie_ep *pcie_ep) { - int irq, ret; + int ret; - irq = platform_get_irq_byname(pdev, "global"); - if (irq < 0) - return irq; + pcie_ep->global_irq = platform_get_irq_byname(pdev, "global"); + if (pcie_ep->global_irq < 0) + return pcie_ep->global_irq; - ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, + ret = devm_request_threaded_irq(&pdev->dev, pcie_ep->global_irq, NULL, qcom_pcie_ep_global_irq_thread, IRQF_ONESHOT, "global_irq", pcie_ep); @@ -604,7 +604,7 @@ static int qcom_pcie_ep_enable_irq_resources(struct platform_device *pdev, "perst_irq", pcie_ep); if (ret) { dev_err(&pdev->dev, "Failed to request PERST IRQ\n"); - disable_irq(irq); + disable_irq(pcie_ep->global_irq); return ret; } @@ -702,6 +702,9 @@ static int qcom_pcie_ep_remove(struct platform_device *pdev) { struct qcom_pcie_ep *pcie_ep = platform_get_drvdata(pdev); + disable_irq(pcie_ep->global_irq); + disable_irq(pcie_ep->perst_irq); + if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) return 0; -- GitLab From 6dbba2b53c3bcbbee849d2fa8cf6acc973ab2e81 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:43 +0530 Subject: [PATCH 1350/2223] PCI: qcom-ep: Expose link transition counts via debugfs Qualcomm PCIe controllers have debug registers in the MMIO region that count PCIe link transitions. Expose them over debugfs to userspace to help debug the low power issues. Link: https://lore.kernel.org/r/20220914075350.7992-6-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 60 +++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index d7a8dd0533b0b..d4f2437ba735b 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -10,6 +10,7 @@ */ #include <linux/clk.h> +#include <linux/debugfs.h> #include <linux/delay.h> #include <linux/gpio/consumer.h> #include <linux/mfd/syscon.h> @@ -45,6 +46,11 @@ #define PARF_ATU_BASE_ADDR 0x634 #define PARF_ATU_BASE_ADDR_HI 0x638 #define PARF_SRIS_MODE 0x644 +#define PARF_DEBUG_CNT_PM_LINKST_IN_L2 0xc04 +#define PARF_DEBUG_CNT_PM_LINKST_IN_L1 0xc0c +#define PARF_DEBUG_CNT_PM_LINKST_IN_L0S 0xc10 +#define PARF_DEBUG_CNT_AUX_CLK_IN_L1SUB_L1 0xc84 +#define PARF_DEBUG_CNT_AUX_CLK_IN_L1SUB_L2 0xc88 #define PARF_DEVICE_TYPE 0x1000 #define PARF_BDF_TO_SID_CFG 0x2c00 @@ -135,12 +141,14 @@ enum qcom_pcie_ep_link_status { * @pci: Designware PCIe controller struct * @parf: Qualcomm PCIe specific PARF register base * @elbi: Designware PCIe specific ELBI register base + * @mmio: MMIO register base * @perst_map: PERST regmap * @mmio_res: MMIO region resource * @core_reset: PCIe Endpoint core reset * @reset: PERST# GPIO * @wake: WAKE# GPIO * @phy: PHY controller block + * @debugfs: PCIe Endpoint Debugfs directory * @clks: PCIe clocks * @num_clks: PCIe clocks count * @perst_en: Flag for PERST enable @@ -154,6 +162,7 @@ struct qcom_pcie_ep { void __iomem *parf; void __iomem *elbi; + void __iomem *mmio; struct regmap *perst_map; struct resource *mmio_res; @@ -161,6 +170,7 @@ struct qcom_pcie_ep { struct gpio_desc *reset; struct gpio_desc *wake; struct phy *phy; + struct dentry *debugfs; struct clk_bulk_data *clks; int num_clks; @@ -446,6 +456,9 @@ static int qcom_pcie_ep_get_io_resources(struct platform_device *pdev, pcie_ep->mmio_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mmio"); + pcie_ep->mmio = devm_pci_remap_cfg_resource(dev, pcie_ep->mmio_res); + if (IS_ERR(pcie_ep->mmio)) + return PTR_ERR(pcie_ep->mmio); syscon = of_parse_phandle(dev->of_node, "qcom,perst-regs", 0); if (!syscon) { @@ -627,6 +640,37 @@ static int qcom_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no, } } +static int qcom_pcie_ep_link_transition_count(struct seq_file *s, void *data) +{ + struct qcom_pcie_ep *pcie_ep = (struct qcom_pcie_ep *) + dev_get_drvdata(s->private); + + seq_printf(s, "L0s transition count: %u\n", + readl_relaxed(pcie_ep->mmio + PARF_DEBUG_CNT_PM_LINKST_IN_L0S)); + + seq_printf(s, "L1 transition count: %u\n", + readl_relaxed(pcie_ep->mmio + PARF_DEBUG_CNT_PM_LINKST_IN_L1)); + + seq_printf(s, "L1.1 transition count: %u\n", + readl_relaxed(pcie_ep->mmio + PARF_DEBUG_CNT_AUX_CLK_IN_L1SUB_L1)); + + seq_printf(s, "L1.2 transition count: %u\n", + readl_relaxed(pcie_ep->mmio + PARF_DEBUG_CNT_AUX_CLK_IN_L1SUB_L2)); + + seq_printf(s, "L2 transition count: %u\n", + readl_relaxed(pcie_ep->mmio + PARF_DEBUG_CNT_PM_LINKST_IN_L2)); + + return 0; +} + +static void qcom_pcie_ep_init_debugfs(struct qcom_pcie_ep *pcie_ep) +{ + struct dw_pcie *pci = &pcie_ep->pci; + + debugfs_create_devm_seqfile(pci->dev, "link_transition_count", pcie_ep->debugfs, + qcom_pcie_ep_link_transition_count); +} + static const struct pci_epc_features qcom_pcie_epc_features = { .linkup_notifier = true, .core_init_notifier = true, @@ -659,6 +703,7 @@ static int qcom_pcie_ep_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct qcom_pcie_ep *pcie_ep; + char *name; int ret; pcie_ep = devm_kzalloc(dev, sizeof(*pcie_ep), GFP_KERNEL); @@ -690,8 +735,21 @@ static int qcom_pcie_ep_probe(struct platform_device *pdev) if (ret) goto err_disable_resources; + name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node); + if (!name) { + ret = -ENOMEM; + goto err_disable_irqs; + } + + pcie_ep->debugfs = debugfs_create_dir(name, NULL); + qcom_pcie_ep_init_debugfs(pcie_ep); + return 0; +err_disable_irqs: + disable_irq(pcie_ep->global_irq); + disable_irq(pcie_ep->perst_irq); + err_disable_resources: qcom_pcie_disable_resources(pcie_ep); @@ -705,6 +763,8 @@ static int qcom_pcie_ep_remove(struct platform_device *pdev) disable_irq(pcie_ep->global_irq); disable_irq(pcie_ep->perst_irq); + debugfs_remove_recursive(pcie_ep->debugfs); + if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) return 0; -- GitLab From c457ac029e443faa5886f59f849e94701375b80f Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:44 +0530 Subject: [PATCH 1351/2223] PCI: qcom-ep: Gate Master AXI clock to MHI bus during L1SS During L1SS, gate the Master clock supplied to the MHI bus to save power. Link: https://lore.kernel.org/r/20220914075350.7992-7-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index d4f2437ba735b..5502e627e4828 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -27,6 +27,7 @@ #define PARF_SYS_CTRL 0x00 #define PARF_DB_CTRL 0x10 #define PARF_PM_CTRL 0x20 +#define PARF_MHI_CLOCK_RESET_CTRL 0x174 #define PARF_MHI_BASE_ADDR_LOWER 0x178 #define PARF_MHI_BASE_ADDR_UPPER 0x17c #define PARF_DEBUG_INT_EN 0x190 @@ -89,6 +90,9 @@ #define PARF_PM_CTRL_READY_ENTR_L23 BIT(2) #define PARF_PM_CTRL_REQ_NOT_ENTR_L1 BIT(5) +/* PARF_MHI_CLOCK_RESET_CTRL fields */ +#define PARF_MSTR_AXI_CLK_EN BIT(1) + /* PARF_AXI_MSTR_RD_HALT_NO_WRITES register fields */ #define PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN BIT(0) @@ -394,6 +398,11 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci) pcie_ep->parf + PARF_MHI_BASE_ADDR_LOWER); writel_relaxed(0, pcie_ep->parf + PARF_MHI_BASE_ADDR_UPPER); + /* Gate Master AXI clock to MHI bus during L1SS */ + val = readl_relaxed(pcie_ep->parf + PARF_MHI_CLOCK_RESET_CTRL); + val &= ~PARF_MSTR_AXI_CLK_EN; + val = readl_relaxed(pcie_ep->parf + PARF_MHI_CLOCK_RESET_CTRL); + dw_pcie_ep_init_notify(&pcie_ep->pci.ep); /* Enable LTSSM */ -- GitLab From 8d0d254b15cc5b7d46d85fb7ab8ecede9575e672 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@kernel.org> Date: Fri, 30 Sep 2022 16:56:02 -0400 Subject: [PATCH 1352/2223] nfsd: fix nfsd_file_unhash_and_dispose nfsd_file_unhash_and_dispose() is called for two reasons: We're either shutting down and purging the filecache, or we've gotten a notification about a file delete, so we want to go ahead and unhash it so that it'll get cleaned up when we close. We're either walking the hashtable or doing a lookup in it and we don't take a reference in either case. What we want to do in both cases is to try and unhash the object and put it on the dispose list if that was successful. If it's no longer hashed, then we don't want to touch it, with the assumption being that something else is already cleaning up the sentinel reference. Instead of trying to selectively decrement the refcount in this function, just unhash it, and if that was successful, move it to the dispose list. Then, the disposal routine will just clean that up as usual. Also, just make this a void function, drop the WARN_ON_ONCE, and the comments about deadlocking since the nature of the purported deadlock is no longer clear. Signed-off-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- fs/nfsd/filecache.c | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index d5c57360b4182..640a3c52c0565 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -405,22 +405,15 @@ nfsd_file_unhash(struct nfsd_file *nf) return false; } -/* - * Return true if the file was unhashed. - */ -static bool +static void nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose) { trace_nfsd_file_unhash_and_dispose(nf); - if (!nfsd_file_unhash(nf)) - return false; - /* keep final reference for nfsd_file_lru_dispose */ - if (refcount_dec_not_one(&nf->nf_ref)) - return true; - - nfsd_file_lru_remove(nf); - list_add(&nf->nf_lru, dispose); - return true; + if (nfsd_file_unhash(nf)) { + /* caller must call nfsd_file_dispose_list() later */ + nfsd_file_lru_remove(nf); + list_add(&nf->nf_lru, dispose); + } } static void @@ -562,8 +555,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) * @lock: LRU list lock (unused) * @arg: dispose list * - * Note this can deadlock with nfsd_file_cache_purge. - * * Return values: * %LRU_REMOVED: @item was removed from the LRU * %LRU_ROTATE: @item is to be moved to the LRU tail @@ -748,8 +739,6 @@ nfsd_file_close_inode(struct inode *inode) * * Walk the LRU list and close any entries that have not been used since * the last scan. - * - * Note this can deadlock with nfsd_file_cache_purge. */ static void nfsd_file_delayed_close(struct work_struct *work) @@ -891,16 +880,12 @@ out_err: goto out; } -/* - * Note this can deadlock with nfsd_file_lru_cb. - */ static void __nfsd_file_cache_purge(struct net *net) { struct rhashtable_iter iter; struct nfsd_file *nf; LIST_HEAD(dispose); - bool del; rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); do { @@ -910,14 +895,7 @@ __nfsd_file_cache_purge(struct net *net) while (!IS_ERR_OR_NULL(nf)) { if (net && nf->nf_net != net) continue; - del = nfsd_file_unhash_and_dispose(nf, &dispose); - - /* - * Deadlock detected! Something marked this entry as - * unhased, but hasn't removed it from the hash list. - */ - WARN_ON_ONCE(!del); - + nfsd_file_unhash_and_dispose(nf, &dispose); nf = rhashtable_walk_next(&iter); } -- GitLab From 243a5263014a30436c93ed3f1f864c1da845455e Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@kernel.org> Date: Tue, 4 Oct 2022 15:41:10 -0400 Subject: [PATCH 1353/2223] nfsd: rework hashtable handling in nfsd_do_file_acquire nfsd_file is RCU-freed, so we need to hold the rcu_read_lock long enough to get a reference after finding it in the hash. Take the rcu_read_lock() and call rhashtable_lookup directly. Switch to using rhashtable_lookup_insert_key as well, and use the usual retry mechanism if we hit an -EEXIST. Rename the "retry" bool to open_retry, and eliminiate the insert_err goto target. Signed-off-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- fs/nfsd/filecache.c | 52 +++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 640a3c52c0565..29a62db155fba 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1042,9 +1042,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, .need = may_flags & NFSD_FILE_MAY_MASK, .net = SVC_NET(rqstp), }; - struct nfsd_file *nf, *new; - bool retry = true; + bool open_retry = true; + struct nfsd_file *nf; __be32 status; + int ret; status = fh_verify(rqstp, fhp, S_IFREG, may_flags|NFSD_MAY_OWNER_OVERRIDE); @@ -1054,35 +1055,33 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, key.cred = get_current_cred(); retry: - /* Avoid allocation if the item is already in cache */ - nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); + rcu_read_lock(); + nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, + nfsd_file_rhash_params); if (nf) nf = nfsd_file_get(nf); + rcu_read_unlock(); if (nf) goto wait_for_construction; - new = nfsd_file_alloc(&key, may_flags); - if (!new) { + nf = nfsd_file_alloc(&key, may_flags); + if (!nf) { status = nfserr_jukebox; goto out_status; } - nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl, - &key, &new->nf_rhash, - nfsd_file_rhash_params); - if (!nf) { - nf = new; - goto open_file; - } - if (IS_ERR(nf)) - goto insert_err; - nf = nfsd_file_get(nf); - if (nf == NULL) { - nf = new; + ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, + &key, &nf->nf_rhash, + nfsd_file_rhash_params); + if (likely(ret == 0)) goto open_file; - } - nfsd_file_slab_free(&new->nf_rcu); + + nfsd_file_slab_free(&nf->nf_rcu); + if (ret == -EEXIST) + goto retry; + trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); + status = nfserr_jukebox; + goto out_status; wait_for_construction: wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); @@ -1090,11 +1089,11 @@ wait_for_construction: /* Did construction of this file fail? */ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); - if (!retry) { + if (!open_retry) { status = nfserr_jukebox; goto out; } - retry = false; + open_retry = false; nfsd_file_put_noref(nf); goto retry; } @@ -1142,13 +1141,6 @@ open_file: smp_mb__after_atomic(); wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); goto out; - -insert_err: - nfsd_file_slab_free(&new->nf_rcu); - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf)); - nf = NULL; - status = nfserr_jukebox; - goto out_status; } /** -- GitLab From 9e2a03173d1b4544c1113059e61e3caa7ce5e3a4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas <bhelgaas@google.com> Date: Tue, 4 Oct 2022 21:58:07 -0500 Subject: [PATCH 1354/2223] PCI/ASPM: Factor out L1 PM Substates configuration Move L1 PM Substates configuration from pcie_aspm_cap_init() to a new aspm_l1ss_init() function. No functional change intended. Link: https://lore.kernel.org/r/20221005025809.2247547-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> --- drivers/pci/pcie/aspm.c | 103 +++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 016d222b07c74..4535228e4a64f 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -554,13 +554,65 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, aspm_program_l1ss(child, cctl1, ctl2); } +static void aspm_l1ss_init(struct pcie_link_state *link) +{ + struct pci_dev *child = link->downstream, *parent = link->pdev; + u32 parent_l1ss_cap, child_l1ss_cap; + u32 parent_l1ss_ctl1 = 0, child_l1ss_ctl1 = 0; + + /* Setup L1 substate */ + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CAP, + &parent_l1ss_cap); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CAP, + &child_l1ss_cap); + + if (!(parent_l1ss_cap & PCI_L1SS_CAP_L1_PM_SS)) + parent_l1ss_cap = 0; + if (!(child_l1ss_cap & PCI_L1SS_CAP_L1_PM_SS)) + child_l1ss_cap = 0; + + /* + * If we don't have LTR for the entire path from the Root Complex + * to this device, we can't use ASPM L1.2 because it relies on the + * LTR_L1.2_THRESHOLD. See PCIe r4.0, secs 5.5.4, 6.18. + */ + if (!child->ltr_path) + child_l1ss_cap &= ~PCI_L1SS_CAP_ASPM_L1_2; + + if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_ASPM_L1_1) + link->aspm_support |= ASPM_STATE_L1_1; + if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_ASPM_L1_2) + link->aspm_support |= ASPM_STATE_L1_2; + if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_1) + link->aspm_support |= ASPM_STATE_L1_1_PCIPM; + if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_2) + link->aspm_support |= ASPM_STATE_L1_2_PCIPM; + + if (parent_l1ss_cap) + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + &parent_l1ss_ctl1); + if (child_l1ss_cap) + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, + &child_l1ss_ctl1); + + if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_1) + link->aspm_enabled |= ASPM_STATE_L1_1; + if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_2) + link->aspm_enabled |= ASPM_STATE_L1_2; + if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_1) + link->aspm_enabled |= ASPM_STATE_L1_1_PCIPM; + if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_2) + link->aspm_enabled |= ASPM_STATE_L1_2_PCIPM; + + if (link->aspm_support & ASPM_STATE_L1SS) + aspm_calc_l1ss_info(link, parent_l1ss_cap, child_l1ss_cap); +} + static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) { struct pci_dev *child = link->downstream, *parent = link->pdev; u32 parent_lnkcap, child_lnkcap; u16 parent_lnkctl, child_lnkctl; - u32 parent_l1ss_cap, child_l1ss_cap; - u32 parent_l1ss_ctl1 = 0, child_l1ss_ctl1 = 0; struct pci_bus *linkbus = parent->subordinate; if (blacklist) { @@ -615,52 +667,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) if (parent_lnkctl & child_lnkctl & PCI_EXP_LNKCTL_ASPM_L1) link->aspm_enabled |= ASPM_STATE_L1; - /* Setup L1 substate */ - pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CAP, - &parent_l1ss_cap); - pci_read_config_dword(child, child->l1ss + PCI_L1SS_CAP, - &child_l1ss_cap); - - if (!(parent_l1ss_cap & PCI_L1SS_CAP_L1_PM_SS)) - parent_l1ss_cap = 0; - if (!(child_l1ss_cap & PCI_L1SS_CAP_L1_PM_SS)) - child_l1ss_cap = 0; - - /* - * If we don't have LTR for the entire path from the Root Complex - * to this device, we can't use ASPM L1.2 because it relies on the - * LTR_L1.2_THRESHOLD. See PCIe r4.0, secs 5.5.4, 6.18. - */ - if (!child->ltr_path) - child_l1ss_cap &= ~PCI_L1SS_CAP_ASPM_L1_2; - - if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_ASPM_L1_1) - link->aspm_support |= ASPM_STATE_L1_1; - if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_ASPM_L1_2) - link->aspm_support |= ASPM_STATE_L1_2; - if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_1) - link->aspm_support |= ASPM_STATE_L1_1_PCIPM; - if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_2) - link->aspm_support |= ASPM_STATE_L1_2_PCIPM; - - if (parent_l1ss_cap) - pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - &parent_l1ss_ctl1); - if (child_l1ss_cap) - pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, - &child_l1ss_ctl1); - - if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_1) - link->aspm_enabled |= ASPM_STATE_L1_1; - if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_2) - link->aspm_enabled |= ASPM_STATE_L1_2; - if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_1) - link->aspm_enabled |= ASPM_STATE_L1_1_PCIPM; - if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_2) - link->aspm_enabled |= ASPM_STATE_L1_2_PCIPM; - - if (link->aspm_support & ASPM_STATE_L1SS) - aspm_calc_l1ss_info(link, parent_l1ss_cap, child_l1ss_cap); + aspm_l1ss_init(link); /* Save default state */ link->aspm_default = link->aspm_enabled; -- GitLab From cfc0028627cadfa271fab0290f18731193d63d87 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas <bhelgaas@google.com> Date: Tue, 4 Oct 2022 21:58:08 -0500 Subject: [PATCH 1355/2223] PCI/ASPM: Ignore L1 PM Substates if device lacks capability 187f91db8237 ("PCI/ASPM: Remove struct aspm_register_info.l1ss_cap") inadvertently removed a check for existence of the L1 PM Substates (L1SS) Capability before reading it. If there is no L1SS Capability, this means we mistakenly read PCI_COMMAND and PCI_STATUS (config address 0x04) and interpret that as the PCI_L1SS_CAP register, so we may incorrectly configure L1SS. Make sure the L1SS Capability exists before trying to read it. Fixes: 187f91db8237 ("PCI/ASPM: Remove struct aspm_register_info.l1ss_cap") Link: https://lore.kernel.org/r/20221005025809.2247547-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> --- drivers/pci/pcie/aspm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 4535228e4a64f..f12d117f44e05 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -560,6 +560,9 @@ static void aspm_l1ss_init(struct pcie_link_state *link) u32 parent_l1ss_cap, child_l1ss_cap; u32 parent_l1ss_ctl1 = 0, child_l1ss_ctl1 = 0; + if (!parent->l1ss || !child->l1ss) + return; + /* Setup L1 substate */ pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CAP, &parent_l1ss_cap); -- GitLab From 7afeb84d14eaaebb71f5c558ed57ca858e4304e7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas <bhelgaas@google.com> Date: Tue, 4 Oct 2022 21:58:09 -0500 Subject: [PATCH 1356/2223] PCI/ASPM: Correct LTR_L1.2_THRESHOLD computation 80d7d7a904fa ("PCI/ASPM: Calculate LTR_L1.2_THRESHOLD from device characteristics") replaced a fixed value (163840ns) with one computed from T_POWER_OFF, Common_Mode_Restore_Time, etc., but it encoded the LTR_L1.2_THRESHOLD value incorrectly. This is especially a problem for small thresholds, e.g., 63ns fell into the "threshold_ns < 1024" case and was encoded as 32ns: LTR_L1.2_THRESHOLD_Scale = 1 (multiplier is 32ns) LTR_L1.2_THRESHOLD_Value = 63 >> 5 = 1 LTR_L1.2_THRESHOLD = multiplier * value = 32ns * 1 = 32ns Correct the algorithm to encode all times of 1023ns (0x3ff) or smaller exactly and larger times conservatively (the encoded threshold is never smaller than was requested). This reduces the chance of entering L1.2 when the device can't tolerate the exit latency. Fixes: 80d7d7a904fa ("PCI/ASPM: Calculate LTR_L1.2_THRESHOLD from device characteristics") Link: https://lore.kernel.org/r/20221005025809.2247547-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> --- drivers/pci/pcie/aspm.c | 49 +++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index f12d117f44e05..53a1fa306e1ee 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> +#include <linux/math.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/pci.h> @@ -350,29 +351,43 @@ static u32 calc_l1ss_pwron(struct pci_dev *pdev, u32 scale, u32 val) return 0; } +/* + * Encode an LTR_L1.2_THRESHOLD value for the L1 PM Substates Control 1 + * register. Ports enter L1.2 when the most recent LTR value is greater + * than or equal to LTR_L1.2_THRESHOLD, so we round up to make sure we + * don't enter L1.2 too aggressively. + * + * See PCIe r6.0, sec 5.5.1, 6.18, 7.8.3.3. + */ static void encode_l12_threshold(u32 threshold_us, u32 *scale, u32 *value) { - u32 threshold_ns = threshold_us * 1000; + u64 threshold_ns = (u64) threshold_us * 1000; - /* See PCIe r3.1, sec 7.33.3 and sec 6.18 */ - if (threshold_ns < 32) { - *scale = 0; + /* + * LTR_L1.2_THRESHOLD_Value ("value") is a 10-bit field with max + * value of 0x3ff. + */ + if (threshold_ns <= 0x3ff * 1) { + *scale = 0; /* Value times 1ns */ *value = threshold_ns; - } else if (threshold_ns < 1024) { - *scale = 1; - *value = threshold_ns >> 5; - } else if (threshold_ns < 32768) { - *scale = 2; - *value = threshold_ns >> 10; - } else if (threshold_ns < 1048576) { - *scale = 3; - *value = threshold_ns >> 15; - } else if (threshold_ns < 33554432) { - *scale = 4; - *value = threshold_ns >> 20; + } else if (threshold_ns <= 0x3ff * 32) { + *scale = 1; /* Value times 32ns */ + *value = roundup(threshold_ns, 32) / 32; + } else if (threshold_ns <= 0x3ff * 1024) { + *scale = 2; /* Value times 1024ns */ + *value = roundup(threshold_ns, 1024) / 1024; + } else if (threshold_ns <= 0x3ff * 32768) { + *scale = 3; /* Value times 32768ns */ + *value = roundup(threshold_ns, 32768) / 32768; + } else if (threshold_ns <= 0x3ff * 1048576) { + *scale = 4; /* Value times 1048576ns */ + *value = roundup(threshold_ns, 1048576) / 1048576; + } else if (threshold_ns <= 0x3ff * (u64) 33554432) { + *scale = 5; /* Value times 33554432ns */ + *value = roundup(threshold_ns, 33554432) / 33554432; } else { *scale = 5; - *value = threshold_ns >> 25; + *value = 0x3ff; /* Max representable value */ } } -- GitLab From 91fa127794ac1c48069479b9d45eb4c7378c0e30 Mon Sep 17 00:00:00 2001 From: Alex Williamson <alex.williamson@redhat.com> Date: Fri, 16 Sep 2022 14:44:48 -0600 Subject: [PATCH 1357/2223] PCI: Expose PCIe Resizable BAR support via sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a simple sysfs interface to Resizable BAR support, largely for the purposes of assigning such devices to a VM through VFIO. Resizable BARs present a difficult feature to expose to a VM through emulation, as resizing a BAR is done on the host. It can fail, and often does, but we have no means via emulation of a PCIe REBAR capability to handle the error cases. A vfio-pci specific ioctl interface is also cumbersome as there are often multiple devices within the same bridge aperture and handling them is a challenge. In the interface proposed here, expanding a BAR potentially requires such devices to be soft-removed during the resize operation and rescanned after, in order for all the necessary resources to be released. A pci-sysfs interface is also more universal than a vfio specific interface. Please see the ABI documentation update for usage. Link: https://lore.kernel.org/r/166336088796.3597940.14973499936692558556.stgit@omen Signed-off-by: Alex Williamson <alex.williamson@redhat.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Christian König <christian.koenig@amd.com> Cc: Krzysztof Wilczyński <kw@linux.com> --- Documentation/ABI/testing/sysfs-bus-pci | 33 ++++++++ drivers/pci/pci-sysfs.c | 108 ++++++++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 6fc2c2efe8ab2..840727fc75dcf 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -457,3 +457,36 @@ Description: The file is writable if the PF is bound to a driver that implements ->sriov_set_msix_vec_count(). + +What: /sys/bus/pci/devices/.../resourceN_resize +Date: September 2022 +Contact: Alex Williamson <alex.williamson@redhat.com> +Description: + These files provide an interface to PCIe Resizable BAR support. + A file is created for each BAR resource (N) supported by the + PCIe Resizable BAR extended capability of the device. Reading + each file exposes the bitmap of available resource sizes: + + # cat resource1_resize + 00000000000001c0 + + The bitmap represents supported resource sizes for the BAR, + where bit0 = 1MB, bit1 = 2MB, bit2 = 4MB, etc. In the above + example the device supports 64MB, 128MB, and 256MB BAR sizes. + + When writing the file, the user provides the bit position of + the desired resource size, for example: + + # echo 7 > resource1_resize + + This indicates to set the size value corresponding to bit 7, + 128MB. The resulting size is 2 ^ (bit# + 20). This definition + matches the PCIe specification of this capability. + + In order to make use of resource resizing, all PCI drivers must + be unbound from the device and peer devices under the same + parent bridge may need to be soft removed. In the case of + VGA devices, writing a resize value will remove low level + console drivers from the device. Raw users of pci-sysfs + resourceN attributes must be terminated prior to resizing. + Success of the resizing operation is not guaranteed. diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index fc804e08e3cb5..0a2eeb82cebde 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -28,6 +28,7 @@ #include <linux/pm_runtime.h> #include <linux/msi.h> #include <linux/of.h> +#include <linux/aperture.h> #include "pci.h" static int sysfs_initialized; /* = 0 */ @@ -1373,6 +1374,112 @@ static const struct attribute_group pci_dev_reset_attr_group = { .is_visible = pci_dev_reset_attr_is_visible, }; +#define pci_dev_resource_resize_attr(n) \ +static ssize_t resource##n##_resize_show(struct device *dev, \ + struct device_attribute *attr, \ + char * buf) \ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + ssize_t ret; \ + \ + pci_config_pm_runtime_get(pdev); \ + \ + ret = sysfs_emit(buf, "%016llx\n", \ + (u64)pci_rebar_get_possible_sizes(pdev, n)); \ + \ + pci_config_pm_runtime_put(pdev); \ + \ + return ret; \ +} \ + \ +static ssize_t resource##n##_resize_store(struct device *dev, \ + struct device_attribute *attr,\ + const char *buf, size_t count)\ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + unsigned long size, flags; \ + int ret, i; \ + u16 cmd; \ + \ + if (kstrtoul(buf, 0, &size) < 0) \ + return -EINVAL; \ + \ + device_lock(dev); \ + if (dev->driver) { \ + ret = -EBUSY; \ + goto unlock; \ + } \ + \ + pci_config_pm_runtime_get(pdev); \ + \ + if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) { \ + ret = aperture_remove_conflicting_pci_devices(pdev, \ + "resourceN_resize"); \ + if (ret) \ + goto pm_put; \ + } \ + \ + pci_read_config_word(pdev, PCI_COMMAND, &cmd); \ + pci_write_config_word(pdev, PCI_COMMAND, \ + cmd & ~PCI_COMMAND_MEMORY); \ + \ + flags = pci_resource_flags(pdev, n); \ + \ + pci_remove_resource_files(pdev); \ + \ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { \ + if (pci_resource_len(pdev, i) && \ + pci_resource_flags(pdev, i) == flags) \ + pci_release_resource(pdev, i); \ + } \ + \ + ret = pci_resize_resource(pdev, n, size); \ + \ + pci_assign_unassigned_bus_resources(pdev->bus); \ + \ + if (pci_create_resource_files(pdev)) \ + pci_warn(pdev, "Failed to recreate resource files after BAR resizing\n");\ + \ + pci_write_config_word(pdev, PCI_COMMAND, cmd); \ +pm_put: \ + pci_config_pm_runtime_put(pdev); \ +unlock: \ + device_unlock(dev); \ + \ + return ret ? ret : count; \ +} \ +static DEVICE_ATTR_RW(resource##n##_resize) + +pci_dev_resource_resize_attr(0); +pci_dev_resource_resize_attr(1); +pci_dev_resource_resize_attr(2); +pci_dev_resource_resize_attr(3); +pci_dev_resource_resize_attr(4); +pci_dev_resource_resize_attr(5); + +static struct attribute *resource_resize_attrs[] = { + &dev_attr_resource0_resize.attr, + &dev_attr_resource1_resize.attr, + &dev_attr_resource2_resize.attr, + &dev_attr_resource3_resize.attr, + &dev_attr_resource4_resize.attr, + &dev_attr_resource5_resize.attr, + NULL, +}; + +static umode_t resource_resize_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + return pci_rebar_get_current_size(pdev, n) < 0 ? 0 : a->mode; +} + +static const struct attribute_group pci_dev_resource_resize_group = { + .attrs = resource_resize_attrs, + .is_visible = resource_resize_is_visible, +}; + int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) { if (!sysfs_initialized) @@ -1494,6 +1601,7 @@ const struct attribute_group *pci_dev_groups[] = { #ifdef CONFIG_ACPI &pci_dev_acpi_attr_group, #endif + &pci_dev_resource_resize_group, NULL, }; -- GitLab From 4a74e79b543c115bf2b5b7a4b29db139da20b90d Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Tue, 4 Oct 2022 20:27:15 +0100 Subject: [PATCH 1358/2223] i2c: microchip: pci1xxxx: Fix comparison of -EPERM against an unsigned variable The comparison of variable ret with -EPERM is always false because ret is an u8 type. Fix this by making ret an int. Cleans up clang warning: drivers/i2c/busses/i2c-mchp-pci1xxxx.c:714:10: warning: result of comparison of constant -1 with expression of type 'u8' (aka 'unsigned char') is always false [-Wtautological-constant-out-of-range-compare] Fixes: 361693697249 ("i2c: microchip: pci1xxxx: Add driver for I2C host controller in multifunction endpoint of pci1xxxx switch") Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-mchp-pci1xxxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c index f5342201eb6b6..09af759211478 100644 --- a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c +++ b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c @@ -708,7 +708,7 @@ static void pci1xxxx_i2c_init(struct pci1xxxx_i2c *i2c) void __iomem *p2 = i2c->i2c_base + SMBUS_STATUS_REG_OFF; void __iomem *p1 = i2c->i2c_base + SMB_GPR_REG; u8 regval; - u8 ret; + int ret; ret = set_sys_lock(i2c); if (ret == -EPERM) { -- GitLab From 8673b6d97a314c2e73352f4a34c1aa9b2730d7c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matti=20Lehtim=C3=A4ki?= <matti.lehtimaki@gmail.com> Date: Sun, 2 Oct 2022 15:28:54 +0300 Subject: [PATCH 1359/2223] dt-bindings: i2c: qcom,i2c-cci: Document MSM8226 compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSM8226's Camera Control Interface has one master and 3 clocks. Signed-off-by: Matti Lehtimäki <matti.lehtimaki@gmail.com> Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- .../devicetree/bindings/i2c/qcom,i2c-cci.yaml | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml index e51a85848d6e5..c0f9537a4bb11 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml @@ -13,6 +13,7 @@ maintainers: properties: compatible: enum: + - qcom,msm8226-cci - qcom,msm8916-cci - qcom,msm8974-cci - qcom,msm8996-cci @@ -27,11 +28,11 @@ properties: const: 0 clocks: - minItems: 4 + minItems: 3 maxItems: 6 clock-names: - minItems: 4 + minItems: 3 maxItems: 6 interrupts: @@ -78,11 +79,28 @@ allOf: compatible: contains: enum: + - qcom,msm8226-cci - qcom,msm8916-cci then: properties: i2c-bus@1: false + - if: + properties: + compatible: + contains: + enum: + - qcom,msm8226-cci + then: + properties: + clocks: + maxItems: 3 + clock-names: + items: + - const: camss_top_ahb + - const: cci_ahb + - const: cci + - if: properties: compatible: -- GitLab From 9ad16f9639646762455bf3ed1e6dfcc6ccc2c099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matti=20Lehtim=C3=A4ki?= <matti.lehtimaki@gmail.com> Date: Sun, 2 Oct 2022 15:28:55 +0300 Subject: [PATCH 1360/2223] dt-bindings: i2c: qcom,i2c-cci: Document clocks for MSM8974 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uses same clocks as MSM8226. Signed-off-by: Matti Lehtimäki <matti.lehtimaki@gmail.com> Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml index c0f9537a4bb11..cf9f8fda595fc 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml @@ -91,6 +91,7 @@ allOf: contains: enum: - qcom,msm8226-cci + - qcom,msm8974-cci then: properties: clocks: -- GitLab From d046bd1372a5c5448c7c7ba3383a4316fdb32a60 Mon Sep 17 00:00:00 2001 From: Rayyan Ansari <rayyan@ansari.sh> Date: Sun, 2 Oct 2022 15:28:56 +0300 Subject: [PATCH 1361/2223] i2c: qcom-cci: Add MSM8226 compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a compatible for MSM8226's Camera Control Interface, which is similar to the one used on MSM8916. Signed-off-by: Rayyan Ansari <rayyan@ansari.sh> Signed-off-by: Matti Lehtimäki <matti.lehtimaki@gmail.com> Reviewed-by: Loic Poulain <loic.poulain@linaro.org> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-qcom-cci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-qcom-cci.c b/drivers/i2c/busses/i2c-qcom-cci.c index ea48e6a9cfca7..87739fb4388ba 100644 --- a/drivers/i2c/busses/i2c-qcom-cci.c +++ b/drivers/i2c/busses/i2c-qcom-cci.c @@ -807,6 +807,7 @@ static const struct cci_data cci_v2_data = { }; static const struct of_device_id cci_dt_match[] = { + { .compatible = "qcom,msm8226-cci", .data = &cci_v1_data}, { .compatible = "qcom,msm8916-cci", .data = &cci_v1_data}, { .compatible = "qcom,msm8974-cci", .data = &cci_v1_5_data}, { .compatible = "qcom,msm8996-cci", .data = &cci_v2_data}, -- GitLab From 301c8f5c32c8fb79c67539bc23972dc3ef48024c Mon Sep 17 00:00:00 2001 From: Jarkko Nikula <jarkko.nikula@linux.intel.com> Date: Tue, 27 Sep 2022 16:56:44 +0300 Subject: [PATCH 1362/2223] i2c: designware: Fix handling of real but unexpected device interrupts Commit c7b79a752871 ("mfd: intel-lpss: Add Intel Alder Lake PCH-S PCI IDs") caused a regression on certain Gigabyte motherboards for Intel Alder Lake-S where system crashes to NULL pointer dereference in i2c_dw_xfer_msg() when system resumes from S3 sleep state ("deep"). I was able to debug the issue on Gigabyte Z690 AORUS ELITE and made following notes: - Issue happens when resuming from S3 but not when resuming from "s2idle" - PCI device 00:15.0 == i2c_designware.0 is already in D0 state when system enters into pci_pm_resume_noirq() while all other i2c_designware PCI devices are in D3. Devices were runtime suspended and in D3 prior entering into suspend - Interrupt comes after pci_pm_resume_noirq() when device interrupts are re-enabled - According to register dump the interrupt really comes from the i2c_designware.0. Controller is enabled, I2C target address register points to a one detectable I2C device address 0x60 and the DW_IC_RAW_INTR_STAT register START_DET, STOP_DET, ACTIVITY and TX_EMPTY bits are set indicating completed I2C transaction. My guess is that the firmware uses this controller to communicate with an on-board I2C device during resume but does not disable the controller before giving control to an operating system. I was told the UEFI update fixes this but never the less it revealed the driver is not ready to handle TX_EMPTY (or RX_FULL) interrupt when device is supposed to be idle and state variables are not set (especially the dev->msgs pointer which may point to NULL or stale old data). Introduce a new software status flag STATUS_ACTIVE indicating when the controller is active in driver point of view. Now treat all interrupts that occur when is not set as unexpected and mask all interrupts from the controller. Fixes: c7b79a752871 ("mfd: intel-lpss: Add Intel Alder Lake PCH-S PCI IDs") Reported-by: Samuel Clark <slc2015@gmail.com> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215907 Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com> Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-designware-core.h | 7 +++++-- drivers/i2c/busses/i2c-designware-master.c | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 70b80e7109905..4d3a3b464ecd8 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -126,8 +126,9 @@ * status codes */ #define STATUS_IDLE 0x0 -#define STATUS_WRITE_IN_PROGRESS 0x1 -#define STATUS_READ_IN_PROGRESS 0x2 +#define STATUS_ACTIVE 0x1 +#define STATUS_WRITE_IN_PROGRESS 0x2 +#define STATUS_READ_IN_PROGRESS 0x4 /* * operation modes @@ -334,12 +335,14 @@ void i2c_dw_disable_int(struct dw_i2c_dev *dev); static inline void __i2c_dw_enable(struct dw_i2c_dev *dev) { + dev->status |= STATUS_ACTIVE; regmap_write(dev->map, DW_IC_ENABLE, 1); } static inline void __i2c_dw_disable_nowait(struct dw_i2c_dev *dev) { regmap_write(dev->map, DW_IC_ENABLE, 0); + dev->status &= ~STATUS_ACTIVE; } void __i2c_dw_disable(struct dw_i2c_dev *dev); diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 44a94b225ed82..dc3c5a15a95b9 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -716,6 +716,19 @@ static int i2c_dw_irq_handler_master(struct dw_i2c_dev *dev) u32 stat; stat = i2c_dw_read_clear_intrbits(dev); + + if (!(dev->status & STATUS_ACTIVE)) { + /* + * Unexpected interrupt in driver point of view. State + * variables are either unset or stale so acknowledge and + * disable interrupts for suppressing further interrupts if + * interrupt really came from this HW (E.g. firmware has left + * the HW active). + */ + regmap_write(dev->map, DW_IC_INTR_MASK, 0); + return 0; + } + if (stat & DW_IC_INTR_TX_ABRT) { dev->cmd_err |= DW_IC_ERR_TX_ABRT; dev->status = STATUS_IDLE; -- GitLab From fd66bd74afe880de4f008f96a795fedee887ff44 Mon Sep 17 00:00:00 2001 From: Quan Nguyen <quan@os.amperecomputing.com> Date: Tue, 4 Oct 2022 16:31:06 +0700 Subject: [PATCH 1363/2223] i2c: aspeed: Assert NAK when slave is busy On I2C_SLAVE_WRITE_REQUESTED event, Slave already ACK'ed on the address phase. But as the backend driver is busy and unable to process any request from Master, issue RxCmdLast for Slave to auto send NACK on next incoming byte. Signed-off-by: Quan Nguyen <quan@os.amperecomputing.com> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-aspeed.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-aspeed.c b/drivers/i2c/busses/i2c-aspeed.c index 185dedfebbac9..c64c381b69b7f 100644 --- a/drivers/i2c/busses/i2c-aspeed.c +++ b/drivers/i2c/busses/i2c-aspeed.c @@ -244,6 +244,7 @@ static u32 aspeed_i2c_slave_irq(struct aspeed_i2c_bus *bus, u32 irq_status) u32 command, irq_handled = 0; struct i2c_client *slave = bus->slave; u8 value; + int ret; if (!slave) return 0; @@ -311,7 +312,13 @@ static u32 aspeed_i2c_slave_irq(struct aspeed_i2c_bus *bus, u32 irq_status) break; case ASPEED_I2C_SLAVE_WRITE_REQUESTED: bus->slave_state = ASPEED_I2C_SLAVE_WRITE_RECEIVED; - i2c_slave_event(slave, I2C_SLAVE_WRITE_REQUESTED, &value); + ret = i2c_slave_event(slave, I2C_SLAVE_WRITE_REQUESTED, &value); + /* + * Slave ACK's on this address phase already but as the backend driver + * returns an errno, the bus driver should nack the next incoming byte. + */ + if (ret < 0) + writel(ASPEED_I2CD_M_S_RX_CMD_LAST, bus->base + ASPEED_I2C_CMD_REG); break; case ASPEED_I2C_SLAVE_WRITE_RECEIVED: i2c_slave_event(slave, I2C_SLAVE_WRITE_RECEIVED, &value); -- GitLab From 74fd2ca0f6af9cc332957b2e6ef70772e421403a Mon Sep 17 00:00:00 2001 From: Jiangshan Yi <yijiangshan@kylinos.cn> Date: Tue, 6 Sep 2022 10:41:19 +0800 Subject: [PATCH 1364/2223] fs/nfs/pnfs_nfs.c: fix spelling typo and syntax error in comment Fix spelling typo and syntax error in comment. Suggested-by: Randy Dunlap <rdunlap@infradead.org> Reported-by: k2ci <kernel-bot@kylinos.cn> Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn> Reviewed-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/pnfs_nfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 657c242a18ff1..987c88ddeaf06 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -374,12 +374,12 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, return NULL; } -/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest +/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request * for @page * @cinfo - commit info for current inode * @page - page to search for matching head request * - * Returns a the head request if one is found, otherwise returns NULL. + * Return: the head request if one is found, otherwise %NULL. */ struct nfs_page * pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) -- GitLab From 8aa7cf85248f7b1fb49a1117c60a160b5b22b337 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui <cuigaosheng1@huawei.com> Date: Fri, 9 Sep 2022 14:46:40 +0800 Subject: [PATCH 1365/2223] NFSv4: remove nfs4_renewd_prepare_shutdown() declaration nfs4_renewd_prepare_shutdown() has been removed since commit 3050141bae57 ("NFSv4: Kill nfs4_renewd_prepare_shutdown()"), so remove it. Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/nfs4_fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 79df6e83881b2..400a71e75238b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -459,7 +459,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); -extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease); -- GitLab From a035618caf8718a1d4e840ec39dfc5fce0dcdee1 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui <cuigaosheng1@huawei.com> Date: Fri, 9 Sep 2022 14:24:11 +0800 Subject: [PATCH 1366/2223] nfs: remove nfs_wait_atomic_killable() and nfs_write_prepare() declaration nfs_write_prepare() has been removed since commit a4cdda59111f ("NFS: Create a common pgio_rpc_prepare function"), so remove it. nfs_wait_atomic_killable() has been removed since commit 723c921e7dfc ("sched/wait, fs/nfs: Convert wait_on_atomic_t() usage to the new wait_var_event() API"), so remove it. Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/internal.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 898dd95bc7a7c..d914d609b85b2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -435,7 +435,6 @@ extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); -extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); /* super.c */ extern const struct super_operations nfs_sops; @@ -503,7 +502,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); -extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, -- GitLab From 963694615d9d5a860e6571da18e50f14ab3583b3 Mon Sep 17 00:00:00 2001 From: Anna Schumaker <Anna.Schumaker@Netapp.com> Date: Wed, 21 Sep 2022 13:21:52 -0400 Subject: [PATCH 1367/2223] NFSv4.2: Add special handling for LISTXATTR receiving NFS4ERR_NOXATTR We can translate this into an empty response list instead of passing an error up to userspace. Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/nfs42xdr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b56f05113d367..fe1aeb0f048f2 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -569,6 +569,14 @@ static int decode_listxattrs(struct xdr_stream *xdr, */ if (status == -ETOOSMALL) status = -ERANGE; + /* + * Special case: for LISTXATTRS, NFS4ERR_NOXATTR + * should be translated to success with zero-length reply. + */ + if (status == -ENODATA) { + res->eof = true; + status = 0; + } goto out; } -- GitLab From 3a100e4d8a2f7660d220c000364fe57679da9c92 Mon Sep 17 00:00:00 2001 From: Anna Schumaker <Anna.Schumaker@Netapp.com> Date: Wed, 21 Sep 2022 16:29:57 -0400 Subject: [PATCH 1368/2223] NFSv4.2: Move TRACE_DEFINE_ENUM(NFS4_CONTENT_*) under CONFIG_NFS_V4_2 NFS4_CONTENT_DATA and NFS4_CONTENT_HOLE both only exist under NFS v4.2. Move their corresponding TRACE_DEFINE_ENUM calls under this Kconfig option. Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/nfs4trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 6ee6ad3674a29..37c4c105ed29f 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2097,6 +2097,7 @@ TRACE_EVENT(ff_layout_commit_error, ) ); +#ifdef CONFIG_NFS_V4_2 TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA); TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); @@ -2105,7 +2106,6 @@ TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); { NFS4_CONTENT_DATA, "DATA" }, \ { NFS4_CONTENT_HOLE, "HOLE" }) -#ifdef CONFIG_NFS_V4_2 TRACE_EVENT(nfs4_llseek, TP_PROTO( const struct inode *inode, -- GitLab From 27ffed1040f7703e368f37f5f97fef87a79527dd Mon Sep 17 00:00:00 2001 From: Anna Schumaker <Anna.Schumaker@Netapp.com> Date: Thu, 22 Sep 2022 15:18:50 -0400 Subject: [PATCH 1369/2223] NFSv4.2: Add tracepoints for getxattr, setxattr, and removexattr These functions take similar arguments, and can share a tracepoint class for common formatting. Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/nfs42proc.c | 3 +++ fs/nfs/nfs4trace.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6dab9e4083729..c4791ca00df1b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1175,6 +1175,7 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name) ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + trace_nfs4_removexattr(inode, name, ret); if (!ret) nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); @@ -1214,6 +1215,7 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + trace_nfs4_setxattr(inode, name, ret); for (; np > 0; np--) put_page(pages[np - 1]); @@ -1246,6 +1248,7 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_getxattr(inode, name, ret); if (ret < 0) return ret; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 37c4c105ed29f..650c9353826f3 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2496,6 +2496,52 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); + +DECLARE_EVENT_CLASS(nfs4_xattr_event, + TP_PROTO( + const struct inode *inode, + const char *name, + int error + ), + + TP_ARGS(inode, name, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(name, name) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __assign_str(name, name); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%s", + -__entry->error, show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __get_str(name) + ) +); +#define DEFINE_NFS4_XATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_xattr_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const char *name, \ + int error \ + ), \ + TP_ARGS(inode, name, error)) +DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr); #endif /* CONFIG_NFS_V4_2 */ #endif /* CONFIG_NFS_V4_1 */ -- GitLab From a0b685e7bd7c5d232a64b0707d2f83ae3e1840dc Mon Sep 17 00:00:00 2001 From: Anna Schumaker <Anna.Schumaker@Netapp.com> Date: Mon, 3 Oct 2022 13:03:52 -0400 Subject: [PATCH 1370/2223] NFSv4.2: Add a tracepoint for listxattr This can be defined as simply an NFS4_INODE_EVENT() since we don't have the name of a specific xattr to list. This roughly matches readdir, which also uses an NFS4_INODE_EVENT() tracepoint. Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/nfs42proc.c | 1 + fs/nfs/nfs4trace.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index c4791ca00df1b..ced9170701b64 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1320,6 +1320,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_listxattr(inode, ret); if (ret >= 0) { ret = res.copied; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 650c9353826f3..2cff5901c6894 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2542,6 +2542,8 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event, DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr); DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr); DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr); + +DEFINE_NFS4_INODE_EVENT(nfs4_listxattr); #endif /* CONFIG_NFS_V4_2 */ #endif /* CONFIG_NFS_V4_1 */ -- GitLab From 6b1eb3b22272713b5153deba812b6e3943ddd683 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Sun, 18 Sep 2022 13:28:16 -0400 Subject: [PATCH 1371/2223] SUNRPC: Replace the use of the xprtiod WQ in rpcrdma While setting up a new lab, I accidentally misconfigured the Ethernet port for a system that tried an NFS mount using RoCE. This made the NFS server unreachable. The following WARNING popped on the NFS client while waiting for the mount attempt to time out: kernel: workqueue: WQ_MEM_RECLAIM xprtiod:xprt_rdma_connect_worker [rpcrdma] is flushing !WQ_MEM_RECLAI> kernel: WARNING: CPU: 0 PID: 100 at kernel/workqueue.c:2628 check_flush_dependency+0xbf/0xca kernel: Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs 8021q garp stp mrp llc rfkill rpcrdma> kernel: CPU: 0 PID: 100 Comm: kworker/u8:8 Not tainted 6.0.0-rc1-00002-g6229f8c054e5 #13 kernel: Hardware name: Supermicro X10SRA-F/X10SRA-F, BIOS 2.0b 06/12/2017 kernel: Workqueue: xprtiod xprt_rdma_connect_worker [rpcrdma] kernel: RIP: 0010:check_flush_dependency+0xbf/0xca kernel: Code: 75 2a 48 8b 55 18 48 8d 8b b0 00 00 00 4d 89 e0 48 81 c6 b0 00 00 00 48 c7 c7 65 33 2e be> kernel: RSP: 0018:ffffb562806cfcf8 EFLAGS: 00010092 kernel: RAX: 0000000000000082 RBX: ffff97894f8c3c00 RCX: 0000000000000027 kernel: RDX: 0000000000000002 RSI: ffffffffbe3447d1 RDI: 00000000ffffffff kernel: RBP: ffff978941315840 R08: 0000000000000000 R09: 0000000000000000 kernel: R10: 00000000000008b0 R11: 0000000000000001 R12: ffffffffc0ce3731 kernel: R13: ffff978950c00500 R14: ffff97894341f0c0 R15: ffff978951112eb0 kernel: FS: 0000000000000000(0000) GS:ffff97987fc00000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00007f807535eae8 CR3: 000000010b8e4002 CR4: 00000000003706f0 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 kernel: Call Trace: kernel: <TASK> kernel: __flush_work.isra.0+0xaf/0x188 kernel: ? _raw_spin_lock_irqsave+0x2c/0x37 kernel: ? lock_timer_base+0x38/0x5f kernel: __cancel_work_timer+0xea/0x13d kernel: ? preempt_latency_start+0x2b/0x46 kernel: rdma_addr_cancel+0x70/0x81 [ib_core] kernel: _destroy_id+0x1a/0x246 [rdma_cm] kernel: rpcrdma_xprt_connect+0x115/0x5ae [rpcrdma] kernel: ? _raw_spin_unlock+0x14/0x29 kernel: ? raw_spin_rq_unlock_irq+0x5/0x10 kernel: ? finish_task_switch.isra.0+0x171/0x249 kernel: xprt_rdma_connect_worker+0x3b/0xc7 [rpcrdma] kernel: process_one_work+0x1d8/0x2d4 kernel: worker_thread+0x18b/0x24f kernel: ? rescuer_thread+0x280/0x280 kernel: kthread+0xf4/0xfc kernel: ? kthread_complete_and_exit+0x1b/0x1b kernel: ret_from_fork+0x22/0x30 kernel: </TASK> SUNRPC's xprtiod workqueue is WQ_MEM_RECLAIM, so any workqueue that one of its work items tries to cancel has to be WQ_MEM_RECLAIM to prevent a priority inversion. The internal workqueues in the RDMA/core are currently non-MEM_RECLAIM. Jason Gunthorpe says this about the current state of RDMA/core: > If you attempt to do a reconnection/etc from within a RECLAIM > context it will deadlock on one of the many allocations that are > made to support opening the connection. > > The general idea of reclaim is that the entire task context > working under the reclaim is marked with an override of the gfp > flags to make all allocations under that call chain reclaim safe. > > But rdmacm does allocations outside this, eg in the WQs processing > the CM packets. So this doesn't work and we will deadlock. > > Fixing it is a big deal and needs more than poking WQ_MEM_RECLAIM > here and there. So we will change the ULP in this case to avoid the use of WQ_MEM_RECLAIM where possible. Deadlocks that were possible before are not fixed, but at least we no longer have a false sense of confidence that the stack won't allocate memory during memory reclaim. Suggested-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/transport.c | 3 +-- net/sunrpc/xprtrdma/verbs.c | 10 +++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index bcb37b51adf65..10bb2b929c6d7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -494,8 +494,7 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } trace_xprtrdma_op_connect(r_xprt, delay); - queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, - delay); + queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2fbe9aaeec349..049c854b7b37d 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -791,13 +791,9 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) /* If there is no underlying connection, it's no use * to wake the refresh worker. */ - if (ep->re_connect_status == 1) { - /* The work is scheduled on a WQ_MEM_RECLAIM - * workqueue in order to prevent MR allocation - * from recursing into NFS during direct reclaim. - */ - queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); - } + if (ep->re_connect_status != 1) + return; + queue_work(system_highpri_wq, &buf->rb_refresh_worker); } /** -- GitLab From 5014831264b05be11090668ae2211e64a1765f7e Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Fri, 23 Sep 2022 09:06:05 -0400 Subject: [PATCH 1372/2223] svcrdma: Clean up RPCRDMA_DEF_GFP xprt_rdma_bc_allocate() is now the only user of RPCRDMA_DEF_GFP. Replace that macro with the raw flags. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 85c8cdda98b18..aa2227a7e5521 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -119,12 +119,12 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } - page = alloc_page(RPCRDMA_DEF_GFP); + page = alloc_page(GFP_NOIO | __GFP_NOWARN); if (!page) return -ENOMEM; rqst->rq_buffer = page_address(page); - rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, GFP_NOIO | __GFP_NOWARN); if (!rqst->rq_rbuffer) { put_page(page); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c79f92eeda762..84b685c45555c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -149,8 +149,6 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) return rb->rg_data; } -#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) - /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists * is capped at 16. This prevents less-capable devices from -- GitLab From 3b50cc1c7f2170f2eb0fec040b6c3a8574026fce Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Fri, 23 Sep 2022 09:06:11 -0400 Subject: [PATCH 1373/2223] xprtrdma: Clean up synopsis of rpcrdma_req_create() Commit 1769e6a816df ("xprtrdma: Clean up rpcrdma_create_req()") added rpcrdma_req_create() with a GFP flags argument in case a caller might want to avoid waiting for memory. There has never been a caller that does not pass GFP_KERNEL as the third argument. That argument can therefore be eliminated. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/backchannel.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 16 ++++++++-------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index faba7136dd9a3..e4d84a13c566e 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -189,7 +189,7 @@ create_req: return NULL; size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); - req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, size); if (!req) return NULL; if (rpcrdma_req_setup(r_xprt, req)) { diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 049c854b7b37d..89f5444f4d413 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -800,25 +800,25 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt * @size: initial size, in bytes, of send and receive buffers - * @flags: GFP flags passed to memory allocators * * Returns an allocated and fully initialized rpcrdma_req or NULL. */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags) +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), flags); + req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) goto out1; - req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, + GFP_KERNEL); if (!req->rl_sendbuf) goto out2; - req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, GFP_KERNEL); if (!req->rl_recvbuf) goto out3; @@ -1060,8 +1060,8 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, - GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, + RPCRDMA_V1_DEF_INLINE_SIZE * 2); if (!req) goto out; list_add(&req->rl_list, &buf->rb_send_bufs); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 84b685c45555c..227dce50cc4bb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -465,8 +465,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags); +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size); int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); -- GitLab From 7ac1879875fffa8f7acfe8b8d6932a039f2b736d Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Fri, 23 Sep 2022 09:06:18 -0400 Subject: [PATCH 1374/2223] xprtrdma: Clean up synopsis of rpcrdma_regbuf_alloc() Currently all rpcrdma_regbuf_alloc() call sites pass the same value as their third argument. That argument can therefore be eliminated. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/verbs.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 89f5444f4d413..8fb10fc72f695 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -76,8 +76,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_ep_get(struct rpcrdma_ep *ep); static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags); +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -813,12 +812,11 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, if (req == NULL) goto out1; - req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, - GFP_KERNEL); + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE); if (!req->rl_sendbuf) goto out2; - req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, GFP_KERNEL); + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE); if (!req->rl_recvbuf) goto out3; @@ -854,7 +852,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), - DMA_TO_DEVICE, GFP_KERNEL); + DMA_TO_DEVICE); if (!rb) goto out; @@ -930,7 +928,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, goto out; rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, - DMA_FROM_DEVICE, GFP_KERNEL); + DMA_FROM_DEVICE); if (!rep->rr_rdmabuf) goto out_free; @@ -1231,15 +1229,14 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags) +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), flags); + rb = kmalloc(sizeof(*rb), GFP_KERNEL); if (!rb) return NULL; - rb->rg_data = kmalloc(size, flags); + rb->rg_data = kmalloc(size, GFP_KERNEL); if (!rb->rg_data) { kfree(rb); return NULL; -- GitLab From 2d77058cce9fbff3d69cc05d4eb695f4ff421c03 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Fri, 23 Sep 2022 09:06:24 -0400 Subject: [PATCH 1375/2223] xprtrdma: MR-related memory allocation should be allowed to fail xprtrdma always drives a retry of MR allocation if it should fail. It should be safe to not use GFP_KERNEL for this purpose rather than sleeping in the memory allocator. In theory, if these weaker allocations are attempted first, memory exhaustion is likely to cause xprtrdma to fail fast and not then invoke the RDMA core APIs, which still might use GFP_KERNEL. Also note that rpc_task_gfp_mask() always sets __GFP_NORETRY and __GFP_NOWARN when an RPC-related allocation is being done in a worker thread. MR allocation is already always done in worker threads. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++---------- net/sunrpc/xprtrdma/verbs.c | 5 ++++- net/sunrpc/xprtrdma/xprt_rdma.h | 6 ++++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index de0bdb6b729f8..ce55361a822fe 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -126,14 +126,15 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) struct ib_mr *frmr; int rc; + sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(ep->re_id->device)); + if (!sg) + return -ENOMEM; + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kmalloc_array(depth, sizeof(*sg), GFP_KERNEL); - if (!sg) - goto out_list_err; - mr->mr_xprt = r_xprt; mr->mr_ibmr = frmr; mr->mr_device = NULL; @@ -146,13 +147,9 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) return 0; out_mr_err: - rc = PTR_ERR(frmr); + kfree(sg); trace_xprtrdma_frwr_alloc(mr, rc); - return rc; - -out_list_err: - ib_dereg_mr(frmr); - return -ENOMEM; + return PTR_ERR(frmr); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8fb10fc72f695..4a7b87e9e47ca 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -739,13 +739,16 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; unsigned int count; + /* Try to allocate enough to perform one full-sized I/O */ for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(device)); if (!mr) break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 227dce50cc4bb..5e5ff6784ef5f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -149,6 +149,12 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) return rb->rg_data; } +/* Do not use emergency memory reserves, and fail quickly if memory + * cannot be allocated easily. These flags may be used wherever there + * is robust logic to handle a failure to allocate. + */ +#define XPRTRDMA_GFP_FLAGS (__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) + /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists * is capped at 16. This prevents less-capable devices from -- GitLab From 9c8f332fbf995dc1d4d30a973d7ad6e1adb56437 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Fri, 23 Sep 2022 09:06:30 -0400 Subject: [PATCH 1376/2223] xprtrdma: Memory allocation should be allowed to fail during connect An attempt to establish a connection can always fail and then be retried. GFP_KERNEL allocation is not necessary here. Like MR allocation, establishing a connection is always done in a worker thread. The new GFP flags align with the flags that would be returned by rpc_task_gfp_mask() in this case. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/verbs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4a7b87e9e47ca..7ca58cb65e27e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -372,7 +372,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep; int rc; - ep = kzalloc(sizeof(*ep), GFP_KERNEL); + ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS); if (!ep) return -ENOTCONN; ep->re_xprt = &r_xprt->rx_xprt; @@ -605,7 +605,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) struct rpcrdma_sendctx *sc; sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), - GFP_KERNEL); + XPRTRDMA_GFP_FLAGS); if (!sc) return NULL; @@ -628,7 +628,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * Sends are posted. */ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; - buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS); if (!buf->rb_sc_ctxs) return -ENOMEM; -- GitLab From f20f18c95630e9b53f5081fd5df3bb705c450bbe Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Fri, 23 Sep 2022 09:06:37 -0400 Subject: [PATCH 1377/2223] xprtrdma: Prevent memory allocations from driving a reclaim Many memory allocations that xprtrdma does can fail safely. Let's use this fact to avoid some potential deadlocks: Replace GFP_KERNEL with GFP flags that do not try hard to acquire memory. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/verbs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7ca58cb65e27e..44b87e4274b42 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -811,7 +811,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), GFP_KERNEL); + req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS); if (req == NULL) goto out1; @@ -926,7 +926,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - rep = kzalloc(sizeof(*rep), GFP_KERNEL); + rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; @@ -1236,10 +1236,10 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), GFP_KERNEL); + rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); if (!rb) return NULL; - rb->rg_data = kmalloc(size, GFP_KERNEL); + rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); if (!rb->rg_data) { kfree(rb); return NULL; -- GitLab From e4266f23ecdf0d3d1f1d9e8fff730e1f962b0687 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Wed, 28 Sep 2022 09:00:48 -0400 Subject: [PATCH 1378/2223] xprtrdma: Fix uninitialized variable net/sunrpc/xprtrdma/frwr_ops.c:151:32: warning: variable 'rc' is uninitialized when used here [-Wuninitialized] trace_xprtrdma_frwr_alloc(mr, rc); ^~ net/sunrpc/xprtrdma/frwr_ops.c:127:8: note: initialize the variable 'rc' to silence this warning int rc; ^ = 0 1 warning generated. The tracepoint is intended to record the error returned from ib_alloc_mr(). In the current code there is no other purpose for @rc, so simply replace it. Reported-by: kernel test robot <lkp@intel.com> Fixes: d8cf39a280c3b0 ('xprtrdma: MR-related memory allocation should be allowed to fail') Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- net/sunrpc/xprtrdma/frwr_ops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ce55361a822fe..ffbf99894970e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -124,7 +124,6 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; - int rc; sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS, ibdev_to_node(ep->re_id->device)); @@ -148,7 +147,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) out_mr_err: kfree(sg); - trace_xprtrdma_frwr_alloc(mr, rc); + trace_xprtrdma_frwr_alloc(mr, PTR_ERR(frmr)); return PTR_ERR(frmr); } -- GitLab From 689fe57e7ecefd2eeba76c32aa569bb3e1e790d9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim <jaegeuk@kernel.org> Date: Fri, 30 Sep 2022 15:48:24 -0700 Subject: [PATCH 1379/2223] f2fs: allow direct read for zoned device This reverts dbf8e63f48af ("f2fs: remove device type check for direct IO"), and apply the below first version, since it contributed out-of-order DIO writes. For zoned devices, f2fs forbids direct IO and forces buffered IO to serialize write IOs. However, the constraint does not apply to read IOs. Cc: stable@vger.kernel.org Fixes: dbf8e63f48af ("f2fs: remove device type check for direct IO") Signed-off-by: Eunhee Rho <eunhee83.rho@samsung.com> Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/f2fs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b63b482c35a85..1ebc08be958eb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4526,7 +4526,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, /* disallow direct IO if any of devices has unaligned blksize */ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; - + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { if (block_unaligned_IO(inode, iocb, iter)) return true; -- GitLab From 0391632948d9c1394601ae56d0cb25a1630874ed Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:45 +0530 Subject: [PATCH 1380/2223] PCI: qcom-ep: Disable Master AXI Clock when there is no PCIe traffic The Master AXI clock can be disabled when it is not used i.e., when there is no traffic on the PCIe bus. This helps to save power during idle state. [bhelgaas: tidy and wrap comment] Link: https://lore.kernel.org/r/20220914075350.7992-8-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 5502e627e4828..c2585cdaa5011 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -105,6 +105,7 @@ /* PARF_SYS_CTRL register fields */ #define PARF_SYS_CTRL_AUX_PWR_DET BIT(4) #define PARF_SYS_CTRL_CORE_CLK_CGC_DIS BIT(6) +#define PARF_SYS_CTRL_MSTR_ACLK_CGC_DIS BIT(10) #define PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE BIT(11) /* PARF_DB_CTRL register fields */ @@ -341,8 +342,14 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci) val &= ~PARF_Q2A_FLUSH_EN; writel_relaxed(val, pcie_ep->parf + PARF_Q2A_FLUSH); - /* Disable DBI Wakeup, core clock CGC and enable AUX power */ + /* + * Disable Master AXI clock during idle. Do not allow DBI access + * to take the core out of L1. Disable core clock gating that + * gates PIPE clock from propagating to core clock. Report to the + * host that Vaux is present. + */ val = readl_relaxed(pcie_ep->parf + PARF_SYS_CTRL); + val &= ~PARF_SYS_CTRL_MSTR_ACLK_CGC_DIS; val |= PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE | PARF_SYS_CTRL_CORE_CLK_CGC_DIS | PARF_SYS_CTRL_AUX_PWR_DET; -- GitLab From 299915d6bee257880139528cd3d293707717eca5 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:46 +0530 Subject: [PATCH 1381/2223] dt-bindings: PCI: qcom-ep: Make PERST separation optional PERST separation is an optional debug feature used to collect the crash dump from the PCIe endpoint devices by the PCIe host when the endpoint crashes. This feature keeps the PCIe link up by separating the PCIe IP block from the SoC reset logic. Remove the corresponding property "qcom,perst-regs" from the required properties list. Link: https://lore.kernel.org/r/20220914075350.7992-9-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> --- Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml index 3d23599e5e915..b728ede3f09fd 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml @@ -105,7 +105,6 @@ required: - reg-names - clocks - clock-names - - qcom,perst-regs - interrupts - interrupt-names - reset-gpios -- GitLab From aa4b1753625ce97a703e71928f67bac07d9d2b55 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:47 +0530 Subject: [PATCH 1382/2223] PCI: qcom-ep: Make PERST separation optional PERST separation is an optional debug feature used to collect the crash dump from the PCIe endpoint devices by the PCIe host when the endpoint crashes. This feature keeps the PCIe link up by separating the PCIe IP block from the SoC reset logic. Make the property optional in the driver. Link: https://lore.kernel.org/r/20220914075350.7992-10-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index c2585cdaa5011..b11d26e50aa2c 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -220,8 +220,10 @@ static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep) */ static void qcom_pcie_ep_configure_tcsr(struct qcom_pcie_ep *pcie_ep) { - regmap_write(pcie_ep->perst_map, pcie_ep->perst_en, 0); - regmap_write(pcie_ep->perst_map, pcie_ep->perst_sep_en, 0); + if (pcie_ep->perst_map) { + regmap_write(pcie_ep->perst_map, pcie_ep->perst_en, 0); + regmap_write(pcie_ep->perst_map, pcie_ep->perst_sep_en, 0); + } } static int qcom_pcie_dw_link_up(struct dw_pcie *pci) @@ -478,8 +480,8 @@ static int qcom_pcie_ep_get_io_resources(struct platform_device *pdev, syscon = of_parse_phandle(dev->of_node, "qcom,perst-regs", 0); if (!syscon) { - dev_err(dev, "Failed to parse qcom,perst-regs\n"); - return -EINVAL; + dev_dbg(dev, "PERST separation not available\n"); + return 0; } pcie_ep->perst_map = syscon_node_to_regmap(syscon); -- GitLab From 8dffa879ac79ffb6421dd924e74e6d07b0996207 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:48 +0530 Subject: [PATCH 1383/2223] dt-bindings: PCI: qcom-ep: Define clocks per platform In preparation for adding the bindings for future SoCs, define the clocks per platform. Link: https://lore.kernel.org/r/20220914075350.7992-11-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> --- .../devicetree/bindings/pci/qcom,pcie-ep.yaml | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml index b728ede3f09fd..bb8e982e69be5 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml @@ -9,9 +9,6 @@ title: Qualcomm PCIe Endpoint Controller binding maintainers: - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> -allOf: - - $ref: "pci-ep.yaml#" - properties: compatible: const: qcom,sdx55-pcie-ep @@ -35,24 +32,10 @@ properties: - const: mmio clocks: - items: - - description: PCIe Auxiliary clock - - description: PCIe CFG AHB clock - - description: PCIe Master AXI clock - - description: PCIe Slave AXI clock - - description: PCIe Slave Q2A AXI clock - - description: PCIe Sleep clock - - description: PCIe Reference clock + maxItems: 7 clock-names: - items: - - const: aux - - const: cfg - - const: bus_master - - const: bus_slave - - const: slave_q2a - - const: sleep - - const: ref + maxItems: 7 qcom,perst-regs: description: Reference to a syscon representing TCSR followed by the two @@ -112,6 +95,35 @@ required: - reset-names - power-domains +allOf: + - $ref: pci-ep.yaml# + - if: + properties: + compatible: + contains: + enum: + - qcom,sdx55-pcie-ep + then: + properties: + clocks: + items: + - description: PCIe Auxiliary clock + - description: PCIe CFG AHB clock + - description: PCIe Master AXI clock + - description: PCIe Slave AXI clock + - description: PCIe Slave Q2A AXI clock + - description: PCIe Sleep clock + - description: PCIe Reference clock + clock-names: + items: + - const: aux + - const: cfg + - const: bus_master + - const: bus_slave + - const: slave_q2a + - const: sleep + - const: ref + unevaluatedProperties: false examples: -- GitLab From 63e445b746aa466525a483b81581e4798eb2f321 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:49 +0530 Subject: [PATCH 1384/2223] dt-bindings: PCI: qcom-ep: Add support for SM8450 SoC Add devicetree bindings support for SM8450 SoC. Only the clocks are different on this platform, rest is same as SDX55. Link: https://lore.kernel.org/r/20220914075350.7992-12-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Rob Herring <robh@kernel.org> --- .../devicetree/bindings/pci/qcom,pcie-ep.yaml | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml index bb8e982e69be5..977c976ea7994 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml @@ -11,7 +11,9 @@ maintainers: properties: compatible: - const: qcom,sdx55-pcie-ep + enum: + - qcom,sdx55-pcie-ep + - qcom,sm8450-pcie-ep reg: items: @@ -32,10 +34,12 @@ properties: - const: mmio clocks: - maxItems: 7 + minItems: 7 + maxItems: 8 clock-names: - maxItems: 7 + minItems: 7 + maxItems: 8 qcom,perst-regs: description: Reference to a syscon representing TCSR followed by the two @@ -124,6 +128,35 @@ allOf: - const: sleep - const: ref + - if: + properties: + compatible: + contains: + enum: + - qcom,sm8450-pcie-ep + then: + properties: + clocks: + items: + - description: PCIe Auxiliary clock + - description: PCIe CFG AHB clock + - description: PCIe Master AXI clock + - description: PCIe Slave AXI clock + - description: PCIe Slave Q2A AXI clock + - description: PCIe Reference clock + - description: PCIe DDRSS SF TBU clock + - description: PCIe AGGRE NOC AXI clock + clock-names: + items: + - const: aux + - const: cfg + - const: bus_master + - const: bus_slave + - const: slave_q2a + - const: ref + - const: ddrss_sf_tbu + - const: aggre_noc_axi + unevaluatedProperties: false examples: -- GitLab From 867ec26c16064b271b1d5fd292a1610ed3a754ec Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Date: Wed, 14 Sep 2022 13:23:50 +0530 Subject: [PATCH 1385/2223] PCI: qcom-ep: Add support for SM8450 SoC Add support for SM8450 SoC to the Qualcomm PCIe Endpoint Controller driver. The driver uses the same config as the existing SDX55 chipset, so additional settings are not required. Link: https://lore.kernel.org/r/20220914075350.7992-13-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index b11d26e50aa2c..464e5ca638be8 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -793,6 +793,7 @@ static int qcom_pcie_ep_remove(struct platform_device *pdev) static const struct of_device_id qcom_pcie_ep_match[] = { { .compatible = "qcom,sdx55-pcie-ep", }, + { .compatible = "qcom,sm8450-pcie-ep", }, { } }; MODULE_DEVICE_TABLE(of, qcom_pcie_ep_match); -- GitLab From 94f0b955e4ed610e4ee93ee72b88c4415bed685d Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Fri, 29 Apr 2022 16:07:40 +0800 Subject: [PATCH 1386/2223] PCI: qcom-ep: Check platform_get_resource_byname() return value If platform_get_resource_byname() fails, 'mmio_res' will be set to NULL pointer, which causes a NULL pointer dereference when it is used in qcom_pcie_perst_deassert(). Check the return value to prevent it. Link: https://lore.kernel.org/r/20220429080740.1294797-1-yangyingliang@huawei.com Fixes: f55fee56a631 ("PCI: qcom-ep: Add Qualcomm PCIe Endpoint controller driver") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Andrew Halaney <ahalaney@redhat.com> --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 464e5ca638be8..6d0d1b759ca24 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -474,6 +474,11 @@ static int qcom_pcie_ep_get_io_resources(struct platform_device *pdev, pcie_ep->mmio_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mmio"); + if (!pcie_ep->mmio_res) { + dev_err(dev, "Failed to get mmio resource\n"); + return -EINVAL; + } + pcie_ep->mmio = devm_pci_remap_cfg_resource(dev, pcie_ep->mmio_res); if (IS_ERR(pcie_ep->mmio)) return PTR_ERR(pcie_ep->mmio); -- GitLab From 4659f01e3cd94f64d9bd06764ace2ef8fe1b6227 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Sat, 1 Oct 2022 11:44:08 -0500 Subject: [PATCH 1387/2223] smb3: do not log confusing message when server returns no network interfaces Some servers can return an empty network interface list so, unless multichannel is requested, no need to log an error for this, and when multichannel is requested on mount but no interfaces, log something less confusing. For this case change parse_server_interfaces: malformed interface info to empty network interface list returned by server localhost Also do not relog this error every ten minutes (only log on mount, once) Cc: <stable@vger.kernel.org> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsproto.h | 2 +- fs/cifs/connect.c | 2 +- fs/cifs/smb2ops.c | 23 ++++++++++++++++++----- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3bc94bcc7177e..71386978858eb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -639,7 +639,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, int cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ad81d7d43eafb..93e59b3b36c73 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -155,7 +155,7 @@ static void smb2_query_server_interfaces(struct work_struct *work) /* * query server network interfaces, in case they change */ - rc = SMB3_request_interfaces(0, tcon); + rc = SMB3_request_interfaces(0, tcon, false); if (rc) { cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f590a9cb6a1a2..10f9ef68e510c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,8 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, - size_t buf_len, - struct cifs_ses *ses) + size_t buf_len, struct cifs_ses *ses, bool in_mount) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; @@ -543,6 +542,20 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, } spin_unlock(&ses->iface_lock); + /* + * Samba server e.g. can return an empty interface list in some cases, + * which would only be a problem if we were requesting multichannel + */ + if (bytes_left == 0) { + /* avoid spamming logs every 10 minutes, so log only in mount */ + if ((ses->chan_max > 1) && in_mount) + cifs_dbg(VFS, + "empty network interface list returned by server %s\n", + ses->server->hostname); + rc = -EINVAL; + goto out; + } + while (bytes_left >= sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); @@ -673,7 +686,7 @@ out: } int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount) { int rc; unsigned int ret_data_len = 0; @@ -693,7 +706,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, ses); + rc = parse_server_interfaces(out_buf, ret_data_len, ses, in_mount); if (rc) goto out; @@ -729,7 +742,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return; - SMB3_request_interfaces(xid, tcon); + SMB3_request_interfaces(xid, tcon, true /* called during mount */); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); -- GitLab From 943deb6066538aeb5417eae5fdc222defdcb9949 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" <gustavoars@kernel.org> Date: Tue, 4 Oct 2022 20:51:39 -0500 Subject: [PATCH 1388/2223] cifs: Replace a couple of one-element arrays with flexible-array members One-element arrays are deprecated, and we are replacing them with flexible array members instead. So, replace one-element arrays with flexible-array member in structs negotiate_req and extended_response, and refactor the rest of the code, accordingly. Also, make use of the DECLARE_FLEX_ARRAY() helper to declare flexible array member EncryptionKey in union u. This new helper allows for flexible-array members in unions. Change pointer notation to proper array notation in a call to memcpy() where flexible-array member DialectsArray is being used as destination argument. Important to mention is that doing a build before/after this patch results in no binary output differences. This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/229 Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 [1] Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifspdu.h | 7 ++++--- fs/cifs/cifssmb.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index aeba371c4c707..d1abaeea974a9 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -483,7 +483,7 @@ put_bcc(__u16 count, struct smb_hdr *hdr) typedef struct negotiate_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; - unsigned char DialectsArray[1]; + unsigned char DialectsArray[]; } __attribute__((packed)) NEGOTIATE_REQ; #define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ @@ -508,13 +508,14 @@ typedef struct negotiate_rsp { __u8 EncryptionKeyLength; __u16 ByteCount; union { - unsigned char EncryptionKey[1]; /* cap extended security off */ + /* cap extended security off */ + DECLARE_FLEX_ARRAY(unsigned char, EncryptionKey); /* followed by Domain name - if extended security is off */ /* followed by 16 bytes of server GUID */ /* then security blob if cap_extended_security negotiated */ struct { unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; - unsigned char SecurityBlob[1]; + unsigned char SecurityBlob[]; } __attribute__((packed)) extended_response; } __attribute__((packed)) u; } __attribute__((packed)) NEGOTIATE_RSP; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7aa91e2720274..7a808e41b1b89 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -465,7 +465,7 @@ CIFSSMBNegotiate(const unsigned int xid, for (i = 0; i < CIFS_NUM_PROT; i++) { size_t len = strlen(protocols[i].name) + 1; - memcpy(pSMB->DialectsArray+count, protocols[i].name, len); + memcpy(&pSMB->DialectsArray[count], protocols[i].name, len); count += len; } inc_rfc1001_len(pSMB, count); -- GitLab From f5823f5ee36040c2a8b8b36afe0783fe0bd7ad14 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum <usama.anjum@collabora.com> Date: Tue, 4 Oct 2022 11:23:32 +0500 Subject: [PATCH 1389/2223] cifs: remove initialization value Don't initialize the rc as its value is being overwritten before its use. Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 90ccac18f9f3f..40fce33763072 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -873,7 +873,7 @@ SMB2_negotiate(const unsigned int xid, struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; struct kvec rsp_iov; - int rc = 0; + int rc; int resp_buftype; int blob_offset, blob_length; char *security_blob; -- GitLab From af7b29b1deaac6da3bb7637f0e263dfab7bfc7a3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Wed, 5 Oct 2022 01:01:00 +0300 Subject: [PATCH 1390/2223] Revert "net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs" taprio_attach() has this logic at the end, which should have been removed with the blamed patch (which is now being reverted): /* access to the child qdiscs is not needed in offload mode */ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { kfree(q->qdiscs); q->qdiscs = NULL; } because otherwise, we make use of q->qdiscs[] even after this array was deallocated, namely in taprio_leaf(). Therefore, whenever one would try to attach a valid child qdisc to a fully offloaded taprio root, one would immediately dereference a NULL pointer. $ tc qdisc replace dev eno0 handle 8001: parent root taprio \ num_tc 8 \ map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ max-sdu 0 0 0 0 0 200 0 0 \ base-time 200 \ sched-entry S 80 20000 \ sched-entry S a0 20000 \ sched-entry S 5f 60000 \ flags 2 $ max_frame_size=1500 $ data_rate_kbps=20000 $ port_transmit_rate_kbps=1000000 $ idleslope=$data_rate_kbps $ sendslope=$(($idleslope - $port_transmit_rate_kbps)) $ locredit=$(($max_frame_size * $sendslope / $port_transmit_rate_kbps)) $ hicredit=$(($max_frame_size * $idleslope / $port_transmit_rate_kbps)) $ tc qdisc replace dev eno0 parent 8001:7 cbs \ idleslope $idleslope \ sendslope $sendslope \ hicredit $hicredit \ locredit $locredit \ offload 0 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000030 pc : taprio_leaf+0x28/0x40 lr : qdisc_leaf+0x3c/0x60 Call trace: taprio_leaf+0x28/0x40 tc_modify_qdisc+0xf0/0x72c rtnetlink_rcv_msg+0x12c/0x390 netlink_rcv_skb+0x5c/0x130 rtnetlink_rcv+0x1c/0x2c The solution is not as obvious as the problem. The code which deallocates q->qdiscs[] is in fact copied and pasted from mqprio, which also deallocates the array in mqprio_attach() and never uses it afterwards. Therefore, the identical cleanup logic of priv->qdiscs[] that mqprio_destroy() has is deceptive because it will never take place at qdisc_destroy() time, but just at raw ops->destroy() time (otherwise said, priv->qdiscs[] do not last for the entire lifetime of the mqprio root), but rather, this is just the twisted way in which the Qdisc API understands error path cleanup should be done (Qdisc_ops :: destroy() is called even when Qdisc_ops :: init() never succeeded). Side note, in fact this is also what the comment in mqprio_init() says: /* pre-allocate qdisc, attachment can't fail */ Or reworded, mqprio's priv->qdiscs[] scheme is only meant to serve as data passing between Qdisc_ops :: init() and Qdisc_ops :: attach(). [ this comment was also copied and pasted into the initial taprio commit, even though taprio_attach() came way later ] The problem is that taprio also makes extensive use of the q->qdiscs[] array in the software fast path (taprio_enqueue() and taprio_dequeue()), but it does not keep a reference of its own on q->qdiscs[i] (you'd think that since it creates these Qdiscs, it holds the reference, but nope, this is not completely true). To understand the difference between taprio_destroy() and mqprio_destroy() one must look before commit 13511704f8d7 ("net: taprio offload: enforce qdisc to netdev queue mapping"), because that just muddied the waters. In the "original" taprio design, taprio always attached itself (the root Qdisc) to all netdev TX queues, so that dev_qdisc_enqueue() would go through taprio_enqueue(). It also called qdisc_refcount_inc() on itself for as many times as there were netdev TX queues, in order to counter-balance what tc_get_qdisc() does when destroying a Qdisc (simplified for brevity below): if (n->nlmsg_type == RTM_DELQDISC) err = qdisc_graft(dev, parent=NULL, new=NULL, q, extack); qdisc_graft(where "new" is NULL so this deletes the Qdisc): for (i = 0; i < num_q; i++) { struct netdev_queue *dev_queue; dev_queue = netdev_get_tx_queue(dev, i); old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) qdisc_refcount_inc(new); qdisc_put(old); ~~~~~~~~~~~~~~ this decrements taprio's refcount once for each TX queue } notify_and_destroy(net, skb, n, classid, rtnl_dereference(dev->qdisc), new); ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ and this finally decrements it to zero, making qdisc_put() call qdisc_destroy() The q->qdiscs[] created using qdisc_create_dflt() (or their replacements, if taprio_graft() was ever to get called) were then privately freed by taprio_destroy(). This is still what is happening after commit 13511704f8d7 ("net: taprio offload: enforce qdisc to netdev queue mapping"), but only for software mode. In full offload mode, the per-txq "qdisc_put(old)" calls from qdisc_graft() now deallocate the child Qdiscs rather than decrement taprio's refcount. So when notify_and_destroy(taprio) finally calls taprio_destroy(), the difference is that the child Qdiscs were already deallocated. And this is exactly why the taprio_attach() comment "access to the child qdiscs is not needed in offload mode" is deceptive too. Not only the q->qdiscs[] array is not needed, but it is also necessary to get rid of it as soon as possible, because otherwise, we will also call qdisc_put() on the child Qdiscs in qdisc_destroy() -> taprio_destroy(), and this will cause a nasty use-after-free/refcount-saturate/whatever. In short, the problem is that since the blamed commit, taprio_leaf() needs q->qdiscs[] to not be freed by taprio_attach(), while qdisc_destroy() -> taprio_destroy() does need q->qdiscs[] to be freed by taprio_attach() for full offload. Fixing one problem triggers the other. All of this can be solved by making taprio keep its q->qdiscs[i] with a refcount elevated at 2 (in offloaded mode where they are attached to the netdev TX queues), both in taprio_attach() and in taprio_graft(). The generic qdisc_graft() would just decrement the child qdiscs' refcounts to 1, and taprio_destroy() would give them the final coup de grace. However the rabbit hole of changes is getting quite deep, and the complexity increases. The blamed commit was supposed to be a bug fix in the first place, and the bug it addressed is not so significant so as to justify further rework in stable trees. So I'd rather just revert it. I don't know enough about multi-queue Qdisc design to make a proper judgement right now regarding what is/isn't idiomatic use of Qdisc concepts in taprio. I will try to study the problem more and come with a different solution in net-next. Fixes: 1461d212ab27 ("net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs") Reported-by: Muhammad Husaini Zulkifli <muhammad.husaini.zulkifli@intel.com> Reported-by: Vinicius Costa Gomes <vinicius.gomes@intel.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com> Link: https://lore.kernel.org/r/20221004220100.1650558-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/sched/sch_taprio.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 435d866fcfa02..570389f6cdd7d 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2043,14 +2043,12 @@ start_error: static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - unsigned int ntx = cl - 1; + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); - if (ntx >= dev->num_tx_queues) + if (!dev_queue) return NULL; - return q->qdiscs[ntx]; + return dev_queue->qdisc_sleeping; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) -- GitLab From 304ee24bdb43d095621669e926feab728454bc63 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven <geert+renesas@glider.be> Date: Tue, 4 Oct 2022 18:23:53 +0200 Subject: [PATCH 1391/2223] net: pse-pd: PSE_REGULATOR should depend on REGULATOR The Regulator based PSE controller driver relies on regulator support to be enabled. If regulator support is disabled, it will still compile fine, but won't operate correctly. Hence add a dependency on REGULATOR, to prevent asking the user about this driver when configuring a kernel without regulator support. Fixes: 66741b4e94ca7bb1 ("net: pse-pd: add regulator based PSE driver") Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be> Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de> Link: https://lore.kernel.org/r/709caac8873ff2a8b72b92091429be7c1a939959.1664900558.git.geert+renesas@glider.be Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/pse-pd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/pse-pd/Kconfig b/drivers/net/pse-pd/Kconfig index 73d163704068a..687dec49c1e13 100644 --- a/drivers/net/pse-pd/Kconfig +++ b/drivers/net/pse-pd/Kconfig @@ -14,6 +14,7 @@ if PSE_CONTROLLER config PSE_REGULATOR tristate "Regulator based PSE controller" + depends on REGULATOR || COMPILE_TEST help This module provides support for simple regulator based Ethernet Power Sourcing Equipment without automatic classification support. For -- GitLab From 229a0027591c970e89992313d87330a3cfe6d028 Mon Sep 17 00:00:00 2001 From: Casper Andersson <casper.casan@gmail.com> Date: Tue, 4 Oct 2022 09:32:42 +0200 Subject: [PATCH 1392/2223] docs: networking: phy: add missing space Missing space between "pins'" and "strength" Signed-off-by: Casper Andersson <casper.casan@gmail.com> Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com> Link: https://lore.kernel.org/r/20221004073242.304425-1-casper.casan@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- Documentation/networking/phy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 06f4fcdb58b66..d11329a08984e 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -120,7 +120,7 @@ required delays, as defined per the RGMII standard, several options may be available: * Some SoCs may offer a pin pad/mux/controller capable of configuring a given - set of pins'strength, delays, and voltage; and it may be a suitable + set of pins' strength, delays, and voltage; and it may be a suitable option to insert the expected 2ns RGMII delay. * Modifying the PCB design to include a fixed delay (e.g: using a specifically -- GitLab From f93719351b0e3684675b3824708a735c0e57005e Mon Sep 17 00:00:00 2001 From: Alexandru Tachici <alexandru.tachici@analog.com> Date: Mon, 3 Oct 2022 14:16:36 +0300 Subject: [PATCH 1393/2223] net: ethernet: adi: adin1110: Add check in netdev_event Check whether this driver actually is the intended recipient of upper change event. Fixes: bc93e19d088b ("net: ethernet: adi: Add ADIN1110 support") Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com> Link: https://lore.kernel.org/r/20221003111636.54973-1-alexandru.tachici@analog.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/adi/adin1110.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c index aaee7c4248e6e..1744d623999d0 100644 --- a/drivers/net/ethernet/adi/adin1110.c +++ b/drivers/net/ethernet/adi/adin1110.c @@ -1169,6 +1169,11 @@ static int adin1110_port_bridge_leave(struct adin1110_port_priv *port_priv, return ret; } +static bool adin1110_port_dev_check(const struct net_device *dev) +{ + return dev->netdev_ops == &adin1110_netdev_ops; +} + static int adin1110_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -1177,6 +1182,9 @@ static int adin1110_netdevice_event(struct notifier_block *unused, struct netdev_notifier_changeupper_info *info = ptr; int ret = 0; + if (!adin1110_port_dev_check(dev)) + return NOTIFY_DONE; + switch (event) { case NETDEV_CHANGEUPPER: if (netif_is_bridge_master(info->upper_dev)) { @@ -1202,11 +1210,6 @@ static void adin1110_disconnect_phy(void *data) phy_disconnect(data); } -static bool adin1110_port_dev_check(const struct net_device *dev) -{ - return dev->netdev_ops == &adin1110_netdev_ops; -} - static int adin1110_port_set_forwarding_state(struct adin1110_port_priv *port_priv) { struct adin1110_priv *priv = port_priv->priv; -- GitLab From 6b430f72b2bc14fd0ac922dda92eaa51c82e15a4 Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Tue, 27 Sep 2022 11:38:23 +0200 Subject: [PATCH 1394/2223] wifi: mt76: fix rate reporting / throughput regression on mt7915 and newer mt7915 and newer need to report the rate_info that's stored in wcid->rate, since they don't fill info->status.rates. Cc: Jonas Jelonek <jelonek.jonas@gmail.com> Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com> Link: https://lore.kernel.org/all/CABXGCsP0znm9pS-MiKtyxTXR7XiyFVqen0qzNpicGHDZKCzbwg@mail.gmail.com/ Fixes: 44fa75f207d8 ("mac80211: extend current rate control tx status API") Signed-off-by: Felix Fietkau <nbd@nbd.name> Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com> Signed-off-by: Kalle Valo <kvalo@kernel.org> Link: https://lore.kernel.org/r/20220927093823.6007-1-nbd@nbd.name --- drivers/net/wireless/mediatek/mt76/tx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c index e67cc7909bce3..6c054850363f6 100644 --- a/drivers/net/wireless/mediatek/mt76/tx.c +++ b/drivers/net/wireless/mediatek/mt76/tx.c @@ -60,14 +60,20 @@ mt76_tx_status_unlock(struct mt76_dev *dev, struct sk_buff_head *list) .skb = skb, .info = IEEE80211_SKB_CB(skb), }; + struct ieee80211_rate_status rs = {}; struct mt76_tx_cb *cb = mt76_tx_skb_cb(skb); struct mt76_wcid *wcid; wcid = rcu_dereference(dev->wcid[cb->wcid]); if (wcid) { status.sta = wcid_to_sta(wcid); - status.rates = NULL; - status.n_rates = 0; + if (status.sta && (wcid->rate.flags || wcid->rate.legacy)) { + rs.rate_idx = wcid->rate; + status.rates = &rs; + status.n_rates = 1; + } else { + status.n_rates = 0; + } } hw = mt76_tx_status_get_hw(dev, skb); -- GitLab From 06c62f8cbb1f660a4147b0d8cbe65cf2cfc1aa5a Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Tue, 4 Oct 2022 17:06:39 +0100 Subject: [PATCH 1395/2223] xen/xenbus: Fix spelling mistake "hardward" -> "hardware" There is a spelling mistake in the module description. Fix it. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20221004160639.154421-1-colin.i.king@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/xen-pciback/xenbus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index bde63ef677b8f..d171091eec123 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -31,7 +31,7 @@ MODULE_PARM_DESC(passthrough, " frontend (for example, a device at 06:01.b will still appear at\n"\ " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ " exposed PCI devices to its driver domains. This may be required\n"\ - " for drivers which depend on finding their hardward in certain\n"\ + " for drivers which depend on finding their hardware in certain\n"\ " bus/slot locations."); static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) -- GitLab From e433715b116553892ecad8796018ae4b64304252 Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Date: Wed, 5 Oct 2022 20:48:22 +0300 Subject: [PATCH 1396/2223] xen/virtio: Fix n_pages calculation in xen_grant_dma_map(unmap)_page() Take page offset into the account when calculating the number of pages to be granted. Fixes: d6aca3504c7d ("xen/grant-dma-ops: Add option to restrict memory access under Xen") Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20221005174823.1800761-2-olekstysh@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/grant-dma-ops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 8973fc1e9cccd..1998d0e8ce82a 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -153,7 +153,7 @@ static dma_addr_t xen_grant_dma_map_page(struct device *dev, struct page *page, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned int i, n_pages = PFN_UP(size); + unsigned int i, n_pages = PFN_UP(offset + size); grant_ref_t grant; dma_addr_t dma_handle; @@ -185,7 +185,8 @@ static void xen_grant_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned int i, n_pages = PFN_UP(size); + unsigned long offset = dma_handle & (PAGE_SIZE - 1); + unsigned int i, n_pages = PFN_UP(offset + size); grant_ref_t grant; if (WARN_ON(dir == DMA_NONE)) -- GitLab From 77be00f194b6e1647cddb644b7023b352c2c6ee8 Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Date: Wed, 5 Oct 2022 20:48:23 +0300 Subject: [PATCH 1397/2223] xen/virtio: Fix potential deadlock when accessing xen_grant_dma_devices As find_xen_grant_dma_data() is called from both interrupt and process contexts, the access to xen_grant_dma_devices XArray must be protected by xa_lock_irqsave to avoid deadlock scenario. As XArray API doesn't provide xa_store_irqsave helper, call lockless __xa_store directly and guard it externally. Also move the storage of the XArray's entry to a separate helper. Fixes: d6aca3504c7d ("xen/grant-dma-ops: Add option to restrict memory access under Xen") Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20221005174823.1800761-3-olekstysh@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/grant-dma-ops.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 1998d0e8ce82a..c66f56d24013b 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -25,7 +25,7 @@ struct xen_grant_dma_data { bool broken; }; -static DEFINE_XARRAY(xen_grant_dma_devices); +static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ); #define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63) @@ -42,14 +42,29 @@ static inline grant_ref_t dma_to_grant(dma_addr_t dma) static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev) { struct xen_grant_dma_data *data; + unsigned long flags; - xa_lock(&xen_grant_dma_devices); + xa_lock_irqsave(&xen_grant_dma_devices, flags); data = xa_load(&xen_grant_dma_devices, (unsigned long)dev); - xa_unlock(&xen_grant_dma_devices); + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); return data; } +static int store_xen_grant_dma_data(struct device *dev, + struct xen_grant_dma_data *data) +{ + unsigned long flags; + int ret; + + xa_lock_irqsave(&xen_grant_dma_devices, flags); + ret = xa_err(__xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, + GFP_ATOMIC)); + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); + + return ret; +} + /* * DMA ops for Xen frontends (e.g. virtio). * @@ -338,8 +353,7 @@ void xen_grant_setup_dma_ops(struct device *dev) */ data->backend_domid = iommu_spec.args[0]; - if (xa_err(xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, - GFP_KERNEL))) { + if (store_xen_grant_dma_data(dev, data)) { dev_err(dev, "Cannot store Xen grant DMA data\n"); goto err; } -- GitLab From 0991028cd49567d7016d1b224fe0117c35059f86 Mon Sep 17 00:00:00 2001 From: "M. Vefa Bicakci" <m.v.b@runbox.com> Date: Sun, 2 Oct 2022 18:20:05 -0400 Subject: [PATCH 1398/2223] xen/gntdev: Prevent leaking grants Prior to this commit, if a grant mapping operation failed partially, some of the entries in the map_ops array would be invalid, whereas all of the entries in the kmap_ops array would be valid. This in turn would cause the following logic in gntdev_map_grant_pages to become invalid: for (i = 0; i < map->count; i++) { if (map->map_ops[i].status == GNTST_okay) { map->unmap_ops[i].handle = map->map_ops[i].handle; if (!use_ptemod) alloced++; } if (use_ptemod) { if (map->kmap_ops[i].status == GNTST_okay) { if (map->map_ops[i].status == GNTST_okay) alloced++; map->kunmap_ops[i].handle = map->kmap_ops[i].handle; } } } ... atomic_add(alloced, &map->live_grants); Assume that use_ptemod is true (i.e., the domain mapping the granted pages is a paravirtualized domain). In the code excerpt above, note that the "alloced" variable is only incremented when both kmap_ops[i].status and map_ops[i].status are set to GNTST_okay (i.e., both mapping operations are successful). However, as also noted above, there are cases where a grant mapping operation fails partially, breaking the assumption of the code excerpt above. The aforementioned causes map->live_grants to be incorrectly set. In some cases, all of the map_ops mappings fail, but all of the kmap_ops mappings succeed, meaning that live_grants may remain zero. This in turn makes it impossible to unmap the successfully grant-mapped pages pointed to by kmap_ops, because unmap_grant_pages has the following snippet of code at its beginning: if (atomic_read(&map->live_grants) == 0) return; /* Nothing to do */ In other cases where only some of the map_ops mappings fail but all kmap_ops mappings succeed, live_grants is made positive, but when the user requests unmapping the grant-mapped pages, __unmap_grant_pages_done will then make map->live_grants negative, because the latter function does not check if all of the pages that were requested to be unmapped were actually unmapped, and the same function unconditionally subtracts "data->count" (i.e., a value that can be greater than map->live_grants) from map->live_grants. The side effects of a negative live_grants value have not been studied. The net effect of all of this is that grant references are leaked in one of the above conditions. In Qubes OS v4.1 (which uses Xen's grant mechanism extensively for X11 GUI isolation), this issue manifests itself with warning messages like the following to be printed out by the Linux kernel in the VM that had granted pages (that contain X11 GUI window data) to dom0: "g.e. 0x1234 still pending", especially after the user rapidly resizes GUI VM windows (causing some grant-mapping operations to partially or completely fail, due to the fact that the VM unshares some of the pages as part of the window resizing, making the pages impossible to grant-map from dom0). The fix for this issue involves counting all successful map_ops and kmap_ops mappings separately, and then adding the sum to live_grants. During unmapping, only the number of successfully unmapped grants is subtracted from live_grants. The code is also modified to check for negative live_grants values after the subtraction and warn the user. Link: https://github.com/QubesOS/qubes-issues/issues/7631 Fixes: dbe97cff7dd9 ("xen/gntdev: Avoid blocking in unmap_grant_pages()") Cc: stable@vger.kernel.org Signed-off-by: M. Vefa Bicakci <m.v.b@runbox.com> Acked-by: Demi Marie Obenour <demi@invisiblethingslab.com> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20221002222006.2077-2-m.v.b@runbox.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/gntdev.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 84b143eef395b..eb0586b9767d1 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -367,8 +367,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) for (i = 0; i < map->count; i++) { if (map->map_ops[i].status == GNTST_okay) { map->unmap_ops[i].handle = map->map_ops[i].handle; - if (!use_ptemod) - alloced++; + alloced++; } else if (!err) err = -EINVAL; @@ -377,8 +376,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) if (use_ptemod) { if (map->kmap_ops[i].status == GNTST_okay) { - if (map->map_ops[i].status == GNTST_okay) - alloced++; + alloced++; map->kunmap_ops[i].handle = map->kmap_ops[i].handle; } else if (!err) err = -EINVAL; @@ -394,8 +392,14 @@ static void __unmap_grant_pages_done(int result, unsigned int i; struct gntdev_grant_map *map = data->data; unsigned int offset = data->unmap_ops - map->unmap_ops; + int successful_unmaps = 0; + int live_grants; for (i = 0; i < data->count; i++) { + if (map->unmap_ops[offset + i].status == GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) + successful_unmaps++; + WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); pr_debug("unmap handle=%d st=%d\n", @@ -403,6 +407,10 @@ static void __unmap_grant_pages_done(int result, map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; if (use_ptemod) { + if (map->kunmap_ops[offset + i].status == GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) + successful_unmaps++; + WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); pr_debug("kunmap handle=%u st=%d\n", @@ -411,11 +419,15 @@ static void __unmap_grant_pages_done(int result, map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; } } + /* * Decrease the live-grant counter. This must happen after the loop to * prevent premature reuse of the grants by gnttab_mmap(). */ - atomic_sub(data->count, &map->live_grants); + live_grants = atomic_sub_return(successful_unmaps, &map->live_grants); + if (WARN_ON(live_grants < 0)) + pr_err("%s: live_grants became negative (%d) after unmapping %d pages!\n", + __func__, live_grants, successful_unmaps); /* Release reference taken by __unmap_grant_pages */ gntdev_put_map(NULL, map); -- GitLab From 5c13a4a0291b30191eff9ead8d010e1ca43a4d0c Mon Sep 17 00:00:00 2001 From: "M. Vefa Bicakci" <m.v.b@runbox.com> Date: Sun, 2 Oct 2022 18:20:06 -0400 Subject: [PATCH 1399/2223] xen/gntdev: Accommodate VMA splitting Prior to this commit, the gntdev driver code did not handle the following scenario correctly with paravirtualized (PV) Xen domains: * User process sets up a gntdev mapping composed of two grant mappings (i.e., two pages shared by another Xen domain). * User process munmap()s one of the pages. * User process munmap()s the remaining page. * User process exits. In the scenario above, the user process would cause the kernel to log the following messages in dmesg for the first munmap(), and the second munmap() call would result in similar log messages: BUG: Bad page map in process doublemap.test pte:... pmd:... page:0000000057c97bff refcount:1 mapcount:-1 \ mapping:0000000000000000 index:0x0 pfn:... ... page dumped because: bad pte ... file:gntdev fault:0x0 mmap:gntdev_mmap [xen_gntdev] readpage:0x0 ... Call Trace: <TASK> dump_stack_lvl+0x46/0x5e print_bad_pte.cold+0x66/0xb6 unmap_page_range+0x7e5/0xdc0 unmap_vmas+0x78/0xf0 unmap_region+0xa8/0x110 __do_munmap+0x1ea/0x4e0 __vm_munmap+0x75/0x120 __x64_sys_munmap+0x28/0x40 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb ... For each munmap() call, the Xen hypervisor (if built with CONFIG_DEBUG) would print out the following and trigger a general protection fault in the affected Xen PV domain: (XEN) d0v... Attempt to implicitly unmap d0's grant PTE ... (XEN) d0v... Attempt to implicitly unmap d0's grant PTE ... As of this writing, gntdev_grant_map structure's vma field (referred to as map->vma below) is mainly used for checking the start and end addresses of mappings. However, with split VMAs, these may change, and there could be more than one VMA associated with a gntdev mapping. Hence, remove the use of map->vma and rely on map->pages_vm_start for the original start address and on (map->count << PAGE_SHIFT) for the original mapping size. Let the invalidate() and find_special_page() hooks use these. Also, given that there can be multiple VMAs associated with a gntdev mapping, move the "mmu_interval_notifier_remove(&map->notifier)" call to the end of gntdev_put_map, so that the MMU notifier is only removed after the closing of the last remaining VMA. Finally, use an atomic to prevent inadvertent gntdev mapping re-use, instead of using the map->live_grants atomic counter and/or the map->vma pointer (the latter of which is now removed). This prevents the userspace from mmap()'ing (with MAP_FIXED) a gntdev mapping over the same address range as a previously set up gntdev mapping. This scenario can be summarized with the following call-trace, which was valid prior to this commit: mmap gntdev_mmap mmap (repeat mmap with MAP_FIXED over the same address range) gntdev_invalidate unmap_grant_pages (sets 'being_removed' entries to true) gnttab_unmap_refs_async unmap_single_vma gntdev_mmap (maps the shared pages again) munmap gntdev_invalidate unmap_grant_pages (no-op because 'being_removed' entries are true) unmap_single_vma (For PV domains, Xen reports that a granted page is being unmapped and triggers a general protection fault in the affected domain, if Xen was built with CONFIG_DEBUG) The fix for this last scenario could be worth its own commit, but we opted for a single commit, because removing the gntdev_grant_map structure's vma field requires guarding the entry to gntdev_mmap(), and the live_grants atomic counter is not sufficient on its own to prevent the mmap() over a pre-existing mapping. Link: https://github.com/QubesOS/qubes-issues/issues/7631 Fixes: ab31523c2fca ("xen/gntdev: allow usermode to map granted pages") Cc: stable@vger.kernel.org Signed-off-by: M. Vefa Bicakci <m.v.b@runbox.com> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20221002222006.2077-3-m.v.b@runbox.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/gntdev-common.h | 3 +- drivers/xen/gntdev.c | 58 ++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h index 40ef379c28ab0..9c286b2a19001 100644 --- a/drivers/xen/gntdev-common.h +++ b/drivers/xen/gntdev-common.h @@ -44,9 +44,10 @@ struct gntdev_unmap_notify { }; struct gntdev_grant_map { + atomic_t in_use; struct mmu_interval_notifier notifier; + bool notifier_init; struct list_head next; - struct vm_area_struct *vma; int index; int count; int flags; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index eb0586b9767d1..4d9a3050de6a3 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -286,6 +286,9 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) */ } + if (use_ptemod && map->notifier_init) + mmu_interval_notifier_remove(&map->notifier); + if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(map->notify.event); evtchn_put(map->notify.event); @@ -298,7 +301,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) { struct gntdev_grant_map *map = data; - unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + unsigned int pgnr = (addr - map->pages_vm_start) >> PAGE_SHIFT; int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte | (1 << _GNTMAP_guest_avail0); u64 pte_maddr; @@ -508,11 +511,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - if (use_ptemod) { - WARN_ON(map->vma != vma); - mmu_interval_notifier_remove(&map->notifier); - map->vma = NULL; - } + vma->vm_private_data = NULL; gntdev_put_map(priv, map); } @@ -540,29 +539,30 @@ static bool gntdev_invalidate(struct mmu_interval_notifier *mn, struct gntdev_grant_map *map = container_of(mn, struct gntdev_grant_map, notifier); unsigned long mstart, mend; + unsigned long map_start, map_end; if (!mmu_notifier_range_blockable(range)) return false; + map_start = map->pages_vm_start; + map_end = map->pages_vm_start + (map->count << PAGE_SHIFT); + /* * If the VMA is split or otherwise changed the notifier is not * updated, but we don't want to process VA's outside the modified * VMA. FIXME: It would be much more understandable to just prevent * modifying the VMA in the first place. */ - if (map->vma->vm_start >= range->end || - map->vma->vm_end <= range->start) + if (map_start >= range->end || map_end <= range->start) return true; - mstart = max(range->start, map->vma->vm_start); - mend = min(range->end, map->vma->vm_end); + mstart = max(range->start, map_start); + mend = min(range->end, map_end); pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - range->start, range->end, mstart, mend); - unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); + map->index, map->count, map_start, map_end, + range->start, range->end, mstart, mend); + unmap_grant_pages(map, (mstart - map_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); return true; } @@ -1042,18 +1042,15 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) return -EINVAL; pr_debug("map %d+%d at %lx (pgoff %lx)\n", - index, count, vma->vm_start, vma->vm_pgoff); + index, count, vma->vm_start, vma->vm_pgoff); mutex_lock(&priv->lock); map = gntdev_find_map_index(priv, index, count); if (!map) goto unlock_out; - if (use_ptemod && map->vma) - goto unlock_out; - if (atomic_read(&map->live_grants)) { - err = -EAGAIN; + if (!atomic_add_unless(&map->in_use, 1, 1)) goto unlock_out; - } + refcount_inc(&map->users); vma->vm_ops = &gntdev_vmops; @@ -1074,15 +1071,16 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->flags |= GNTMAP_readonly; } + map->pages_vm_start = vma->vm_start; + if (use_ptemod) { - map->vma = vma; err = mmu_interval_notifier_insert_locked( &map->notifier, vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, &gntdev_mmu_ops); - if (err) { - map->vma = NULL; + if (err) goto out_unlock_put; - } + + map->notifier_init = true; } mutex_unlock(&priv->lock); @@ -1099,7 +1097,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) */ mmu_interval_read_begin(&map->notifier); - map->pages_vm_start = vma->vm_start; err = apply_to_page_range(vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, find_grant_ptes, map); @@ -1128,13 +1125,8 @@ unlock_out: out_unlock_put: mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) { + if (use_ptemod) unmap_grant_pages(map, 0, map->count); - if (map->vma) { - mmu_interval_notifier_remove(&map->notifier); - map->vma = NULL; - } - } gntdev_put_map(priv, map); return err; } -- GitLab From 9d157c89c5569f0ef560b7a5b2d7bf59ae98499c Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn <lukas.bulwahn@gmail.com> Date: Thu, 6 Oct 2022 10:01:54 +0200 Subject: [PATCH 1400/2223] MAINTAINERS: adjust STARFIVE JH7100 PINCTRL DRIVER after file movement Commit ba7fdf88e98a ("pinctrl: Create subdirectory for StarFive drivers") moves pinctrl-starfive.c into its own subdirectory starfive; further, commit ba99b756da17 ("pinctrl: starfive: Rename "pinctrl-starfive" to "pinctrl-starfive-jh7100"") adds the suffix jh7100 to the driver and dt-bindings header file name. These commits however do not adjust the entry in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Adjust the entries for STARFIVE JH7100 PINCTRL DRIVER after file movement. Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Reviewed-by: Emil Renner Berthing <kernel@esmil.dk> Link: https://lore.kernel.org/r/20221006080154.5396-1-lukas.bulwahn@gmail.com Signed-off-by: Linus Walleij <linus.walleij@linaro.org> --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 131299c18f029..0a5f3d67e3761 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19386,8 +19386,8 @@ M: Emil Renner Berthing <kernel@esmil.dk> L: linux-gpio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml -F: drivers/pinctrl/pinctrl-starfive.c -F: include/dt-bindings/pinctrl/pinctrl-starfive.h +F: drivers/pinctrl/starfive/ +F: include/dt-bindings/pinctrl/pinctrl-starfive-jh7100.h STARFIVE JH7100 RESET CONTROLLER DRIVER M: Emil Renner Berthing <kernel@esmil.dk> -- GitLab From c7c43e38b236eb80ae6ee60d3dffd8f894cd751c Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Thu, 22 Sep 2022 22:14:38 +0800 Subject: [PATCH 1401/2223] perf stat: Clean redundant if in process_evlist Since the first if statment is covered by the following one, clean up the first if statment. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220922141438.22487-5-shangxiaojing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-stat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7b8e901bce101..1677546b2ea2d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -900,8 +900,6 @@ try_again: evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; - if (!counter->reset_group && !counter->errored) - continue; if (!counter->reset_group) continue; try_again_reset: -- GitLab From 433b31fa00797a2a6205a023e9345f2c5e7896b6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 26 Sep 2022 14:56:38 -0700 Subject: [PATCH 1402/2223] perf lock contention: Fix a build error on 32-bit It was reported that it failed to build the BPF lock contention skeleton on 32 bit arch due to the size of long. The lost count is used only for reporting errors due to lack of stackmap space through bad_hist which type is 'int'. Let's use int type then. Fixes: 6d499a6b3d90277d ("perf lock: Print the number of lost entries for BPF") Reported-by: Jiri Slaby <jirislaby@kernel.org> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <songliubraving@fb.com> Link: http://lore.kernel.org/lkml/20220926215638.3931222-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 2 +- tools/perf/util/lock-contention.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index e107d71f0f1ac..1bb8628e7c9f0 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -75,7 +75,7 @@ int has_task; int stack_skip; /* error stat */ -unsigned long lost; +int lost; static inline int can_record(void) { diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h index 67db311fc9dfc..b8cb8830b7bc5 100644 --- a/tools/perf/util/lock-contention.h +++ b/tools/perf/util/lock-contention.h @@ -114,7 +114,7 @@ struct lock_contention { struct machine *machine; struct hlist_head *result; unsigned long map_nr_entries; - unsigned long lost; + int lost; int max_stack; int stack_skip; }; -- GitLab From dae09ffca00df015db96ffe3819777525cd26170 Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 01:39:27 +0000 Subject: [PATCH 1403/2223] perf machine: Remove unused struct process_args After commit a93f0e551af9 ("perf symbols: Get kernel start address by symbol name"), no one uses struct process_args any more, so remove it. Signed-off-by: Yuan Can <yuancan@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/linux-perf-users/20220927013931.110475-2-yuancan@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/machine.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 2a16cae284074..76316e459c3de 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1128,10 +1128,6 @@ static struct dso *machine__get_kernel(struct machine *machine) return kernel; } -struct process_args { - u64 start; -}; - void machine__get_kallsyms_filename(struct machine *machine, char *buf, size_t bufsz) { -- GitLab From 8d9b1734c7372390428346860f47b11652639fb2 Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 01:39:28 +0000 Subject: [PATCH 1404/2223] perf annotate: Remove unused struct disasm_line_samples After commit 3ab6db8d0f3b ("perf annotate browser: Use samples data from struct annotation_line"), no one use struct disasm_line_samples, so remove it. Signed-off-by: Yuan Can <yuancan@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/linux-perf-users/20220927013931.110475-3-yuancan@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/ui/browsers/annotate.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 725662e21b23e..c03fa76c02ffe 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -19,11 +19,6 @@ #include <sys/ttydefaults.h> #include <asm/bug.h> -struct disasm_line_samples { - double percent; - struct sym_hist_entry he; -}; - struct arch; struct annotate_browser { -- GitLab From 18f224ee8170137b80bb99c4bb36a7817a9433e3 Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 01:39:29 +0000 Subject: [PATCH 1405/2223] perf metric: Remove unused struct metric_ref_node After commit 46bdc0bf8d21 ("perf metric: Simplify metric_refs calculation"), no one use struct metric_ref_node, so remove it. Signed-off-by: Yuan Can <yuancan@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/linux-perf-users/20220927013931.110475-4-yuancan@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/metricgroup.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index b18da1a62a555..4c98ac29ee13f 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -109,17 +109,6 @@ void metricgroup__rblist_exit(struct rblist *metric_events) rblist__exit(metric_events); } -/* - * A node in the list of referenced metrics. metric_expr - * is held as a convenience to avoid a search through the - * metric list. - */ -struct metric_ref_node { - const char *metric_name; - const char *metric_expr; - struct list_head list; -}; - /** * The metric under construction. The data held here will be placed in a * metric_expr. -- GitLab From d28a8fd3c0f82c29dec7225a2e33f3801d9ec026 Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 01:39:30 +0000 Subject: [PATCH 1406/2223] perf jit: Remove unused struct debug_line_info The struct debug_line_info is never used, remove it. Signed-off-by: Yuan Can <yuancan@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/linux-perf-users/20220927013931.110475-5-yuancan@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/jitdump.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 4e66322037044..0e033278fa127 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -56,13 +56,6 @@ struct jit_buf_desc { char dir[PATH_MAX]; }; -struct debug_line_info { - unsigned long vma; - unsigned int lineno; - /* The filename format is unspecified, absolute path, relative etc. */ - char const filename[]; -}; - struct jit_tool { struct perf_tool tool; struct perf_data output; -- GitLab From 20b2194eeee3ff8df8f2bbf7631e7278fced404a Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 01:39:31 +0000 Subject: [PATCH 1407/2223] perf lock: Remove unused struct lock_contention_key The struct lock_contention_key is never used, remove it. Signed-off-by: Yuan Can <yuancan@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/linux-perf-users/20220927013931.110475-6-yuancan@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/bpf_lock_contention.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index efe5b9968e774..fc4d613cb979a 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -15,11 +15,6 @@ static struct lock_contention_bpf *skel; -/* should be same as bpf_skel/lock_contention.bpf.c */ -struct lock_contention_key { - s32 stack_id; -}; - struct lock_contention_data { u64 total_time; u64 min_time; -- GitLab From 81935f10e694e390c7d23055952ebe0ac2173d1d Mon Sep 17 00:00:00 2001 From: Will Chandler <wfc@wfchandler.org> Date: Fri, 30 Sep 2022 11:11:57 -0400 Subject: [PATCH 1408/2223] perf tools: Fix empty version number when building outside of a git repo When perf is built in a full source tree that is not a git repository, e.g. from a kernel source tarball, `perf version` will print empty tag and commit strings: $ perf version perf version Currently the tag version is only generated from the root Makefile when building in a git repository. If PERF-VERSION-FILE has not been generated and the source tree is not in a git repository, then PERF-VERSION-GEN will return an empty version. The problem can be reproduced with the following steps: $ wget https://git.kernel.org/torvalds/t/linux-6.0-rc7.tar.gz $ tar -xf linux-6.0-rc7.tar.gz && cd linux-6.0-rc7 $ make -C tools/perf $ tools/perf/perf -v perf version Builds from tarballs generated with `make perf-tar-src-pkg` are not impacted by this issue as PERF-VERSION-FILE is included in the archive. The perf RPM provided by Fedora for 5.18+ is experiencing this problem. Package build logs[0] show that the build is attempting to fall back on PERF-VERSION-FILE, but it is not present. To resolve this, revert back to the previous logic of using the kernel Makefile version if not in a git repository and PERF-VERSION-FILE does not exist. [0] https://kojipkgs.fedoraproject.org/packages/kernel-tools/5.19.4/200.fc36/data/logs/x86_64/build.log Fixes: 7572733b84997d23 ("perf tools: Fix version kernel tag") Reviewed-by: John Garry <john.garry@huawei.com> Signed-off-by: Will Chandler <wfc@wfchandler.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220930151157.529674-1-wfc@wfchandler.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/PERF-VERSION-GEN | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN index 0ee5af529238a..3cc42821d9b39 100755 --- a/tools/perf/util/PERF-VERSION-GEN +++ b/tools/perf/util/PERF-VERSION-GEN @@ -11,7 +11,8 @@ LF=' ' # -# Always try first to get the version from the kernel Makefile +# Use version from kernel Makefile unless not in a git repository and +# PERF-VERSION-FILE exists # CID= TAG= @@ -19,9 +20,14 @@ if test -d ../../.git -o -f ../../.git then TAG=$(MAKEFLAGS= make -sC ../.. kernelversion) CID=$(git log -1 --abbrev=12 --pretty=format:"%h" 2>/dev/null) && CID="-g$CID" -else +elif test -f ../../PERF-VERSION-FILE +then TAG=$(cut -d' ' -f3 ../../PERF-VERSION-FILE | sed -e 's/\"//g') fi +if test -z "$TAG" +then + TAG=$(MAKEFLAGS= make -sC ../.. kernelversion) +fi VN="$TAG$CID" if test -n "$CID" -- GitLab From 30b842d27dfa90046c46bbfa884113885e742279 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin <chenzhongjin@huawei.com> Date: Tue, 4 Oct 2022 08:59:25 -0300 Subject: [PATCH 1409/2223] perf parse-events: Remove unused macros __PERF_EVENT_FIELD() Unused macros reported by [-Wunused-macros]. This macros were introduced as __PERF_COUNTER_FIELD and used for reading the bit in config. cdd6c482c9ff9c55 ("perf: Do the big rename: Performance Counters -> Performance Events") Changes it to __PERF_EVENT_FIELD but at this commit there is already nowhere else using these macros, also no macros called PERF_EVENT_##name##_MASK/SHIFT. Now we are not reading type or id from config. These macros are useless and incomplete. So removing them for code cleaning. Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lore.kernel.org/lkml/20220926031440.28275-5-chenzhongjin@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/parse-events.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f3b2c2a87456b..437389dacf483 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -150,14 +150,6 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { }, }; -#define __PERF_EVENT_FIELD(config, name) \ - ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT) - -#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) -#define PERF_EVENT_CONFIG(config) __PERF_EVENT_FIELD(config, CONFIG) -#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) -#define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) - bool is_event_supported(u8 type, u64 config) { bool ret = true; -- GitLab From 4b65fc7bca1299de12ceeed1de31f252a185ed47 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:50 -0700 Subject: [PATCH 1410/2223] perf expr: Allow a double if expression Some TMA metrics have double if expressions like: ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) if #core_wide < 1 else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD This currently fails to parse as the left hand side if expression needs to be in parentheses. By allowing the if expression to have a right hand side that is an if expression we can parse the expression above, with left to right evaluation order that matches languages like Python. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/expr.c | 4 ++++ tools/perf/util/expr.y | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 8bd7197668140..6512f5e22045a 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -95,6 +95,10 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u ret |= test(ctx, "min(1,2) + 1", 2); ret |= test(ctx, "max(1,2) + 1", 3); ret |= test(ctx, "1+1 if 3*4 else 0", 2); + ret |= test(ctx, "100 if 1 else 200 if 1 else 300", 100); + ret |= test(ctx, "100 if 0 else 200 if 1 else 300", 200); + ret |= test(ctx, "100 if 1 else 200 if 0 else 300", 100); + ret |= test(ctx, "100 if 0 else 200 if 0 else 300", 300); ret |= test(ctx, "1.1 + 2.1", 3.2); ret |= test(ctx, ".1 + 2.", 2.1); ret |= test(ctx, "d_ratio(1, 2)", 0.5); diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y index a30b825adb7ba..635e562350c5c 100644 --- a/tools/perf/util/expr.y +++ b/tools/perf/util/expr.y @@ -156,7 +156,7 @@ start: if_expr } ; -if_expr: expr IF expr ELSE expr +if_expr: expr IF expr ELSE if_expr { if (fpclassify($3.val) == FP_ZERO) { /* -- GitLab From 0e4079154ea2ed4434c960df74551cb14de32324 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:51 -0700 Subject: [PATCH 1411/2223] perf test: Adjust case of test metrics Icelake and later architectures have slots events and SLOTS metrics meaning case sensitivity is important. Make the test metrics case agree with the name of the metrics. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json | 6 +++--- tools/perf/pmu-events/empty-pmu-events.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json b/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json index 42d9b5242fd7d..70ec8caaaf6f0 100644 --- a/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json +++ b/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json @@ -34,15 +34,15 @@ "MetricName": "DCache_L2_All_Miss" }, { - "MetricExpr": "dcache_l2_all_hits + dcache_l2_all_miss", + "MetricExpr": "DCache_L2_All_Hits + DCache_L2_All_Miss", "MetricName": "DCache_L2_All" }, { - "MetricExpr": "d_ratio(dcache_l2_all_hits, dcache_l2_all)", + "MetricExpr": "d_ratio(DCache_L2_All_Hits, DCache_L2_All)", "MetricName": "DCache_L2_Hits" }, { - "MetricExpr": "d_ratio(dcache_l2_all_miss, dcache_l2_all)", + "MetricExpr": "d_ratio(DCache_L2_All_Miss, DCache_L2_All)", "MetricName": "DCache_L2_Misses" }, { diff --git a/tools/perf/pmu-events/empty-pmu-events.c b/tools/perf/pmu-events/empty-pmu-events.c index 5ed8c0aa48175..480e8f0d30c83 100644 --- a/tools/perf/pmu-events/empty-pmu-events.c +++ b/tools/perf/pmu-events/empty-pmu-events.c @@ -142,15 +142,15 @@ static const struct pmu_event pme_test_soc_cpu[] = { .metric_name = "DCache_L2_All_Miss", }, { - .metric_expr = "dcache_l2_all_hits + dcache_l2_all_miss", + .metric_expr = "DCache_L2_All_Hits + DCache_L2_All_Miss", .metric_name = "DCache_L2_All", }, { - .metric_expr = "d_ratio(dcache_l2_all_hits, dcache_l2_all)", + .metric_expr = "d_ratio(DCache_L2_All_Hits, DCache_L2_All)", .metric_name = "DCache_L2_Hits", }, { - .metric_expr = "d_ratio(dcache_l2_all_miss, dcache_l2_all)", + .metric_expr = "d_ratio(DCache_L2_All_Miss, DCache_L2_All)", .metric_name = "DCache_L2_Misses", }, { -- GitLab From 715b824f4a1f21e3eeb78076efa6215421bb8f98 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:52 -0700 Subject: [PATCH 1412/2223] perf expr: Remove jevents case workaround jevents.py no longer lowercases metrics and altering the case can cause hashmap lookups to fail, so remove. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/expr.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index c6827900f8d31..aaacf514dc09c 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -182,7 +182,7 @@ int expr__add_ref(struct expr_parse_ctx *ctx, struct metric_ref *ref) { struct expr_id_data *data_ptr = NULL, *old_data = NULL; char *old_key = NULL; - char *name, *p; + char *name; int ret; data_ptr = zalloc(sizeof(*data_ptr)); @@ -195,15 +195,6 @@ int expr__add_ref(struct expr_parse_ctx *ctx, struct metric_ref *ref) return -ENOMEM; } - /* - * The jevents tool converts all metric expressions - * to lowercase, including metric references, hence - * we need to add lowercase name for metric, so it's - * properly found. - */ - for (p = name; *p; p++) - *p = tolower(*p); - /* * Intentionally passing just const char pointers, * originally from 'struct pmu_event' object. -- GitLab From 8cff7490fc05333f163c0130ec6c64e7a433a4a0 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:53 -0700 Subject: [PATCH 1413/2223] perf metrics: Don't scale counts going into metrics Counts are scaled prior to going into saved_value, reverse the scaling so that metrics don't double scale values. Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat-shadow.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 9e1eddeff21bd..b5cedd37588fb 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -865,11 +865,16 @@ static int prepare_metric(struct evsel **metric_events, if (!v) break; stats = &v->stats; - scale = 1.0; + /* + * If an event was scaled during stat gathering, reverse + * the scale before computing the metric. + */ + scale = 1.0 / metric_events[i]->scale; + source_count = evsel__source_count(metric_events[i]); if (v->metric_other) - metric_total = v->metric_total; + metric_total = v->metric_total * scale; } n = strdup(evsel__metric_id(metric_events[i])); if (!n) -- GitLab From 313b2f384be160b83a72328cbeab8a53902aaef4 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:54 -0700 Subject: [PATCH 1414/2223] perf vendor events: Update Intel skylakex Events remain at v1.28, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Removal of ScaleUnit from uncore events by Zhengjun Xing <zhengjun.xing@linux.intel.com>. - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Latest metrics from: https://github.com/intel/perfmon-metrics Tested on a skylakex manually and with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok 93: perf all metricgroups test : Ok 94: perf all metrics test : Skip 95: perf all PMU test : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/skylakex/skx-metrics.json | 1262 ++++++++++------- .../arch/x86/skylakex/uncore-memory.json | 18 +- .../arch/x86/skylakex/uncore-other.json | 19 +- 3 files changed, 782 insertions(+), 517 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 6a6764e1504b2..bc8e42554096c 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -1,148 +1,726 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "9 * BACLEARS.ANY / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (44 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(17 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "(59.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "(127 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "((89.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + (89.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 11 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "((110 * Average_Frequency) * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + (47.5 * Average_Frequency) * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "Mispredictions" + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_512b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", + "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fused_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_non_fused_branches", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / SLOTS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "Bad;BadSpec;BrMispredicts_SMT", - "MetricName": "Mispredictions_SMT" + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" }, { "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (OFFCORE_REQUESTS_BUFFER.SQ_FULL / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) ) + ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( ((L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )) * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CPU_CLK_UNHALTED.THREAD) / #(max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) ", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "Memory_Bandwidth" }, - { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) ) + ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( ((L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )) * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CPU_CLK_UNHALTED.THREAD) / #(max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) ", - "MetricGroup": "Mem;MemoryBW;Offcore_SMT", - "MetricName": "Memory_Bandwidth_SMT" - }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD ) / CPU_CLK_UNHALTED.THREAD - (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD)) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (( (20.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) - (3.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) ) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) + ( (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD)) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) )", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))", "MetricGroup": "Mem;MemoryLat;Offcore", "MetricName": "Memory_Latency" }, - { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD ) / CPU_CLK_UNHALTED.THREAD - (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD)) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( (20.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) - (3.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) ) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) + ( (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD)) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) )", - "MetricGroup": "Mem;MemoryLat;Offcore_SMT", - "MetricName": "Memory_Latency_SMT" - }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( 9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE , max( CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS , 0 ) ) / CPU_CLK_UNHALTED.THREAD) / (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) + ( (EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (( 9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE ) / CPU_CLK_UNHALTED.THREAD) / #(EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) ) ) ", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency))) ", "MetricGroup": "Mem;MemoryTLB;Offcore", "MetricName": "Memory_Data_TLBs" }, - { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( 9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE , max( CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS , 0 ) ) / CPU_CLK_UNHALTED.THREAD) / (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) + ( (EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( 9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / #(EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) ) ) ", - "MetricGroup": "Mem;MemoryTLB;Offcore_SMT", - "MetricName": "Memory_Data_TLBs_SMT" - }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) ) / (4 * CPU_CLK_UNHALTED.THREAD))", + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, - { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))", - "MetricGroup": "Ret_SMT", - "MetricName": "Branching_Overhead_SMT" - }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "Big_Code" }, - { - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB_SMT", - "MetricName": "Big_Code_SMT" - }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) - (100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)))", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "Instruction_Fetch_BW" }, - { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) - (100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))", - "MetricGroup": "Fed;FetchBW;Frontend_SMT", - "MetricName": "Instruction_Fetch_BW_SMT" - }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -158,6 +736,12 @@ "MetricGroup": "Branches;Fed;FetchBW", "MetricName": "UpTB" }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", + "MetricName": "CPI" + }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", @@ -166,16 +750,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -185,63 +763,38 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Cor;Flops;HPC_SMT", - "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricExpr": "( 1 - ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)))) / ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)))) < ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1 ) if 0 > 0.5 else 0", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", "MetricGroup": "Cor;SMT", "MetricName": "Core_Bound_Likely" }, - { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricExpr": "( 1 - ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) / ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) < ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1 ) if (1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 )) > 0.5 else 0", - "MetricGroup": "Cor;SMT_SMT", - "MetricName": "Core_Bound_Likely_SMT" - }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -283,13 +836,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -310,21 +863,21 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX512", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -336,9 +889,9 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -373,16 +926,10 @@ }, { "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", - "MetricExpr": "100 * ( (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + ((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))) * (( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / CPU_CLK_UNHALTED.THREAD / 2) / #((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))) )", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", "MetricGroup": "DSBmiss;Fed", "MetricName": "DSB_Misses" }, - { - "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", - "MetricExpr": "100 * ( (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + ((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * (( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) / 2) / #((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )", - "MetricGroup": "DSBmiss;Fed_SMT", - "MetricName": "DSB_Misses_SMT" - }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -397,16 +944,10 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts", "MetricName": "Branch_Misprediction_Cost" }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -415,101 +956,95 @@ }, { "BriefDescription": "Fraction of branches that are taken conditionals", - "MetricExpr": "( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", "MetricName": "Cond_TK" }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "Jump" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -536,37 +1071,37 @@ }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1000 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_Silent_PKI" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_NonSilent_PKI" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -578,68 +1113,47 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / 2 / CORE_CLKS if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License0_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, - { - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / 2 / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Power_SMT", - "MetricName": "Power_License0_Utilization_SMT", - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes. SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / 2 / CORE_CLKS if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License1_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, - { - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / 2 / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Power_SMT", - "MetricName": "Power_License1_Utilization_SMT", - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions. SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / 2 / CORE_CLKS if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License2_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, - { - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / 2 / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Power_SMT", - "MetricName": "Power_License2_Utilization_SMT", - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions. SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -657,13 +1171,13 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / cha@event\\=0x35\\,umask\\=0x21\\,config\\=0x40433@ ) / ( cha_0@event\\=0x0@ / duration_time )", + "MetricExpr": "1000000000 * (cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / cha@event\\=0x35\\,umask\\=0x21\\,config\\=0x40433@) / (Socket_CLKS / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "MEM_Read_Latency" }, @@ -675,20 +1189,20 @@ }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": "1000000000 * ( UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS ) / imc_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": "1000000000 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_DRAM_Read_Latency" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "( UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3 ) * 4 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1000000000 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Write_BW" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "( UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 ) * 4 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1000000000 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Read_BW" }, { @@ -697,12 +1211,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cha_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -752,11 +1260,10 @@ "MetricName": "C7_Pkg_Residency" }, { - "BriefDescription": "Percentage of time spent in the active CPU power state C0", - "MetricExpr": "100 * CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "", - "MetricName": "cpu_utilization_percent", - "ScaleUnit": "1%" + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" }, { "BriefDescription": "CPU operating frequency (in GHz)", @@ -765,13 +1272,6 @@ "MetricName": "cpu_operating_frequency", "ScaleUnit": "1GHz" }, - { - "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", - "MetricGroup": "", - "MetricName": "cpi", - "ScaleUnit": "1per_instr" - }, { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", @@ -790,7 +1290,7 @@ "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches", + "MetricName": "l1d_mpi", "ScaleUnit": "1per_instr" }, { @@ -818,7 +1318,7 @@ "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches", + "MetricName": "l2_mpi", "ScaleUnit": "1per_instr" }, { @@ -849,58 +1349,79 @@ "MetricName": "llc_code_read_mpi_demand_plus_prefetch", "ScaleUnit": "1per_instr" }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "MetricExpr": "( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043300000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043300000000@ ) / ( UNC_CHA_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", + "MetricGroup": "", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043200000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ ) / ( UNC_CHA_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", + "MetricGroup": "", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043100000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ ) / ( UNC_CHA_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", + "MetricGroup": "", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "itlb_2nd_level_mpi", + "MetricName": "itlb_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "itlb_2nd_level_large_page_mpi", + "MetricName": "itlb_large_page_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "dtlb_2nd_level_load_mpi", + "MetricName": "dtlb_load_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "dtlb_2nd_level_2mb_large_page_load_mpi", + "MetricName": "dtlb_2mb_large_page_load_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "dtlb_2nd_level_store_mpi", + "MetricName": "dtlb_store_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ / ( cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ + cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_local_dram", + "MetricName": "numa_reads_addressed_to_local_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ / ( cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ + cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_remote_dram", + "MetricName": "numa_reads_addressed_to_remote_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Uncore operating frequency in GHz", - "MetricExpr": "( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) / 1000000000) / duration_time", + "MetricExpr": "( UNC_CHA_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) / 1000000000) / duration_time", "MetricGroup": "", "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" @@ -909,7 +1430,7 @@ "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "( UNC_UPI_TxL_FLITS.ALL_DATA * (64 / 9.0) / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "upi_data_transmit_bw_only_data", + "MetricName": "upi_data_transmit_bw", "ScaleUnit": "1MB/s" }, { @@ -937,35 +1458,35 @@ "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", "MetricExpr": "(( UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3 ) * 4 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_read", + "MetricName": "io_bandwidth_disk_or_network_writes", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", "MetricExpr": "(( UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART0 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART1 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART2 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART3 ) * 4 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_write", + "MetricName": "io_bandwidth_disk_or_network_reads", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.DSB_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_decoded_icache_dsb", + "MetricName": "percent_uops_delivered_from_decoded_icache", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MITE_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline_mite", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MS_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_microcode_sequencer_ms", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", "ScaleUnit": "1%" }, { @@ -988,250 +1509,5 @@ "MetricGroup": "", "MetricName": "llc_miss_remote_memory_bandwidth_read", "ScaleUnit": "1MB/s" - }, - { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", - "MetricExpr": "100 * ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1;PGO", - "MetricName": "tma_frontend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", - "MetricExpr": "100 * ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_latency_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "100 * ( ( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_icache_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", - "MetricExpr": "100 * ( ICACHE_64B.IFTAG_STALL / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_itlb_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", - "MetricExpr": "100 * ( INT_MISC.CLEAR_RESTEER_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) + ( ( 9 ) * BACLEARS.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_branch_resteers_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", - "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "DSBmiss;FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_dsb_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", - "MetricExpr": "100 * ( ILD_STALL.LCP / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_lcp_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.", - "MetricExpr": "100 * ( ( 2 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;MicroSeq;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_ms_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", - "MetricExpr": "100 * ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "FetchBW;Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_bandwidth_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", - "MetricExpr": "100 * ( ( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSBmiss;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_mite_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", - "MetricExpr": "100 * ( ( IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSB;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_dsb_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", - "MetricExpr": "100 * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_bad_speculation_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", - "MetricExpr": "100 * ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "BadSpec;BrMispredicts;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_branch_mispredicts_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", - "MetricExpr": "100 * ( ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "BadSpec;MachineClears;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_machine_clears_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", - "MetricExpr": "100 * ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_backend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", - "MetricExpr": "100 * ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / ( CYCLE_ACTIVITY.STALLS_TOTAL + ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) + EXE_ACTIVITY.BOUND_ON_STORES ) ) * ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;m_tma_backend_bound_percent", - "MetricName": "tma_memory_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", - "MetricExpr": "100 * ( max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l1_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=0x1@ ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l2_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l3_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", - "MetricExpr": "100 * ( min( ( ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=0x1@ ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_dram_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", - "MetricExpr": "100 * ( EXE_ACTIVITY.BOUND_ON_STORES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_store_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", - "MetricExpr": "100 * ( ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / ( CYCLE_ACTIVITY.STALLS_TOTAL + ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) + EXE_ACTIVITY.BOUND_ON_STORES ) ) * ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;Compute;m_tma_backend_bound_percent", - "MetricName": "tma_core_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.", - "MetricExpr": "100 * ( ARITH.DIVIDER_ACTIVE / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_divider_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", - "MetricExpr": "100 * ( ( EXE_ACTIVITY.EXE_BOUND_0_PORTS + ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) ) / ( CPU_CLK_UNHALTED.THREAD ) if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "PortsUtil;TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_ports_utilization_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", - "MetricExpr": "100 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_retiring_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_light_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) , ( 1 ) ) ) )", - "MetricGroup": "HPC;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fp_arith_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_memory_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * UOPS_RETIRED.MACRO_FUSED / ( UOPS_RETIRED.RETIRE_SLOTS ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fused_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * ( BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED ) / ( UOPS_RETIRED.RETIRE_SLOTS ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_non_fused_branches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * INST_RETIRED.NOP / ( UOPS_RETIRED.RETIRE_SLOTS ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_nop_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", - "MetricExpr": "100 * ( max( 0 , ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) - ( ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) , ( 1 ) ) ) ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * UOPS_RETIRED.MACRO_FUSED / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * ( BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * INST_RETIRED.NOP / ( UOPS_RETIRED.RETIRE_SLOTS ) ) ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_other_light_ops_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_heavy_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_few_uops_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "MicroSeq;TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_microcode_sequencer_percent", - "ScaleUnit": "1%" } ] diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json index 0746fcf2ebd97..62941146e3967 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json @@ -27,20 +27,19 @@ "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", + "BriefDescription": "All DRAM Read CAS Commands issued (including underfills)", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_READ", + "EventName": "UNC_M_CAS_COUNT.RD", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0x3", "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller", + "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.RD", + "EventName": "LLC_MISSES.MEM_READ", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0x3", @@ -56,20 +55,19 @@ "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", + "BriefDescription": "All DRAM Write CAS commands issued", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_WRITE", + "EventName": "UNC_M_CAS_COUNT.WR", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0xC", "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller", + "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.WR", + "EventName": "LLC_MISSES.MEM_WRITE", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0xC", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json index f55aeadc630f2..0d106fe7aae35 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json @@ -1089,7 +1089,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x01", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1101,7 +1100,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x02", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1113,7 +1111,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x04", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1125,7 +1122,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x08", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1196,7 +1192,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x01", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1208,7 +1203,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x02", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1220,7 +1214,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x04", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1232,7 +1225,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x08", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1974,20 +1966,19 @@ "Unit": "UPI LL" }, { - "BriefDescription": "UPI interconnect send bandwidth for payload. Derived from unc_upi_txl_flits.all_data", + "BriefDescription": "Valid data FLITs transmitted via any slot", "Counter": "0,1,2,3", "EventCode": "0x2", - "EventName": "UPI_DATA_BANDWIDTH_TX", + "EventName": "UNC_UPI_TxL_FLITS.ALL_DATA", "PerPkg": "1", - "ScaleUnit": "7.11E-06Bytes", - "UMask": "0xf", + "UMask": "0x0F", "Unit": "UPI LL" }, { - "BriefDescription": "UPI interconnect send bandwidth for payload", + "BriefDescription": "UPI interconnect send bandwidth for payload. Derived from unc_upi_txl_flits.all_data", "Counter": "0,1,2,3", "EventCode": "0x2", - "EventName": "UNC_UPI_TxL_FLITS.ALL_DATA", + "EventName": "UPI_DATA_BANDWIDTH_TX", "PerPkg": "1", "ScaleUnit": "7.11E-06Bytes", "UMask": "0xf", -- GitLab From a80de06698a7c7dc4f875bd3118bc9e650c18c14 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:55 -0700 Subject: [PATCH 1415/2223] perf vendor events: Update Intel alderlake Events are updated to v1.15, the core metrics are based on TMA 4.4 full and the atom metrics on E-core TMA 2.2. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - Addition of all 6 levels of TMA metrics. Previously metrics involving topdown events were dropped. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Update mapfile.csv CPUIDs to match 01.org. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/alderlake/adl-metrics.json | 1353 ++++++++++++++++- .../pmu-events/arch/x86/alderlake/cache.json | 129 +- .../arch/x86/alderlake/frontend.json | 12 + .../pmu-events/arch/x86/alderlake/memory.json | 22 + .../pmu-events/arch/x86/alderlake/other.json | 22 + .../arch/x86/alderlake/pipeline.json | 14 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 7 files changed, 1460 insertions(+), 94 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index 095dd8c7f1619..e06d26ad51385 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -1,22 +1,852 @@ [ + { + "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "(topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS)", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "ICACHE_DATA.STALLS / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_TAG.STALLS / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (tma_branch_mispredicts / tma_bad_speculation)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "DECODE.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", + "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "FetchBW;LSD;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_lsd", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((25 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (24 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(24 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(9 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / CLKS)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((MEM_STORE_RETIRED.L2_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(28 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_streaming_stores", + "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", + "MetricExpr": "13 * MISC2_RETIRED.LFENCE / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_memory_fence", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3_10", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_int_operations", + "PublicDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired). Vector/Matrix Int operations and shuffles are counted. Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired.", + "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_int_operations_group", + "MetricName": "tma_int_vector_128b", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired.", + "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_int_operations_group", + "MetricName": "tma_int_vector_256b", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", + "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * SLOTS)", + "MetricGroup": "HPC;Pipeline;TopdownL4;tma_int_operations_group", + "MetricName": "tma_shuffles", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fused_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_non_fused_branches", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "UOPS_RETIRED.MS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * cpu_core@ASSISTS.ANY\\,umask\\=0x1B@ / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", + "MetricExpr": "99 * ASSISTS.PAGE_FAULT / SLOTS", + "MetricGroup": "TopdownL5;tma_assists_group", + "MetricName": "tma_page_faults", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults. A Page Fault may apply on first application access to a memory page. Note operating system handling of page faults accounts for the majority of its cost.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "30 * ASSISTS.FP / SLOTS", + "MetricGroup": "HPC;TopdownL5;tma_assists_group", + "MetricName": "tma_fp_assists", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called denormals).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists. ", + "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / SLOTS", + "MetricGroup": "HPC;TopdownL5;tma_assists_group", + "MetricName": "tma_avx_assists", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources. Sample with: FRONTEND_RETIRED.MS_FLOWS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "Memory_Bandwidth", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))", + "MetricGroup": "Mem;MemoryLat;Offcore", + "MetricName": "Memory_Latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ", + "MetricGroup": "Mem;MemoryTLB;Offcore", + "MetricName": "Memory_Data_TLBs", + "Unit": "cpu_core" + }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead", "Unit": "cpu_core" }, + { + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", + "MetricName": "Big_Code", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "Instruction_Fetch_BW", + "Unit": "cpu_core" + }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC", "Unit": "cpu_core" }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "UPI", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW", + "MetricName": "UpTB", + "Unit": "cpu_core" + }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI", "Unit": "cpu_core" }, @@ -30,14 +860,14 @@ { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS", "Unit": "cpu_core" }, { "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1", - "MetricGroup": "SMT;TmaL1", + "MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1", + "MetricGroup": "SMT;tma_L1_group", "MetricName": "Slots_Utilization", "Unit": "cpu_core" }, @@ -51,21 +881,21 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC", "Unit": "cpu_core" }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc", "Unit": "cpu_core" }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5 ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common).", @@ -73,11 +903,18 @@ }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP", "Unit": "cpu_core" }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", + "MetricGroup": "Cor;SMT", + "MetricName": "Core_Bound_Likely", + "Unit": "cpu_core" + }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", @@ -129,14 +966,14 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.", @@ -160,7 +997,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", @@ -168,7 +1005,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", @@ -182,12 +1019,19 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions", "Unit": "cpu_core" }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "(tma_retiring * SLOTS) / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire", + "Unit": "cpu_core" + }, { "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", @@ -237,6 +1081,13 @@ "MetricName": "DSB_Switch_Cost", "Unit": "cpu_core" }, + { + "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "DSB_Misses", + "Unit": "cpu_core" + }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -251,6 +1102,13 @@ "MetricName": "IpMispredict", "Unit": "cpu_core" }, + { + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "Branch_Misprediction_Cost", + "Unit": "cpu_core" + }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -267,7 +1125,7 @@ }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet", "Unit": "cpu_core" @@ -281,7 +1139,7 @@ }, { "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - ( (BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES) + ((BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES) )", + "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)", "MetricGroup": "Bad;Branches", "MetricName": "Other_Branches", "Unit": "cpu_core" @@ -296,77 +1154,77 @@ { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP", "Unit": "cpu_core" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI", "Unit": "cpu_core" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load", "Unit": "cpu_core" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI", "Unit": "cpu_core" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI", "Unit": "cpu_core" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING ) / ( 4 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization", "Unit": "cpu_core" @@ -401,28 +1259,28 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T", "Unit": "cpu_core" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T", "Unit": "cpu_core" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T", "Unit": "cpu_core" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T", "Unit": "cpu_core" @@ -436,14 +1294,14 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency", "Unit": "cpu_core" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.", @@ -451,7 +1309,7 @@ }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization", "Unit": "cpu_core" @@ -479,7 +1337,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use", "Unit": "cpu_core" @@ -500,41 +1358,408 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.", - "MetricExpr": "TOPDOWN_FE_BOUND.ALL / (5 * CPU_CLK_UNHALTED.CORE)", + "MetricExpr": "TOPDOWN_FE_BOUND.ALL / SLOTS", "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", + "MetricName": "tma_frontend_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", + "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / SLOTS", + "MetricGroup": "TopdownL2;tma_frontend_bound_group", + "MetricName": "tma_frontend_latency", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.", + "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_latency_group", + "MetricName": "tma_icache", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.", + "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_latency_group", + "MetricName": "tma_itlb", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend", + "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_latency_group", + "MetricName": "tma_branch_detect", + "PublicDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.", + "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_latency_group", + "MetricName": "tma_branch_resteer", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", + "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / SLOTS", + "MetricGroup": "TopdownL2;tma_frontend_bound_group", + "MetricName": "tma_frontend_bandwidth", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).", + "MetricExpr": "TOPDOWN_FE_BOUND.CISC / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group", + "MetricName": "tma_cisc", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.", + "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group", + "MetricName": "tma_decode", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.", + "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group", + "MetricName": "tma_predecode", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.", + "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / SLOTS", + "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group", + "MetricName": "tma_other_fb", + "ScaleUnit": "100%", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.ALL / (5 * CPU_CLK_UNHALTED.CORE)", + "MetricExpr": "(SLOTS - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / SLOTS", "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", + "MetricName": "tma_bad_speculation", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts.", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / SLOTS", + "MetricGroup": "TopdownL2;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / SLOTS", + "MetricGroup": "TopdownL2;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear (slow nuke).", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / SLOTS", + "MetricGroup": "TopdownL3;tma_machine_clears_group", + "MetricName": "tma_nuke", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to SMC. ", + "MetricExpr": "tma_nuke * (MACHINE_CLEARS.SMC / MACHINE_CLEARS.SLOW)", + "MetricGroup": "TopdownL4;tma_nuke_group", + "MetricName": "tma_smc", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory ordering. ", + "MetricExpr": "tma_nuke * (MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.SLOW)", + "MetricGroup": "TopdownL4;tma_nuke_group", + "MetricName": "tma_memory_ordering", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to FP assists. ", + "MetricExpr": "tma_nuke * (MACHINE_CLEARS.FP_ASSIST / MACHINE_CLEARS.SLOW)", + "MetricGroup": "TopdownL4;tma_nuke_group", + "MetricName": "tma_fp_assist", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory disambiguation. ", + "MetricExpr": "tma_nuke * (MACHINE_CLEARS.DISAMBIGUATION / MACHINE_CLEARS.SLOW)", + "MetricGroup": "TopdownL4;tma_nuke_group", + "MetricName": "tma_disambiguation", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to page faults. ", + "MetricExpr": "tma_nuke * (MACHINE_CLEARS.PAGE_FAULT / MACHINE_CLEARS.SLOW)", + "MetricGroup": "TopdownL4;tma_nuke_group", + "MetricName": "tma_page_fault", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear classified as a fast nuke due to memory ordering, memory disambiguation and memory renaming.", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / SLOTS", + "MetricGroup": "TopdownL3;tma_machine_clears_group", + "MetricName": "tma_fast_nuke", + "ScaleUnit": "100%", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "TOPDOWN_BE_BOUND.ALL / (5 * CPU_CLK_UNHALTED.CORE)", + "MetricExpr": "TOPDOWN_BE_BOUND.ALL / SLOTS", "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", + "MetricName": "tma_backend_bound", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles due to backend bound stalls that are core execution bound and not attributed to outstanding demand load or store stalls. ", + "MetricExpr": "max(0, tma_backend_bound - tma_load_store_bound)", + "MetricGroup": "TopdownL2;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads. ", + "MetricExpr": "min((TOPDOWN_BE_BOUND.ALL / SLOTS), (LD_HEAD.ANY_AT_RET / CLKS) + tma_store_bound)", + "MetricGroup": "TopdownL2;tma_backend_bound_group", + "MetricName": "tma_load_store_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full.", + "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)", + "MetricGroup": "TopdownL3;tma_load_store_bound_group", + "MetricName": "tma_store_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a load block.", + "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / CLKS", + "MetricGroup": "TopdownL3;tma_load_store_bound_group", + "MetricName": "tma_l1_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.", + "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a first level TLB miss.", + "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_stlb_hit", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a second level TLB miss requiring a page walk.", + "MetricExpr": "LD_HEAD.PGWALK_AT_RET / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_stlb_miss", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a number of other load blocks.", + "MetricExpr": "LD_HEAD.OTHER_AT_RET / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_other_l1", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.", + "MetricExpr": "(MEM_BOUND_STALLS.LOAD_L2_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD)", + "MetricGroup": "TopdownL3;tma_load_store_bound_group", + "MetricName": "tma_l2_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.", + "MetricExpr": "(MEM_BOUND_STALLS.LOAD_LLC_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD)", + "MetricGroup": "TopdownL3;tma_load_store_bound_group", + "MetricName": "tma_l3_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).", + "MetricExpr": "(MEM_BOUND_STALLS.LOAD_DRAM_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD)", + "MetricGroup": "TopdownL3;tma_load_store_bound_group", + "MetricName": "tma_dram_bound", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hits in the L2, LLC, DRAM or MMIO (Non-DRAM) but could not be correctly attributed or cycles in which the load miss is waiting on a request buffer.", + "MetricExpr": "max(0, tma_load_store_bound - (tma_store_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_dram_bound))", + "MetricGroup": "TopdownL3;tma_load_store_bound_group", + "MetricName": "tma_other_load_store", + "ScaleUnit": "100%", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", - "MetricExpr": "(TOPDOWN_BE_BOUND.ALL / (5 * CPU_CLK_UNHALTED.CORE))", + "MetricExpr": "tma_backend_bound", "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound_Aux", + "MetricName": "tma_backend_bound_aux", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation. ", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", + "MetricExpr": "tma_backend_bound", + "MetricGroup": "TopdownL2;tma_backend_bound_aux_group", + "MetricName": "tma_resource_bound", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. ", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.", + "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / SLOTS", + "MetricGroup": "TopdownL3;tma_resource_bound_group", + "MetricName": "tma_mem_scheduler", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to store buffer full", + "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)", + "MetricGroup": "TopdownL4;tma_mem_scheduler_group", + "MetricName": "tma_st_buffer", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to load buffer full", + "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.LD_BUF / MEM_SCHEDULER_BLOCK.ALL", + "MetricGroup": "TopdownL4;tma_mem_scheduler_group", + "MetricName": "tma_ld_buffer", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to RSV full relative ", + "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.RSV / MEM_SCHEDULER_BLOCK.ALL", + "MetricGroup": "TopdownL4;tma_mem_scheduler_group", + "MetricName": "tma_rsv", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.", + "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / SLOTS", + "MetricGroup": "TopdownL3;tma_resource_bound_group", + "MetricName": "tma_non_mem_scheduler", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).", + "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / SLOTS", + "MetricGroup": "TopdownL3;tma_resource_bound_group", + "MetricName": "tma_register", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls).", + "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / SLOTS", + "MetricGroup": "TopdownL3;tma_resource_bound_group", + "MetricName": "tma_reorder_buffer", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions.", + "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / SLOTS", + "MetricGroup": "TopdownL3;tma_resource_bound_group", + "MetricName": "tma_alloc_restriction", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).", + "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / SLOTS", + "MetricGroup": "TopdownL3;tma_resource_bound_group", + "MetricName": "tma_serialization", + "ScaleUnit": "100%", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the numer of issue slots that result in retirement slots. ", - "MetricExpr": "TOPDOWN_RETIRING.ALL / (5 * CPU_CLK_UNHALTED.CORE)", + "MetricExpr": "TOPDOWN_RETIRING.ALL / SLOTS", "MetricGroup": "TopdownL1", - "MetricName": "Retiring", + "MetricName": "tma_retiring", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of uops that are not from the microsequencer. ", + "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS) / SLOTS", + "MetricGroup": "TopdownL2;tma_retiring_group", + "MetricName": "tma_base", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of floating point operations per uop with all default weighting.", + "MetricExpr": "UOPS_RETIRED.FPDIV / SLOTS", + "MetricGroup": "TopdownL3;tma_base_group", + "MetricName": "tma_fp_uops", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.", + "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS - UOPS_RETIRED.FPDIV) / SLOTS", + "MetricGroup": "TopdownL3;tma_base_group", + "MetricName": "tma_other_ret", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)", + "MetricExpr": "UOPS_RETIRED.MS / SLOTS", + "MetricGroup": "TopdownL2;tma_retiring_group", + "MetricName": "tma_ms_uops", + "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.", + "ScaleUnit": "100%", "Unit": "cpu_atom" }, { @@ -551,19 +1776,19 @@ }, { "BriefDescription": "", - "MetricExpr": "5 * CPU_CLK_UNHALTED.CORE", + "MetricExpr": "5 * CLKS", "MetricName": "SLOTS", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions Per Cycle", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.CORE", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricName": "IPC", "Unit": "cpu_atom" }, { "BriefDescription": "Cycles Per Instruction", - "MetricExpr": "CPU_CLK_UNHALTED.CORE / INST_RETIRED.ANY", + "MetricExpr": "CLKS / INST_RETIRED.ANY", "MetricName": "CPI", "Unit": "cpu_atom" }, @@ -623,7 +1848,7 @@ }, { "BriefDescription": "Instructions per Far Branch", - "MetricExpr": "INST_RETIRED.ANY / ( BR_INST_RETIRED.FAR_BRANCH / 2 )", + "MetricExpr": "INST_RETIRED.ANY / (BR_INST_RETIRED.FAR_BRANCH / 2)", "MetricName": "IpFarBranch", "Unit": "cpu_atom" }, @@ -665,7 +1890,7 @@ }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.CORE / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricName": "Turbo_Utilization", "Unit": "cpu_atom" }, @@ -681,12 +1906,6 @@ "MetricName": "CPU_Utilization", "Unit": "cpu_atom" }, - { - "BriefDescription": "Estimated Pause cost. In percent", - "MetricExpr": "100 * SERIALIZATION.NON_C01_MS_SCB / (5 * CPU_CLK_UNHALTED.CORE)", - "MetricName": "Estimated_Pause_Cost", - "Unit": "cpu_atom" - }, { "BriefDescription": "Cycle cost per L2 hit", "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT", @@ -707,19 +1926,19 @@ }, { "BriefDescription": "Percent of instruction miss cost that hit in the L2", - "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / ( MEM_BOUND_STALLS.IFETCH )", + "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / (MEM_BOUND_STALLS.IFETCH)", "MetricName": "Inst_Miss_Cost_L2Hit_Percent", "Unit": "cpu_atom" }, { "BriefDescription": "Percent of instruction miss cost that hit in the L3", - "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / ( MEM_BOUND_STALLS.IFETCH )", + "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / (MEM_BOUND_STALLS.IFETCH)", "MetricName": "Inst_Miss_Cost_L3Hit_Percent", "Unit": "cpu_atom" }, { "BriefDescription": "Percent of instruction miss cost that hit in DRAM", - "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / ( MEM_BOUND_STALLS.IFETCH )", + "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / (MEM_BOUND_STALLS.IFETCH)", "MetricName": "Inst_Miss_Cost_DRAMHit_Percent", "Unit": "cpu_atom" }, diff --git a/tools/perf/pmu-events/arch/x86/alderlake/cache.json b/tools/perf/pmu-events/arch/x86/alderlake/cache.json index 887dce4dfebac..2cc62d2779d20 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/cache.json @@ -1,4 +1,28 @@ [ + { + "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.", + "CollectPEBSRecord": "2", + "Counter": "0,1,2,3,4,5", + "EventCode": "0x2e", + "EventName": "LONGEST_LAT_CACHE.MISS", + "PEBScounters": "0,1,2,3,4,5", + "SampleAfterValue": "200003", + "Speculative": "1", + "UMask": "0x41", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.", + "CollectPEBSRecord": "2", + "Counter": "0,1,2,3,4,5", + "EventCode": "0x2e", + "EventName": "LONGEST_LAT_CACHE.REFERENCE", + "PEBScounters": "0,1,2,3,4,5", + "SampleAfterValue": "200003", + "Speculative": "1", + "UMask": "0x4f", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).", "CollectPEBSRecord": "2", @@ -210,8 +234,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", @@ -219,7 +243,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x80", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -227,8 +251,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", @@ -236,7 +260,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x10", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -244,8 +268,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", @@ -253,7 +277,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x100", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -261,8 +285,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", @@ -270,7 +294,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x20", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -278,8 +302,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", @@ -287,7 +311,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x4", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -295,8 +319,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", @@ -304,7 +328,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x200", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -312,8 +336,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", @@ -321,7 +345,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x40", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -329,8 +353,8 @@ }, { "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.", - "CollectPEBSRecord": "3", - "Counter": "0,1,2,3,4,5", + "CollectPEBSRecord": "2", + "Counter": "0,1", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", @@ -338,7 +362,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x8", "PEBS": "2", - "PEBScounters": "0,1,2,3,4,5", + "PEBScounters": "0,1", "SampleAfterValue": "1000003", "TakenAlone": "1", "UMask": "0x5", @@ -359,7 +383,7 @@ }, { "BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.", - "CollectPEBSRecord": "3", + "CollectPEBSRecord": "2", "Counter": "0,1,2,3,4,5", "Data_LA": "1", "EventCode": "0xd0", @@ -371,6 +395,61 @@ "UMask": "0x6", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10003C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x4003C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x8003C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.", "Counter": "0,1,2,3,4,5", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json index 2cfa70b2d5e1f..da1a7ba0e5681 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json @@ -47,6 +47,18 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Cycles the Microcode Sequencer is busy.", + "CollectPEBSRecord": "2", + "Counter": "0,1,2,3", + "EventCode": "0x87", + "EventName": "DECODE.MS_BUSY", + "PEBScounters": "0,1,2,3", + "SampleAfterValue": "500009", + "Speculative": "1", + "UMask": "0x2", + "Unit": "cpu_core" + }, { "BriefDescription": "DSB-to-MITE switch true penalty cycles.", "CollectPEBSRecord": "2", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/memory.json b/tools/perf/pmu-events/arch/x86/alderlake/memory.json index 586fb961e46dd..f894e4a0212b2 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/memory.json @@ -82,6 +82,17 @@ "UMask": "0x1", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F84400001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", "Counter": "0,1,2,3,4,5", @@ -93,6 +104,17 @@ "UMask": "0x1", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_MISS_LOCAL", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F84400002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Execution stalls while L3 cache miss demand load is outstanding.", "CollectPEBSRecord": "2", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/other.json b/tools/perf/pmu-events/arch/x86/alderlake/other.json index 67a9c13cc71da..c49d8ce273100 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/other.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/other.json @@ -1,4 +1,15 @@ [ + { + "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.", + "Counter": "0,1,2,3,4,5", + "EventCode": "0xB7", + "EventName": "OCR.COREWB_M.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10008", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that have any type of response.", "Counter": "0,1,2,3,4,5", @@ -103,6 +114,17 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by DRAM.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_DATA_RD.DRAM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x184000001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", "Counter": "0,1,2,3,4,5,6,7", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json index d02e078a90c94..1a137f7f8b7e8 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json @@ -330,6 +330,18 @@ "UMask": "0x3", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.", + "CollectPEBSRecord": "2", + "Counter": "0,1,2,3,4,5", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", + "PEBScounters": "0,1,2,3,4,5", + "SampleAfterValue": "2000003", + "Speculative": "1", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)", "CollectPEBSRecord": "2", @@ -874,7 +886,7 @@ "PEBScounters": "0,1,2,3,4,5,6,7", "SampleAfterValue": "100003", "Speculative": "1", - "UMask": "0x1f", + "UMask": "0x1b", "Unit": "cpu_core" }, { diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 7f2d777fd97f7..bc873a1e84e10 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -1,5 +1,5 @@ Family-model,Version,Filename,EventType -GenuineIntel-6-9[7A],v1.13,alderlake,core +GenuineIntel-6-(97|9A|B7|BA|BE|BF),v1.15,alderlake,core GenuineIntel-6-(1C|26|27|35|36),v4,bonnell,core GenuineIntel-6-(3D|47),v26,broadwell,core GenuineIntel-6-56,v23,broadwellde,core -- GitLab From eb4f8d7787f115a724e4ffcb8d1d659249b04f9b Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:56 -0700 Subject: [PATCH 1416/2223] perf vendor events: Update Intel broadwell Events remain at v26, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/broadwell/bdw-metrics.json | 679 +++++++++++++++--- 1 file changed, 565 insertions(+), 114 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json index d65afe3d0b062..c220b1cf1740d 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -1,64 +1,552 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "MetricExpr": "ICACHE.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. ", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. ", + "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -76,8 +564,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -88,16 +576,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -107,51 +589,32 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Cor;Flops;HPC_SMT", - "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -193,13 +656,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -220,22 +683,22 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -252,7 +715,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -264,83 +727,71 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts", "MetricName": "Branch_Misprediction_Cost" }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / CORE_CLKS", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -361,19 +812,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -391,26 +842,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -428,7 +879,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, -- GitLab From 5bc4e39eecb069d49060ebcebf07dada088de026 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:57 -0700 Subject: [PATCH 1417/2223] perf vendor events: Update Intel broadwellx Events remain at v19, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Uncore event updates by Zhengjun Xing <zhengjun.xing@linux.intel.com>. - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Latest metrics from: https://github.com/intel/perfmon-metrics Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/broadwellx/bdx-metrics.json | 965 +++++++++++------- .../arch/x86/broadwellx/uncore-cache.json | 10 +- .../x86/broadwellx/uncore-interconnect.json | 18 +- .../arch/x86/broadwellx/uncore-memory.json | 18 +- 4 files changed, 638 insertions(+), 373 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index a3a15ee528417..e89fa536ca030 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -1,64 +1,576 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "MetricExpr": "ICACHE.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. ", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. ", + "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -74,6 +586,12 @@ "MetricGroup": "Branches;Fed;FetchBW", "MetricName": "UpTB" }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", + "MetricName": "CPI" + }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", @@ -82,16 +600,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -101,51 +613,32 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Cor;Flops;HPC_SMT", - "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -187,13 +680,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -214,22 +707,22 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -246,7 +739,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -258,83 +751,71 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts", "MetricName": "Branch_Misprediction_Cost" }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -355,19 +836,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -385,26 +866,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -422,13 +903,13 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )", + "MetricExpr": "1000000000 * (cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@) / (Socket_CLKS / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "MEM_Read_Latency" }, @@ -444,12 +925,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -498,20 +973,19 @@ "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency" }, + { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" + }, { "BriefDescription": "CPU operating frequency (in GHz)", - "MetricExpr": "( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000", + "MetricExpr": "(( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000) / duration_time", "MetricGroup": "", "MetricName": "cpu_operating_frequency", "ScaleUnit": "1GHz" }, - { - "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", - "MetricGroup": "", - "MetricName": "cpi", - "ScaleUnit": "1per_instr" - }, { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", @@ -530,7 +1004,7 @@ "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches", + "MetricName": "l1d_mpi", "ScaleUnit": "1per_instr" }, { @@ -558,7 +1032,7 @@ "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches", + "MetricName": "l2_mpi", "ScaleUnit": "1per_instr" }, { @@ -591,21 +1065,21 @@ }, { "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", - "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) ) ) * duration_time", + "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", - "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) ) ) * duration_time", + "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", - "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) ) ) * duration_time", + "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", "ScaleUnit": "1ns" @@ -640,21 +1114,21 @@ }, { "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", - "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ )", + "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_local_dram", + "MetricName": "numa_reads_addressed_to_local_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", - "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ )", + "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_remote_dram", + "MetricName": "numa_reads_addressed_to_remote_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Uncore operating frequency in GHz", - "MetricExpr": "UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) / 1000000000", + "MetricExpr": "( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) / 1000000000) / duration_time", "MetricGroup": "", "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" @@ -663,7 +1137,7 @@ "BriefDescription": "Intel(R) Quick Path Interconnect (QPI) data transmit bandwidth (MB/sec)", "MetricExpr": "( UNC_Q_TxL_FLITS_G0.DATA * 8 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "qpi_data_transmit_bw_only_data", + "MetricName": "qpi_data_transmit_bw", "ScaleUnit": "1MB/s" }, { @@ -691,245 +1165,42 @@ "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_read", + "MetricName": "io_bandwidth_disk_or_network_writes", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", "MetricExpr": "(( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ + cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x180\\,filter_tid\\=0x3e@ ) * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_write", + "MetricName": "io_bandwidth_disk_or_network_reads", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.DSB_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_decoded_icache_dsb", + "MetricName": "percent_uops_delivered_from_decoded_icache", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MITE_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline_mite", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MS_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_microcode_sequencer_ms", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from loop stream detector(LSD) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( LSD.UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_loop_stream_detector_lsd", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", - "MetricExpr": "100 * ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1;PGO", - "MetricName": "tma_frontend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", - "MetricExpr": "100 * ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_latency_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "100 * ( ICACHE.IFDATA_STALL / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_icache_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", - "MetricExpr": "100 * ( ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=0x1@ + 7 * ITLB_MISSES.WALK_COMPLETED ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_itlb_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", - "MetricExpr": "100 * ( ( 12 ) * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_branch_resteers_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", - "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "DSBmiss;FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_dsb_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", - "MetricExpr": "100 * ( ILD_STALL.LCP / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_lcp_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.", - "MetricExpr": "100 * ( ( 2 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;MicroSeq;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_ms_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", - "MetricExpr": "100 * ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "FetchBW;Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_bandwidth_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", - "MetricExpr": "100 * ( ( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSBmiss;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_mite_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", - "MetricExpr": "100 * ( ( IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSB;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_dsb_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", - "MetricExpr": "100 * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_bad_speculation_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", - "MetricExpr": "100 * ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "BadSpec;BrMispredicts;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_branch_mispredicts_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", - "MetricExpr": "100 * ( ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "BadSpec;MachineClears;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_machine_clears_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", - "MetricExpr": "100 * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_backend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", - "MetricExpr": "100 * ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB ) / ( ( CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - ( UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) ) * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;m_tma_backend_bound_percent", - "MetricName": "tma_memory_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", - "MetricExpr": "100 * ( max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l1_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l2_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( MEM_LOAD_UOPS_RETIRED.L3_HIT / ( MEM_LOAD_UOPS_RETIRED.L3_HIT + ( 7 ) * MEM_LOAD_UOPS_RETIRED.L3_MISS ) ) * CYCLE_ACTIVITY.STALLS_L2_MISS / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l3_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", - "MetricExpr": "100 * ( min( ( ( 1 - ( MEM_LOAD_UOPS_RETIRED.L3_HIT / ( MEM_LOAD_UOPS_RETIRED.L3_HIT + ( 7 ) * MEM_LOAD_UOPS_RETIRED.L3_MISS ) ) ) * CYCLE_ACTIVITY.STALLS_L2_MISS / ( CPU_CLK_UNHALTED.THREAD ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_dram_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", - "MetricExpr": "100 * ( RESOURCE_STALLS.SB / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_store_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", - "MetricExpr": "100 * ( ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) - ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB ) / ( ( CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - ( UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) ) * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;Compute;m_tma_backend_bound_percent", - "MetricName": "tma_core_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.", - "MetricExpr": "100 * ( ARITH.FPU_DIV_ACTIVE / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_divider_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", - "MetricExpr": "100 * ( ( ( ( CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - ( UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "PortsUtil;TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_ports_utilization_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", - "MetricExpr": "100 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_retiring_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_light_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", - "MetricExpr": "100 * ( ( INST_RETIRED.X87 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / INST_RETIRED.ANY ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) , ( 1 ) ) ) )", - "MetricGroup": "HPC;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fp_arith_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_heavy_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "MicroSeq;TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_microcode_sequencer_percent", + "MetricName": "percent_uops_delivered_from_loop_stream_detector", "ScaleUnit": "1%" } ] diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json index abee6f773c1fd..449fa723d0aa7 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json @@ -947,21 +947,19 @@ "Unit": "CBO" }, { - "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches. Derived from unc_c_tor_inserts.miss_opcode", + "BriefDescription": "TOR Inserts; Miss Opcode Match", "Counter": "0,1,2,3", "EventCode": "0x35", - "EventName": "LLC_MISSES.DATA_READ", - "Filter": "filter_opc=0x182", + "EventName": "UNC_C_TOR_INSERTS.MISS_OPCODE", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0x3", "Unit": "CBO" }, { - "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches", + "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches. Derived from unc_c_tor_inserts.miss_opcode", "Counter": "0,1,2,3", "EventCode": "0x35", - "EventName": "UNC_C_TOR_INSERTS.MISS_OPCODE", + "EventName": "LLC_MISSES.DATA_READ", "Filter": "filter_opc=0x182", "PerPkg": "1", "ScaleUnit": "64Bytes", diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json index 071ce45620d24..cb1916f526074 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json @@ -685,36 +685,34 @@ "Unit": "QPI LL" }, { - "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data", + "BriefDescription": "Flits Transferred - Group 0; Data Tx Flits", "Counter": "0,1,2,3", - "EventName": "QPI_DATA_BANDWIDTH_TX", + "EventName": "UNC_Q_TxL_FLITS_G0.DATA", "PerPkg": "1", - "ScaleUnit": "8Bytes", "UMask": "0x2", "Unit": "QPI LL" }, { - "BriefDescription": "Number of data flits transmitted ", + "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data", "Counter": "0,1,2,3", - "EventName": "UNC_Q_TxL_FLITS_G0.DATA", + "EventName": "QPI_DATA_BANDWIDTH_TX", "PerPkg": "1", "ScaleUnit": "8Bytes", "UMask": "0x2", "Unit": "QPI LL" }, { - "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data", + "BriefDescription": "Flits Transferred - Group 0; Non-Data protocol Tx Flits", "Counter": "0,1,2,3", - "EventName": "QPI_CTL_BANDWIDTH_TX", + "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA", "PerPkg": "1", - "ScaleUnit": "8Bytes", "UMask": "0x4", "Unit": "QPI LL" }, { - "BriefDescription": "Number of non data (control) flits transmitted ", + "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data", "Counter": "0,1,2,3", - "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA", + "EventName": "QPI_CTL_BANDWIDTH_TX", "PerPkg": "1", "ScaleUnit": "8Bytes", "UMask": "0x4", diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json index 302e956a82ed1..05fab7d2723ea 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json @@ -72,20 +72,19 @@ "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", + "BriefDescription": "DRAM RD_CAS and WR_CAS Commands.; All DRAM Reads (RD_CAS + Underfills)", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_READ", + "EventName": "UNC_M_CAS_COUNT.RD", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0x3", "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller", + "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.RD", + "EventName": "LLC_MISSES.MEM_READ", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0x3", @@ -110,20 +109,19 @@ "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", + "BriefDescription": "DRAM RD_CAS and WR_CAS Commands.; All DRAM WR_CAS (both Modes)", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_WRITE", + "EventName": "UNC_M_CAS_COUNT.WR", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0xC", "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller", + "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.WR", + "EventName": "LLC_MISSES.MEM_WRITE", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0xC", -- GitLab From 55b201a833664bf6bd4dc17ac3e75882a34daacf Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:58 -0700 Subject: [PATCH 1418/2223] perf vendor events: Update Intel cascadelakex Events remain at v1.16, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Removal of ScaleUnit from uncore events by Zhengjun Xing <zhengjun.xing@linux.intel.com>. - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Latest metrics from: https://github.com/intel/perfmon-metrics Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/cascadelakex/clx-metrics.json | 1285 ++++++++++------- .../arch/x86/cascadelakex/uncore-memory.json | 18 +- .../arch/x86/cascadelakex/uncore-other.json | 10 +- 3 files changed, 787 insertions(+), 526 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 46613504b816b..81de1149297da 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -1,148 +1,742 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "9 * BACLEARS.ANY / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + (44 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(17 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound) - tma_pmm_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "(59.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "(127 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "((89.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + (89.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) / ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)) if (1000000 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS) else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_pmm_bound", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 11 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "((110 * Average_Frequency) * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + (47.5 * Average_Frequency) * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricExpr": "40 * ROB_MISC_EVENTS.PAUSE_INST / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "Mispredictions" + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_512b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", + "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fused_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_non_fused_branches", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / SLOTS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "Bad;BadSpec;BrMispredicts_SMT", - "MetricName": "Mispredictions_SMT" + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" }, { "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD) / #( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (OFFCORE_REQUESTS_BUFFER.SQ_FULL / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) ) + ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( ((L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )) * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CPU_CLK_UNHALTED.THREAD) / #(max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) ", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "Memory_Bandwidth" }, - { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD) / #( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) ) + ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( ((L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )) * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CPU_CLK_UNHALTED.THREAD) / #(max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) ", - "MetricGroup": "Mem;MemoryBW;Offcore_SMT", - "MetricName": "Memory_Bandwidth_SMT" - }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD ) / CPU_CLK_UNHALTED.THREAD - (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD)) / #( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (( (20.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) - (3.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) ) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) + ( (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD)) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) )", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)))", "MetricGroup": "Mem;MemoryLat;Offcore", "MetricName": "Memory_Latency" }, - { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD ) / CPU_CLK_UNHALTED.THREAD - (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD)) / #( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) - ( ( ( 1 - ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) / ( ( 19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + 10 * ( (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) ) ) + ( 25 * ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) + 33 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) ) ) ) ) ) ) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) if ( 1000000 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( (20.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) - (3.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) ) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) + ( (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD)) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) )", - "MetricGroup": "Mem;MemoryLat;Offcore_SMT", - "MetricName": "Memory_Latency_SMT" - }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( 9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE , max( CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS , 0 ) ) / CPU_CLK_UNHALTED.THREAD) / (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) + ( (EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (( 9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE ) / CPU_CLK_UNHALTED.THREAD) / #(EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) ) ) ", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency))) ", "MetricGroup": "Mem;MemoryTLB;Offcore", "MetricName": "Memory_Data_TLBs" }, - { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( 9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE , max( CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS , 0 ) ) / CPU_CLK_UNHALTED.THREAD) / (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) + ( (EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( 9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / #(EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) ) ) ", - "MetricGroup": "Mem;MemoryTLB;Offcore_SMT", - "MetricName": "Memory_Data_TLBs_SMT" - }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) ) / (4 * CPU_CLK_UNHALTED.THREAD))", + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, - { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))", - "MetricGroup": "Ret_SMT", - "MetricName": "Branching_Overhead_SMT" - }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "Big_Code" }, - { - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB_SMT", - "MetricName": "Big_Code_SMT" - }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) - (100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)))", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "Instruction_Fetch_BW" }, - { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) - (100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))", - "MetricGroup": "Fed;FetchBW;Frontend_SMT", - "MetricName": "Instruction_Fetch_BW_SMT" - }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -158,6 +752,12 @@ "MetricGroup": "Branches;Fed;FetchBW", "MetricName": "UpTB" }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", + "MetricName": "CPI" + }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", @@ -166,16 +766,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -185,63 +779,38 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Cor;Flops;HPC_SMT", - "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricExpr": "( 1 - ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)))) / ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)))) < ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1 ) if 0 > 0.5 else 0", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", "MetricGroup": "Cor;SMT", "MetricName": "Core_Bound_Likely" }, - { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricExpr": "( 1 - ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) / ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) < ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1 ) if (1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 )) > 0.5 else 0", - "MetricGroup": "Cor;SMT_SMT", - "MetricName": "Core_Bound_Likely_SMT" - }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -283,13 +852,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -310,21 +879,21 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX512", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -336,9 +905,9 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -373,16 +942,10 @@ }, { "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", - "MetricExpr": "100 * ( (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + ((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))) * (( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / CPU_CLK_UNHALTED.THREAD / 2) / #((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))) )", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", "MetricGroup": "DSBmiss;Fed", "MetricName": "DSB_Misses" }, - { - "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", - "MetricExpr": "100 * ( (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + ((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * (( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) / 2) / #((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )", - "MetricGroup": "DSBmiss;Fed_SMT", - "MetricName": "DSB_Misses_SMT" - }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -397,16 +960,10 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts", "MetricName": "Branch_Misprediction_Cost" }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -415,101 +972,95 @@ }, { "BriefDescription": "Fraction of branches that are taken conditionals", - "MetricExpr": "( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", "MetricName": "Cond_TK" }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "Jump" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -536,37 +1087,37 @@ }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1000 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_Silent_PKI" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_NonSilent_PKI" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -578,68 +1129,47 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / 2 / CORE_CLKS if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License0_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, - { - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / 2 / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Power_SMT", - "MetricName": "Power_License0_Utilization_SMT", - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes. SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / 2 / CORE_CLKS if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License1_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, - { - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / 2 / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Power_SMT", - "MetricName": "Power_License1_Utilization_SMT", - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions. SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / 2 / CORE_CLKS if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License2_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, - { - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / 2 / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Power_SMT", - "MetricName": "Power_License2_Utilization_SMT", - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions. SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -657,13 +1187,13 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / cha@event\\=0x35\\,umask\\=0x21\\,config\\=0x40433@ ) / ( cha_0@event\\=0x0@ / duration_time )", + "MetricExpr": "1000000000 * (cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / cha@event\\=0x35\\,umask\\=0x21\\,config\\=0x40433@) / (Socket_CLKS / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "MEM_Read_Latency" }, @@ -675,38 +1205,38 @@ }, { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": "( 1000000000 * ( imc@event\\=0xe0\\,umask\\=0x1@ / imc@event\\=0xe3@ ) / imc_0@event\\=0x0@ )", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": "(1000000000 * (imc@event\\=0xe0\\,umask\\=0x1@ / imc@event\\=0xe3@) / imc_0@event\\=0x0@)", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_PMM_Read_Latency" }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": "1000000000 * ( UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS ) / imc_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": "1000000000 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_DRAM_Read_Latency" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", - "MetricExpr": "( ( 64 * imc@event\\=0xe3@ / 1000000000 ) / duration_time )", - "MetricGroup": "Mem;MemoryBW;SoC;Server", + "MetricExpr": "((64 * imc@event\\=0xe3@ / 1000000000) / duration_time)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", "MetricName": "PMM_Read_BW" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "( ( 64 * imc@event\\=0xe7@ / 1000000000 ) / duration_time )", - "MetricGroup": "Mem;MemoryBW;SoC;Server", + "MetricExpr": "((64 * imc@event\\=0xe7@ / 1000000000) / duration_time)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", "MetricName": "PMM_Write_BW" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "( UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3 ) * 4 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1000000000 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Write_BW" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "( UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3 ) * 4 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1000000000 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Read_BW" }, { @@ -715,12 +1245,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cha_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -770,26 +1294,18 @@ "MetricName": "C7_Pkg_Residency" }, { - "BriefDescription": "Percentage of time spent in the active CPU power state C0", - "MetricExpr": "100 * CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "", - "MetricName": "cpu_utilization_percent", - "ScaleUnit": "1%" + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" }, { "BriefDescription": "CPU operating frequency (in GHz)", - "MetricExpr": "( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000", + "MetricExpr": "(( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000) / duration_time", "MetricGroup": "", "MetricName": "cpu_operating_frequency", "ScaleUnit": "1GHz" }, - { - "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", - "MetricGroup": "", - "MetricName": "cpi", - "ScaleUnit": "1per_instr" - }, { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", @@ -808,7 +1324,7 @@ "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches", + "MetricName": "l1d_mpi", "ScaleUnit": "1per_instr" }, { @@ -836,7 +1352,7 @@ "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches", + "MetricName": "l2_mpi", "ScaleUnit": "1per_instr" }, { @@ -869,21 +1385,21 @@ }, { "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", - "MetricExpr": "( ( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043300000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043300000000@ ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043300000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043300000000@ ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", - "MetricExpr": "( ( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043200000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043200000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", - "MetricExpr": "( ( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043100000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( cha@unc_cha_tor_occupancy.ia_miss\\,config1\\=0x4043100000000@ / cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", "ScaleUnit": "1ns" @@ -892,54 +1408,54 @@ "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "itlb_2nd_level_mpi", + "MetricName": "itlb_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "itlb_2nd_level_large_page_mpi", + "MetricName": "itlb_large_page_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "dtlb_2nd_level_load_mpi", + "MetricName": "dtlb_load_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "dtlb_2nd_level_2mb_large_page_load_mpi", + "MetricName": "dtlb_2mb_large_page_load_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "dtlb_2nd_level_store_mpi", + "MetricName": "dtlb_store_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ / ( cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ + cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_local_dram", + "MetricName": "numa_reads_addressed_to_local_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ / ( cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043200000000@ + cha@unc_cha_tor_inserts.ia_miss\\,config1\\=0x4043100000000@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_remote_dram", + "MetricName": "numa_reads_addressed_to_remote_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Uncore operating frequency in GHz", - "MetricExpr": "UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) / 1000000000", + "MetricExpr": "( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_CLOCKTICKS) * #num_packages ) / 1000000000) / duration_time", "MetricGroup": "", "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" @@ -948,7 +1464,7 @@ "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "( UNC_UPI_TxL_FLITS.ALL_DATA * (64 / 9.0) / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "upi_data_transmit_bw_only_data", + "MetricName": "upi_data_transmit_bw", "ScaleUnit": "1MB/s" }, { @@ -997,35 +1513,35 @@ "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", "MetricExpr": "(( UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3 ) * 4 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_read", + "MetricName": "io_bandwidth_disk_or_network_writes", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", "MetricExpr": "(( UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART0 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART1 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART2 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART3 ) * 4 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_write", + "MetricName": "io_bandwidth_disk_or_network_reads", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_decoded_icache_dsb", + "MetricName": "percent_uops_delivered_from_decoded_icache", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MITE_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline_mite", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MS_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_microcode_sequencer_ms", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", "ScaleUnit": "1%" }, { @@ -1050,255 +1566,10 @@ "ScaleUnit": "1MB/s" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", - "MetricExpr": "100 * ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1;PGO", - "MetricName": "tma_frontend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", - "MetricExpr": "100 * ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_latency_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "100 * ( ( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_icache_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", - "MetricExpr": "100 * ( ICACHE_64B.IFTAG_STALL / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_itlb_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", - "MetricExpr": "100 * ( INT_MISC.CLEAR_RESTEER_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) + ( ( 9 ) * BACLEARS.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_branch_resteers_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", - "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "DSBmiss;FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_dsb_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", - "MetricExpr": "100 * ( ILD_STALL.LCP / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_lcp_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.", - "MetricExpr": "100 * ( ( 2 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;MicroSeq;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_ms_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", - "MetricExpr": "100 * ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "FetchBW;Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_bandwidth_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", - "MetricExpr": "100 * ( ( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSBmiss;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_mite_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", - "MetricExpr": "100 * ( ( IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSB;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_dsb_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", - "MetricExpr": "100 * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_bad_speculation_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", - "MetricExpr": "100 * ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "BadSpec;BrMispredicts;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_branch_mispredicts_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", - "MetricExpr": "100 * ( ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "BadSpec;MachineClears;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_machine_clears_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", - "MetricExpr": "100 * ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_backend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", - "MetricExpr": "100 * ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / ( CYCLE_ACTIVITY.STALLS_TOTAL + ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) + EXE_ACTIVITY.BOUND_ON_STORES ) ) * ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;m_tma_backend_bound_percent", - "MetricName": "tma_memory_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", - "MetricExpr": "100 * ( max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l1_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=0x1@ ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l2_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l3_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", - "MetricExpr": "100 * ( min( ( ( ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=0x1@ ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( min( ( ( ( ( 1 - ( ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) / ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) + ( 25 * ( ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) + 33 * ( ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) ) ) ) ) * ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=0x1@ ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) if ( ( 1000000 ) * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) , ( 1 ) ) ) ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_dram_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ", - "MetricExpr": "100 * ( min( ( ( ( ( 1 - ( ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) / ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) + ( 25 * ( ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) + 33 * ( ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) ) ) ) ) * ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=0x1@ ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) if ( ( 1000000 ) * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;Server;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_pmm_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", - "MetricExpr": "100 * ( EXE_ACTIVITY.BOUND_ON_STORES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_store_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", - "MetricExpr": "100 * ( ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / ( CYCLE_ACTIVITY.STALLS_TOTAL + ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) + EXE_ACTIVITY.BOUND_ON_STORES ) ) * ( 1 - ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( UOPS_ISSUED.ANY + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;Compute;m_tma_backend_bound_percent", - "MetricName": "tma_core_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.", - "MetricExpr": "100 * ( ARITH.DIVIDER_ACTIVE / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_divider_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", - "MetricExpr": "100 * ( ( EXE_ACTIVITY.EXE_BOUND_0_PORTS + ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) ) / ( CPU_CLK_UNHALTED.THREAD ) if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else ( EXE_ACTIVITY.1_PORTS_UTIL + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "PortsUtil;TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_ports_utilization_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", - "MetricExpr": "100 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_retiring_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_light_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) , ( 1 ) ) ) )", - "MetricGroup": "HPC;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fp_arith_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_memory_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * UOPS_RETIRED.MACRO_FUSED / ( UOPS_RETIRED.RETIRE_SLOTS ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fused_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * ( BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED ) / ( UOPS_RETIRED.RETIRE_SLOTS ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_non_fused_branches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * INST_RETIRED.NOP / ( UOPS_RETIRED.RETIRE_SLOTS ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_nop_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", - "MetricExpr": "100 * ( max( 0 , ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) - ( ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) , ( 1 ) ) ) ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * UOPS_RETIRED.MACRO_FUSED / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * ( BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) * INST_RETIRED.NOP / ( UOPS_RETIRED.RETIRE_SLOTS ) ) ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_other_light_ops_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_heavy_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_few_uops_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "MicroSeq;TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_microcode_sequencer_percent", + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", + "MetricExpr": "100 * ( ( LSD.CYCLES_ACTIVE - LSD.CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", + "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", + "MetricName": "tma_lsd", "ScaleUnit": "1%" } ] diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-memory.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-memory.json index 6facfb244cd32..326b674045c68 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-memory.json @@ -27,20 +27,19 @@ "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", + "BriefDescription": "All DRAM Read CAS Commands issued (including underfills)", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_READ", + "EventName": "UNC_M_CAS_COUNT.RD", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0x3", "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller", + "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.RD", + "EventName": "LLC_MISSES.MEM_READ", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0x3", @@ -56,20 +55,19 @@ "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", + "BriefDescription": "All DRAM Write CAS commands issued", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_WRITE", + "EventName": "UNC_M_CAS_COUNT.WR", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0xC", "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller", + "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.WR", + "EventName": "LLC_MISSES.MEM_WRITE", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0xC", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-other.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-other.json index a29bba230f496..e10530c21ef8b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-other.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-other.json @@ -1477,7 +1477,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x01", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1489,7 +1488,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x02", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1501,7 +1499,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x04", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1513,7 +1510,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x08", - "ScaleUnit": "4Bytes", "UMask": "0x01", "Unit": "IIO" }, @@ -1584,7 +1580,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x01", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1596,7 +1591,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x02", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1608,7 +1602,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x04", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -1620,7 +1613,6 @@ "FCMask": "0x07", "PerPkg": "1", "PortMask": "0x08", - "ScaleUnit": "4Bytes", "UMask": "0x04", "Unit": "IIO" }, @@ -2254,7 +2246,7 @@ "Unit": "UPI LL" }, { - "BriefDescription": "FLITs received which bypassed the Slot0 Receive Buffer", + "BriefDescription": "FLITs received which bypassed the Slot0 Recieve Buffer", "Counter": "0,1,2,3", "EventCode": "0x31", "EventName": "UNC_UPI_RxL_BYPASSED.SLOT2", -- GitLab From 5ed4fc264c2becdae2d2fed4db94eb2dc45668fe Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:15:59 -0700 Subject: [PATCH 1419/2223] perf vendor events: Update elkhartlake cpuids Add cpuid that was added to https://download.01.org/perfmon/mapfile.csv Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index bc873a1e84e10..6c8188404db89 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -5,7 +5,7 @@ GenuineIntel-6-(3D|47),v26,broadwell,core GenuineIntel-6-56,v23,broadwellde,core GenuineIntel-6-4F,v19,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.16,cascadelakex,core -GenuineIntel-6-96,v1.03,elkhartlake,core +GenuineIntel-6-9[6C],v1.03,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-(3C|45|46),v31,haswell,core -- GitLab From dd7aae2c2d651c34f3a006313fa1f46ce9bf48a0 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:00 -0700 Subject: [PATCH 1420/2223] perf vendor events: Update Intel haswell Events are updated to v32, the core metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../pmu-events/arch/x86/haswell/cache.json | 4 +- .../pmu-events/arch/x86/haswell/frontend.json | 12 +- .../arch/x86/haswell/hsw-metrics.json | 570 +++++++++++++++--- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 4 files changed, 498 insertions(+), 90 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/haswell/cache.json b/tools/perf/pmu-events/arch/x86/haswell/cache.json index 3b0f3a2642469..719b8e622f596 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/cache.json +++ b/tools/perf/pmu-events/arch/x86/haswell/cache.json @@ -20,7 +20,7 @@ "UMask": "0x2" }, { - "BriefDescription": "L1D miss oustandings duration in cycles", + "BriefDescription": "L1D miss outstanding duration in cycles", "Counter": "2", "CounterHTOff": "2", "EventCode": "0x48", @@ -655,7 +655,7 @@ "UMask": "0x8" }, { - "BriefDescription": "Cacheable and noncachaeble code read requests", + "BriefDescription": "Cacheable and noncacheable code read requests", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xB0", diff --git a/tools/perf/pmu-events/arch/x86/haswell/frontend.json b/tools/perf/pmu-events/arch/x86/haswell/frontend.json index c45a09abe5d3f..18a993297108c 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/frontend.json +++ b/tools/perf/pmu-events/arch/x86/haswell/frontend.json @@ -161,7 +161,7 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", @@ -172,7 +172,7 @@ "UMask": "0x30" }, { - "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.", + "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy.", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", @@ -182,7 +182,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy.", + "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequencer (MS) is busy.", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", @@ -193,7 +193,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", @@ -203,7 +203,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", @@ -224,7 +224,7 @@ "UMask": "0x30" }, { - "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json index 75dc6dd9a7bcb..6cb6603efbd8f 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json @@ -1,64 +1,490 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "MetricExpr": "ICACHE.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "10 * ARITH.DIVIDER_UOPS / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -76,8 +502,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -88,37 +514,25 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", + "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)) if #SMT_on else UOPS_EXECUTED.CORE / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -159,9 +573,9 @@ "MetricName": "BpTkBranch" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -172,7 +586,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -184,47 +598,41 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / CORE_CLKS", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -245,19 +653,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -275,19 +683,19 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -305,7 +713,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 6c8188404db89..63a0e98fd1162 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -8,7 +8,7 @@ GenuineIntel-6-55-[56789ABCDEF],v1.16,cascadelakex,core GenuineIntel-6-9[6C],v1.03,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -GenuineIntel-6-(3C|45|46),v31,haswell,core +GenuineIntel-6-(3C|45|46),v32,haswell,core GenuineIntel-6-3F,v25,haswellx,core GenuineIntel-6-(7D|7E|A7),v1.14,icelake,core GenuineIntel-6-6[AC],v1.15,icelakex,core -- GitLab From 08ce57dd1b89f0e125c8d0cb03c9578ecae348c1 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:01 -0700 Subject: [PATCH 1421/2223] perf vendor events: Update Intel haswellx Events are updated to v26, the core metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Uncore event updates by Zhengjun Xing <zhengjun.xing@linux.intel.com>. - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Previously metrics involving topdown events were dropped. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Latest metrics from: https://github.com/intel/perfmon-metrics Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../pmu-events/arch/x86/haswellx/cache.json | 2 +- .../arch/x86/haswellx/frontend.json | 12 +- .../arch/x86/haswellx/hsx-metrics.json | 919 +++++++++++------- .../x86/haswellx/uncore-interconnect.json | 18 +- .../arch/x86/haswellx/uncore-memory.json | 18 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 6 files changed, 615 insertions(+), 356 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/haswellx/cache.json b/tools/perf/pmu-events/arch/x86/haswellx/cache.json index 7557a203a1b66..427c949bed6ed 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/cache.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/cache.json @@ -691,7 +691,7 @@ "UMask": "0x8" }, { - "BriefDescription": "Cacheable and noncachaeble code read requests", + "BriefDescription": "Cacheable and noncacheable code read requests", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xB0", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/frontend.json b/tools/perf/pmu-events/arch/x86/haswellx/frontend.json index c45a09abe5d3f..18a993297108c 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/frontend.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/frontend.json @@ -161,7 +161,7 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", @@ -172,7 +172,7 @@ "UMask": "0x30" }, { - "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.", + "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy.", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", @@ -182,7 +182,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy.", + "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequencer (MS) is busy.", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", @@ -193,7 +193,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", @@ -203,7 +203,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", @@ -224,7 +224,7 @@ "UMask": "0x30" }, { - "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index d31d76db9d84d..2cd86750986af 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -1,64 +1,514 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "MetricExpr": "ICACHE.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "10 * ARITH.DIVIDER_UOPS / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -74,6 +524,12 @@ "MetricGroup": "Branches;Fed;FetchBW", "MetricName": "UpTB" }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", + "MetricName": "CPI" + }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", @@ -82,37 +538,25 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", + "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)) if #SMT_on else UOPS_EXECUTED.CORE / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -153,9 +597,9 @@ "MetricName": "BpTkBranch" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -166,7 +610,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -178,47 +622,41 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / CORE_CLKS", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -239,19 +677,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -269,19 +707,19 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -299,13 +737,13 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )", + "MetricExpr": "1000000000 * (cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@) / (Socket_CLKS / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "MEM_Read_Latency" }, @@ -321,12 +759,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -375,403 +807,234 @@ "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency" }, + { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" + }, { "BriefDescription": "CPU operating frequency (in GHz)", - "MetricExpr": "( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000", + "MetricExpr": "(( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000) / duration_time", "MetricGroup": "", "MetricName": "cpu_operating_frequency", "ScaleUnit": "1GHz" }, - { - "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", - "MetricExpr": " CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY ", - "MetricGroup": "", - "MetricName": "cpi", - "ScaleUnit": "1per_instr" - }, { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", - "MetricExpr": " MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY ", + "MetricExpr": "MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "loads_per_instr", "ScaleUnit": "1per_instr" }, { "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", - "MetricExpr": " MEM_UOPS_RETIRED.ALL_STORES / INST_RETIRED.ANY ", + "MetricExpr": "MEM_UOPS_RETIRED.ALL_STORES / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "stores_per_instr", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", - "MetricExpr": " L1D.REPLACEMENT / INST_RETIRED.ANY ", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches", + "MetricName": "l1d_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", - "MetricExpr": " MEM_LOAD_UOPS_RETIRED.L1_HIT / INST_RETIRED.ANY ", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L1_HIT / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "l1d_demand_data_read_hits_per_instr", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", - "MetricExpr": " L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY ", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", - "MetricExpr": " MEM_LOAD_UOPS_RETIRED.L2_HIT / INST_RETIRED.ANY ", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_HIT / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "l2_demand_data_read_hits_per_instr", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", - "MetricExpr": " L2_LINES_IN.ALL / INST_RETIRED.ANY ", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches", + "MetricName": "l2_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", - "MetricExpr": " MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY ", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "l2_demand_data_read_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", - "MetricExpr": " L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY ", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "l2_demand_code_mpi", "ScaleUnit": "1per_instr" }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", + "MetricGroup": "", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", + "MetricGroup": "", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time", + "MetricGroup": "", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", - "MetricExpr": " ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY ", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "itlb_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", - "MetricExpr": " ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY ", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "itlb_large_page_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", - "MetricExpr": " DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY ", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "dtlb_load_mpi", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", - "MetricExpr": " DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY ", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "dtlb_store_mpi", "ScaleUnit": "1per_instr" }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) / 1000000000) / duration_time", + "MetricGroup": "", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" + }, { "BriefDescription": "Intel(R) Quick Path Interconnect (QPI) data transmit bandwidth (MB/sec)", - "MetricExpr": "( UNC_Q_TxL_FLITS_G0.DATA * 8 / 1000000) / duration_time", + "MetricExpr": "( UNC_Q_TxL_FLITS_G0.DATA * 8 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "qpi_data_transmit_bw_only_data", + "MetricName": "qpi_data_transmit_bw", "ScaleUnit": "1MB/s" }, { "BriefDescription": "DDR memory read bandwidth (MB/sec)", - "MetricExpr": "( UNC_M_CAS_COUNT.RD * 64 / 1000000) / duration_time", + "MetricExpr": "( UNC_M_CAS_COUNT.RD * 64 / 1000000) / duration_time", "MetricGroup": "", "MetricName": "memory_bandwidth_read", "ScaleUnit": "1MB/s" }, { "BriefDescription": "DDR memory write bandwidth (MB/sec)", - "MetricExpr": "( UNC_M_CAS_COUNT.WR * 64 / 1000000) / duration_time", + "MetricExpr": "( UNC_M_CAS_COUNT.WR * 64 / 1000000) / duration_time", "MetricGroup": "", "MetricName": "memory_bandwidth_write", "ScaleUnit": "1MB/s" }, { "BriefDescription": "DDR memory bandwidth (MB/sec)", - "MetricExpr": "(( UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR ) * 64 / 1000000) / duration_time", + "MetricExpr": "(( UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR ) * 64 / 1000000) / duration_time", "MetricGroup": "", "MetricName": "memory_bandwidth_total", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", - "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1000000) / duration_time", + "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_read", + "MetricName": "io_bandwidth_disk_or_network_writes", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", - "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ * 64 / 1000000) / duration_time", + "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_write", + "MetricName": "io_bandwidth_disk_or_network_reads", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", - "MetricExpr": "100 * ( IDQ.DSB_UOPS / UOPS_ISSUED.ANY )", + "MetricExpr": "100 * ( IDQ.DSB_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_frodecoded_icache_dsb", + "MetricName": "percent_uops_delivered_from_decoded_icache", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", - "MetricExpr": "100 * ( IDQ.MITE_UOPS / UOPS_ISSUED.ANY )", + "MetricExpr": "100 * ( IDQ.MITE_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_frolegacy_decode_pipeline_mite", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", - "MetricExpr": "100 * ( IDQ.MS_UOPS / UOPS_ISSUED.ANY )", + "MetricExpr": "100 * ( IDQ.MS_UOPS / UOPS_ISSUED.ANY )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_fromicrocode_sequencer_ms", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from loop stream detector(LSD) as a percent of total uops delivered to Instruction Decode Queue", - "MetricExpr": "100 * ( UOPS_ISSUED.ANY - IDQ.MITE_UOPS - IDQ.MS_UOPS - IDQ.DSB_UOPS ) / UOPS_ISSUED.ANY ", + "MetricExpr": "100 * ( UOPS_ISSUED.ANY - IDQ.MITE_UOPS - IDQ.MS_UOPS - IDQ.DSB_UOPS ) / UOPS_ISSUED.ANY", "MetricGroup": "", - "MetricName": "percent_uops_delivered_froloop_streadetector_lsd", + "MetricName": "percent_uops_delivered_from_loop_stream_detector", "ScaleUnit": "1%" }, { "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", - "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x192@ ) / INST_RETIRED.ANY ", + "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x192@ ) / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "llc_data_read_mpi_demand_plus_prefetch", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", - "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x181@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x191@ ) / INST_RETIRED.ANY ", + "MetricExpr": "( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x181@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x191@ ) / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "llc_code_read_mpi_demand_plus_prefetch", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", - "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ )", + "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_local_dram", + "MetricName": "numa_reads_addressed_to_local_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", - "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ )", - "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_remote_dram", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", - "MetricExpr": "100 * ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1, PGO", - "MetricName": "tma_frontend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", - "MetricExpr": "100 * ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "Frontend, TmaL2", - "MetricName": "tma_fetch_latency_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "100 * ( ICACHE.IFDATA_STALL / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot, FetchLat, IcMiss", - "MetricName": "tma_icache_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", - "MetricExpr": "100 * ( ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot, FetchLat, MemoryTLB", - "MetricName": "tma_itlb_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", - "MetricExpr": "100 * ( ( 12 ) * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat", - "MetricName": "tma_branch_resteers_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", - "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "DSBmiss, FetchLat", - "MetricName": "tma_dsb_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", - "MetricExpr": "100 * ( ILD_STALL.LCP / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat", - "MetricName": "tma_lcp_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.", - "MetricExpr": "100 * ( ( 2 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat, MicroSeq", - "MetricName": "tma_ms_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", - "MetricExpr": "100 * ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "FetchBW, Frontend, TmaL2", - "MetricName": "tma_fetch_bandwidth_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", - "MetricExpr": "100 * ( ( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSBmiss, FetchBW", - "MetricName": "tma_mite_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", - "MetricExpr": "100 * ( ( IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )", - "MetricGroup": "DSB, FetchBW", - "MetricName": "tma_dsb_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", - "MetricExpr": "100 * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_bad_speculation_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", - "MetricExpr": "100 * ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "BadSpec, BrMispredicts, TmaL2", - "MetricName": "tma_branch_mispredicts_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", - "MetricExpr": "100 * ( ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "BadSpec, MachineClears, TmaL2", - "MetricName": "tma_machine_clears_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", - "MetricExpr": "100 * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_backend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", - "MetricExpr": "100 * ( ( ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.STALLS_LDM_PENDING ) ) + RESOURCE_STALLS.SB ) / ( ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.CYCLES_NO_EXECUTE ) ) + ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x1@ - ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x3@ if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x2@ ) ) / 2 - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) if #SMT_on else ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.CYCLES_NO_EXECUTE ) ) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x1@ - ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x3@ if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x2@ ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) ) * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) )", - "MetricGroup": "Backend, TmaL2", - "MetricName": "tma_memory_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", - "MetricExpr": "100 * ( max( ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.STALLS_LDM_PENDING ) ) - CYCLE_ACTIVITY.STALLS_L1D_PENDING ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )", - "MetricGroup": "CacheMisses, MemoryBound, TmaL3mem", - "MetricName": "tma_l1_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses, MemoryBound, TmaL3mem", - "MetricName": "tma_l2_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( MEM_LOAD_UOPS_RETIRED.L3_HIT / ( MEM_LOAD_UOPS_RETIRED.L3_HIT + ( 7 ) * MEM_LOAD_UOPS_RETIRED.L3_MISS ) ) * CYCLE_ACTIVITY.STALLS_L2_PENDING / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses, MemoryBound, TmaL3mem", - "MetricName": "tma_l3_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", - "MetricExpr": "100 * ( min( ( ( 1 - ( MEM_LOAD_UOPS_RETIRED.L3_HIT / ( MEM_LOAD_UOPS_RETIRED.L3_HIT + ( 7 ) * MEM_LOAD_UOPS_RETIRED.L3_MISS ) ) ) * CYCLE_ACTIVITY.STALLS_L2_PENDING / ( CPU_CLK_UNHALTED.THREAD ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound, TmaL3mem", - "MetricName": "tma_drabound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", - "MetricExpr": "100 * ( RESOURCE_STALLS.SB / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "MemoryBound, TmaL3mem", - "MetricName": "tma_store_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", - "MetricExpr": "100 * ( ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) - ( ( ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.STALLS_LDM_PENDING ) ) + RESOURCE_STALLS.SB ) / ( ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.CYCLES_NO_EXECUTE ) ) + ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x1@ - ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x3@ if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x2@ ) ) / 2 - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) if #SMT_on else ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.CYCLES_NO_EXECUTE ) ) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x1@ - ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x3@ if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x2@ ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) ) * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) ) )", - "MetricGroup": "Backend, TmaL2, Compute", - "MetricName": "tma_core_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.", - "MetricExpr": "100 * ( 10 * ARITH.DIVIDER_UOPS / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) )", + "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ )", "MetricGroup": "", - "MetricName": "tma_divider_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", - "MetricExpr": "100 * ( ( ( ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.CYCLES_NO_EXECUTE ) ) + ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x1@ - ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x3@ if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x2@ ) ) / 2 - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) if #SMT_on else ( ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.CYCLES_NO_EXECUTE ) ) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x1@ - ( cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x3@ if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else cpu@UOPS_EXECUTED.CORE\\,cmask\\=0x2@ ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * ( min( CPU_CLK_UNHALTED.THREAD , IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) - RESOURCE_STALLS.SB - ( min( CPU_CLK_UNHALTED.THREAD , CYCLE_ACTIVITY.STALLS_LDM_PENDING ) ) ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "PortsUtil", - "MetricName": "tma_ports_utilization_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", - "MetricExpr": "100 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_retiring_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )", - "MetricGroup": "Retire, TmaL2", - "MetricName": "tma_light_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", - "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )", - "MetricGroup": "Retire, TmaL2", - "MetricName": "tma_heavy_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", - "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )", - "MetricGroup": "MicroSeq", - "MetricName": "tma_microcode_sequencer_percent", + "MetricName": "numa_reads_addressed_to_remote_dram", "ScaleUnit": "1%" } ] diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json index 3e48ff3516b0f..eb0a05fbb7048 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json @@ -981,36 +981,34 @@ "Unit": "QPI LL" }, { - "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data", + "BriefDescription": "Flits Transferred - Group 0; Data Tx Flits", "Counter": "0,1,2,3", - "EventName": "QPI_DATA_BANDWIDTH_TX", + "EventName": "UNC_Q_TxL_FLITS_G0.DATA", "PerPkg": "1", - "ScaleUnit": "8Bytes", "UMask": "0x2", "Unit": "QPI LL" }, { - "BriefDescription": "Number of data flits transmitted ", + "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data", "Counter": "0,1,2,3", - "EventName": "UNC_Q_TxL_FLITS_G0.DATA", + "EventName": "QPI_DATA_BANDWIDTH_TX", "PerPkg": "1", "ScaleUnit": "8Bytes", "UMask": "0x2", "Unit": "QPI LL" }, { - "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data", + "BriefDescription": "Flits Transferred - Group 0; Non-Data protocol Tx Flits", "Counter": "0,1,2,3", - "EventName": "QPI_CTL_BANDWIDTH_TX", + "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA", "PerPkg": "1", - "ScaleUnit": "8Bytes", "UMask": "0x4", "Unit": "QPI LL" }, { - "BriefDescription": "Number of non data (control) flits transmitted ", + "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data", "Counter": "0,1,2,3", - "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA", + "EventName": "QPI_CTL_BANDWIDTH_TX", "PerPkg": "1", "ScaleUnit": "8Bytes", "UMask": "0x4", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json index db3418db312e1..c003daa9ed8cf 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json @@ -72,20 +72,19 @@ "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", + "BriefDescription": "DRAM RD_CAS and WR_CAS Commands.; All DRAM Reads (RD_CAS + Underfills)", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_READ", + "EventName": "UNC_M_CAS_COUNT.RD", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0x3", "Unit": "iMC" }, { - "BriefDescription": "read requests to memory controller", + "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.RD", + "EventName": "LLC_MISSES.MEM_READ", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0x3", @@ -110,20 +109,19 @@ "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", + "BriefDescription": "DRAM RD_CAS and WR_CAS Commands.; All DRAM WR_CAS (both Modes)", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "LLC_MISSES.MEM_WRITE", + "EventName": "UNC_M_CAS_COUNT.WR", "PerPkg": "1", - "ScaleUnit": "64Bytes", "UMask": "0xC", "Unit": "iMC" }, { - "BriefDescription": "write requests to memory controller", + "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr", "Counter": "0,1,2,3", "EventCode": "0x4", - "EventName": "UNC_M_CAS_COUNT.WR", + "EventName": "LLC_MISSES.MEM_WRITE", "PerPkg": "1", "ScaleUnit": "64Bytes", "UMask": "0xC", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 63a0e98fd1162..ddc9fc8b7171a 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -9,7 +9,7 @@ GenuineIntel-6-9[6C],v1.03,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-(3C|45|46),v32,haswell,core -GenuineIntel-6-3F,v25,haswellx,core +GenuineIntel-6-3F,v26,haswellx,core GenuineIntel-6-(7D|7E|A7),v1.14,icelake,core GenuineIntel-6-6[AC],v1.15,icelakex,core GenuineIntel-6-3A,v22,ivybridge,core -- GitLab From 8fb4ddf499ebbdeaa1bafb16f2c0b6818325d981 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:02 -0700 Subject: [PATCH 1422/2223] perf vendor events: Update Intel icelake Events are updated to v1.15, the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - Addition of all 6 levels of TMA metrics. Previously metrics involving topdown events were dropped. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../pmu-events/arch/x86/icelake/cache.json | 6 +- .../arch/x86/icelake/icl-metrics.json | 808 +++++++++++++++++- .../pmu-events/arch/x86/icelake/pipeline.json | 2 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 4 files changed, 766 insertions(+), 52 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/icelake/cache.json b/tools/perf/pmu-events/arch/x86/icelake/cache.json index b4f28f24ee63d..0f6b918484d50 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/cache.json +++ b/tools/perf/pmu-events/arch/x86/icelake/cache.json @@ -18,13 +18,13 @@ "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", "PEBScounters": "0,1,2,3", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailablability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "Speculative": "1", "UMask": "0x2" }, { - "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability.", + "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.", "CollectPEBSRecord": "2", "Counter": "0,1,2,3", "CounterMask": "1", @@ -32,7 +32,7 @@ "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", "PEBScounters": "0,1,2,3", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "Speculative": "1", "UMask": "0x2" diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json index f0356d66a9271..3b5ef09eb8efc 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json @@ -1,26 +1,716 @@ [ + { + "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "10 * BACLEARS.ANY / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_mite_4wide", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", + "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "FetchBW;LSD;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_lsd", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + (5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + L1D_PEND_MISS.FB_FULL_PERIODS)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((29 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + (23.5 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(23.5 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(9 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "L1D_PEND_MISS.L2_STALL / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(32.5 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_streaming_stores", + "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_512b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_branch_instructions", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer + tma_retiring * (UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=1@) / IDQ.MITE_UOPS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "((tma_retiring * SLOTS) / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * ASSISTS.ANY / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" + }, + { + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "Memory_Bandwidth" + }, + { + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))", + "MetricGroup": "Mem;MemoryLat;Offcore", + "MetricName": "Memory_Latency" + }, + { + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ", + "MetricGroup": "Mem;MemoryTLB;Offcore", + "MetricName": "Memory_Data_TLBs" + }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (( 5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / TOPDOWN.SLOTS) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (ICACHE_16B.IFDATA_STALL / CPU_CLK_UNHALTED.THREAD) + (10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(( 5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "Big_Code" }, + { + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "Instruction_Fetch_BW" + }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "UPI" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW", + "MetricName": "UpTB" + }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -32,13 +722,13 @@ { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, { "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1", - "MetricGroup": "SMT;TmaL1", + "MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1", + "MetricGroup": "SMT;tma_L1_group", "MetricName": "Slots_Utilization" }, { @@ -50,29 +740,35 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", + "MetricGroup": "Cor;SMT", + "MetricName": "Core_Bound_Likely" + }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", @@ -117,13 +813,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -144,21 +840,21 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX512", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -170,11 +866,17 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "(tma_retiring * SLOTS) / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire" + }, { "BriefDescription": "", "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", @@ -205,6 +907,12 @@ "MetricGroup": "DSBmiss", "MetricName": "DSB_Switch_Cost" }, + { + "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "DSB_Misses" + }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -217,6 +925,12 @@ "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "IpMispredict" }, + { + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "Branch_Misprediction_Cost" + }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -231,7 +945,7 @@ }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, @@ -243,74 +957,74 @@ }, { "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - ( (BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES) + ((BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES) )", + "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)", "MetricGroup": "Bad;Branches", "MetricName": "Other_Branches" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( ( OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD ) + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricExpr": "1000 * ((OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD) + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / Instructions", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, @@ -340,25 +1054,25 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -370,40 +1084,40 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License0_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License1_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License2_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." @@ -428,7 +1142,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, diff --git a/tools/perf/pmu-events/arch/x86/icelake/pipeline.json b/tools/perf/pmu-events/arch/x86/icelake/pipeline.json index a017a47270506..c74a7369cff35 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/icelake/pipeline.json @@ -167,7 +167,7 @@ "UMask": "0x10" }, { - "BriefDescription": "number of branch instructions retired that were mispredicted and taken. Non PEBS", + "BriefDescription": "number of branch instructions retired that were mispredicted and taken.", "CollectPEBSRecord": "2", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc5", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index ddc9fc8b7171a..1f5dbe176b3a8 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -10,7 +10,7 @@ GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-(3C|45|46),v32,haswell,core GenuineIntel-6-3F,v26,haswellx,core -GenuineIntel-6-(7D|7E|A7),v1.14,icelake,core +GenuineIntel-6-(7D|7E|A7),v1.15,icelake,core GenuineIntel-6-6[AC],v1.15,icelakex,core GenuineIntel-6-3A,v22,ivybridge,core GenuineIntel-6-3E,v21,ivytown,core -- GitLab From bd035250c5e80ed24aa709b12a7e12402c0037b5 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:03 -0700 Subject: [PATCH 1423/2223] perf vendor events: Update Intel icelakex Events are updated to v1.16 the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - Addition of all 6 levels of TMA metrics. Previously metrics involving topdown events were dropped. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Latest metrics from: https://github.com/intel/perfmon-metrics Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../pmu-events/arch/x86/icelakex/cache.json | 6 +- .../arch/x86/icelakex/icx-metrics.json | 1155 ++++++++++++----- .../arch/x86/icelakex/pipeline.json | 2 +- .../arch/x86/icelakex/uncore-other.json | 2 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 5 files changed, 833 insertions(+), 334 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/icelakex/cache.json b/tools/perf/pmu-events/arch/x86/icelakex/cache.json index 775190bdd0632..e4035b3e55caa 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/cache.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/cache.json @@ -18,13 +18,13 @@ "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", "PEBScounters": "0,1,2,3", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailablability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "Speculative": "1", "UMask": "0x2" }, { - "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability.", + "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.", "CollectPEBSRecord": "2", "Counter": "0,1,2,3", "CounterMask": "1", @@ -32,7 +32,7 @@ "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", "PEBScounters": "0,1,2,3", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "Speculative": "1", "UMask": "0x2" diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index e905458b34b8d..b52afc34a1694 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -1,22 +1,742 @@ [ + { + "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "10 * BACLEARS.ANY / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_mite_4wide", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + (5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + L1D_PEND_MISS.FB_FULL_PERIODS)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (43.5 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(43.5 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(19 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "L1D_PEND_MISS.L2_STALL / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound) - tma_pmm_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "(43.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "(108 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "((97 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + (97 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) / ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)) if (1000000 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS) else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_pmm_bound", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(48 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_streaming_stores", + "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricExpr": "37 * MISC_RETIRED.PAUSE_INST / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_512b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_branch_instructions", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer + tma_retiring * (UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=1@) / IDQ.MITE_UOPS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "((tma_retiring * SLOTS) / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * ASSISTS.ANY / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" + }, + { + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "Memory_Bandwidth" + }, + { + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)))", + "MetricGroup": "Mem;MemoryLat;Offcore", + "MetricName": "Memory_Latency" + }, + { + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ", + "MetricGroup": "Mem;MemoryTLB;Offcore", + "MetricName": "Memory_Data_TLBs" + }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (( 5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / TOPDOWN.SLOTS) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (ICACHE_16B.IFDATA_STALL / CPU_CLK_UNHALTED.THREAD) + (10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(( 5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "Big_Code" }, + { + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "Instruction_Fetch_BW" + }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "UPI" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW", + "MetricName": "UpTB" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", + "MetricName": "CPI" + }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", @@ -26,13 +746,13 @@ { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, { "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1", - "MetricGroup": "SMT;TmaL1", + "MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1", + "MetricGroup": "SMT;tma_L1_group", "MetricName": "Slots_Utilization" }, { @@ -44,29 +764,35 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", + "MetricGroup": "Cor;SMT", + "MetricName": "Core_Bound_Likely" + }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", @@ -111,13 +837,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -138,21 +864,21 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX512", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -164,11 +890,17 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "(tma_retiring * SLOTS) / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire" + }, { "BriefDescription": "", "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", @@ -193,6 +925,12 @@ "MetricGroup": "DSBmiss", "MetricName": "DSB_Switch_Cost" }, + { + "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "DSB_Misses" + }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -205,6 +943,12 @@ "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "IpMispredict" }, + { + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "Branch_Misprediction_Cost" + }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -219,7 +963,7 @@ }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, @@ -231,74 +975,74 @@ }, { "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - ( (BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES) + ((BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES) )", + "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)", "MetricGroup": "Bad;Branches", "MetricName": "Other_Branches" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( ( OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD ) + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricExpr": "1000 * ((OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD) + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / Instructions", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, @@ -328,37 +1072,37 @@ }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1000 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_Silent_PKI" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_NonSilent_PKI" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -370,40 +1114,40 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License0_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License1_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License2_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." @@ -428,13 +1172,13 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD ) / ( cha_0@event\\=0x0@ / duration_time )", + "MetricExpr": "1000000000 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (Socket_CLKS / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "MEM_Read_Latency" }, @@ -446,38 +1190,38 @@ }, { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM ) / cha_0@event\\=0x0@ )", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": "(1000000000 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / cha_0@event\\=0x0@)", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_PMM_Read_Latency" }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": " 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR ) / cha_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": " 1000000000 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / cha_0@event\\=0x0@", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_DRAM_Read_Latency" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", - "MetricExpr": "( ( 64 * imc@event\\=0xe3@ / 1000000000 ) / duration_time )", - "MetricGroup": "Mem;MemoryBW;SoC;Server", + "MetricExpr": "((64 * imc@event\\=0xe3@ / 1000000000) / duration_time)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", "MetricName": "PMM_Read_BW" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "( ( 64 * imc@event\\=0xe7@ / 1000000000 ) / duration_time )", - "MetricGroup": "Mem;MemoryBW;SoC;Server", + "MetricExpr": "((64 * imc@event\\=0xe7@ / 1000000000) / duration_time)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", "MetricName": "PMM_Write_BW" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Write_BW" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "( UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR ) * 64 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1000000000 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Read_BW" }, { @@ -486,12 +1230,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cha_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -523,11 +1261,10 @@ "MetricName": "C6_Pkg_Residency" }, { - "BriefDescription": "Percentage of time spent in the active CPU power state C0", - "MetricExpr": "100 * CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "", - "MetricName": "cpu_utilization_percent", - "ScaleUnit": "1%" + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" }, { "BriefDescription": "CPU operating frequency (in GHz)", @@ -536,13 +1273,6 @@ "MetricName": "cpu_operating_frequency", "ScaleUnit": "1GHz" }, - { - "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", - "MetricGroup": "", - "MetricName": "cpi", - "ScaleUnit": "1per_instr" - }, { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", @@ -561,7 +1291,7 @@ "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches", + "MetricName": "l1d_mpi", "ScaleUnit": "1per_instr" }, { @@ -589,7 +1319,7 @@ "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches", + "MetricName": "l2_mpi", "ScaleUnit": "1per_instr" }, { @@ -615,42 +1345,42 @@ }, { "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", - "MetricExpr": "( UNC_CHA_TOR_INSERTS.IA_MISS_CRD ) / INST_RETIRED.ANY", + "MetricExpr": "( UNC_CHA_TOR_INSERTS.IA_MISS_CRD + UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF ) / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "llc_code_read_mpi_demand_plus_prefetch", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_latency", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_latency_for_local_requests", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_latency_for_remote_requests", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to Intel(R) Optane(TM) Persistent Memory(PMEM) in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_to_pmem_latency", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to DRAM in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_to_dram_latency", "ScaleUnit": "1ns" @@ -694,14 +1424,14 @@ "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL ) / ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_local_dram", + "MetricName": "numa_reads_addressed_to_local_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE ) / ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_remote_dram", + "MetricName": "numa_reads_addressed_to_remote_dram", "ScaleUnit": "1%" }, { @@ -715,7 +1445,7 @@ "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "( UNC_UPI_TxL_FLITS.ALL_DATA * (64 / 9.0) / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "upi_data_transmit_bw_only_data", + "MetricName": "upi_data_transmit_bw", "ScaleUnit": "1MB/s" }, { @@ -764,35 +1494,35 @@ "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", "MetricExpr": "(( UNC_CHA_TOR_INSERTS.IO_HIT_PCIRDCUR + UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR ) * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_read", + "MetricName": "io_bandwidth_disk_or_network_writes", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", "MetricExpr": "(( UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR ) * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_write", + "MetricName": "io_bandwidth_disk_or_network_reads", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_decoded_icache_dsb", + "MetricName": "percent_uops_delivered_from_decoded_icache", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MITE_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline_mite", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MS_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_microcode_sequencer_ms", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", "ScaleUnit": "1%" }, { @@ -824,241 +1554,10 @@ "ScaleUnit": "1MB/s" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", - "MetricExpr": "100 * ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) )", - "MetricGroup": "TmaL1;PGO", - "MetricName": "tma_frontend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", - "MetricExpr": "100 * ( ( ( 5 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / ( slots ) )", - "MetricGroup": "Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_latency_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "100 * ( ICACHE_16B.IFDATA_STALL / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_icache_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", - "MetricExpr": "100 * ( ICACHE_64B.IFTAG_STALL / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_itlb_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", - "MetricExpr": "100 * ( INT_MISC.CLEAR_RESTEER_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) + ( ( 10 ) * BACLEARS.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_branch_resteers_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", - "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "DSBmiss;FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_dsb_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", - "MetricExpr": "100 * ( ILD_STALL.LCP / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_lcp_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.", - "MetricExpr": "100 * ( ( 3 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;MicroSeq;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_ms_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", - "MetricExpr": "100 * ( max( 0 , ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) - ( ( ( 5 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / ( slots ) ) ) )", - "MetricGroup": "FetchBW;Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_bandwidth_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", - "MetricExpr": "100 * ( ( IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK ) / ( CPU_CLK_UNHALTED.DISTRIBUTED ) / 2 )", - "MetricGroup": "DSBmiss;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_mite_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", - "MetricExpr": "100 * ( ( IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK ) / ( CPU_CLK_UNHALTED.DISTRIBUTED ) / 2 )", - "MetricGroup": "DSB;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_dsb_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", - "MetricExpr": "100 * ( max( 1 - ( ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) + ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) , 0 ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_bad_speculation_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", - "MetricExpr": "100 * ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( max( 1 - ( ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) + ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) , 0 ) ) )", - "MetricGroup": "BadSpec;BrMispredicts;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_branch_mispredicts_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", - "MetricExpr": "100 * ( max( 0 , ( max( 1 - ( ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) + ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) , 0 ) ) - ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( max( 1 - ( ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) + ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) , 0 ) ) ) ) )", - "MetricGroup": "BadSpec;MachineClears;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_machine_clears_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", - "MetricExpr": "100 * ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_backend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", - "MetricExpr": "100 * ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / ( CYCLE_ACTIVITY.STALLS_TOTAL + ( EXE_ACTIVITY.1_PORTS_UTIL + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) + EXE_ACTIVITY.BOUND_ON_STORES ) ) * ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) )", - "MetricGroup": "Backend;TmaL2;m_tma_backend_bound_percent", - "MetricName": "tma_memory_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", - "MetricExpr": "100 * ( max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l1_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + L1D_PEND_MISS.FB_FULL_PERIODS ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l2_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l3_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", - "MetricExpr": "100 * ( min( ( ( ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + L1D_PEND_MISS.FB_FULL_PERIODS ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( min( ( ( ( ( 1 - ( ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) / ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) + ( 25 * ( ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) + 33 * ( ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) ) ) ) ) * ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + L1D_PEND_MISS.FB_FULL_PERIODS ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) if ( ( 1000000 ) * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) , ( 1 ) ) ) ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_dram_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ", - "MetricExpr": "100 * ( min( ( ( ( ( 1 - ( ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) / ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) + ( 25 * ( ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) + 33 * ( ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) ) ) ) ) * ( CYCLE_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) + ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) - ( ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) / ( ( MEM_LOAD_RETIRED.L2_HIT * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + L1D_PEND_MISS.FB_FULL_PERIODS ) ) * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) if ( ( 1000000 ) * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;Server;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_pmm_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", - "MetricExpr": "100 * ( EXE_ACTIVITY.BOUND_ON_STORES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_store_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", - "MetricExpr": "100 * ( max( 0 , ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) - ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / ( CYCLE_ACTIVITY.STALLS_TOTAL + ( EXE_ACTIVITY.1_PORTS_UTIL + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * EXE_ACTIVITY.2_PORTS_UTIL ) + EXE_ACTIVITY.BOUND_ON_STORES ) ) * ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) + ( ( 5 ) * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=0x1\\,edge\\=0x1@ ) / ( slots ) ) ) ) )", - "MetricGroup": "Backend;TmaL2;Compute;m_tma_backend_bound_percent", - "MetricName": "tma_core_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.", - "MetricExpr": "100 * ( ARITH.DIVIDER_ACTIVE / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_divider_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", - "MetricExpr": "( 100 * ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) + ( 0 * slots )", - "MetricGroup": "TmaL1", - "MetricName": "tma_retiring_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", - "MetricExpr": "100 * ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_light_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", - "MetricExpr": "100 * ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) , ( 1 ) ) ) )", - "MetricGroup": "HPC;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fp_arith_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_memory_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) * BR_INST_RETIRED.ALL_BRANCHES / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_branch_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) * INST_RETIRED.NOP / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_nop_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", - "MetricExpr": "100 * ( max( 0 , ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) - ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) , ( 1 ) ) ) ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) * BR_INST_RETIRED.ALL_BRANCHES / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) ) ) * INST_RETIRED.NOP / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_other_light_ops_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", - "MetricExpr": "100 * ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_heavy_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", - "MetricExpr": "100 * ( ( ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=0x1@ ) / IDQ.MITE_UOPS ) - ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) ) )", - "MetricGroup": "TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_few_uops_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", - "MetricExpr": "100 * ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( slots ) )", - "MetricGroup": "MicroSeq;TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_microcode_sequencer_percent", + "BriefDescription": "%", + "MetricExpr": "100 * ( ( LSD.CYCLES_ACTIVE - LSD.CYCLES_OK ) / ( CPU_CLK_UNHALTED.DISTRIBUTED ) / 2 )", + "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", + "MetricName": "tma_lsd", "ScaleUnit": "1%" } ] diff --git a/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json b/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json index 396868f700040..52fba238bf1fd 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json @@ -167,7 +167,7 @@ "UMask": "0x10" }, { - "BriefDescription": "number of branch instructions retired that were mispredicted and taken. Non PEBS", + "BriefDescription": "number of branch instructions retired that were mispredicted and taken.", "CollectPEBSRecord": "2", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc5", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-other.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-other.json index 7783aa2ef5d18..03e99b8aed93e 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-other.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-other.json @@ -11779,7 +11779,7 @@ "Unit": "M3UPI" }, { - "BriefDescription": "Flit Gen - Header 1 : Acumullate", + "BriefDescription": "Flit Gen - Header 1 : Accumulate", "Counter": "0,1,2,3", "CounterType": "PGMABLE", "EventCode": "0x51", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 1f5dbe176b3a8..84535179d1287 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -11,7 +11,7 @@ GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-(3C|45|46),v32,haswell,core GenuineIntel-6-3F,v26,haswellx,core GenuineIntel-6-(7D|7E|A7),v1.15,icelake,core -GenuineIntel-6-6[AC],v1.15,icelakex,core +GenuineIntel-6-6[AC],v1.16,icelakex,core GenuineIntel-6-3A,v22,ivybridge,core GenuineIntel-6-3E,v21,ivytown,core GenuineIntel-6-2D,v21,jaketown,core -- GitLab From 3bd2d21171b72754628957385c78f0307662b394 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:04 -0700 Subject: [PATCH 1424/2223] perf vendor events: Update Intel ivybridge Events remain at v22, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/ivybridge/ivb-metrics.json | 594 +++++++++++++++--- 1 file changed, 503 insertions(+), 91 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json index 3f48e75f8a86d..63db3397af0f9 100644 --- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json @@ -1,64 +1,500 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "MetricExpr": "ICACHE.IFETCH_STALL / CLKS - tma_itlb_misses", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "13 * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) + 43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.LLC_MISS)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS))) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS * FP_COMP_OPS_EXE.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -76,8 +512,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -88,16 +524,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -107,37 +537,25 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -179,15 +597,15 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "1 / ( ((FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) / UOPS_EXECUTED.THREAD) + ((FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) / UOPS_EXECUTED.THREAD) )", + "MetricExpr": "1 / (tma_fp_scalar + tma_fp_vector)", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -204,7 +622,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -216,47 +634,41 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / CORE_CLKS", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -277,19 +689,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -307,26 +719,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -344,7 +756,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, -- GitLab From d2aaf04076ea217443707775c0dc792aeca3c641 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:05 -0700 Subject: [PATCH 1425/2223] perf vendor events: Update Intel ivytown Events are updated to v22 the core metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../pmu-events/arch/x86/ivytown/cache.json | 4 +- .../arch/x86/ivytown/floating-point.json | 2 +- .../pmu-events/arch/x86/ivytown/frontend.json | 18 +- .../arch/x86/ivytown/ivt-metrics.json | 630 +++++++++++++++--- .../arch/x86/ivytown/uncore-cache.json | 58 +- .../arch/x86/ivytown/uncore-interconnect.json | 84 +-- .../arch/x86/ivytown/uncore-memory.json | 2 +- .../arch/x86/ivytown/uncore-other.json | 6 +- .../arch/x86/ivytown/uncore-power.json | 8 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 10 files changed, 625 insertions(+), 189 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/ivytown/cache.json b/tools/perf/pmu-events/arch/x86/ivytown/cache.json index 27576d53b3472..d95b98c839143 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/cache.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/cache.json @@ -21,7 +21,7 @@ "UMask": "0x2" }, { - "BriefDescription": "L1D miss oustandings duration in cycles", + "BriefDescription": "L1D miss outstanding duration in cycles", "Counter": "2", "CounterHTOff": "2", "EventCode": "0x48", @@ -658,7 +658,7 @@ "UMask": "0x8" }, { - "BriefDescription": "Cacheable and noncachaeble code read requests", + "BriefDescription": "Cacheable and noncacheable code read requests", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0xB0", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/floating-point.json b/tools/perf/pmu-events/arch/x86/ivytown/floating-point.json index 4c2ac010cf55d..88891cba54ec8 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/floating-point.json @@ -91,7 +91,7 @@ "UMask": "0x20" }, { - "BriefDescription": "Number of FP Computational Uops Executed this cycle. The number of FADD, FSUB, FCOM, FMULs, integer MULsand IMULs, FDIVs, FPREMs, FSQRTS, integer DIVs, and IDIVs. This event does not distinguish an FADD used in the middle of a transcendental flow from a s", + "BriefDescription": "Number of FP Computational Uops Executed this cycle. The number of FADD, FSUB, FCOM, FMULs, integer MULs and IMULs, FDIVs, FPREMs, FSQRTS, integer DIVs, and IDIVs. This event does not distinguish an FADD used in the middle of a transcendental flow from a s", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x10", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/frontend.json b/tools/perf/pmu-events/arch/x86/ivytown/frontend.json index 2b1a82dd86abc..0a295c4e093dd 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/frontend.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/frontend.json @@ -176,41 +176,41 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_CYCLES", - "PublicDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.", + "PublicDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy.", "SampleAfterValue": "2000003", "UMask": "0x30" }, { - "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_DSB_CYCLES", - "PublicDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.", + "PublicDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy.", "SampleAfterValue": "2000003", "UMask": "0x10" }, { - "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "CounterMask": "1", "EdgeDetect": "1", "EventCode": "0x79", "EventName": "IDQ.MS_DSB_OCCUR", - "PublicDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy.", + "PublicDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequencer (MS) is busy.", "SampleAfterValue": "2000003", "UMask": "0x10" }, { - "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", @@ -220,7 +220,7 @@ "UMask": "0x10" }, { - "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", @@ -242,7 +242,7 @@ "UMask": "0x30" }, { - "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy", + "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", "Counter": "0,1,2,3", "CounterHTOff": "0,1,2,3,4,5,6,7", "EventCode": "0x79", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json index 19c7f3b41102d..99a45c8d8ceeb 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json @@ -1,64 +1,524 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "MetricExpr": "ICACHE.IFETCH_STALL / CLKS - tma_itlb_misses", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "13 * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS))) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "310 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "(200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD)))) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS * FP_COMP_OPS_EXE.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -76,8 +536,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -88,16 +548,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -107,37 +561,25 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -179,15 +621,15 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "1 / ( ((FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) / UOPS_EXECUTED.THREAD) + ((FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) / UOPS_EXECUTED.THREAD) )", + "MetricExpr": "1 / (tma_fp_scalar + tma_fp_vector)", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -204,7 +646,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -216,47 +658,41 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / CORE_CLKS", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -277,19 +713,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -307,26 +743,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -344,7 +780,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, @@ -354,12 +790,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -407,5 +837,11 @@ "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency" + }, + { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" } ] diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json index 93e07385eeec7..c118ff54c30eb 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json @@ -61,7 +61,7 @@ "EventCode": "0x34", "EventName": "UNC_C_LLC_LOOKUP.WRITE", "PerPkg": "1", - "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set filter mask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:17] bits correspond to [M'FMESI] state.; Writeback transactions from L2 to the LLC This includes all write transactions -- both Cachable and UC.", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set filter mask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CBoGlCtrl[22:17] bits correspond to [M'FMESI] state.; Writeback transactions from L2 to the LLC This includes all write transactions -- both Cacheable and UC.", "UMask": "0x5", "Unit": "CBO" }, @@ -999,7 +999,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.ALL", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions inserted into the TOR. This includes requests that reside in the TOR for a short time, such as LLC Hits that do not need to snoop cores or requests that get rejected and have to be retried through one of the ingress queues. The TOR is more commonly a bottleneck in skews with smaller core counts, where the ratio of RTIDs to TOR entries is larger. Note that there are reserved TOR entries for various request types, so it is possible that a given request type be blocked with an occupancy that is less than 20. Also note that generally requests will not be able to arbitrate into the TOR pipeline if there are no available TOR slots.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions inserted into the TOR. This includes requests that reside in the TOR for a short time, such as LLC Hits that do not need to snoop cores or requests that get rejected and have to be retried through one of the ingress queues. The TOR is more commonly a bottleneck in skews with smaller core counts, where the ratio of RTIDs to TOR entries is larger. Note that there are reserved TOR entries for various request types, so it is possible that a given request type be blocked with an occupancy that is less than 20. Also note that generally requests will not be able to arbitrate into the TOR pipeline if there are no available TOR slots.", "UMask": "0x8", "Unit": "CBO" }, @@ -1009,7 +1009,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.EVICTION", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Eviction transactions inserted into the TOR. Evictions can be quick, such as when the line is in the F, S, or E states and no core valid bits are set. They can also be longer if either CV bits are set (so the cores need to be snooped) and/or if there is a HitM (in which case it is necessary to write the request out to memory).", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Eviction transactions inserted into the TOR. Evictions can be quick, such as when the line is in the F, S, or E states and no core valid bits are set. They can also be longer if either CV bits are set (so the cores need to be snooped) and/or if there is a HitM (in which case it is necessary to write the request out to memory).", "UMask": "0x4", "Unit": "CBO" }, @@ -1019,7 +1019,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.LOCAL", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions inserted into the TOR that are satisifed by locally HOMed memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions inserted into the TOR that are satisfied by locally HOMed memory.", "UMask": "0x28", "Unit": "CBO" }, @@ -1029,7 +1029,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.LOCAL_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions, satisifed by an opcode, inserted into the TOR that are satisifed by locally HOMed memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions, satisfied by an opcode, inserted into the TOR that are satisfied by locally HOMed memory.", "UMask": "0x21", "Unit": "CBO" }, @@ -1039,7 +1039,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.MISS_LOCAL", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that are satisifed by locally HOMed memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that are satisfied by locally HOMed memory.", "UMask": "0x2A", "Unit": "CBO" }, @@ -1049,7 +1049,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions, satisifed by an opcode, inserted into the TOR that are satisifed by locally HOMed memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions, satisfied by an opcode, inserted into the TOR that are satisfied by locally HOMed memory.", "UMask": "0x23", "Unit": "CBO" }, @@ -1059,7 +1059,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.MISS_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that match an opcode.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that match an opcode.", "UMask": "0x3", "Unit": "CBO" }, @@ -1069,7 +1069,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.MISS_REMOTE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that are satisifed by remote caches or remote memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that are satisfied by remote caches or remote memory.", "UMask": "0x8A", "Unit": "CBO" }, @@ -1079,7 +1079,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions, satisifed by an opcode, inserted into the TOR that are satisifed by remote caches or remote memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions, satisfied by an opcode, inserted into the TOR that are satisfied by remote caches or remote memory.", "UMask": "0x83", "Unit": "CBO" }, @@ -1089,7 +1089,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.NID_ALL", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All NID matched (matches an RTID destination) transactions inserted into the TOR. The NID is programmed in Cn_MSR_PMON_BOX_FILTER.nid. In conjunction with STATE = I, it is possible to monitor misses to specific NIDs in the system.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All NID matched (matches an RTID destination) transactions inserted into the TOR. The NID is programmed in Cn_MSR_PMON_BOX_FILTER.nid. In conjunction with STATE = I, it is possible to monitor misses to specific NIDs in the system.", "UMask": "0x48", "Unit": "CBO" }, @@ -1099,7 +1099,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.NID_EVICTION", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; NID matched eviction transactions inserted into the TOR.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; NID matched eviction transactions inserted into the TOR.", "UMask": "0x44", "Unit": "CBO" }, @@ -1109,7 +1109,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.NID_MISS_ALL", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All NID matched miss requests that were inserted into the TOR.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All NID matched miss requests that were inserted into the TOR.", "UMask": "0x4A", "Unit": "CBO" }, @@ -1119,7 +1119,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.NID_MISS_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that match a NID and an opcode.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Miss transactions inserted into the TOR that match a NID and an opcode.", "UMask": "0x43", "Unit": "CBO" }, @@ -1129,7 +1129,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.NID_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Transactions inserted into the TOR that match a NID and an opcode.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Transactions inserted into the TOR that match a NID and an opcode.", "UMask": "0x41", "Unit": "CBO" }, @@ -1139,7 +1139,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.NID_WB", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; NID matched write transactions inserted into the TOR.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; NID matched write transactions inserted into the TOR.", "UMask": "0x50", "Unit": "CBO" }, @@ -1149,7 +1149,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Transactions inserted into the TOR that match an opcode (matched by Cn_MSR_PMON_BOX_FILTER.opc)", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Transactions inserted into the TOR that match an opcode (matched by Cn_MSR_PMON_BOX_FILTER.opc)", "UMask": "0x1", "Unit": "CBO" }, @@ -1159,7 +1159,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.REMOTE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions inserted into the TOR that are satisifed by remote caches or remote memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions inserted into the TOR that are satisfied by remote caches or remote memory.", "UMask": "0x88", "Unit": "CBO" }, @@ -1169,7 +1169,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.REMOTE_OPCODE", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions, satisifed by an opcode, inserted into the TOR that are satisifed by remote caches or remote memory.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; All transactions, satisfied by an opcode, inserted into the TOR that are satisfied by remote caches or remote memory.", "UMask": "0x81", "Unit": "CBO" }, @@ -1179,7 +1179,7 @@ "EventCode": "0x35", "EventName": "UNC_C_TOR_INSERTS.WB", "PerPkg": "1", - "PublicDescription": "Counts the number of entries successfuly inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Write transactions inserted into the TOR. This does not include RFO, but actual operations that contain data being sent from the core.", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182).; Write transactions inserted into the TOR. This does not include RFO, but actual operations that contain data being sent from the core.", "UMask": "0x10", "Unit": "CBO" }, @@ -1215,7 +1215,7 @@ "EventCode": "0x36", "EventName": "UNC_C_TOR_OCCUPANCY.LOCAL_OPCODE", "PerPkg": "1", - "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding transactions, satisifed by an opcode, in the TOR that are satisifed by locally HOMed memory.", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding transactions, satisfied by an opcode, in the TOR that are satisfied by locally HOMed memory.", "UMask": "0x21", "Unit": "CBO" }, @@ -1242,7 +1242,7 @@ "EventCode": "0x36", "EventName": "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE", "PerPkg": "1", - "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding Miss transactions, satisifed by an opcode, in the TOR that are satisifed by locally HOMed memory.", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding Miss transactions, satisfied by an opcode, in the TOR that are satisfied by locally HOMed memory.", "UMask": "0x23", "Unit": "CBO" }, @@ -1269,7 +1269,7 @@ "EventCode": "0x36", "EventName": "UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE", "PerPkg": "1", - "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding Miss transactions, satisifed by an opcode, in the TOR that are satisifed by remote caches or remote memory.", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding Miss transactions, satisfied by an opcode, in the TOR that are satisfied by remote caches or remote memory.", "UMask": "0x83", "Unit": "CBO" }, @@ -1350,7 +1350,7 @@ "EventCode": "0x36", "EventName": "UNC_C_TOR_OCCUPANCY.REMOTE_OPCODE", "PerPkg": "1", - "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding transactions, satisifed by an opcode, in the TOR that are satisifed by remote caches or remote memory.", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. There are a number of subevent 'filters' but only a subset of the subevent combinations are valid. Subevents that require an opcode or NID match require the Cn_MSR_PMON_BOX_FILTER.{opc, nid} field to be set. If, for example, one wanted to count DRD Local Misses, one should select MISS_OPC_MATCH and set Cn_MSR_PMON_BOX_FILTER.opc to DRD (0x182); Number of outstanding transactions, satisfied by an opcode, in the TOR that are satisfied by remote caches or remote memory.", "UMask": "0x81", "Unit": "CBO" }, @@ -1446,7 +1446,7 @@ "EventCode": "0x2", "EventName": "UNC_C_TxR_INSERTS.BL_CORE", "PerPkg": "1", - "PublicDescription": "Number of allocations into the Cbo Egress. The Egress is used to queue up requests destined for the ring.; Ring transactions from the Corebo destined for the BL ring. This is commonly used for transfering writeback data to the cache.", + "PublicDescription": "Number of allocations into the Cbo Egress. The Egress is used to queue up requests destined for the ring.; Ring transactions from the Corebo destined for the BL ring. This is commonly used for transferring writeback data to the cache.", "UMask": "0x40", "Unit": "CBO" }, @@ -1692,7 +1692,7 @@ "EventCode": "0xb", "EventName": "UNC_H_CONFLICT_CYCLES.LAST", "PerPkg": "1", - "PublicDescription": "Count every last conflictor in conflict chain. Can be used to compute the average conflict chain length as (#Ackcnflts/#LastConflictor)+1. This can be used to give a feel for the conflict chain lenghts while analyzing lock kernels.", + "PublicDescription": "Count every last conflictor in conflict chain. Can be used to compute the average conflict chain length as (#Ackcnflts/#LastConflictor)+1. This can be used to give a feel for the conflict chain lengths while analyzing lock kernels.", "UMask": "0x4", "Unit": "HA" }, @@ -1729,7 +1729,7 @@ "EventCode": "0x41", "EventName": "UNC_H_DIRECTORY_LAT_OPT", "PerPkg": "1", - "PublicDescription": "Directory Latency Optimization Data Return Path Taken. When directory mode is enabled and the directory retuned for a read is Dir=I, then data can be returned using a faster path if certain conditions are met (credits, free pipeline, etc).", + "PublicDescription": "Directory Latency Optimization Data Return Path Taken. When directory mode is enabled and the directory returned for a read is Dir=I, then data can be returned using a faster path if certain conditions are met (credits, free pipeline, etc).", "Unit": "HA" }, { @@ -2686,7 +2686,7 @@ "EventCode": "0x21", "EventName": "UNC_H_SNOOP_RESP.RSPSFWD", "PerPkg": "1", - "PublicDescription": "Counts the total number of RspI snoop responses received. Whenever a snoops are issued, one or more snoop responses will be returned depending on the topology of the system. In systems larger than 2s, when multiple snoops are returned this will count all the snoops that are received. For example, if 3 snoops were issued and returned RspI, RspS, and RspSFwd; then each of these sub-events would increment by 1.; Filters for a snoop response of RspSFwd. This is returned when a remote caching agent forwards data but holds on to its currentl copy. This is common for data and code reads that hit in a remote socket in E or F state.", + "PublicDescription": "Counts the total number of RspI snoop responses received. Whenever a snoops are issued, one or more snoop responses will be returned depending on the topology of the system. In systems larger than 2s, when multiple snoops are returned this will count all the snoops that are received. For example, if 3 snoops were issued and returned RspI, RspS, and RspSFwd; then each of these sub-events would increment by 1.; Filters for a snoop response of RspSFwd. This is returned when a remote caching agent forwards data but holds on to its currently copy. This is common for data and code reads that hit in a remote socket in E or F state.", "UMask": "0x8", "Unit": "HA" }, @@ -2766,7 +2766,7 @@ "EventCode": "0x60", "EventName": "UNC_H_SNP_RESP_RECV_LOCAL.RSPSFWD", "PerPkg": "1", - "PublicDescription": "Number of snoop responses received for a Local request; Filters for a snoop response of RspSFwd. This is returned when a remote caching agent forwards data but holds on to its currentl copy. This is common for data and code reads that hit in a remote socket in E or F state.", + "PublicDescription": "Number of snoop responses received for a Local request; Filters for a snoop response of RspSFwd. This is returned when a remote caching agent forwards data but holds on to its currently copy. This is common for data and code reads that hit in a remote socket in E or F state.", "UMask": "0x8", "Unit": "HA" }, diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json index b3b1a08d4acf5..10ea4afeffc13 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json @@ -24,7 +24,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_CREDITS", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because there were not enough Egress credits. Had there been enough credits, the spawn would have worked as the RBT bit was set and the RBT tag matched.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because there were not enough Egress credits. Had there been enough credits, the spawn would have worked as the RBT bit was set and the RBT tag matched.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -34,7 +34,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_CREDITS_MISS", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match and there weren't enough Egress credits. The valid bit was set.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match and there weren't enough Egress credits. The valid bit was set.", "UMask": "0x20", "Unit": "QPI LL" }, @@ -44,7 +44,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_CREDITS_RBT", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because there were not enough Egress credits AND the RBT bit was not set, but the RBT tag matched.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because there were not enough Egress credits AND the RBT bit was not set, but the RBT tag matched.", "UMask": "0x8", "Unit": "QPI LL" }, @@ -54,7 +54,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_CREDITS_RBT_MISS", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match, the valid bit was not set and there weren't enough Egress credits.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match, the valid bit was not set and there weren't enough Egress credits.", "UMask": "0x80", "Unit": "QPI LL" }, @@ -64,7 +64,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_MISS", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match although the valid bit was set and there were enough Egress credits.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match although the valid bit was set and there were enough Egress credits.", "UMask": "0x10", "Unit": "QPI LL" }, @@ -74,7 +74,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_RBT_HIT", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the route-back table (RBT) specified that the transaction should not trigger a direct2core tranaction. This is common for IO transactions. There were enough Egress credits and the RBT tag matched but the valid bit was not set.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the route-back table (RBT) specified that the transaction should not trigger a direct2core transaction. This is common for IO transactions. There were enough Egress credits and the RBT tag matched but the valid bit was not set.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -84,7 +84,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.FAILURE_RBT_MISS", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match and the valid bit was not set although there were enough Egress credits.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn failed because the RBT tag did not match and the valid bit was not set although there were enough Egress credits.", "UMask": "0x40", "Unit": "QPI LL" }, @@ -94,7 +94,7 @@ "EventCode": "0x13", "EventName": "UNC_Q_DIRECT2CORE.SUCCESS_RBT_HIT", "PerPkg": "1", - "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exlusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn was successful. There were sufficient credits, the RBT valid bit was set and there was an RBT tag match. The message was marked to spawn direct2core.", + "PublicDescription": "Counts the number of DRS packets that we attempted to do direct2core on. There are 4 mutually exclusive filters. Filter [0] can be used to get successful spawns, while [1:3] provide the different failure cases. Note that this does not count packets that are not candidates for Direct2Core. The only candidates for Direct2Core are DRS packets destined for Cbos.; The spawn was successful. There were sufficient credits, the RBT valid bit was set and there was an RBT tag match. The message was marked to spawn direct2core.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -131,7 +131,7 @@ "EventCode": "0x9", "EventName": "UNC_Q_RxL_BYPASSED", "PerPkg": "1", - "PublicDescription": "Counts the number of times that an incoming flit was able to bypass the flit buffer and pass directly across the BGF and into the Egress. This is a latency optimization, and should generally be the common case. If this value is less than the number of flits transfered, it implies that there was queueing getting onto the ring, and thus the transactions saw higher latency.", + "PublicDescription": "Counts the number of times that an incoming flit was able to bypass the flit buffer and pass directly across the BGF and into the Egress. This is a latency optimization, and should generally be the common case. If this value is less than the number of flits transferred, it implies that there was queueing getting onto the ring, and thus the transactions saw higher latency.", "Unit": "QPI LL" }, { @@ -443,7 +443,7 @@ "EventCode": "0x1", "EventName": "UNC_Q_RxL_FLITS_G0.DATA", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of data flitsreceived over QPI. Each flit contains 64b of data. This includes both DRS and NCB data flits (coherent and non-coherent). This can be used to calculate the data bandwidth of the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This does not include the header flits that go in data packets.", + "PublicDescription": "Counts the number of flits received from the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of data flits received over QPI. Each flit contains 64b of data. This includes both DRS and NCB data flits (coherent and non-coherent). This can be used to calculate the data bandwidth of the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This does not include the header flits that go in data packets.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -453,7 +453,7 @@ "EventCode": "0x1", "EventName": "UNC_Q_RxL_FLITS_G0.IDLE", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of flits received over QPI that do not hold protocol payload. When QPI is not in a power saving state, it continuously transmits flits across the link. When there are no protocol flits to send, it will send IDLE and NULL flits across. These flits sometimes do carry a payload, such as credit returns, but are generall not considered part of the QPI bandwidth.", + "PublicDescription": "Counts the number of flits received from the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of flits received over QPI that do not hold protocol payload. When QPI is not in a power saving state, it continuously transmits flits across the link. When there are no protocol flits to send, it will send IDLE and NULL flits across. These flits sometimes do carry a payload, such as credit returns, but are generally not considered part of the QPI bandwidth.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -463,7 +463,7 @@ "EventCode": "0x1", "EventName": "UNC_Q_RxL_FLITS_G0.NON_DATA", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of non-NULL non-data flits received across QPI. This basically tracks the protocol overhead on the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This includes the header flits for data packets.", + "PublicDescription": "Counts the number of flits received from the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of non-NULL non-data flits received across QPI. This basically tracks the protocol overhead on the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This includes the header flits for data packets.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -474,7 +474,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.DRS", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits received over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits received over the NCB channel which transmits non-coherent data.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits received over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits received over the NCB channel which transmits non-coherent data.", "UMask": "0x18", "Unit": "QPI LL" }, @@ -485,7 +485,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.DRS_DATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of data flits received over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits received over the NCB channel which transmits non-coherent data. This includes only the data flits (not the header).", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of data flits received over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits received over the NCB channel which transmits non-coherent data. This includes only the data flits (not the header).", "UMask": "0x8", "Unit": "QPI LL" }, @@ -496,7 +496,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.DRS_NONDATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of protocol flits received over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits received over the NCB channel which transmits non-coherent data. This includes only the header flits (not the data). This includes extended headers.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of protocol flits received over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits received over the NCB channel which transmits non-coherent data. This includes only the header flits (not the data). This includes extended headers.", "UMask": "0x10", "Unit": "QPI LL" }, @@ -507,7 +507,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.HOM", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of flits received over QPI on the home channel.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of flits received over QPI on the home channel.", "UMask": "0x6", "Unit": "QPI LL" }, @@ -518,7 +518,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.HOM_NONREQ", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of non-request flits received over QPI on the home channel. These are most commonly snoop responses, and this event can be used as a proxy for that.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of non-request flits received over QPI on the home channel. These are most commonly snoop responses, and this event can be used as a proxy for that.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -529,7 +529,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.HOM_REQ", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of data request received over QPI on the home channel. This basically counts the number of remote memory requests received over QPI. In conjunction with the local read count in the Home Agent, one can calculate the number of LLC Misses.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of data request received over QPI on the home channel. This basically counts the number of remote memory requests received over QPI. In conjunction with the local read count in the Home Agent, one can calculate the number of LLC Misses.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -540,7 +540,7 @@ "EventName": "UNC_Q_RxL_FLITS_G1.SNP", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of snoop request flits received over QPI. These requests are contained in the snoop channel. This does not include snoop responses, which are received on the home channel.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of snoop request flits received over QPI. These requests are contained in the snoop channel. This does not include snoop responses, which are received on the home channel.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -551,7 +551,7 @@ "EventName": "UNC_Q_RxL_FLITS_G2.NCB", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass flits. These packets are generally used to transmit non-coherent data across QPI.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass flits. These packets are generally used to transmit non-coherent data across QPI.", "UMask": "0xC", "Unit": "QPI LL" }, @@ -562,7 +562,7 @@ "EventName": "UNC_Q_RxL_FLITS_G2.NCB_DATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass data flits. These flits are generally used to transmit non-coherent data across QPI. This does not include a count of the DRS (coherent) data flits. This only counts the data flits, not the NCB headers.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass data flits. These flits are generally used to transmit non-coherent data across QPI. This does not include a count of the DRS (coherent) data flits. This only counts the data flits, not the NCB headers.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -573,7 +573,7 @@ "EventName": "UNC_Q_RxL_FLITS_G2.NCB_NONDATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass non-data flits. These packets are generally used to transmit non-coherent data across QPI, and the flits counted here are for headers and other non-data flits. This includes extended headers.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass non-data flits. These packets are generally used to transmit non-coherent data across QPI, and the flits counted here are for headers and other non-data flits. This includes extended headers.", "UMask": "0x8", "Unit": "QPI LL" }, @@ -584,7 +584,7 @@ "EventName": "UNC_Q_RxL_FLITS_G2.NCS", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of NCS (non-coherent standard) flits received over QPI. This includes extended headers.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of NCS (non-coherent standard) flits received over QPI. This includes extended headers.", "UMask": "0x10", "Unit": "QPI LL" }, @@ -595,7 +595,7 @@ "EventName": "UNC_Q_RxL_FLITS_G2.NDR_AD", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits received over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets to the local socket which use the AK ring.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits received over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets to the local socket which use the AK ring.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -606,7 +606,7 @@ "EventName": "UNC_Q_RxL_FLITS_G2.NDR_AK", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits received over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets destined for Route-thru to a remote socket.", + "PublicDescription": "Counts the number of flits received from the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits received over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets destined for Route-thru to a remote socket.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -1227,7 +1227,7 @@ "Counter": "0,1,2,3", "EventName": "UNC_Q_TxL_FLITS_G0.DATA", "PerPkg": "1", - "PublicDescription": "Counts the number of flits transmitted across the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of data flits transmitted over QPI. Each flit contains 64b of data. This includes both DRS and NCB data flits (coherent and non-coherent). This can be used to calculate the data bandwidth of the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This does not include the header flits that go in data packets.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of data flits transmitted over QPI. Each flit contains 64b of data. This includes both DRS and NCB data flits (coherent and non-coherent). This can be used to calculate the data bandwidth of the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This does not include the header flits that go in data packets.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -1236,7 +1236,7 @@ "Counter": "0,1,2,3", "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA", "PerPkg": "1", - "PublicDescription": "Counts the number of flits transmitted across the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of non-NULL non-data flits transmitted across QPI. This basically tracks the protocol overhead on the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This includes the header flits for data packets.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. It includes filters for Idle, protocol, and Data Flits. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time (for L0) or 4B instead of 8B for L0p.; Number of non-NULL non-data flits transmitted across QPI. This basically tracks the protocol overhead on the QPI link. One can get a good picture of the QPI-link characteristics by evaluating the protocol flits, data flits, and idle/null flits. This includes the header flits for data packets.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -1246,7 +1246,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.DRS", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits transmitted over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits transmitted over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency.", "UMask": "0x18", "Unit": "QPI LL" }, @@ -1256,7 +1256,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.DRS_DATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of data flits transmitted over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits transmitted over the NCB channel which transmits non-coherent data. This includes only the data flits (not the header).", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of data flits transmitted over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits transmitted over the NCB channel which transmits non-coherent data. This includes only the data flits (not the header).", "UMask": "0x8", "Unit": "QPI LL" }, @@ -1266,7 +1266,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.DRS_NONDATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of protocol flits transmitted over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits transmitted over the NCB channel which transmits non-coherent data. This includes only the header flits (not the data). This includes extended headers.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of protocol flits transmitted over QPI on the DRS (Data Response) channel. DRS flits are used to transmit data with coherency. This does not count data flits transmitted over the NCB channel which transmits non-coherent data. This includes only the header flits (not the data). This includes extended headers.", "UMask": "0x10", "Unit": "QPI LL" }, @@ -1276,7 +1276,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.HOM", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of flits transmitted over QPI on the home channel.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of flits transmitted over QPI on the home channel.", "UMask": "0x6", "Unit": "QPI LL" }, @@ -1286,7 +1286,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.HOM_NONREQ", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of non-request flits transmitted over QPI on the home channel. These are most commonly snoop responses, and this event can be used as a proxy for that.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of non-request flits transmitted over QPI on the home channel. These are most commonly snoop responses, and this event can be used as a proxy for that.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -1296,7 +1296,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.HOM_REQ", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of data request transmitted over QPI on the home channel. This basically counts the number of remote memory requests transmitted over QPI. In conjunction with the local read count in the Home Agent, one can calculate the number of LLC Misses.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of data request transmitted over QPI on the home channel. This basically counts the number of remote memory requests transmitted over QPI. In conjunction with the local read count in the Home Agent, one can calculate the number of LLC Misses.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -1306,7 +1306,7 @@ "EventName": "UNC_Q_TxL_FLITS_G1.SNP", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of snoop request flits transmitted over QPI. These requests are contained in the snoop channel. This does not include snoop responses, which are transmitted on the home channel.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for SNP, HOM, and DRS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the number of snoop request flits transmitted over QPI. These requests are contained in the snoop channel. This does not include snoop responses, which are transmitted on the home channel.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -1317,7 +1317,7 @@ "EventName": "UNC_Q_TxL_FLITS_G2.NCB", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass flits. These packets are generally used to transmit non-coherent data across QPI.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass flits. These packets are generally used to transmit non-coherent data across QPI.", "UMask": "0xC", "Unit": "QPI LL" }, @@ -1328,7 +1328,7 @@ "EventName": "UNC_Q_TxL_FLITS_G2.NCB_DATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass data flits. These flits are generally used to transmit non-coherent data across QPI. This does not include a count of the DRS (coherent) data flits. This only counts the data flits, not te NCB headers.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass data flits. These flits are generally used to transmit non-coherent data across QPI. This does not include a count of the DRS (coherent) data flits. This only counts the data flits, not the NCB headers.", "UMask": "0x4", "Unit": "QPI LL" }, @@ -1339,7 +1339,7 @@ "EventName": "UNC_Q_TxL_FLITS_G2.NCB_NONDATA", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass non-data flits. These packets are generally used to transmit non-coherent data across QPI, and the flits counted here are for headers and other non-data flits. This includes extended headers.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of Non-Coherent Bypass non-data flits. These packets are generally used to transmit non-coherent data across QPI, and the flits counted here are for headers and other non-data flits. This includes extended headers.", "UMask": "0x8", "Unit": "QPI LL" }, @@ -1350,7 +1350,7 @@ "EventName": "UNC_Q_TxL_FLITS_G2.NCS", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of NCS (non-coherent standard) flits transmitted over QPI. This includes extended headers.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Number of NCS (non-coherent standard) flits transmitted over QPI. This includes extended headers.", "UMask": "0x10", "Unit": "QPI LL" }, @@ -1361,7 +1361,7 @@ "EventName": "UNC_Q_TxL_FLITS_G2.NDR_AD", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits transmitted over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets to the local socket which use the AK ring.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits transmitted over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets to the local socket which use the AK ring.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -1372,7 +1372,7 @@ "EventName": "UNC_Q_TxL_FLITS_G2.NDR_AK", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Counts the number of flits trasmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transfering a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits transmitted over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets destined for Route-thru to a remote socket.", + "PublicDescription": "Counts the number of flits transmitted across the QPI Link. This is one of three groups that allow us to track flits. It includes filters for NDR, NCB, and NCS message classes. Each flit is made up of 80 bits of information (in addition to some ECC data). In full-width (L0) mode, flits are made up of four fits, each of which contains 20 bits of data (along with some additional ECC data). In half-width (L0p) mode, the fits are only 10 bits, and therefore it takes twice as many fits to transmit a flit. When one talks about QPI speed (for example, 8.0 GT/s), the transfers here refer to fits. Therefore, in L0, the system will transfer 1 flit at the rate of 1/4th the QPI speed. One can calculate the bandwidth of the link by taking: flits*80b/time. Note that this is not the same as data bandwidth. For example, when we are transferring a 64B cacheline across QPI, we will break it into 9 flits -- 1 with header information and 8 with 64 bits of actual data and an additional 16 bits of other information. To calculate data bandwidth, one should therefore do: data flits * 8B / time.; Counts the total number of flits transmitted over the NDR (Non-Data Response) channel. This channel is used to send a variety of protocol flits including grants and completions. This is only for NDR packets destined for Route-thru to a remote socket.", "UMask": "0x2", "Unit": "QPI LL" }, @@ -1511,7 +1511,7 @@ "EventName": "UNC_Q_TxR_AD_SNP_CREDIT_OCCUPANCY.VN0", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Occupancy event that tracks the number of link layer credits into the R3 (for transactions across the BGF) available in each cycle. Flow Control FIFO fro Snoop messages on AD.", + "PublicDescription": "Occupancy event that tracks the number of link layer credits into the R3 (for transactions across the BGF) available in each cycle. Flow Control FIFO for Snoop messages on AD.", "UMask": "0x1", "Unit": "QPI LL" }, @@ -1522,7 +1522,7 @@ "EventName": "UNC_Q_TxR_AD_SNP_CREDIT_OCCUPANCY.VN1", "ExtSel": "1", "PerPkg": "1", - "PublicDescription": "Occupancy event that tracks the number of link layer credits into the R3 (for transactions across the BGF) available in each cycle. Flow Control FIFO fro Snoop messages on AD.", + "PublicDescription": "Occupancy event that tracks the number of link layer credits into the R3 (for transactions across the BGF) available in each cycle. Flow Control FIFO for Snoop messages on AD.", "UMask": "0x2", "Unit": "QPI LL" }, diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json index 63b49b712c621..ed60ebca35cb8 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json @@ -188,7 +188,7 @@ "EventCode": "0x9", "EventName": "UNC_M_ECC_CORRECTABLE_ERRORS", "PerPkg": "1", - "PublicDescription": "Counts the number of ECC errors detected and corrected by the iMC on this channel. This counter is only useful with ECC DRAM devices. This count will increment one time for each correction regardless of the number of bits corrected. The iMC can correct up to 4 bit errors in independent channel mode and 8 bit erros in lockstep mode.", + "PublicDescription": "Counts the number of ECC errors detected and corrected by the iMC on this channel. This counter is only useful with ECC DRAM devices. This count will increment one time for each correction regardless of the number of bits corrected. The iMC can correct up to 4 bit errors in independent channel mode and 8 bit errors in lockstep mode.", "Unit": "iMC" }, { diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-other.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-other.json index af289aa6c98ea..6c7ddf642fc38 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-other.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-other.json @@ -2097,7 +2097,7 @@ "EventCode": "0x33", "EventName": "UNC_R3_VNA_CREDITS_ACQUIRED", "PerPkg": "1", - "PublicDescription": "Number of QPI VNA Credit acquisitions. This event can be used in conjunction with the VNA In-Use Accumulator to calculate the average lifetime of a credit holder. VNA credits are used by all message classes in order to communicate across QPI. If a packet is unable to acquire credits, it will then attempt to use credts from the VN0 pool. Note that a single packet may require multiple flit buffers (i.e. when data is being transfered). Therefore, this event will increment by the number of credits acquired in each cycle. Filtering based on message class is not provided. One can count the number of packets transfered in a given message class using an qfclk event.", + "PublicDescription": "Number of QPI VNA Credit acquisitions. This event can be used in conjunction with the VNA In-Use Accumulator to calculate the average lifetime of a credit holder. VNA credits are used by all message classes in order to communicate across QPI. If a packet is unable to acquire credits, it will then attempt to use credits from the VN0 pool. Note that a single packet may require multiple flit buffers (i.e. when data is being transferred). Therefore, this event will increment by the number of credits acquired in each cycle. Filtering based on message class is not provided. One can count the number of packets transferred in a given message class using an qfclk event.", "Unit": "R3QPI" }, { @@ -2106,7 +2106,7 @@ "EventCode": "0x33", "EventName": "UNC_R3_VNA_CREDITS_ACQUIRED.AD", "PerPkg": "1", - "PublicDescription": "Number of QPI VNA Credit acquisitions. This event can be used in conjunction with the VNA In-Use Accumulator to calculate the average lifetime of a credit holder. VNA credits are used by all message classes in order to communicate across QPI. If a packet is unable to acquire credits, it will then attempt to use credts from the VN0 pool. Note that a single packet may require multiple flit buffers (i.e. when data is being transfered). Therefore, this event will increment by the number of credits acquired in each cycle. Filtering based on message class is not provided. One can count the number of packets transfered in a given message class using an qfclk event.; Filter for the Home (HOM) message class. HOM is generally used to send requests, request responses, and snoop responses.", + "PublicDescription": "Number of QPI VNA Credit acquisitions. This event can be used in conjunction with the VNA In-Use Accumulator to calculate the average lifetime of a credit holder. VNA credits are used by all message classes in order to communicate across QPI. If a packet is unable to acquire credits, it will then attempt to use credits from the VN0 pool. Note that a single packet may require multiple flit buffers (i.e. when data is being transferred). Therefore, this event will increment by the number of credits acquired in each cycle. Filtering based on message class is not provided. One can count the number of packets transferred in a given message class using an qfclk event.; Filter for the Home (HOM) message class. HOM is generally used to send requests, request responses, and snoop responses.", "UMask": "0x1", "Unit": "R3QPI" }, @@ -2116,7 +2116,7 @@ "EventCode": "0x33", "EventName": "UNC_R3_VNA_CREDITS_ACQUIRED.BL", "PerPkg": "1", - "PublicDescription": "Number of QPI VNA Credit acquisitions. This event can be used in conjunction with the VNA In-Use Accumulator to calculate the average lifetime of a credit holder. VNA credits are used by all message classes in order to communicate across QPI. If a packet is unable to acquire credits, it will then attempt to use credts from the VN0 pool. Note that a single packet may require multiple flit buffers (i.e. when data is being transfered). Therefore, this event will increment by the number of credits acquired in each cycle. Filtering based on message class is not provided. One can count the number of packets transfered in a given message class using an qfclk event.; Filter for the Home (HOM) message class. HOM is generally used to send requests, request responses, and snoop responses.", + "PublicDescription": "Number of QPI VNA Credit acquisitions. This event can be used in conjunction with the VNA In-Use Accumulator to calculate the average lifetime of a credit holder. VNA credits are used by all message classes in order to communicate across QPI. If a packet is unable to acquire credits, it will then attempt to use credits from the VN0 pool. Note that a single packet may require multiple flit buffers (i.e. when data is being transferred). Therefore, this event will increment by the number of credits acquired in each cycle. Filtering based on message class is not provided. One can count the number of packets transferred in a given message class using an qfclk event.; Filter for the Home (HOM) message class. HOM is generally used to send requests, request responses, and snoop responses.", "UMask": "0x4", "Unit": "R3QPI" }, diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json index 0ba63a97ddfa7..74c87217d75c9 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json @@ -601,7 +601,7 @@ "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", "PerPkg": "1", - "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" }, { @@ -610,7 +610,7 @@ "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", "PerPkg": "1", - "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" }, { @@ -619,7 +619,7 @@ "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", "PerPkg": "1", - "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" }, { @@ -637,7 +637,7 @@ "EventCode": "0x9", "EventName": "UNC_P_PROCHOT_INTERNAL_CYCLES", "PerPkg": "1", - "PublicDescription": "Counts the number of cycles that we are in Interal PROCHOT mode. This mode is triggered when a sensor on the die determines that we are too hot and must throttle to avoid damaging the chip.", + "PublicDescription": "Counts the number of cycles that we are in Internal PROCHOT mode. This mode is triggered when a sensor on the die determines that we are too hot and must throttle to avoid damaging the chip.", "Unit": "PCU" }, { diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 84535179d1287..81bd6f5d53540 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -13,7 +13,7 @@ GenuineIntel-6-3F,v26,haswellx,core GenuineIntel-6-(7D|7E|A7),v1.15,icelake,core GenuineIntel-6-6[AC],v1.16,icelakex,core GenuineIntel-6-3A,v22,ivybridge,core -GenuineIntel-6-3E,v21,ivytown,core +GenuineIntel-6-3E,v22,ivytown,core GenuineIntel-6-2D,v21,jaketown,core GenuineIntel-6-(57|85),v9,knightslanding,core GenuineIntel-6-AA,v1.00,meteorlake,core -- GitLab From 3405de19abf54ef891280564e381a8f277f5fc76 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:06 -0700 Subject: [PATCH 1426/2223] perf vendor events: Update Intel jaketown Events remain at v21, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/jaketown/jkt-metrics.json | 327 +++++++++++++----- 1 file changed, 246 insertions(+), 81 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json index c0fbb4f31241b..554f87c03c05f 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -1,64 +1,247 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS))) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING)) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS * FP_COMP_OPS_EXE.X87 / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -70,8 +253,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -82,16 +265,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_DISPATCHED.THREAD / UOPS_ISSUED.ANY", @@ -101,44 +278,32 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", + "MetricExpr": "UOPS_DISPATCHED.THREAD / ((cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -149,7 +314,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -161,26 +326,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -198,7 +363,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, @@ -208,12 +373,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -261,5 +420,11 @@ "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency" + }, + { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" } ] -- GitLab From db35c1dc0b5567dcaef68809b789ee25bf088647 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:07 -0700 Subject: [PATCH 1427/2223] perf vendor events: Update Intel sandybridge Events remain at v17, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/sandybridge/snb-metrics.json | 315 +++++++++++++----- 1 file changed, 240 insertions(+), 75 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json index ae7ed267b2a22..5d5a6d6f3bdab 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -1,64 +1,247 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS))) * CYCLE_ACTIVITY.STALLS_L2_PENDING / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if (IPC > 1.8) else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@ - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING)) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS * FP_COMP_OPS_EXE.X87 / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -70,8 +253,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -82,16 +265,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_DISPATCHED.THREAD / UOPS_ISSUED.ANY", @@ -101,44 +278,32 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", + "MetricExpr": "UOPS_DISPATCHED.THREAD / ((cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -149,7 +314,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -161,26 +326,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -198,7 +363,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, -- GitLab From 9a1b4aa4c9b25a40b17c29f9e198f67e185bfe44 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:08 -0700 Subject: [PATCH 1428/2223] perf vendor events: Update Intel sapphirerapids Events are updated to v1.06 the core metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - Addition of all 6 levels of TMA metrics. Previously metrics involving topdown events were dropped. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. - Latest metrics from: https://github.com/intel/perfmon-metrics Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-20-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../arch/x86/sapphirerapids/cache.json | 4 +- .../arch/x86/sapphirerapids/frontend.json | 11 + .../arch/x86/sapphirerapids/pipeline.json | 4 +- .../arch/x86/sapphirerapids/spr-metrics.json | 1249 ++++++++++++----- 5 files changed, 917 insertions(+), 353 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 81bd6f5d53540..c2354e368586c 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -20,7 +20,7 @@ GenuineIntel-6-AA,v1.00,meteorlake,core GenuineIntel-6-1[AEF],v3,nehalemep,core GenuineIntel-6-2E,v3,nehalemex,core GenuineIntel-6-2A,v17,sandybridge,core -GenuineIntel-6-8F,v1.04,sapphirerapids,core +GenuineIntel-6-8F,v1.06,sapphirerapids,core GenuineIntel-6-(37|4C|4D),v14,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v53,skylake,core GenuineIntel-6-55-[01234],v1.28,skylakex,core diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json index 348476ce8107f..c05c741e22db1 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json @@ -35,7 +35,7 @@ "UMask": "0x2" }, { - "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability.", + "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.", "CollectPEBSRecord": "2", "Counter": "0,1,2,3", "CounterMask": "1", @@ -43,7 +43,7 @@ "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", "PEBScounters": "0,1,2,3", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "Speculative": "1", "UMask": "0x2" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json index 44ecf38ad970e..ff0d47ce8e9a1 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json @@ -11,6 +11,17 @@ "Speculative": "1", "UMask": "0x1" }, + { + "BriefDescription": "Cycles the Microcode Sequencer is busy.", + "CollectPEBSRecord": "2", + "Counter": "0,1,2,3", + "EventCode": "0x87", + "EventName": "DECODE.MS_BUSY", + "PEBScounters": "0,1,2,3", + "SampleAfterValue": "500009", + "Speculative": "1", + "UMask": "0x2" + }, { "BriefDescription": "DSB-to-MITE switch true penalty cycles.", "CollectPEBSRecord": "2", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json index df4f3d714e6e0..b2f0d9393d3ca 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json @@ -80,10 +80,10 @@ "EventCode": "0xc1", "EventName": "ASSISTS.ANY", "PEBScounters": "0,1,2,3,4,5,6,7", - "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware Examples include AD (page Access Dirty), FP and AVX related assists.", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", "SampleAfterValue": "100003", "Speculative": "1", - "UMask": "0x1f" + "UMask": "0x1b" }, { "BriefDescription": "All branch instructions retired.", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index e194dfc5c25b5..9ec42a68c160a 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -1,16 +1,818 @@ [ + { + "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "(topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS)", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "ICACHE_DATA.STALLS / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_TAG.STALLS / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (tma_branch_mispredicts / tma_bad_speculation)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "DECODE.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((25 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (24 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(24 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(9 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "((MEMORY_ACTIVITY.STALLS_L3_MISS / CLKS) - tma_pmm_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to memory bandwidth Allocation feature (RDT's memory bandwidth throttling).", + "MetricExpr": "INT_MISC.MBA_STALLS / CLKS", + "MetricGroup": "MemoryBW;Offcore;Server;TopdownL5;tma_mem_bandwidth_group", + "MetricName": "tma_mba_stalls", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricExpr": "(54.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_local_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricExpr": "(119 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_dram", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", + "MetricExpr": "((108 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + (108 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group", + "MetricName": "tma_remote_cache", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) / ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))))) * (MEMORY_ACTIVITY.STALLS_L3_MISS / CLKS)) if (1000000 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS) else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_pmm_bound", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((MEM_STORE_RETIRED.L2_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(28 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_streaming_stores", + "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", + "MetricExpr": "13 * MISC2_RETIRED.LFENCE / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_memory_fence", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix Extensions (AMX) execution engine was busy with tile (arithmetic) operations", + "MetricExpr": "EXE.AMX_BUSY / CORE_CLKS", + "MetricGroup": "Compute;HPC;Server;TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_amx_busy", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3_10", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector + tma_fp_amx", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_512b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine)", + "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;HPC;Pipeline;Server;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_amx", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine). Refer to AMX_Busy and GFLOPs metrics for actual AMX utilization and FP performance, resp.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_int_operations", + "PublicDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired). Vector/Matrix Int operations and shuffles are counted. Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired.", + "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_int_operations_group", + "MetricName": "tma_int_vector_128b", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired.", + "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_int_operations_group", + "MetricName": "tma_int_vector_256b", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine)", + "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;HPC;IntVector;Pipeline;Server;TopdownL4;tma_int_operations_group", + "MetricName": "tma_int_amx", + "PublicDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine). Refer to AMX_Busy and TIOPs metrics for actual AMX utilization and Int performance, resp.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", + "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * SLOTS)", + "MetricGroup": "HPC;Pipeline;TopdownL4;tma_int_operations_group", + "MetricName": "tma_shuffles", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fused_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_non_fused_branches", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "UOPS_RETIRED.MS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", + "MetricExpr": "99 * ASSISTS.PAGE_FAULT / SLOTS", + "MetricGroup": "TopdownL5;tma_assists_group", + "MetricName": "tma_page_faults", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults. A Page Fault may apply on first application access to a memory page. Note operating system handling of page faults accounts for the majority of its cost.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "30 * ASSISTS.FP / SLOTS", + "MetricGroup": "HPC;TopdownL5;tma_assists_group", + "MetricName": "tma_fp_assists", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called denormals).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists. ", + "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / SLOTS", + "MetricGroup": "HPC;TopdownL5;tma_assists_group", + "MetricName": "tma_avx_assists", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources. Sample with: FRONTEND_RETIRED.MS_FLOWS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" + }, + { + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "Memory_Bandwidth" + }, + { + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)))", + "MetricGroup": "Mem;MemoryLat;Offcore", + "MetricName": "Memory_Latency" + }, + { + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ", + "MetricGroup": "Mem;MemoryTLB;Offcore", + "MetricName": "Memory_Data_TLBs" + }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, + { + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", + "MetricName": "Big_Code" + }, + { + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "Instruction_Fetch_BW" + }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "UPI" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW", + "MetricName": "UpTB" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", + "MetricName": "CPI" + }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", @@ -20,13 +822,13 @@ { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, { "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1", - "MetricGroup": "SMT;TmaL1", + "MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1", + "MetricGroup": "SMT;tma_L1_group", "MetricName": "Slots_Utilization" }, { @@ -38,29 +840,35 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR_HALF ) + 2 * ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF ) + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * ( FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16 ) / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR_HALF) + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5 ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", + "MetricGroup": "Cor;SMT", + "MetricName": "Core_Bound_Likely" + }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", @@ -105,13 +913,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR_HALF ) + 2 * ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF ) + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * ( FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16 )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR_HALF) + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.VECTOR) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.VECTOR))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -132,21 +940,21 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX512", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -161,7 +969,7 @@ { "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", - "MetricGroup": "IntVector;InsType;Server", + "MetricGroup": "InsType;IntVector;Server", "MetricName": "IpArith_AMX_Int8", "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." }, @@ -172,11 +980,17 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "(tma_retiring * SLOTS) / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire" + }, { "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", @@ -213,6 +1027,12 @@ "MetricGroup": "DSBmiss", "MetricName": "DSB_Switch_Cost" }, + { + "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "DSB_Misses" + }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -225,6 +1045,12 @@ "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "IpMispredict" }, + { + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "Branch_Misprediction_Cost" + }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -239,7 +1065,7 @@ }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, @@ -251,7 +1077,7 @@ }, { "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - ( (BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES) + ((BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES) )", + "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)", "MetricGroup": "Bad;Branches", "MetricName": "Other_Branches" }, @@ -264,67 +1090,67 @@ { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING ) / ( 4 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, @@ -354,37 +1180,37 @@ }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1000 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_Silent_PKI" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricExpr": "1000 * L2_LINES_OUT.NON_SILENT / Instructions", "MetricGroup": "L2Evicts;Mem;Server", "MetricName": "L2_Evictions_NonSilent_PKI" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -396,26 +1222,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR_HALF ) + 2 * ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF ) + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * ( FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16 ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR_HALF) + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Tera Integer (matrix) Operations Per Second", - "MetricExpr": "( 8 * AMX_OPS_RETIRED.INT8 / 1000000000000 ) / duration_time", + "MetricExpr": "(8 * AMX_OPS_RETIRED.INT8 / 1e12) / duration_time", "MetricGroup": "Cor;HPC;IntVector;Server", "MetricName": "TIOPS" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, @@ -439,13 +1265,13 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD ) / ( uncore_cha_0@event\\=0x1@ / duration_time )", + "MetricExpr": "1000000000 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (Socket_CLKS / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "MEM_Read_Latency" }, @@ -457,32 +1283,32 @@ }, { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM ) / uncore_cha_0@event\\=0x1@ )", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": "(1000000000 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@)", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_PMM_Read_Latency" }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches", - "MetricExpr": " 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR ) / uncore_cha_0@event\\=0x1@", - "MetricGroup": "Mem;MemoryLat;SoC;Server", + "MetricExpr": " 1000000000 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", + "MetricGroup": "Mem;MemoryLat;Server;SoC", "MetricName": "MEM_DRAM_Read_Latency" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", - "MetricExpr": "( ( 64 * UNC_M_PMM_RPQ_INSERTS / 1000000000 ) / duration_time )", - "MetricGroup": "Mem;MemoryBW;SoC;Server", + "MetricExpr": "((64 * UNC_M_PMM_RPQ_INSERTS / 1000000000) / duration_time)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", "MetricName": "PMM_Read_BW" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "( ( 64 * UNC_M_PMM_WPQ_INSERTS / 1000000000 ) / duration_time )", - "MetricGroup": "Mem;MemoryBW;SoC;Server", + "MetricExpr": "((64 * UNC_M_PMM_WPQ_INSERTS / 1000000000) / duration_time)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", "MetricName": "PMM_Write_BW" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1000000000 / duration_time", - "MetricGroup": "IoBW;Mem;SoC;Server", + "MetricGroup": "IoBW;Mem;Server;SoC", "MetricName": "IO_Write_BW" }, { @@ -491,12 +1317,6 @@ "MetricGroup": "SoC", "MetricName": "Socket_CLKS" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "uncore_cha_0@event\\=0x1@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", @@ -528,11 +1348,10 @@ "MetricName": "C6_Pkg_Residency" }, { - "BriefDescription": "Percentage of time spent in the active CPU power state C0", - "MetricExpr": "100 * CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "", - "MetricName": "cpu_utilization_percent", - "ScaleUnit": "1%" + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ" }, { "BriefDescription": "CPU operating frequency (in GHz)", @@ -541,13 +1360,6 @@ "MetricName": "cpu_operating_frequency", "ScaleUnit": "1GHz" }, - { - "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", - "MetricGroup": "", - "MetricName": "cpi", - "ScaleUnit": "1per_instr" - }, { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", @@ -566,7 +1378,7 @@ "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches", + "MetricName": "l1d_mpi", "ScaleUnit": "1per_instr" }, { @@ -594,7 +1406,7 @@ "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", "MetricGroup": "", - "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches", + "MetricName": "l2_mpi", "ScaleUnit": "1per_instr" }, { @@ -620,42 +1432,42 @@ }, { "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", - "MetricExpr": "( UNC_CHA_TOR_INSERTS.IA_MISS_CRD ) / INST_RETIRED.ANY", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD / INST_RETIRED.ANY", "MetricGroup": "", "MetricName": "llc_code_read_mpi_demand_plus_prefetch", "ScaleUnit": "1per_instr" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_latency", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_latency_for_local_requests", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_latency_for_remote_requests", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to Intel(R) Optane(TM) Persistent Memory(PMEM) in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_to_pmem_latency", "ScaleUnit": "1ns" }, { "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to DRAM in nano seconds", - "MetricExpr": "( ( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages ) ) ) * duration_time )", + "MetricExpr": "( 1000000000 * ( UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR ) / ( UNC_CHA_CLOCKTICKS / ( source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages ) ) ) * duration_time", "MetricGroup": "", "MetricName": "llc_demand_data_read_miss_to_dram_latency", "ScaleUnit": "1ns" @@ -699,14 +1511,14 @@ "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL ) / ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_local_dram", + "MetricName": "numa_reads_addressed_to_local_dram", "ScaleUnit": "1%" }, { "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", "MetricExpr": "100 * ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE ) / ( UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE )", "MetricGroup": "", - "MetricName": "numa_percent_reads_addressed_to_remote_dram", + "MetricName": "numa_reads_addressed_to_remote_dram", "ScaleUnit": "1%" }, { @@ -720,7 +1532,7 @@ "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "( UNC_UPI_TxL_FLITS.ALL_DATA * (64 / 9.0) / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "upi_data_transmit_bw_only_data", + "MetricName": "upi_data_transmit_bw", "ScaleUnit": "1MB/s" }, { @@ -769,35 +1581,35 @@ "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", "MetricExpr": "( UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_read", + "MetricName": "io_bandwidth_disk_or_network_writes", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", "MetricExpr": "(( UNC_CHA_TOR_INSERTS.IO_ITOM + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR ) * 64 / 1000000) / duration_time", "MetricGroup": "", - "MetricName": "io_bandwidth_write", + "MetricName": "io_bandwidth_disk_or_network_reads", "ScaleUnit": "1MB/s" }, { "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_decoded_icache_dsb", + "MetricName": "percent_uops_delivered_from_decoded_icache", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MITE_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline_mite", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", "ScaleUnit": "1%" }, { "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", "MetricExpr": "100 * ( IDQ.MS_UOPS / ( IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS ) )", "MetricGroup": "", - "MetricName": "percent_uops_delivered_from_microcode_sequencer_ms", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", "ScaleUnit": "1%" }, { @@ -827,264 +1639,5 @@ "MetricGroup": "", "MetricName": "llc_miss_remote_memory_bandwidth_write", "ScaleUnit": "1MB/s" - }, - { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", - "MetricExpr": "100 * ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) )", - "MetricGroup": "TmaL1;PGO", - "MetricName": "tma_frontend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", - "MetricExpr": "100 * ( ( topdown\\-fetch\\-lat / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) )", - "MetricGroup": "Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_latency_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "100 * ( ICACHE_DATA.STALLS / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_icache_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", - "MetricExpr": "100 * ( ICACHE_TAG.STALLS / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_itlb_misses_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", - "MetricExpr": "100 * ( INT_MISC.CLEAR_RESTEER_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) + ( INT_MISC.UNKNOWN_BRANCH_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_branch_resteers_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.", - "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "DSBmiss;FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_dsb_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", - "MetricExpr": "100 * ( DECODE.LCP / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_lcp_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.", - "MetricExpr": "100 * ( ( 3 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "FetchLat;MicroSeq;TmaL3;m_tma_fetch_latency_percent", - "MetricName": "tma_ms_switches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", - "MetricExpr": "100 * ( max( 0 , ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) - ( ( topdown\\-fetch\\-lat / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) ) ) )", - "MetricGroup": "FetchBW;Frontend;TmaL2;m_tma_frontend_bound_percent", - "MetricName": "tma_fetch_bandwidth_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", - "MetricExpr": "100 * ( ( IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK ) / ( CPU_CLK_UNHALTED.DISTRIBUTED ) / 2 )", - "MetricGroup": "DSBmiss;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_mite_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", - "MetricExpr": "100 * ( ( IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK ) / ( CPU_CLK_UNHALTED.DISTRIBUTED ) / 2 )", - "MetricGroup": "DSB;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent", - "MetricName": "tma_dsb_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", - "MetricExpr": "100 * ( max( 1 - ( ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) + ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) , 0 ) )", - "MetricGroup": "TmaL1", - "MetricName": "tma_bad_speculation_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", - "MetricExpr": "( 100 * ( topdown\\-br\\-mispredict / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) + ( 0 * slots )", - "MetricGroup": "BadSpec;BrMispredicts;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_branch_mispredicts_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", - "MetricExpr": "100 * ( max( 0 , ( max( 1 - ( ( topdown\\-fe\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) - INT_MISC.UOP_DROPPING / ( slots ) ) + ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) , 0 ) ) - ( topdown\\-br\\-mispredict / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) )", - "MetricGroup": "BadSpec;MachineClears;TmaL2;m_tma_bad_speculation_percent", - "MetricName": "tma_machine_clears_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", - "MetricExpr": "( 100 * ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) + ( 0 * slots )", - "MetricGroup": "TmaL1", - "MetricName": "tma_backend_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", - "MetricExpr": "( 100 * ( topdown\\-mem\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) + ( 0 * slots )", - "MetricGroup": "Backend;TmaL2;m_tma_backend_bound_percent", - "MetricName": "tma_memory_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", - "MetricExpr": "100 * ( max( ( EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l1_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l2_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", - "MetricExpr": "100 * ( ( MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_l3_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", - "MetricExpr": "100 * ( min( ( ( ( MEMORY_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) ) - ( min( ( ( ( ( 1 - ( ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) / ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) + ( 25 * ( ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) + 33 * ( ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) ) ) ) ) * ( MEMORY_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) ) ) if ( ( 1000000 ) * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) , ( 1 ) ) ) ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_dram_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ", - "MetricExpr": "100 * ( min( ( ( ( ( 1 - ( ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) / ( ( 19 * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + 10 * ( ( MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) + ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) + ( 25 * ( ( MEM_LOAD_RETIRED.LOCAL_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) + 33 * ( ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * ( 1 + ( MEM_LOAD_RETIRED.FB_HIT / ( MEM_LOAD_RETIRED.L1_MISS ) ) ) ) ) ) ) ) ) ) * ( MEMORY_ACTIVITY.STALLS_L3_MISS / ( CPU_CLK_UNHALTED.THREAD ) ) ) if ( ( 1000000 ) * ( MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM ) > MEM_LOAD_RETIRED.L1_MISS ) else 0 ) ) , ( 1 ) ) )", - "MetricGroup": "MemoryBound;Server;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_pmm_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", - "MetricExpr": "100 * ( EXE_ACTIVITY.BOUND_ON_STORES / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent", - "MetricName": "tma_store_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", - "MetricExpr": "( 100 * ( max( 0 , ( topdown\\-be\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-mem\\-bound / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) ) + ( 0 * slots )", - "MetricGroup": "Backend;TmaL2;Compute;m_tma_backend_bound_percent", - "MetricName": "tma_core_bound_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.", - "MetricExpr": "100 * ( ARITH.DIVIDER_ACTIVE / ( CPU_CLK_UNHALTED.THREAD ) )", - "MetricGroup": "TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_divider_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", - "MetricExpr": "( 100 * ( ( EXE_ACTIVITY.EXE_BOUND_0_PORTS + ( EXE_ACTIVITY.1_PORTS_UTIL + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@ ) ) / ( CPU_CLK_UNHALTED.THREAD ) if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS ) ) else ( EXE_ACTIVITY.1_PORTS_UTIL + ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@ ) / ( CPU_CLK_UNHALTED.THREAD ) ) ) + ( 0 * slots )", - "MetricGroup": "PortsUtil;TmaL3;m_tma_core_bound_percent", - "MetricName": "tma_ports_utilization_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", - "MetricExpr": "( 100 * ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) + ( 0 * slots )", - "MetricGroup": "TmaL1", - "MetricName": "tma_retiring_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", - "MetricExpr": "( 100 * ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) ) + ( 0 * slots )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_light_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", - "MetricExpr": "100 * ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.VECTOR ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) , ( 1 ) ) ) + ( cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=0x1@ / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) )", - "MetricGroup": "HPC;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fp_arith_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired). Vector/Matrix Int operations and shuffles are counted. Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain.", - "MetricExpr": "100 * ( ( ( INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128 ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( ( INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256 ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( INT_VEC_RETIRED.SHUFFLES / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_int_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * MEM_UOP_RETIRED.ANY / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_memory_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * INST_RETIRED.MACRO_FUSED / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_fused_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * ( BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_non_fused_branches_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body.", - "MetricExpr": "100 * ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * INST_RETIRED.NOP / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_nop_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", - "MetricExpr": "100 * ( max( 0 , ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) - ( ( ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + FP_ARITH_INST_RETIRED2.SCALAR ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.VECTOR ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) , ( 1 ) ) ) + ( cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=0x1@ / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) ) + ( ( ( INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128 ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( ( INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256 ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( INT_VEC_RETIRED.SHUFFLES / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * MEM_UOP_RETIRED.ANY / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * INST_RETIRED.MACRO_FUSED / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * ( BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED ) / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) + ( ( max( 0 , ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) ) * INST_RETIRED.NOP / ( ( topdown\\-retiring / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) * ( slots ) ) ) ) ) )", - "MetricGroup": "Pipeline;TmaL3;m_tma_light_operations_percent", - "MetricName": "tma_other_light_ops_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", - "MetricExpr": "( 100 * ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) ) + ( 0 * slots )", - "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent", - "MetricName": "tma_heavy_operations_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", - "MetricExpr": "100 * ( ( topdown\\-heavy\\-ops / ( topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound ) ) - ( UOPS_RETIRED.MS / ( slots ) ) )", - "MetricGroup": "TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_few_uops_instructions_percent", - "ScaleUnit": "1%" - }, - { - "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", - "MetricExpr": "100 * ( UOPS_RETIRED.MS / ( slots ) )", - "MetricGroup": "MicroSeq;TmaL3;m_tma_heavy_operations_percent", - "MetricName": "tma_microcode_sequencer_percent", - "ScaleUnit": "1%" } ] -- GitLab From e762a998e71cc579487cf478d0a3b56634189ffa Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:09 -0700 Subject: [PATCH 1429/2223] perf vendor events: Update silvermont cpuids Add cpuid that was added to https://download.01.org/perfmon/mapfile.csv Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-21-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index c2354e368586c..5e609b8767903 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -21,7 +21,7 @@ GenuineIntel-6-1[AEF],v3,nehalemep,core GenuineIntel-6-2E,v3,nehalemex,core GenuineIntel-6-2A,v17,sandybridge,core GenuineIntel-6-8F,v1.06,sapphirerapids,core -GenuineIntel-6-(37|4C|4D),v14,silvermont,core +GenuineIntel-6-(37|4A|4C|4D|5A),v14,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v53,skylake,core GenuineIntel-6-55-[01234],v1.28,skylakex,core GenuineIntel-6-86,v1.20,snowridgex,core -- GitLab From aac53e8f0730e921e56be6d362aee7e1d004b6c6 Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:10 -0700 Subject: [PATCH 1430/2223] perf vendor events: Update Intel skylake Events remain at v53, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - _SMT suffix metrics are dropped as the #SMT_On and #EBS_Mode are correctly expanded in the single main metric. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/skylake/skl-metrics.json | 861 ++++++++++++++---- 1 file changed, 679 insertions(+), 182 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index 73fa72d3dcb15..f138b9836b514 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -1,148 +1,694 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "9 * BACLEARS.ANY / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (9 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((18.5 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + (16.5 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(16.5 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(6.5 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(22 * Average_Frequency) * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "Mispredictions" + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", + "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fused_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_non_fused_branches", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / SLOTS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "Bad;BadSpec;BrMispredicts_SMT", - "MetricName": "Mispredictions_SMT" + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" }, { "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (OFFCORE_REQUESTS_BUFFER.SQ_FULL / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) ) + ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( ((L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )) * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CPU_CLK_UNHALTED.THREAD) / #(max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) ", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "Memory_Bandwidth" }, - { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) ) + ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( ((L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )) * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CPU_CLK_UNHALTED.THREAD) / #(max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) ", - "MetricGroup": "Mem;MemoryBW;Offcore_SMT", - "MetricName": "Memory_Bandwidth_SMT" - }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD ) / CPU_CLK_UNHALTED.THREAD - (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD)) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (( (10 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) - (3.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) ) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) + ( (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD)) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) )", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))", "MetricGroup": "Mem;MemoryLat;Offcore", "MetricName": "Memory_Latency" }, - { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( CPU_CLK_UNHALTED.THREAD , OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD ) / CPU_CLK_UNHALTED.THREAD - (min( CPU_CLK_UNHALTED.THREAD , cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@ ) / CPU_CLK_UNHALTED.THREAD)) / #(CYCLE_ACTIVITY.STALLS_L3_MISS / CPU_CLK_UNHALTED.THREAD + (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD) - (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD))) ) + ( (( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( (10 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) - (3.5 * ((CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time)) ) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CPU_CLK_UNHALTED.THREAD) / #(( CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS ) / CPU_CLK_UNHALTED.THREAD) ) + ( (( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) / ( (MEM_LOAD_RETIRED.L2_HIT * ( 1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) )) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ ) ) * (( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / CPU_CLK_UNHALTED.THREAD)) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) )", - "MetricGroup": "Mem;MemoryLat;Offcore_SMT", - "MetricName": "Memory_Latency_SMT" - }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) * ( ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (min( 9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE , max( CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS , 0 ) ) / CPU_CLK_UNHALTED.THREAD) / (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) + ( (EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) ) * ( (( 9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE ) / CPU_CLK_UNHALTED.THREAD) / #(EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) ) ) ", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency))) ", "MetricGroup": "Mem;MemoryTLB;Offcore", "MetricName": "Memory_Data_TLBs" }, - { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * ( ( (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) / ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (min( 9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE , max( CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS , 0 ) ) / CPU_CLK_UNHALTED.THREAD) / (max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / CPU_CLK_UNHALTED.THREAD , 0 )) ) + ( (EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) / #((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * ( (( 9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / #(EXE_ACTIVITY.BOUND_ON_STORES / CPU_CLK_UNHALTED.THREAD) ) ) ", - "MetricGroup": "Mem;MemoryTLB;Offcore_SMT", - "MetricName": "Memory_Data_TLBs_SMT" - }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) ) / (4 * CPU_CLK_UNHALTED.THREAD))", + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, - { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))", - "MetricGroup": "Ret_SMT", - "MetricName": "Branching_Overhead_SMT" - }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "Big_Code" }, - { - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB_SMT", - "MetricName": "Big_Code_SMT" - }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) - (100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)))", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "Instruction_Fetch_BW" }, - { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) - (100 * (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (( ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ ) / CPU_CLK_UNHALTED.THREAD) + (9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))", - "MetricGroup": "Fed;FetchBW;Frontend_SMT", - "MetricName": "Instruction_Fetch_BW_SMT" - }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -160,8 +706,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -172,16 +718,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -191,63 +731,38 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Cor;Flops;HPC_SMT", - "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricExpr": "( 1 - ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)))) / ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)))) < ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1 ) if 0 > 0.5 else 0", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", "MetricGroup": "Cor;SMT", "MetricName": "Core_Bound_Likely" }, - { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricExpr": "( 1 - ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) / ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if ((1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ((( CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES ) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * (1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) < ((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ( ARITH.DIVIDER_ACTIVE < ( CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY ) ) else (EXE_ACTIVITY.1_PORTS_UTIL + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1 ) if (1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 )) > 0.5 else 0", - "MetricGroup": "Cor;SMT_SMT", - "MetricName": "Core_Bound_Likely_SMT" - }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -289,13 +804,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -316,14 +831,14 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -335,9 +850,9 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -372,16 +887,10 @@ }, { "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", - "MetricExpr": "100 * ( (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + ((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))) * (( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / CPU_CLK_UNHALTED.THREAD / 2) / #((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD))) )", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", "MetricGroup": "DSBmiss;Fed", "MetricName": "DSB_Misses" }, - { - "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", - "MetricExpr": "100 * ( (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + ((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * (( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) / 2) / #((IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )", - "MetricGroup": "DSBmiss;Fed_SMT", - "MetricName": "DSB_Misses_SMT" - }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -396,16 +905,10 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts", "MetricName": "Branch_Misprediction_Cost" }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -414,101 +917,95 @@ }, { "BriefDescription": "Fraction of branches that are taken conditionals", - "MetricExpr": "( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", "MetricName": "Cond_TK" }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - ( BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN ) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "Jump" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -535,25 +1032,25 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -565,26 +1062,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -602,7 +1099,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, -- GitLab From a7c1aaa639e08e3f29035a863e2169bfb2cf592e Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:11 -0700 Subject: [PATCH 1431/2223] perf vendor events: Update Intel tigerlake Events remain at v1.07, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - Addition of all 6 levels of TMA metrics. Previously metrics involving topdown events were dropped. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-23-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/tigerlake/tgl-metrics.json | 810 ++++++++++++++++-- 1 file changed, 762 insertions(+), 48 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json index 03c97bd74ad94..79b8b101b68fc 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json @@ -1,26 +1,716 @@ [ + { + "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "10 * BACLEARS.ANY / CLKS", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / CLKS", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group", + "MetricName": "tma_mite_4wide", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", + "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / CORE_CLKS / 2", + "MetricGroup": "FetchBW;LSD;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_lsd", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + (5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + L1D_PEND_MISS.FB_FULL_PERIODS)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "((49 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (48 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(48 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "(17.5 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "L1D_PEND_MISS.L2_STALL / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "(54 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_streaming_stores", + "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS", + "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_serializing_operation", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / CLKS", + "MetricGroup": "TopdownL6;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", + "MetricGroup": "TopdownL5;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5", + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_512b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_branch_instructions", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_nop_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer + tma_retiring * (UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=1@) / IDQ.MITE_UOPS", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer", + "MetricGroup": "TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_few_uops_instructions", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "((tma_retiring * SLOTS) / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * ASSISTS.ANY / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "Mispredictions" + }, + { + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "Memory_Bandwidth" + }, + { + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))", + "MetricGroup": "Mem;MemoryLat;Offcore", + "MetricName": "Memory_Latency" + }, + { + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ", + "MetricGroup": "Mem;MemoryTLB;Offcore", + "MetricName": "Memory_Data_TLBs" + }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * (( BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)", "MetricGroup": "Ret", "MetricName": "Branching_Overhead" }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricExpr": "100 * (( 5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / TOPDOWN.SLOTS) * ( (ICACHE_64B.IFTAG_STALL / CPU_CLK_UNHALTED.THREAD) + (ICACHE_16B.IFDATA_STALL / CPU_CLK_UNHALTED.THREAD) + (10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) ) / #(( 5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING ) / TOPDOWN.SLOTS)", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "Big_Code" }, + { + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "Instruction_Fetch_BW" + }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "UPI" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW", + "MetricName": "UpTB" + }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -32,13 +722,13 @@ { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, { "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1", - "MetricGroup": "SMT;TmaL1", + "MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1", + "MetricGroup": "SMT;tma_L1_group", "MetricName": "Slots_Utilization" }, { @@ -50,29 +740,35 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0", + "MetricGroup": "Cor;SMT", + "MetricName": "Core_Bound_Likely" + }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", @@ -117,13 +813,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -144,21 +840,21 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX512", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." @@ -170,11 +866,17 @@ "MetricName": "IpSWPF" }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "(tma_retiring * SLOTS) / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "Retire" + }, { "BriefDescription": "", "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", @@ -205,6 +907,12 @@ "MetricGroup": "DSBmiss", "MetricName": "DSB_Switch_Cost" }, + { + "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "DSB_Misses" + }, { "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", @@ -217,6 +925,12 @@ "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "IpMispredict" }, + { + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "Branch_Misprediction_Cost" + }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", @@ -231,7 +945,7 @@ }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "CallRet" }, @@ -243,80 +957,80 @@ }, { "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - ( (BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES) + ((BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES) )", + "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)", "MetricGroup": "Bad;Branches", "MetricName": "Other_Branches" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI_Load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "FB_HPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * CORE_CLKS)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, @@ -346,25 +1060,25 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Access_BW", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "L3_Cache_Access_BW_1T" }, @@ -376,40 +1090,40 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License0_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License1_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CORE_CLKS", "MetricGroup": "Power", "MetricName": "Power_License2_Utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." @@ -434,7 +1148,7 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, -- GitLab From d7184d9487e9791a89f3304bd015da9569b1099b Mon Sep 17 00:00:00 2001 From: Ian Rogers <irogers@google.com> Date: Mon, 3 Oct 2022 19:16:12 -0700 Subject: [PATCH 1432/2223] perf vendor events: Update Intel broadwellde Events remain at v23, and the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py with updates at: https://github.com/captain5050/event-converter-for-linux-perf Updates include: - Switch for core metrics from BDX to BDW. - Switch for Page_Walks_Utilization to BDX version. - Rename of topdown TMA metrics from Frontend_Bound to tma_frontend_bound. - Addition of all 6 levels of TMA metrics. Child metrics are placed in a group named after their parent allowing children of a metric to be easily measured using the metric name with a _group suffix. - ## and ##? operators are correctly expanded. - The locate-with column is added to the long description describing a sampling event. - Metrics are written in terms of other metrics to reduce the expression size and increase readability. Tested with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Ahmad Yasin <ahmad.yasin@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Florian Fischer <florian.fischer@muhq.space> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Miaoqian Lin <linmq006@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Richter <tmricht@linux.ibm.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221004021612.325521-24-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../arch/x86/broadwellde/bdwde-metrics.json | 711 ++++++++++++++---- 1 file changed, 577 insertions(+), 134 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json index b6fdf5ba2c9ae..5a074cf7c77da 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json @@ -1,64 +1,556 @@ [ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Frontend_Bound", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound." + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS", + "MetricGroup": "PGO;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Frontend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS", + "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "ICACHE.IFDATA_STALL / CLKS", + "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_clears_resteers", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers", + "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_dsb_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "ILD_STALL.LCP / CLKS", + "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_lcp", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group", + "MetricName": "tma_ms_switches", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "tma_frontend_bound - tma_fetch_latency", + "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_bandwidth", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Bad_Speculation", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example." + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Bad_Speculation_SMT", - "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation", + "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts", + "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )", - "MetricGroup": "TopdownL1", - "MetricName": "Backend_Bound", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound." + "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound", + "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS", + "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS", + "MetricGroup": "TopdownL4;tma_l1_bound_group", + "MetricName": "tma_4k_aliasing", + "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS", + "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS", + "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS", + "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS", + "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_bandwidth", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group", + "MetricName": "tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "RESOURCE_STALLS.SB / CLKS", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS", + "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group", + "MetricName": "tma_store_latency", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS", + "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS", + "MetricGroup": "TopdownL4;tma_store_bound_group", + "MetricName": "tma_split_stores", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / CLKS", + "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "tma_backend_bound - tma_memory_bound", + "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Backend_Bound_SMT", - "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS", + "MetricGroup": "TopdownL3;tma_core_bound_group", + "MetricName": "tma_divider", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS", + "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS", + "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS", + "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_0", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_5", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_alu_op_utilization_group", + "MetricName": "tma_port_6", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3_10", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_load_op_utilization_group", + "MetricName": "tma_port_3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data)", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_4", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS", + "MetricGroup": "TopdownL6;tma_store_op_utilization_group", + "MetricName": "tma_port_7", + "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "TopdownL1", - "MetricName": "Retiring", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. " + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%" }, { - "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))", - "MetricGroup": "TopdownL1_SMT", - "MetricName": "Retiring_SMT", - "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU." + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "tma_retiring - tma_heavy_operations", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_scalar", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group", + "MetricName": "tma_fp_vector", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_128b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS", + "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group", + "MetricName": "tma_fp_vector_256b", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences", + "MetricExpr": "tma_microcode_sequencer", + "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS", + "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group", + "MetricName": "tma_microcode_sequencer", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.", + "ScaleUnit": "100%" }, { "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "INST_RETIRED.ANY / CLKS", "MetricGroup": "Ret;Summary", "MetricName": "IPC" }, @@ -76,8 +568,8 @@ }, { "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)", - "MetricGroup": "Pipeline;Mem", + "MetricExpr": "1 / IPC", + "MetricGroup": "Mem;Pipeline", "MetricName": "CPI" }, { @@ -88,16 +580,10 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "TmaL1", + "MetricExpr": "4 * CORE_CLKS", + "MetricGroup": "tma_L1_group", "MetricName": "SLOTS" }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "TmaL1_SMT", - "MetricName": "SLOTS_SMT" - }, { "BriefDescription": "The ratio of Executed- by Issued-Uops", "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", @@ -107,51 +593,32 @@ }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;SMT;TmaL1", + "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS", + "MetricGroup": "Ret;SMT;tma_L1_group", "MetricName": "CoreIPC" }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;SMT;TmaL1_SMT", - "MetricName": "CoreIPC_SMT" - }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Ret;Flops", + "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS", + "MetricGroup": "Flops;Ret", "MetricName": "FLOPc" }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Ret;Flops_SMT", - "MetricName": "FLOPc_SMT" - }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )", + "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "FP_Arith_Utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.", - "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )", - "MetricGroup": "Cor;Flops;HPC_SMT", - "MetricName": "FP_Arith_Utilization_SMT", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU." - }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "ILP" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", + "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS", "MetricGroup": "SMT", "MetricName": "CORE_CLKS" }, @@ -193,13 +660,13 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "IpFLOP" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )", + "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))", "MetricGroup": "Flops;InsType", "MetricName": "IpArith", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." @@ -220,22 +687,22 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX128", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "IpArith_AVX256", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { - "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST", + "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST", "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1", + "MetricGroup": "Summary;tma_L1_group", "MetricName": "Instructions" }, { @@ -252,7 +719,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )", + "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))", "MetricGroup": "DSB;Fed;FetchBW", "MetricName": "DSB_Coverage" }, @@ -264,83 +731,71 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts", "MetricName": "Branch_Misprediction_Cost" }, - { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts_SMT", - "MetricName": "Branch_Misprediction_Cost_SMT" - }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "Load_Miss_Real_Latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBound;MemoryBW", + "MetricGroup": "Mem;MemoryBW;MemoryBound", "MetricName": "MLP" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L1MPKI" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;Backend;CacheMisses", + "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "L2MPKI" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses;Offcore", + "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "L2MPKI_All" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2MPKI_Load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_All" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L2HPKI_Load" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Mem;CacheMisses", + "MetricGroup": "CacheMisses;Mem", "MetricName": "L3MPKI" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricConstraint": "NO_NMI_WATCHDOG", - "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) if #core_wide < 1 else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD) )", "MetricGroup": "Mem;MemoryTLB", "MetricName": "Page_Walks_Utilization" }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )", - "MetricGroup": "Mem;MemoryTLB_SMT", - "MetricName": "Page_Walks_Utilization_SMT" - }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time", @@ -361,19 +816,19 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)", + "MetricExpr": "L1D_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L1D_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)", + "MetricExpr": "L2_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L2_Cache_Fill_BW_1T" }, { "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)", + "MetricExpr": "L3_Cache_Fill_BW", "MetricGroup": "Mem;MemoryBW", "MetricName": "L3_Cache_Fill_BW_1T" }, @@ -391,26 +846,26 @@ }, { "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time", - "MetricGroup": "Summary;Power", + "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time", + "MetricGroup": "Power;Summary", "MetricName": "Average_Frequency" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time", + "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "GFLOPs", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "Turbo_Utilization" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0", + "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0", "MetricGroup": "SMT", "MetricName": "SMT_2T_Utilization" }, @@ -428,33 +883,21 @@ }, { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000", "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "DRAM_BW_Use" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "MEM_Read_Latency" - }, - { - "BriefDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches", - "MetricExpr": "cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182\\,thresh\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "MEM_Parallel_Reads" - }, - { - "BriefDescription": "Socket actual clocks when any core is active on that socket", - "MetricExpr": "cbox_0@event\\=0x0@", - "MetricGroup": "SoC", - "MetricName": "Socket_CLKS" + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / arb@event\\=0x81\\,umask\\=0x1@", + "MetricGroup": "Mem;SoC", + "MetricName": "MEM_Request_Latency" }, { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" + "BriefDescription": "Average number of parallel requests to external memory. Accounts for all requests", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / arb@event\\=0x81\\,umask\\=0x1@", + "MetricGroup": "Mem;SoC", + "MetricName": "MEM_Parallel_Requests" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", -- GitLab From 06b552ee378193a3a67d7124f3f0e76989881fed Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 3 Oct 2022 13:46:43 -0700 Subject: [PATCH 1433/2223] libperf: Populate system-wide evsel maps Setting proper cpu and thread maps for system wide evsels regardless of user requested cpu in __perf_evlist__propagate_maps(). Those evsels need to be active on all cpus always. Do it in the libperf so that we can guarantee it has proper maps. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221003204647.1481128-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/evlist.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 0e7347d1583dc..19eaea99aa4f1 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -40,11 +40,11 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, * We already have cpus for evsel (via PMU sysfs) so * keep it, if there's no target cpu list defined. */ - if (!evsel->own_cpus || - (!evsel->system_wide && evlist->has_user_cpus) || - (!evsel->system_wide && - !evsel->requires_cpu && - perf_cpu_map__empty(evlist->user_requested_cpus))) { + if (evsel->system_wide) { + perf_cpu_map__put(evsel->cpus); + evsel->cpus = perf_cpu_map__new(NULL); + } else if (!evsel->own_cpus || evlist->has_user_cpus || + (!evsel->requires_cpu && perf_cpu_map__empty(evlist->user_requested_cpus))) { perf_cpu_map__put(evsel->cpus); evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); } else if (evsel->cpus != evsel->own_cpus) { @@ -52,7 +52,10 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, evsel->cpus = perf_cpu_map__get(evsel->own_cpus); } - if (!evsel->system_wide) { + if (evsel->system_wide) { + perf_thread_map__put(evsel->threads); + evsel->threads = perf_thread_map__new_dummy(); + } else { perf_thread_map__put(evsel->threads); evsel->threads = perf_thread_map__get(evlist->threads); } -- GitLab From 7e2450bb756c84cdc2b2668b1036ac105453ed5f Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 3 Oct 2022 13:46:44 -0700 Subject: [PATCH 1434/2223] libperf: Propagate maps only if necessary The current code propagate evsel's cpu map settings to evlist when it's added to an evlist. But the evlist->all_cpus and each evsel's cpus will be updated in perf_evlist__set_maps() later. No need to do it before evlist's cpus are set actually. In fact it discards this intermediate all_cpus maps at the beginning of perf_evlist__set_maps(). Let's not do this. It's only needed when an evsel is added after the evlist cpu/thread maps are set. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221003204647.1481128-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/evlist.c | 11 ++++------- tools/lib/perf/include/internal/evlist.h | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 19eaea99aa4f1..61b637f29b827 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -67,9 +67,7 @@ static void perf_evlist__propagate_maps(struct perf_evlist *evlist) { struct perf_evsel *evsel; - /* Recomputing all_cpus, so start with a blank slate. */ - perf_cpu_map__put(evlist->all_cpus); - evlist->all_cpus = NULL; + evlist->needs_map_propagation = true; perf_evlist__for_each_evsel(evlist, evsel) __perf_evlist__propagate_maps(evlist, evsel); @@ -81,7 +79,9 @@ void perf_evlist__add(struct perf_evlist *evlist, evsel->idx = evlist->nr_entries; list_add_tail(&evsel->node, &evlist->entries); evlist->nr_entries += 1; - __perf_evlist__propagate_maps(evlist, evsel); + + if (evlist->needs_map_propagation) + __perf_evlist__propagate_maps(evlist, evsel); } void perf_evlist__remove(struct perf_evlist *evlist, @@ -177,9 +177,6 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, evlist->threads = perf_thread_map__get(threads); } - if (!evlist->all_cpus && cpus) - evlist->all_cpus = perf_cpu_map__get(cpus); - perf_evlist__propagate_maps(evlist); } diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h index 6f89aec3e6084..850f07070036c 100644 --- a/tools/lib/perf/include/internal/evlist.h +++ b/tools/lib/perf/include/internal/evlist.h @@ -19,6 +19,7 @@ struct perf_evlist { int nr_entries; int nr_groups; bool has_user_cpus; + bool needs_map_propagation; /** * The cpus passed from the command line or all online CPUs by * default. -- GitLab From 60ea006f72512fd7c36f16cdbe91f4fc284f8115 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 3 Oct 2022 13:46:45 -0700 Subject: [PATCH 1435/2223] perf tools: Get rid of evlist__add_on_all_cpus() The cpu and thread maps are properly handled in libperf now. No need to do it in the perf tools anymore. Let's remove the logic. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221003204647.1481128-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/evlist.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index fcfe5bcc0bcff..dcf57b271ff13 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -268,28 +268,6 @@ int evlist__add_dummy(struct evlist *evlist) return 0; } -static void evlist__add_on_all_cpus(struct evlist *evlist, struct evsel *evsel) -{ - evsel->core.system_wide = true; - - /* - * All CPUs. - * - * Note perf_event_open() does not accept CPUs that are not online, so - * in fact this CPU list will include only all online CPUs. - */ - perf_cpu_map__put(evsel->core.own_cpus); - evsel->core.own_cpus = perf_cpu_map__new(NULL); - perf_cpu_map__put(evsel->core.cpus); - evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus); - - /* No threads */ - perf_thread_map__put(evsel->core.threads); - evsel->core.threads = perf_thread_map__new_dummy(); - - evlist__add(evlist, evsel); -} - struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide) { struct evsel *evsel = evlist__dummy_event(evlist); @@ -302,14 +280,11 @@ struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide) evsel->core.attr.exclude_hv = 1; evsel->core.attr.freq = 0; evsel->core.attr.sample_period = 1; + evsel->core.system_wide = system_wide; evsel->no_aux_samples = true; evsel->name = strdup("dummy:u"); - if (system_wide) - evlist__add_on_all_cpus(evlist, evsel); - else - evlist__add(evlist, evsel); - + evlist__add(evlist, evsel); return evsel; } -- GitLab From 182bb594e0678b3ceac99f4ec3daa5d22ea3d0ce Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 3 Oct 2022 13:46:46 -0700 Subject: [PATCH 1436/2223] perf tools: Add evlist__add_sched_switch() Add a help to create a system-wide sched_switch event. One merit is that it sets the system-wide bit before adding it to evlist so that the libperf can handle the cpu and thread maps correctly. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221003204647.1481128-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/arch/x86/util/intel-pt.c | 15 +++++---------- tools/perf/tests/switch-tracking.c | 15 +++++---------- tools/perf/util/evlist.c | 17 +++++++++++++++++ tools/perf/util/evlist.h | 1 + 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 13933020a79eb..793b35f2221aa 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -11,6 +11,7 @@ #include <linux/bitops.h> #include <linux/log2.h> #include <linux/zalloc.h> +#include <linux/err.h> #include <cpuid.h> #include "../../../util/session.h" @@ -426,20 +427,14 @@ static int intel_pt_track_switches(struct evlist *evlist) if (!evlist__can_select_event(evlist, sched_switch)) return -EPERM; - err = parse_event(evlist, sched_switch); - if (err) { - pr_debug2("%s: failed to parse %s, error %d\n", + evsel = evlist__add_sched_switch(evlist, true); + if (IS_ERR(evsel)) { + err = PTR_ERR(evsel); + pr_debug2("%s: failed to create %s, error = %d\n", __func__, sched_switch, err); return err; } - evsel = evlist__last(evlist); - - evsel__set_sample_bit(evsel, CPU); - evsel__set_sample_bit(evsel, TIME); - - evsel->core.system_wide = true; - evsel->no_aux_samples = true; evsel->immediate = true; return 0; diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 2d46af9ef9357..87f565c7f650d 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -6,6 +6,7 @@ #include <time.h> #include <stdlib.h> #include <linux/zalloc.h> +#include <linux/err.h> #include <perf/cpumap.h> #include <perf/evlist.h> #include <perf/mmap.h> @@ -398,19 +399,13 @@ static int test__switch_tracking(struct test_suite *test __maybe_unused, int sub goto out; } - err = parse_event(evlist, sched_switch); - if (err) { - pr_debug("Failed to parse event %s\n", sched_switch); + switch_evsel = evlist__add_sched_switch(evlist, true); + if (IS_ERR(switch_evsel)) { + err = PTR_ERR(switch_evsel); + pr_debug("Failed to create event %s\n", sched_switch); goto out_err; } - switch_evsel = evlist__last(evlist); - - evsel__set_sample_bit(switch_evsel, CPU); - evsel__set_sample_bit(switch_evsel, TIME); - - switch_evsel->core.system_wide = true; - switch_evsel->no_aux_samples = true; switch_evsel->immediate = true; /* Test moving an event to the front */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index dcf57b271ff13..6612b00949e70 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -288,6 +288,23 @@ struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide) return evsel; } +struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide) +{ + struct evsel *evsel = evsel__newtp_idx("sched", "sched_switch", 0); + + if (IS_ERR(evsel)) + return evsel; + + evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TIME); + + evsel->core.system_wide = system_wide; + evsel->no_aux_samples = true; + + evlist__add(evlist, evsel); + return evsel; +}; + int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs) { struct evsel *evsel, *n; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 9d967fe3953a8..16734c6756b3c 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -127,6 +127,7 @@ static inline struct evsel *evlist__add_dummy_on_all_cpus(struct evlist *evlist) { return evlist__add_aux_dummy(evlist, true); } +struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide); int evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, evsel__sb_cb_t cb, void *data); -- GitLab From 1337b9dcb03b1c81448eed1b70296148f62730b8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 3 Oct 2022 13:46:47 -0700 Subject: [PATCH 1437/2223] perf tools: Remove special handling of system-wide evsel For system-wide evsels, the thread map should be dummy - i.e. it has a single entry of -1. But the code guarantees such a thread map, so no need to handle it specially. No functional change intended. Reviewed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221003204647.1481128-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/evsel.c | 3 --- tools/perf/builtin-script.c | 3 --- tools/perf/util/evsel.c | 12 ++---------- tools/perf/util/stat.c | 3 --- 4 files changed, 2 insertions(+), 19 deletions(-) diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 8ce5bbd096666..8b51b008a81f1 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -515,9 +515,6 @@ int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads) if (ncpus == 0 || nthreads == 0) return 0; - if (evsel->system_wide) - nthreads = 1; - evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); if (evsel->sample_id == NULL) return -ENOMEM; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 886f53cfa2574..7fa467ed91dc7 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2243,9 +2243,6 @@ static void __process_stat(struct evsel *counter, u64 tstamp) struct perf_cpu cpu; static int header_printed; - if (counter->core.system_wide) - nthreads = 1; - if (!header_printed) { printf("%3s %8s %15s %15s %15s %15s %s\n", "CPU", "THREAD", "VAL", "ENA", "RUN", "TIME", "EVENT"); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a27092339b81a..76605fde35078 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1813,7 +1813,7 @@ static struct perf_thread_map *empty_thread_map; static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, struct perf_thread_map *threads) { - int nthreads; + int nthreads = perf_thread_map__nr(threads); if ((perf_missing_features.write_backward && evsel->core.attr.write_backward) || (perf_missing_features.aux_output && evsel->core.attr.aux_output)) @@ -1839,11 +1839,6 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, threads = empty_thread_map; } - if (evsel->core.system_wide) - nthreads = 1; - else - nthreads = threads->nr; - if (evsel->core.fd == NULL && perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0) return -ENOMEM; @@ -2061,10 +2056,7 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, if (threads == NULL) threads = empty_thread_map; - if (evsel->core.system_wide) - nthreads = 1; - else - nthreads = threads->nr; + nthreads = perf_thread_map__nr(threads); if (evsel->cgrp) pid = evsel->cgrp->fd; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index ce5e9e372fc4c..cef943377ad7f 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -420,9 +420,6 @@ static int process_counter_maps(struct perf_stat_config *config, int ncpus = evsel__nr_cpus(counter); int idx, thread; - if (counter->core.system_wide) - nthreads = 1; - for (thread = 0; thread < nthreads; thread++) { for (idx = 0; idx < ncpus; idx++) { if (process_counter_values(config, counter, idx, thread, -- GitLab From 66b76e30ee36c0c58836bf91b8602f5f2c94093a Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:04 -0700 Subject: [PATCH 1438/2223] perf stat: Convert perf_stat_evsel.res_stats array It uses only one member, no need to have it as an array. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat-display.c | 2 +- tools/perf/util/stat.c | 10 +++------- tools/perf/util/stat.h | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index b82844cb0ce77..234491f43c36b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -67,7 +67,7 @@ static void print_noise(struct perf_stat_config *config, return; ps = evsel->stats; - print_noise_pct(config, stddev_stats(&ps->res_stats[0]), avg); + print_noise_pct(config, stddev_stats(&ps->res_stats), avg); } static void print_cgroup(struct perf_stat_config *config, struct evsel *evsel) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index cef943377ad7f..8d27ba77f8ab0 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -132,12 +132,9 @@ static void perf_stat_evsel_id_init(struct evsel *evsel) static void evsel__reset_stat_priv(struct evsel *evsel) { - int i; struct perf_stat_evsel *ps = evsel->stats; - for (i = 0; i < 3; i++) - init_stats(&ps->res_stats[i]); - + init_stats(&ps->res_stats); perf_stat_evsel_id_init(evsel); } @@ -437,7 +434,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, struct perf_counts_values *aggr = &counter->counts->aggr; struct perf_stat_evsel *ps = counter->stats; u64 *count = counter->counts->aggr.values; - int i, ret; + int ret; aggr->val = aggr->ena = aggr->run = 0; @@ -455,8 +452,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, evsel__compute_deltas(counter, -1, -1, aggr); perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled); - for (i = 0; i < 3; i++) - update_stats(&ps->res_stats[i], count[i]); + update_stats(&ps->res_stats, *count); if (verbose > 0) { fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 72713b344b792..3eba38a1a1499 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -43,7 +43,7 @@ enum perf_stat_evsel_id { }; struct perf_stat_evsel { - struct stats res_stats[3]; + struct stats res_stats; enum perf_stat_evsel_id id; u64 *group_data; }; -- GitLab From 429b8e84517b0ccdb3feace4b264c74ab61b16b0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:05 -0700 Subject: [PATCH 1439/2223] perf stat: Don't call perf_stat_evsel_id_init() repeatedly evsel__reset_stat_priv() is called more than once if user gave -r option for multiple runs. But it doesn't need to re-initialize the id. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 8d27ba77f8ab0..7e9543cff31cf 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -135,7 +135,6 @@ static void evsel__reset_stat_priv(struct evsel *evsel) struct perf_stat_evsel *ps = evsel->stats; init_stats(&ps->res_stats); - perf_stat_evsel_id_init(evsel); } static int evsel__alloc_stat_priv(struct evsel *evsel) @@ -143,6 +142,7 @@ static int evsel__alloc_stat_priv(struct evsel *evsel) evsel->stats = zalloc(sizeof(struct perf_stat_evsel)); if (evsel->stats == NULL) return -ENOMEM; + perf_stat_evsel_id_init(evsel); evsel__reset_stat_priv(evsel); return 0; } -- GitLab From dfca2d692d035a276811d050cb0c4e4e825b3415 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:06 -0700 Subject: [PATCH 1440/2223] perf stat: Rename saved_value->cpu_map_idx The cpu_map_idx fields is just to differentiate values from other entries. It doesn't need to be strictly cpu map index. Actually we can pass thread map index or aggr map index. So rename the fields first. No functional change intended. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat-shadow.c | 308 +++++++++++++++++----------------- tools/perf/util/stat.h | 6 +- 2 files changed, 157 insertions(+), 157 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index b5cedd37588fb..48634b95669e8 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -33,7 +33,7 @@ struct saved_value { struct evsel *evsel; enum stat_type type; int ctx; - int cpu_map_idx; + int map_idx; /* cpu map index */ struct cgroup *cgrp; struct runtime_stat *stat; struct stats stats; @@ -48,8 +48,8 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry) rb_node); const struct saved_value *b = entry; - if (a->cpu_map_idx != b->cpu_map_idx) - return a->cpu_map_idx - b->cpu_map_idx; + if (a->map_idx != b->map_idx) + return a->map_idx - b->map_idx; /* * Previously the rbtree was used to link generic metrics. @@ -106,7 +106,7 @@ static void saved_value_delete(struct rblist *rblist __maybe_unused, } static struct saved_value *saved_value_lookup(struct evsel *evsel, - int cpu_map_idx, + int map_idx, bool create, enum stat_type type, int ctx, @@ -116,7 +116,7 @@ static struct saved_value *saved_value_lookup(struct evsel *evsel, struct rblist *rblist; struct rb_node *nd; struct saved_value dm = { - .cpu_map_idx = cpu_map_idx, + .map_idx = map_idx, .evsel = evsel, .type = type, .ctx = ctx, @@ -215,10 +215,10 @@ struct runtime_stat_data { static void update_runtime_stat(struct runtime_stat *st, enum stat_type type, - int cpu_map_idx, u64 count, + int map_idx, u64 count, struct runtime_stat_data *rsd) { - struct saved_value *v = saved_value_lookup(NULL, cpu_map_idx, true, type, + struct saved_value *v = saved_value_lookup(NULL, map_idx, true, type, rsd->ctx, st, rsd->cgrp); if (v) @@ -231,7 +231,7 @@ static void update_runtime_stat(struct runtime_stat *st, * instruction rates, etc: */ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, - int cpu_map_idx, struct runtime_stat *st) + int map_idx, struct runtime_stat *st) { u64 count_ns = count; struct saved_value *v; @@ -243,88 +243,88 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, count *= counter->scale; if (evsel__is_clock(counter)) - update_runtime_stat(st, STAT_NSECS, cpu_map_idx, count_ns, &rsd); + update_runtime_stat(st, STAT_NSECS, map_idx, count_ns, &rsd); else if (evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_runtime_stat(st, STAT_CYCLES, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_CYCLES, map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) - update_runtime_stat(st, STAT_CYCLES_IN_TX, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_CYCLES_IN_TX, map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TRANSACTION_START)) - update_runtime_stat(st, STAT_TRANSACTION, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_TRANSACTION, map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, ELISION_START)) - update_runtime_stat(st, STAT_ELISION, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_ELISION, map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING)) update_runtime_stat(st, STAT_TOPDOWN_RETIRING, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC)) update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND)) update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND)) update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_HEAVY_OPS)) update_runtime_stat(st, STAT_TOPDOWN_HEAVY_OPS, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_BR_MISPREDICT)) update_runtime_stat(st, STAT_TOPDOWN_BR_MISPREDICT, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_LAT)) update_runtime_stat(st, STAT_TOPDOWN_FETCH_LAT, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_MEM_BOUND)) update_runtime_stat(st, STAT_TOPDOWN_MEM_BOUND, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_BACK, - cpu_map_idx, count, &rsd); + map_idx, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_runtime_stat(st, STAT_BRANCHES, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_BRANCHES, map_idx, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) - update_runtime_stat(st, STAT_CACHEREFS, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_CACHEREFS, map_idx, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) - update_runtime_stat(st, STAT_L1_DCACHE, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_L1_DCACHE, map_idx, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) - update_runtime_stat(st, STAT_L1_ICACHE, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_L1_ICACHE, map_idx, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_LL)) - update_runtime_stat(st, STAT_LL_CACHE, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_LL_CACHE, map_idx, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) - update_runtime_stat(st, STAT_DTLB_CACHE, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_DTLB_CACHE, map_idx, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) - update_runtime_stat(st, STAT_ITLB_CACHE, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_ITLB_CACHE, map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, SMI_NUM)) - update_runtime_stat(st, STAT_SMI_NUM, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_SMI_NUM, map_idx, count, &rsd); else if (perf_stat_evsel__is(counter, APERF)) - update_runtime_stat(st, STAT_APERF, cpu_map_idx, count, &rsd); + update_runtime_stat(st, STAT_APERF, map_idx, count, &rsd); if (counter->collect_stat) { - v = saved_value_lookup(counter, cpu_map_idx, true, STAT_NONE, 0, st, + v = saved_value_lookup(counter, map_idx, true, STAT_NONE, 0, st, rsd.cgrp); update_stats(&v->stats, count); if (counter->metric_leader) v->metric_total += count; } else if (counter->metric_leader) { v = saved_value_lookup(counter->metric_leader, - cpu_map_idx, true, STAT_NONE, 0, st, rsd.cgrp); + map_idx, true, STAT_NONE, 0, st, rsd.cgrp); v->metric_total += count; v->metric_other++; } @@ -466,12 +466,12 @@ void perf_stat__collect_metric_expr(struct evlist *evsel_list) } static double runtime_stat_avg(struct runtime_stat *st, - enum stat_type type, int cpu_map_idx, + enum stat_type type, int map_idx, struct runtime_stat_data *rsd) { struct saved_value *v; - v = saved_value_lookup(NULL, cpu_map_idx, false, type, rsd->ctx, st, rsd->cgrp); + v = saved_value_lookup(NULL, map_idx, false, type, rsd->ctx, st, rsd->cgrp); if (!v) return 0.0; @@ -479,12 +479,12 @@ static double runtime_stat_avg(struct runtime_stat *st, } static double runtime_stat_n(struct runtime_stat *st, - enum stat_type type, int cpu_map_idx, + enum stat_type type, int map_idx, struct runtime_stat_data *rsd) { struct saved_value *v; - v = saved_value_lookup(NULL, cpu_map_idx, false, type, rsd->ctx, st, rsd->cgrp); + v = saved_value_lookup(NULL, map_idx, false, type, rsd->ctx, st, rsd->cgrp); if (!v) return 0.0; @@ -492,7 +492,7 @@ static double runtime_stat_n(struct runtime_stat *st, } static void print_stalled_cycles_frontend(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -500,7 +500,7 @@ static void print_stalled_cycles_frontend(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_CYCLES, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -515,7 +515,7 @@ static void print_stalled_cycles_frontend(struct perf_stat_config *config, } static void print_stalled_cycles_backend(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -523,7 +523,7 @@ static void print_stalled_cycles_backend(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_CYCLES, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -534,7 +534,7 @@ static void print_stalled_cycles_backend(struct perf_stat_config *config, } static void print_branch_misses(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -542,7 +542,7 @@ static void print_branch_misses(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_BRANCHES, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_BRANCHES, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -553,7 +553,7 @@ static void print_branch_misses(struct perf_stat_config *config, } static void print_l1_dcache_misses(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -561,7 +561,7 @@ static void print_l1_dcache_misses(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_L1_DCACHE, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_L1_DCACHE, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -572,7 +572,7 @@ static void print_l1_dcache_misses(struct perf_stat_config *config, } static void print_l1_icache_misses(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -580,7 +580,7 @@ static void print_l1_icache_misses(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_L1_ICACHE, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_L1_ICACHE, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -590,7 +590,7 @@ static void print_l1_icache_misses(struct perf_stat_config *config, } static void print_dtlb_cache_misses(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -598,7 +598,7 @@ static void print_dtlb_cache_misses(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_DTLB_CACHE, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_DTLB_CACHE, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -608,7 +608,7 @@ static void print_dtlb_cache_misses(struct perf_stat_config *config, } static void print_itlb_cache_misses(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -616,7 +616,7 @@ static void print_itlb_cache_misses(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_ITLB_CACHE, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_ITLB_CACHE, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -626,7 +626,7 @@ static void print_itlb_cache_misses(struct perf_stat_config *config, } static void print_ll_cache_misses(struct perf_stat_config *config, - int cpu_map_idx, double avg, + int map_idx, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -634,7 +634,7 @@ static void print_ll_cache_misses(struct perf_stat_config *config, double total, ratio = 0.0; const char *color; - total = runtime_stat_avg(st, STAT_LL_CACHE, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_LL_CACHE, map_idx, rsd); if (total) ratio = avg / total * 100.0; @@ -692,61 +692,61 @@ static double sanitize_val(double x) return x; } -static double td_total_slots(int cpu_map_idx, struct runtime_stat *st, +static double td_total_slots(int map_idx, struct runtime_stat *st, struct runtime_stat_data *rsd) { - return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, cpu_map_idx, rsd); + return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, map_idx, rsd); } -static double td_bad_spec(int cpu_map_idx, struct runtime_stat *st, +static double td_bad_spec(int map_idx, struct runtime_stat *st, struct runtime_stat_data *rsd) { double bad_spec = 0; double total_slots; double total; - total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, cpu_map_idx, rsd) - - runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, cpu_map_idx, rsd) + - runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, cpu_map_idx, rsd); + total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, map_idx, rsd) - + runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, map_idx, rsd) + + runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, map_idx, rsd); - total_slots = td_total_slots(cpu_map_idx, st, rsd); + total_slots = td_total_slots(map_idx, st, rsd); if (total_slots) bad_spec = total / total_slots; return sanitize_val(bad_spec); } -static double td_retiring(int cpu_map_idx, struct runtime_stat *st, +static double td_retiring(int map_idx, struct runtime_stat *st, struct runtime_stat_data *rsd) { double retiring = 0; - double total_slots = td_total_slots(cpu_map_idx, st, rsd); + double total_slots = td_total_slots(map_idx, st, rsd); double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, - cpu_map_idx, rsd); + map_idx, rsd); if (total_slots) retiring = ret_slots / total_slots; return retiring; } -static double td_fe_bound(int cpu_map_idx, struct runtime_stat *st, +static double td_fe_bound(int map_idx, struct runtime_stat *st, struct runtime_stat_data *rsd) { double fe_bound = 0; - double total_slots = td_total_slots(cpu_map_idx, st, rsd); + double total_slots = td_total_slots(map_idx, st, rsd); double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES, - cpu_map_idx, rsd); + map_idx, rsd); if (total_slots) fe_bound = fetch_bub / total_slots; return fe_bound; } -static double td_be_bound(int cpu_map_idx, struct runtime_stat *st, +static double td_be_bound(int map_idx, struct runtime_stat *st, struct runtime_stat_data *rsd) { - double sum = (td_fe_bound(cpu_map_idx, st, rsd) + - td_bad_spec(cpu_map_idx, st, rsd) + - td_retiring(cpu_map_idx, st, rsd)); + double sum = (td_fe_bound(map_idx, st, rsd) + + td_bad_spec(map_idx, st, rsd) + + td_retiring(map_idx, st, rsd)); if (sum == 0) return 0; return sanitize_val(1.0 - sum); @@ -757,15 +757,15 @@ static double td_be_bound(int cpu_map_idx, struct runtime_stat *st, * the ratios we need to recreate the sum. */ -static double td_metric_ratio(int cpu_map_idx, enum stat_type type, +static double td_metric_ratio(int map_idx, enum stat_type type, struct runtime_stat *stat, struct runtime_stat_data *rsd) { - double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu_map_idx, rsd) + - runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu_map_idx, rsd) + - runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu_map_idx, rsd) + - runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu_map_idx, rsd); - double d = runtime_stat_avg(stat, type, cpu_map_idx, rsd); + double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, map_idx, rsd) + + runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, map_idx, rsd) + + runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, map_idx, rsd) + + runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, map_idx, rsd); + double d = runtime_stat_avg(stat, type, map_idx, rsd); if (sum) return d / sum; @@ -777,23 +777,23 @@ static double td_metric_ratio(int cpu_map_idx, enum stat_type type, * We allow two missing. */ -static bool full_td(int cpu_map_idx, struct runtime_stat *stat, +static bool full_td(int map_idx, struct runtime_stat *stat, struct runtime_stat_data *rsd) { int c = 0; - if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu_map_idx, rsd) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, map_idx, rsd) > 0) c++; - if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu_map_idx, rsd) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, map_idx, rsd) > 0) c++; - if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu_map_idx, rsd) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, map_idx, rsd) > 0) c++; - if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu_map_idx, rsd) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, map_idx, rsd) > 0) c++; return c >= 2; } -static void print_smi_cost(struct perf_stat_config *config, int cpu_map_idx, +static void print_smi_cost(struct perf_stat_config *config, int map_idx, struct perf_stat_output_ctx *out, struct runtime_stat *st, struct runtime_stat_data *rsd) @@ -801,9 +801,9 @@ static void print_smi_cost(struct perf_stat_config *config, int cpu_map_idx, double smi_num, aperf, cycles, cost = 0.0; const char *color = NULL; - smi_num = runtime_stat_avg(st, STAT_SMI_NUM, cpu_map_idx, rsd); - aperf = runtime_stat_avg(st, STAT_APERF, cpu_map_idx, rsd); - cycles = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, rsd); + smi_num = runtime_stat_avg(st, STAT_SMI_NUM, map_idx, rsd); + aperf = runtime_stat_avg(st, STAT_APERF, map_idx, rsd); + cycles = runtime_stat_avg(st, STAT_CYCLES, map_idx, rsd); if ((cycles == 0) || (aperf == 0)) return; @@ -820,7 +820,7 @@ static void print_smi_cost(struct perf_stat_config *config, int cpu_map_idx, static int prepare_metric(struct evsel **metric_events, struct metric_ref *metric_refs, struct expr_parse_ctx *pctx, - int cpu_map_idx, + int map_idx, struct runtime_stat *st) { double scale; @@ -859,7 +859,7 @@ static int prepare_metric(struct evsel **metric_events, abort(); } } else { - v = saved_value_lookup(metric_events[i], cpu_map_idx, false, + v = saved_value_lookup(metric_events[i], map_idx, false, STAT_NONE, 0, st, metric_events[i]->cgrp); if (!v) @@ -902,7 +902,7 @@ static void generic_metric(struct perf_stat_config *config, const char *metric_name, const char *metric_unit, int runtime, - int cpu_map_idx, + int map_idx, struct perf_stat_output_ctx *out, struct runtime_stat *st) { @@ -920,7 +920,7 @@ static void generic_metric(struct perf_stat_config *config, pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list); pctx->sctx.runtime = runtime; pctx->sctx.system_wide = config->system_wide; - i = prepare_metric(metric_events, metric_refs, pctx, cpu_map_idx, st); + i = prepare_metric(metric_events, metric_refs, pctx, map_idx, st); if (i < 0) { expr__ctx_free(pctx); return; @@ -965,7 +965,7 @@ static void generic_metric(struct perf_stat_config *config, expr__ctx_free(pctx); } -double test_generic_metric(struct metric_expr *mexp, int cpu_map_idx, struct runtime_stat *st) +double test_generic_metric(struct metric_expr *mexp, int map_idx, struct runtime_stat *st) { struct expr_parse_ctx *pctx; double ratio = 0.0; @@ -974,7 +974,7 @@ double test_generic_metric(struct metric_expr *mexp, int cpu_map_idx, struct run if (!pctx) return NAN; - if (prepare_metric(mexp->metric_events, mexp->metric_refs, pctx, cpu_map_idx, st) < 0) + if (prepare_metric(mexp->metric_events, mexp->metric_refs, pctx, map_idx, st) < 0) goto out; if (expr__parse(&ratio, pctx, mexp->metric_expr)) @@ -987,7 +987,7 @@ out: void perf_stat__print_shadow_stats(struct perf_stat_config *config, struct evsel *evsel, - double avg, int cpu_map_idx, + double avg, int map_idx, struct perf_stat_output_ctx *out, struct rblist *metric_events, struct runtime_stat *st) @@ -1006,7 +1006,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (config->iostat_run) { iostat_print_metric(config, evsel, out); } else if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { - total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_CYCLES, map_idx, &rsd); if (total) { ratio = avg / total; @@ -1016,11 +1016,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0); } - total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, map_idx, &rsd); total = max(total, runtime_stat_avg(st, STAT_STALLED_CYCLES_BACK, - cpu_map_idx, &rsd)); + map_idx, &rsd)); if (total && avg) { out->new_line(config, ctxp); @@ -1030,8 +1030,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ratio); } } else if (evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { - if (runtime_stat_n(st, STAT_BRANCHES, cpu_map_idx, &rsd) != 0) - print_branch_misses(config, cpu_map_idx, avg, out, st, &rsd); + if (runtime_stat_n(st, STAT_BRANCHES, map_idx, &rsd) != 0) + print_branch_misses(config, map_idx, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all branches", 0); } else if ( @@ -1040,8 +1040,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_L1_DCACHE, cpu_map_idx, &rsd) != 0) - print_l1_dcache_misses(config, cpu_map_idx, avg, out, st, &rsd); + if (runtime_stat_n(st, STAT_L1_DCACHE, map_idx, &rsd) != 0) + print_l1_dcache_misses(config, map_idx, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all L1-dcache accesses", 0); } else if ( @@ -1050,8 +1050,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_L1_ICACHE, cpu_map_idx, &rsd) != 0) - print_l1_icache_misses(config, cpu_map_idx, avg, out, st, &rsd); + if (runtime_stat_n(st, STAT_L1_ICACHE, map_idx, &rsd) != 0) + print_l1_icache_misses(config, map_idx, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all L1-icache accesses", 0); } else if ( @@ -1060,8 +1060,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_DTLB_CACHE, cpu_map_idx, &rsd) != 0) - print_dtlb_cache_misses(config, cpu_map_idx, avg, out, st, &rsd); + if (runtime_stat_n(st, STAT_DTLB_CACHE, map_idx, &rsd) != 0) + print_dtlb_cache_misses(config, map_idx, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all dTLB cache accesses", 0); } else if ( @@ -1070,8 +1070,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_ITLB_CACHE, cpu_map_idx, &rsd) != 0) - print_itlb_cache_misses(config, cpu_map_idx, avg, out, st, &rsd); + if (runtime_stat_n(st, STAT_ITLB_CACHE, map_idx, &rsd) != 0) + print_itlb_cache_misses(config, map_idx, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all iTLB cache accesses", 0); } else if ( @@ -1080,27 +1080,27 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_LL_CACHE, cpu_map_idx, &rsd) != 0) - print_ll_cache_misses(config, cpu_map_idx, avg, out, st, &rsd); + if (runtime_stat_n(st, STAT_LL_CACHE, map_idx, &rsd) != 0) + print_ll_cache_misses(config, map_idx, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all LL-cache accesses", 0); } else if (evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { - total = runtime_stat_avg(st, STAT_CACHEREFS, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_CACHEREFS, map_idx, &rsd); if (total) ratio = avg * 100 / total; - if (runtime_stat_n(st, STAT_CACHEREFS, cpu_map_idx, &rsd) != 0) + if (runtime_stat_n(st, STAT_CACHEREFS, map_idx, &rsd) != 0) print_metric(config, ctxp, NULL, "%8.3f %%", "of all cache refs", ratio); else print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0); } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { - print_stalled_cycles_frontend(config, cpu_map_idx, avg, out, st, &rsd); + print_stalled_cycles_frontend(config, map_idx, avg, out, st, &rsd); } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { - print_stalled_cycles_backend(config, cpu_map_idx, avg, out, st, &rsd); + print_stalled_cycles_backend(config, map_idx, avg, out, st, &rsd); } else if (evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { - total = runtime_stat_avg(st, STAT_NSECS, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_NSECS, map_idx, &rsd); if (total) { ratio = avg / total; @@ -1109,7 +1109,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "Ghz", 0); } } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { - total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_CYCLES, map_idx, &rsd); if (total) print_metric(config, ctxp, NULL, @@ -1119,8 +1119,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "transactional cycles", 0); } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { - total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, &rsd); - total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_CYCLES, map_idx, &rsd); + total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, map_idx, &rsd); if (total2 < avg) total2 = avg; @@ -1130,19 +1130,19 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, else print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0); } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { - total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, map_idx, &rsd); if (avg) ratio = total / avg; - if (runtime_stat_n(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd) != 0) + if (runtime_stat_n(st, STAT_CYCLES_IN_TX, map_idx, &rsd) != 0) print_metric(config, ctxp, NULL, "%8.0f", "cycles / transaction", ratio); else print_metric(config, ctxp, NULL, NULL, "cycles / transaction", 0); } else if (perf_stat_evsel__is(evsel, ELISION_START)) { - total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, map_idx, &rsd); if (avg) ratio = total / avg; @@ -1155,28 +1155,28 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, else print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { - double fe_bound = td_fe_bound(cpu_map_idx, st, &rsd); + double fe_bound = td_fe_bound(map_idx, st, &rsd); if (fe_bound > 0.2) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", fe_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { - double retiring = td_retiring(cpu_map_idx, st, &rsd); + double retiring = td_retiring(map_idx, st, &rsd); if (retiring > 0.7) color = PERF_COLOR_GREEN; print_metric(config, ctxp, color, "%8.1f%%", "retiring", retiring * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { - double bad_spec = td_bad_spec(cpu_map_idx, st, &rsd); + double bad_spec = td_bad_spec(map_idx, st, &rsd); if (bad_spec > 0.1) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", bad_spec * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { - double be_bound = td_be_bound(cpu_map_idx, st, &rsd); + double be_bound = td_be_bound(map_idx, st, &rsd); const char *name = "backend bound"; static int have_recovery_bubbles = -1; @@ -1189,14 +1189,14 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (be_bound > 0.2) color = PERF_COLOR_RED; - if (td_total_slots(cpu_map_idx, st, &rsd) > 0) + if (td_total_slots(map_idx, st, &rsd) > 0) print_metric(config, ctxp, color, "%8.1f%%", name, be_bound * 100.); else print_metric(config, ctxp, NULL, NULL, name, 0); } else if (perf_stat_evsel__is(evsel, TOPDOWN_RETIRING) && - full_td(cpu_map_idx, st, &rsd)) { - double retiring = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd)) { + double retiring = td_metric_ratio(map_idx, STAT_TOPDOWN_RETIRING, st, &rsd); if (retiring > 0.7) @@ -1204,8 +1204,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Retiring", retiring * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) && - full_td(cpu_map_idx, st, &rsd)) { - double fe_bound = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd)) { + double fe_bound = td_metric_ratio(map_idx, STAT_TOPDOWN_FE_BOUND, st, &rsd); if (fe_bound > 0.2) @@ -1213,8 +1213,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Frontend Bound", fe_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) && - full_td(cpu_map_idx, st, &rsd)) { - double be_bound = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd)) { + double be_bound = td_metric_ratio(map_idx, STAT_TOPDOWN_BE_BOUND, st, &rsd); if (be_bound > 0.2) @@ -1222,8 +1222,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Backend Bound", be_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) && - full_td(cpu_map_idx, st, &rsd)) { - double bad_spec = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd)) { + double bad_spec = td_metric_ratio(map_idx, STAT_TOPDOWN_BAD_SPEC, st, &rsd); if (bad_spec > 0.1) @@ -1231,11 +1231,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Bad Speculation", bad_spec * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_HEAVY_OPS) && - full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { - double retiring = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd) && (config->topdown_level > 1)) { + double retiring = td_metric_ratio(map_idx, STAT_TOPDOWN_RETIRING, st, &rsd); - double heavy_ops = td_metric_ratio(cpu_map_idx, + double heavy_ops = td_metric_ratio(map_idx, STAT_TOPDOWN_HEAVY_OPS, st, &rsd); double light_ops = retiring - heavy_ops; @@ -1251,11 +1251,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Light Operations", light_ops * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BR_MISPREDICT) && - full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { - double bad_spec = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd) && (config->topdown_level > 1)) { + double bad_spec = td_metric_ratio(map_idx, STAT_TOPDOWN_BAD_SPEC, st, &rsd); - double br_mis = td_metric_ratio(cpu_map_idx, + double br_mis = td_metric_ratio(map_idx, STAT_TOPDOWN_BR_MISPREDICT, st, &rsd); double m_clears = bad_spec - br_mis; @@ -1271,11 +1271,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Machine Clears", m_clears * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_LAT) && - full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { - double fe_bound = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd) && (config->topdown_level > 1)) { + double fe_bound = td_metric_ratio(map_idx, STAT_TOPDOWN_FE_BOUND, st, &rsd); - double fetch_lat = td_metric_ratio(cpu_map_idx, + double fetch_lat = td_metric_ratio(map_idx, STAT_TOPDOWN_FETCH_LAT, st, &rsd); double fetch_bw = fe_bound - fetch_lat; @@ -1291,11 +1291,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, color, "%8.1f%%", "Fetch Bandwidth", fetch_bw * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_MEM_BOUND) && - full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { - double be_bound = td_metric_ratio(cpu_map_idx, + full_td(map_idx, st, &rsd) && (config->topdown_level > 1)) { + double be_bound = td_metric_ratio(map_idx, STAT_TOPDOWN_BE_BOUND, st, &rsd); - double mem_bound = td_metric_ratio(cpu_map_idx, + double mem_bound = td_metric_ratio(map_idx, STAT_TOPDOWN_MEM_BOUND, st, &rsd); double core_bound = be_bound - mem_bound; @@ -1313,12 +1313,12 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, } else if (evsel->metric_expr) { generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, evsel->name, evsel->metric_name, NULL, 1, - cpu_map_idx, out, st); - } else if (runtime_stat_n(st, STAT_NSECS, cpu_map_idx, &rsd) != 0) { + map_idx, out, st); + } else if (runtime_stat_n(st, STAT_NSECS, map_idx, &rsd) != 0) { char unit = ' '; char unit_buf[10] = "/sec"; - total = runtime_stat_avg(st, STAT_NSECS, cpu_map_idx, &rsd); + total = runtime_stat_avg(st, STAT_NSECS, map_idx, &rsd); if (total) ratio = convert_unit_double(1000000000.0 * avg / total, &unit); @@ -1326,7 +1326,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio); } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { - print_smi_cost(config, cpu_map_idx, out, st, &rsd); + print_smi_cost(config, map_idx, out, st, &rsd); } else { num = 0; } @@ -1340,7 +1340,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, generic_metric(config, mexp->metric_expr, mexp->metric_events, mexp->metric_refs, evsel->name, mexp->metric_name, mexp->metric_unit, mexp->runtime, - cpu_map_idx, out, st); + map_idx, out, st); } } if (num == 0) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 3eba38a1a1499..93f6ca0d9761e 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -234,7 +234,7 @@ void perf_stat__init_shadow_stats(void); void perf_stat__reset_shadow_stats(void); void perf_stat__reset_shadow_per_stat(struct runtime_stat *st); void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, - int cpu_map_idx, struct runtime_stat *st); + int map_idx, struct runtime_stat *st); struct perf_stat_output_ctx { void *ctx; print_metric_t print_metric; @@ -244,7 +244,7 @@ struct perf_stat_output_ctx { void perf_stat__print_shadow_stats(struct perf_stat_config *config, struct evsel *evsel, - double avg, int cpu, + double avg, int map_idx, struct perf_stat_output_ctx *out, struct rblist *metric_events, struct runtime_stat *st); @@ -279,5 +279,5 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf struct target *_target, struct timespec *ts, int argc, const char **argv); struct metric_expr; -double test_generic_metric(struct metric_expr *mexp, int cpu_map_idx, struct runtime_stat *st); +double test_generic_metric(struct metric_expr *mexp, int map_idx, struct runtime_stat *st); #endif -- GitLab From 87ae87fd6c61c93a93b79c6c6c8ec5f47e4839dd Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:07 -0700 Subject: [PATCH 1441/2223] perf stat: Use thread map index for shadow stat When AGGR_THREAD is active, it aggregates the values for each thread. Previously it used cpu map index which is invalid for AGGR_THREAD so it had to use separate runtime stats with index 0. But it can just use the rt_stat with thread_map_index. Rename the first_shadow_map_idx() and make it return the thread index. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat-display.c | 20 +++++++++----------- tools/perf/util/stat-shadow.c | 2 +- tools/perf/util/stat.c | 8 ++------ 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 234491f43c36b..570e2c04d47d2 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -442,7 +442,7 @@ static void print_metric_header(struct perf_stat_config *config, fprintf(os->fh, "%*s ", config->metric_only_len, unit); } -static int first_shadow_cpu_map_idx(struct perf_stat_config *config, +static int first_shadow_map_idx(struct perf_stat_config *config, struct evsel *evsel, const struct aggr_cpu_id *id) { struct perf_cpu_map *cpus = evsel__cpus(evsel); @@ -452,6 +452,9 @@ static int first_shadow_cpu_map_idx(struct perf_stat_config *config, if (config->aggr_mode == AGGR_NONE) return perf_cpu_map__idx(cpus, id->cpu); + if (config->aggr_mode == AGGR_THREAD) + return id->thread; + if (!config->aggr_get_id) return 0; @@ -646,7 +649,7 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int } perf_stat__print_shadow_stats(config, counter, uval, - first_shadow_cpu_map_idx(config, counter, &id), + first_shadow_map_idx(config, counter, &id), &out, &config->metric_events, st); if (!config->csv_output && !config->metric_only && !config->json_output) { print_noise(config, counter, noise); @@ -676,7 +679,7 @@ static void aggr_update_shadow(struct perf_stat_config *config, val += perf_counts(counter->counts, idx, 0)->val; } perf_stat__update_shadow_stats(counter, val, - first_shadow_cpu_map_idx(config, counter, &id), + first_shadow_map_idx(config, counter, &id), &rt_stat); } } @@ -979,14 +982,9 @@ static void print_aggr_thread(struct perf_stat_config *config, fprintf(output, "%s", prefix); id = buf[thread].id; - if (config->stats) - printout(config, id, 0, buf[thread].counter, buf[thread].uval, - prefix, buf[thread].run, buf[thread].ena, 1.0, - &config->stats[id.thread]); - else - printout(config, id, 0, buf[thread].counter, buf[thread].uval, - prefix, buf[thread].run, buf[thread].ena, 1.0, - &rt_stat); + printout(config, id, 0, buf[thread].counter, buf[thread].uval, + prefix, buf[thread].run, buf[thread].ena, 1.0, + &rt_stat); fputc('\n', output); } diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 48634b95669e8..60c8709fb53c8 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -33,7 +33,7 @@ struct saved_value { struct evsel *evsel; enum stat_type type; int ctx; - int map_idx; /* cpu map index */ + int map_idx; /* cpu or thread map index */ struct cgroup *cgrp; struct runtime_stat *stat; struct stats stats; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 7e9543cff31cf..8ec8bb4a99129 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -389,12 +389,8 @@ process_counter_values(struct perf_stat_config *config, struct evsel *evsel, } if (config->aggr_mode == AGGR_THREAD) { - if (config->stats) - perf_stat__update_shadow_stats(evsel, - count->val, 0, &config->stats[thread]); - else - perf_stat__update_shadow_stats(evsel, - count->val, 0, &rt_stat); + perf_stat__update_shadow_stats(evsel, count->val, + thread, &rt_stat); } break; case AGGR_GLOBAL: -- GitLab From f407aac4056c9ce52ea9ec7a8dabbd0f553684c2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:08 -0700 Subject: [PATCH 1442/2223] perf stat: Kill unused per-thread runtime stats Now it's using the global rt_stat, no need to use per-thread stats. Let get rid of them. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-stat.c | 54 --------------------------------------- tools/perf/util/stat.h | 2 -- 2 files changed, 56 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 1677546b2ea2d..265b051579726 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -292,13 +292,8 @@ static inline void diff_timespec(struct timespec *r, struct timespec *a, static void perf_stat__reset_stats(void) { - int i; - evlist__reset_stats(evsel_list); perf_stat__reset_shadow_stats(); - - for (i = 0; i < stat_config.stats_num; i++) - perf_stat__reset_shadow_per_stat(&stat_config.stats[i]); } static int process_synthesized_event(struct perf_tool *tool __maybe_unused, @@ -489,46 +484,6 @@ static void read_counters(struct timespec *rs) } } -static int runtime_stat_new(struct perf_stat_config *config, int nthreads) -{ - int i; - - config->stats = calloc(nthreads, sizeof(struct runtime_stat)); - if (!config->stats) - return -1; - - config->stats_num = nthreads; - - for (i = 0; i < nthreads; i++) - runtime_stat__init(&config->stats[i]); - - return 0; -} - -static void runtime_stat_delete(struct perf_stat_config *config) -{ - int i; - - if (!config->stats) - return; - - for (i = 0; i < config->stats_num; i++) - runtime_stat__exit(&config->stats[i]); - - zfree(&config->stats); -} - -static void runtime_stat_reset(struct perf_stat_config *config) -{ - int i; - - if (!config->stats) - return; - - for (i = 0; i < config->stats_num; i++) - perf_stat__reset_shadow_per_stat(&config->stats[i]); -} - static void process_interval(void) { struct timespec ts, rs; @@ -537,7 +492,6 @@ static void process_interval(void) diff_timespec(&rs, &ts, &ref_time); perf_stat__reset_shadow_per_stat(&rt_stat); - runtime_stat_reset(&stat_config); read_counters(&rs); if (STAT_RECORD) { @@ -1014,7 +968,6 @@ try_again_reset: evlist__copy_prev_raw_counts(evsel_list); evlist__reset_prev_raw_counts(evsel_list); - runtime_stat_reset(&stat_config); perf_stat__reset_shadow_per_stat(&rt_stat); } else { update_stats(&walltime_nsecs_stats, t1 - t0); @@ -2510,12 +2463,6 @@ int cmd_stat(int argc, const char **argv) */ if (stat_config.aggr_mode == AGGR_THREAD) { thread_map__read_comms(evsel_list->core.threads); - if (target.system_wide) { - if (runtime_stat_new(&stat_config, - perf_thread_map__nr(evsel_list->core.threads))) { - goto out; - } - } } if (stat_config.aggr_mode == AGGR_NODE) @@ -2656,7 +2603,6 @@ out: evlist__delete(evsel_list); metricgroup__rblist_exit(&stat_config.metric_events); - runtime_stat_delete(&stat_config); evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close); return status; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 93f6ca0d9761e..b0899c6e002f5 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -153,8 +153,6 @@ struct perf_stat_config { int run_count; int print_free_counters_hint; int print_mixed_hw_group_error; - struct runtime_stat *stats; - int stats_num; const char *csv_sep; struct stats *walltime_nsecs_stats; struct rusage ru_data; -- GitLab From 01b8957b738f42f96a130079bc951b3cc78c5b8a Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:09 -0700 Subject: [PATCH 1443/2223] perf stat: Don't compare runtime stat for shadow stats Now it always uses the global rt_stat. Let's get rid of the field from the saved_value. When the both evsels are NULL, it'd return 0 so remove the block in the saved_value_cmp. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat-shadow.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 60c8709fb53c8..07b29fe272c79 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -35,7 +35,6 @@ struct saved_value { int ctx; int map_idx; /* cpu or thread map index */ struct cgroup *cgrp; - struct runtime_stat *stat; struct stats stats; u64 metric_total; int metric_other; @@ -67,16 +66,6 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry) if (a->cgrp != b->cgrp) return (char *)a->cgrp < (char *)b->cgrp ? -1 : +1; - if (a->evsel == NULL && b->evsel == NULL) { - if (a->stat == b->stat) - return 0; - - if ((char *)a->stat < (char *)b->stat) - return -1; - - return 1; - } - if (a->evsel == b->evsel) return 0; if ((char *)a->evsel < (char *)b->evsel) @@ -120,7 +109,6 @@ static struct saved_value *saved_value_lookup(struct evsel *evsel, .evsel = evsel, .type = type, .ctx = ctx, - .stat = st, .cgrp = cgrp, }; -- GitLab From fa2edc07b4643f9dc1db80b2c51ef81f62b26614 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Fri, 30 Sep 2022 13:21:10 -0700 Subject: [PATCH 1444/2223] perf stat: Rename to aggr_cpu_id.thread_idx The aggr_cpu_id has a thread value but it's actually an index to the thread_map. To reduce possible confusion, rename it to thread_idx. Suggested-by: Ian Rogers <irogers@google.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20220930202110.845199-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/topology.c | 10 +++++----- tools/perf/util/cpumap.c | 8 ++++---- tools/perf/util/cpumap.h | 2 +- tools/perf/util/stat-display.c | 12 ++++++------ 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 0b4f61b6cc6b8..c4630cfc80ea2 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -147,7 +147,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Cpu map - Die ID doesn't match", session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1); - TEST_ASSERT_VAL("Cpu map - Thread is set", id.thread == -1); + TEST_ASSERT_VAL("Cpu map - Thread IDX is set", id.thread_idx == -1); } // Test that core ID contains socket, die and core @@ -163,7 +163,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Core map - Die ID doesn't match", session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); - TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1); + TEST_ASSERT_VAL("Core map - Thread IDX is set", id.thread_idx == -1); } // Test that die ID contains socket and die @@ -179,7 +179,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); TEST_ASSERT_VAL("Die map - CPU is set", id.cpu.cpu == -1); - TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1); + TEST_ASSERT_VAL("Die map - Thread IDX is set", id.thread_idx == -1); } // Test that socket ID contains only socket @@ -193,7 +193,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1); TEST_ASSERT_VAL("Socket map - CPU is set", id.cpu.cpu == -1); - TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1); + TEST_ASSERT_VAL("Socket map - Thread IDX is set", id.thread_idx == -1); } // Test that node ID contains only node @@ -205,7 +205,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); TEST_ASSERT_VAL("Node map - CPU is set", id.cpu.cpu == -1); - TEST_ASSERT_VAL("Node map - Thread is set", id.thread == -1); + TEST_ASSERT_VAL("Node map - Thread IDX is set", id.thread_idx == -1); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2389bd3e19b86..8486ca3bec75f 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -229,7 +229,7 @@ static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer) else if (a->core != b->core) return a->core - b->core; else - return a->thread - b->thread; + return a->thread_idx - b->thread_idx; } struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus, @@ -667,7 +667,7 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b) { - return a->thread == b->thread && + return a->thread_idx == b->thread_idx && a->node == b->node && a->socket == b->socket && a->die == b->die && @@ -677,7 +677,7 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a) { - return a->thread == -1 && + return a->thread_idx == -1 && a->node == -1 && a->socket == -1 && a->die == -1 && @@ -688,7 +688,7 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a) struct aggr_cpu_id aggr_cpu_id__empty(void) { struct aggr_cpu_id ret = { - .thread = -1, + .thread_idx = -1, .node = -1, .socket = -1, .die = -1, diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index fa8a5acdcae12..4a6d029576eeb 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -10,7 +10,7 @@ /** Identify where counts are aggregated, -1 implies not to aggregate. */ struct aggr_cpu_id { /** A value in the range 0 to number of threads. */ - int thread; + int thread_idx; /** The numa node X as read from /sys/devices/system/node/nodeX. */ int node; /** diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 570e2c04d47d2..df26fb5eb072b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -189,14 +189,14 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_THREAD: if (config->json_output) { fprintf(config->output, "\"thread\" : \"%s-%d\", ", - perf_thread_map__comm(evsel->core.threads, id.thread), - perf_thread_map__pid(evsel->core.threads, id.thread)); + perf_thread_map__comm(evsel->core.threads, id.thread_idx), + perf_thread_map__pid(evsel->core.threads, id.thread_idx)); } else { fprintf(config->output, "%*s-%*d%s", config->csv_output ? 0 : 16, - perf_thread_map__comm(evsel->core.threads, id.thread), + perf_thread_map__comm(evsel->core.threads, id.thread_idx), config->csv_output ? 0 : -8, - perf_thread_map__pid(evsel->core.threads, id.thread), + perf_thread_map__pid(evsel->core.threads, id.thread_idx), config->csv_sep); } break; @@ -453,7 +453,7 @@ static int first_shadow_map_idx(struct perf_stat_config *config, return perf_cpu_map__idx(cpus, id->cpu); if (config->aggr_mode == AGGR_THREAD) - return id->thread; + return id->thread_idx; if (!config->aggr_get_id) return 0; @@ -946,7 +946,7 @@ static struct perf_aggr_thread_value *sort_aggr_thread( buf[i].counter = counter; buf[i].id = aggr_cpu_id__empty(); - buf[i].id.thread = thread; + buf[i].id.thread_idx = thread; buf[i].uval = uval; buf[i].val = val; buf[i].run = run; -- GitLab From 07d2872bf4c864eb83d034263c155746a2fb7a3b Mon Sep 17 00:00:00 2001 From: Avri Altman <avri.altman@wdc.com> Date: Wed, 28 Sep 2022 12:57:44 +0300 Subject: [PATCH 1445/2223] mmc: core: Add SD card quirk for broken discard Some SD-cards from Sandisk that are SDA-6.0 compliant reports they supports discard, while they actually don't. This might cause mk2fs to fail while trying to format the card and revert it to a read-only mode. To fix this problem, let's add a card quirk (MMC_QUIRK_BROKEN_SD_DISCARD) to indicate that we shall fall-back to use the legacy erase command instead. Signed-off-by: Avri Altman <avri.altman@wdc.com> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220928095744.16455-1-avri.altman@wdc.com [Ulf: Updated the commit message] Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org> --- drivers/mmc/core/block.c | 6 +++++- drivers/mmc/core/card.h | 6 ++++++ drivers/mmc/core/quirks.h | 6 ++++++ include/linux/mmc/card.h | 1 + 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index ce89611a136e9..54cd009aee50e 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1140,8 +1140,12 @@ static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req) { struct mmc_blk_data *md = mq->blkdata; struct mmc_card *card = md->queue.card; + unsigned int arg = card->erase_arg; - mmc_blk_issue_erase_rq(mq, req, MMC_BLK_DISCARD, card->erase_arg); + if (mmc_card_broken_sd_discard(card)) + arg = SD_ERASE_ARG; + + mmc_blk_issue_erase_rq(mq, req, MMC_BLK_DISCARD, arg); } static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq, diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index 99045e138ba48..cfdd1ff40b865 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -73,6 +73,7 @@ struct mmc_fixup { #define EXT_CSD_REV_ANY (-1u) #define CID_MANFID_SANDISK 0x2 +#define CID_MANFID_SANDISK_SD 0x3 #define CID_MANFID_ATP 0x9 #define CID_MANFID_TOSHIBA 0x11 #define CID_MANFID_MICRON 0x13 @@ -258,4 +259,9 @@ static inline int mmc_card_broken_hpi(const struct mmc_card *c) return c->quirks & MMC_QUIRK_BROKEN_HPI; } +static inline int mmc_card_broken_sd_discard(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_BROKEN_SD_DISCARD; +} + #endif diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index be43939880868..29b9497936df9 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -100,6 +100,12 @@ static const struct mmc_fixup __maybe_unused mmc_blk_fixups[] = { MMC_FIXUP("V10016", CID_MANFID_KINGSTON, CID_OEMID_ANY, add_quirk_mmc, MMC_QUIRK_TRIM_BROKEN), + /* + * Some SD cards reports discard support while they don't + */ + MMC_FIXUP(CID_NAME_ANY, CID_MANFID_SANDISK_SD, 0x5344, add_quirk_sd, + MMC_QUIRK_BROKEN_SD_DISCARD), + END_FIXUP }; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 8a30de08e9139..c726ea7812552 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -293,6 +293,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_IRQ_POLLING (1<<11) /* Polling SDIO_CCCR_INTx could create a fake interrupt */ #define MMC_QUIRK_TRIM_BROKEN (1<<12) /* Skip trim */ #define MMC_QUIRK_BROKEN_HPI (1<<13) /* Disable broken HPI support */ +#define MMC_QUIRK_BROKEN_SD_DISCARD (1<<14) /* Disable broken SD discard support */ bool reenable_cmdq; /* Re-enable Command Queue */ -- GitLab From 340e134727c9adaefadc7e79b765c038e18e55c3 Mon Sep 17 00:00:00 2001 From: Deming Wang <wangdeming@inspur.com> Date: Thu, 6 Oct 2022 04:44:50 -0400 Subject: [PATCH 1446/2223] block: Remove the repeat word 'can' Remove the repeat word 'can' from the comments of bio_kmalloc. Signed-off-by: Deming Wang <wangdeming@inspur.com> Link: https://lore.kernel.org/r/20221006084450.1513-1-wangdeming@inspur.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 7cb7d2ff139ba..6c470a50a36d9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -567,7 +567,7 @@ EXPORT_SYMBOL(bio_alloc_bioset); * be reused by calling bio_uninit() before calling bio_init() again. * * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this - * function are not backed by a mempool can can fail. Do not use this function + * function are not backed by a mempool can fail. Do not use this function * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. -- GitLab From 39494194f93bed7926d4b3bd03a6a76ba23e612b Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@hammerspace.com> Date: Wed, 5 Oct 2022 15:57:35 -0400 Subject: [PATCH 1447/2223] SUNRPC: Fix races with rpc_killall_tasks() Ensure that we immediately call rpc_exit_task() after waking up, and that the tk_rpc_status cannot get clobbered by some other function. Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- include/linux/sunrpc/sched.h | 1 + net/sunrpc/clnt.c | 6 ++---- net/sunrpc/sched.c | 40 ++++++++++++++++++++++-------------- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index acc62647317c6..647247040ef94 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -209,6 +209,7 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); +bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status); void rpc_signal_task(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_exit(struct rpc_task *, int); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4d8665f15dd7e..a8c341e435101 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1642,7 +1642,7 @@ static void __rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) { trace_rpc_call_rpcerror(task, tk_status, rpc_status); - task->tk_rpc_status = rpc_status; + rpc_task_set_rpc_status(task, rpc_status); rpc_exit(task, tk_status); } @@ -2435,10 +2435,8 @@ rpc_check_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - if (RPC_SIGNALLED(task)) { - rpc_call_rpcerror(task, -ERESTARTSYS); + if (RPC_SIGNALLED(task)) return; - } if (xprt_adjust_timeout(task->tk_rqstp) == 0) return; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 25b9221950ffb..f388bfaf6ff03 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -65,6 +65,13 @@ gfp_t rpc_task_gfp_mask(void) } EXPORT_SYMBOL_GPL(rpc_task_gfp_mask); +bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status) +{ + if (cmpxchg(&task->tk_rpc_status, 0, rpc_status) == 0) + return true; + return false; +} + unsigned long rpc_task_timeout(const struct rpc_task *task) { @@ -855,12 +862,14 @@ void rpc_signal_task(struct rpc_task *task) if (!RPC_IS_ACTIVATED(task)) return; + if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) + return; trace_rpc_task_signalled(task, task->tk_action); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) - rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS); + rpc_wake_up_queued_task(queue, task); } void rpc_exit(struct rpc_task *task, int status) @@ -907,10 +916,16 @@ static void __rpc_execute(struct rpc_task *task) * Perform the next FSM step or a pending callback. * * tk_action may be NULL if the task has been killed. - * In particular, note that rpc_killall_tasks may - * do this at any time, so beware when dereferencing. */ do_action = task->tk_action; + /* Tasks with an RPC error status should exit */ + if (do_action != rpc_exit_task && + (status = READ_ONCE(task->tk_rpc_status)) != 0) { + task->tk_status = status; + if (do_action != NULL) + do_action = rpc_exit_task; + } + /* Callbacks override all actions */ if (task->tk_callback) { do_action = task->tk_callback; task->tk_callback = NULL; @@ -932,14 +947,6 @@ static void __rpc_execute(struct rpc_task *task) continue; } - /* - * Signalled tasks should exit rather than sleep. - */ - if (RPC_SIGNALLED(task)) { - task->tk_rpc_status = -ERESTARTSYS; - rpc_exit(task, -ERESTARTSYS); - } - /* * The queue->lock protects against races with * rpc_make_runnable(). @@ -955,6 +962,12 @@ static void __rpc_execute(struct rpc_task *task) spin_unlock(&queue->lock); continue; } + /* Wake up any task that has an exit status */ + if (READ_ONCE(task->tk_rpc_status) != 0) { + rpc_wake_up_task_queue_locked(queue, task); + spin_unlock(&queue->lock); + continue; + } rpc_clear_running(task); spin_unlock(&queue->lock); if (task_is_async) @@ -972,10 +985,7 @@ static void __rpc_execute(struct rpc_task *task) * clean up after sleeping on some queue, we don't * break the loop here, but go around once more. */ - trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - task->tk_rpc_status = -ERESTARTSYS; - rpc_exit(task, -ERESTARTSYS); + rpc_signal_task(task); } trace_rpc_task_sync_wake(task, task->tk_action); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b3341c202ea07..f34d5427b66ce 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1978,8 +1978,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) * we'll need to figure out how to pass a namespace to * connect. */ - task->tk_rpc_status = -ENOTCONN; - rpc_exit(task, -ENOTCONN); + rpc_task_set_rpc_status(task, -ENOTCONN); goto out_wake; } ret = xs_local_setup_socket(transport); -- GitLab From f8423909ecca208834a9d704e58409800f8b5f21 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@hammerspace.com> Date: Wed, 5 Oct 2022 15:57:36 -0400 Subject: [PATCH 1448/2223] SUNRPC: Add a helper to allow pNFS drivers to selectively cancel RPC calls Add the helper rpc_cancel_tasks(), which uses a caller-defined selection function to define a set of in-flight RPC calls to cancel. This is mainly intended for pNFS drivers which are subject to a layout recall, and which may therefore want to cancel all pending I/O using that layout in order to redrive it after the layout recall has been satisfied. Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- include/linux/sunrpc/sched.h | 5 +++++ net/sunrpc/clnt.c | 37 ++++++++++++++++++++++++++++++++++++ net/sunrpc/sched.c | 11 +++++++++++ 3 files changed, 53 insertions(+) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 647247040ef94..cdcf0fe56a6fe 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -210,11 +210,16 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status); +void rpc_task_try_cancel(struct rpc_task *task, int error); void rpc_signal_task(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_exit(struct rpc_task *, int); void rpc_release_calldata(const struct rpc_call_ops *, void *); void rpc_killall_tasks(struct rpc_clnt *); +unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, + bool (*fnmatch)(const struct rpc_task *, + const void *), + const void *data); void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a8c341e435101..57677517b4747 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -873,6 +873,43 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_killall_tasks); +/** + * rpc_cancel_tasks - try to cancel a set of RPC tasks + * @clnt: Pointer to RPC client + * @error: RPC task error value to set + * @fnmatch: Pointer to selector function + * @data: User data + * + * Uses @fnmatch to define a set of RPC tasks that are to be cancelled. + * The argument @error must be a negative error value. + */ +unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, + bool (*fnmatch)(const struct rpc_task *, + const void *), + const void *data) +{ + struct rpc_task *task; + unsigned long count = 0; + + if (list_empty(&clnt->cl_tasks)) + return 0; + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&clnt->cl_lock); + list_for_each_entry(task, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(task)) + continue; + if (!fnmatch(task, data)) + continue; + rpc_task_try_cancel(task, error); + count++; + } + spin_unlock(&clnt->cl_lock); + return count; +} +EXPORT_SYMBOL_GPL(rpc_cancel_tasks); + /* * Properly shut down an RPC client, terminating all outstanding * requests. diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f388bfaf6ff03..de912e02371ba 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -872,6 +872,17 @@ void rpc_signal_task(struct rpc_task *task) rpc_wake_up_queued_task(queue, task); } +void rpc_task_try_cancel(struct rpc_task *task, int error) +{ + struct rpc_wait_queue *queue; + + if (!rpc_task_set_rpc_status(task, error)) + return; + queue = READ_ONCE(task->tk_waitqueue); + if (queue) + rpc_wake_up_queued_task(queue, task); +} + void rpc_exit(struct rpc_task *task, int status) { task->tk_status = status; -- GitLab From dc4c4304855a5721d214e2a53e17df5152dd5f34 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@hammerspace.com> Date: Wed, 5 Oct 2022 15:57:37 -0400 Subject: [PATCH 1449/2223] SUNRPC: Add API to force the client to disconnect Allow the caller to force a disconnection of the RPC client so that we can clear any pending requests that are buffered in the socket. Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 75eea5ebb179b..770ef2cb57752 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -246,6 +246,7 @@ void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt); +void rpc_clnt_disconnect(struct rpc_clnt *clnt); void rpc_cleanup_clids(void); static inline int rpc_reply_expected(struct rpc_task *task) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 57677517b4747..993acf38af870 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -910,6 +910,20 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, } EXPORT_SYMBOL_GPL(rpc_cancel_tasks); +static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, void *dummy) +{ + if (xprt_connected(xprt)) + xprt_force_disconnect(xprt); + return 0; +} + +void rpc_clnt_disconnect(struct rpc_clnt *clnt) +{ + rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_disconnect_xprt, NULL); +} +EXPORT_SYMBOL_GPL(rpc_clnt_disconnect); + /* * Properly shut down an RPC client, terminating all outstanding * requests. -- GitLab From b739a5bd9d9f18cc69dced8db128ef7206e000cd Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@hammerspace.com> Date: Wed, 5 Oct 2022 15:57:38 -0400 Subject: [PATCH 1450/2223] NFSv4/flexfiles: Cancel I/O if the layout is recalled or revoked If the layout is recalled or revoked, we want to cancel I/O as quickly as possible so that we can return the layout. Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> --- fs/nfs/flexfilelayout/flexfilelayout.c | 84 +++++++++++++++++++++++++- fs/nfs/pnfs.c | 9 ++- fs/nfs/pnfs.h | 9 +++ 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1443330ae9985..1ec79ccf89ad2 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1379,6 +1379,11 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1559,6 +1564,11 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1651,15 +1661,23 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, - struct nfs_commit_data *cdata) +static int ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) { + if (!pnfs_is_valid_lseg(cdata->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_commit_record_layoutstats_start(task, cdata); + return 0; } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { - ff_layout_commit_prepare_common(task, data); + if (ff_layout_commit_prepare_common(task, data)) + return; + rpc_call_start(task); } @@ -1955,6 +1973,65 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, ff_layout_initiate_commit); } +static bool ff_layout_match_rw(const struct rpc_task *task, + const struct nfs_pgio_header *hdr, + const struct pnfs_layout_segment *lseg) +{ + return hdr->lseg == lseg; +} + +static bool ff_layout_match_commit(const struct rpc_task *task, + const struct nfs_commit_data *cdata, + const struct pnfs_layout_segment *lseg) +{ + return cdata->lseg == lseg; +} + +static bool ff_layout_match_io(const struct rpc_task *task, const void *data) +{ + const struct rpc_call_ops *ops = task->tk_ops; + + if (ops == &ff_layout_read_call_ops_v3 || + ops == &ff_layout_read_call_ops_v4 || + ops == &ff_layout_write_call_ops_v3 || + ops == &ff_layout_write_call_ops_v4) + return ff_layout_match_rw(task, task->tk_calldata, data); + if (ops == &ff_layout_commit_call_ops_v3 || + ops == &ff_layout_commit_call_ops_v4) + return ff_layout_match_commit(task, task->tk_calldata, data); + return false; +} + +static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds *mirror_ds; + struct nfs4_pnfs_ds *ds; + struct nfs_client *ds_clp; + struct rpc_clnt *clnt; + u32 idx; + + for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { + mirror = flseg->mirror_array[idx]; + mirror_ds = mirror->mirror_ds; + if (!mirror_ds) + continue; + ds = mirror->mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } +} + static struct pnfs_ds_commit_info * ff_layout_get_ds_info(struct inode *inode) { @@ -2512,6 +2589,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cancel_io = ff_layout_cancel_io, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2613b7e36eb95..d41fc1558e915 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -710,6 +710,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -722,8 +723,10 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; + pnfs_lseg_cancel_io(server, lseg); } dprintk("%s:Return %i\n", __func__, remaining); return remaining; @@ -2485,6 +2488,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -2507,6 +2511,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_lseg_cancel_io(server, lseg); } if (remaining) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f331f067691b0..e3e6a41f19de6 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -169,6 +169,8 @@ struct pnfs_layoutdriver_type { void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + + void (*cancel_io)(struct pnfs_layout_segment *lseg); }; struct pnfs_commit_ops { @@ -685,6 +687,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page req_offset(req), req_last); } +static inline void pnfs_lseg_cancel_io(struct nfs_server *server, + struct pnfs_layout_segment *lseg) +{ + if (server->pnfs_curr_ld->cancel_io) + server->pnfs_curr_ld->cancel_io(lseg); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG -- GitLab From fd643afc8f605bcbb4181a2ad5eacf3233a47187 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Tue, 20 Sep 2022 15:28:22 -0700 Subject: [PATCH 1451/2223] perf record: Save DSO build-ID for synthesizing When synthesizing MMAP2 with build-id, it'd read the same file repeatedly as it has no idea if it's done already. Maintain a dsos to check that and skip the file access if possible. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220920222822.2171056-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/synthetic-events.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 289ea17ac5f7f..cccd293b53124 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -364,11 +364,14 @@ static bool read_proc_maps_line(struct io *io, __u64 *start, __u64 *end, } static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, + struct machine *machine, bool is_kernel) { struct build_id bid; struct nsinfo *nsi; struct nscookie nc; + struct dso *dso = NULL; + struct dso_id id; int rc; if (is_kernel) { @@ -376,6 +379,18 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, goto out; } + id.maj = event->maj; + id.min = event->min; + id.ino = event->ino; + id.ino_generation = event->ino_generation; + + dso = dsos__findnew_id(&machine->dsos, event->filename, &id); + if (dso && dso->has_build_id) { + bid = dso->bid; + rc = 0; + goto out; + } + nsi = nsinfo__new(event->pid); nsinfo__mountns_enter(nsi, &nc); @@ -391,12 +406,16 @@ out: event->header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; event->__reserved_1 = 0; event->__reserved_2 = 0; + + if (dso && !dso->has_build_id) + dso__set_build_id(dso, &bid); } else { if (event->filename[0] == '/') { pr_debug2("Failed to read build ID for %s\n", event->filename); } } + dso__put(dso); } int perf_event__synthesize_mmap_events(struct perf_tool *tool, @@ -507,7 +526,7 @@ out: event->mmap2.tid = pid; if (symbol_conf.buildid_mmap2) - perf_record_mmap2__read_build_id(&event->mmap2, false); + perf_record_mmap2__read_build_id(&event->mmap2, machine, false); if (perf_tool__process_synth_event(tool, event, machine, process) != 0) { rc = -1; @@ -690,7 +709,7 @@ int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t memcpy(event->mmap2.filename, pos->dso->long_name, pos->dso->long_name_len + 1); - perf_record_mmap2__read_build_id(&event->mmap2, false); + perf_record_mmap2__read_build_id(&event->mmap2, machine, false); } else { size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64)); event->mmap.header.type = PERF_RECORD_MMAP; @@ -1126,7 +1145,7 @@ static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool, event->mmap2.len = map->end - event->mmap.start; event->mmap2.pid = machine->pid; - perf_record_mmap2__read_build_id(&event->mmap2, true); + perf_record_mmap2__read_build_id(&event->mmap2, machine, true); } else { size = snprintf(event->mmap.filename, sizeof(event->mmap.filename), "%s%s", machine->mmap_name, kmap->ref_reloc_sym->name) + 1; -- GitLab From 60abedb8aa902b0692025d41351e8938991e3062 Mon Sep 17 00:00:00 2001 From: Leo Yan <leo.yan@linaro.org> Date: Thu, 6 Oct 2022 18:10:39 +0800 Subject: [PATCH 1452/2223] perf test: Introduce script for data symbol testing The test is designed with a data structure with 64-byte alignment, it has two fields "data1" and "data2", and other fields are reserved. Using the "perf mem" command, we can record and report memory samples for a self-contained workload with 1 second duration. If no samples are obtained for the data structure "buf1", it reports failure; and by checking the offset in structure "buf1", if the memory samples aren't for the "data1" and "data2" fields, it means wrong data symbol parsing and returns failure. Committer testing: [root@quaco ~]# grep -m1 "model name" /proc/cpuinfo model name : Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz [root@quaco ~]# [root@quaco ~]# perf test -v "data symbol" 104: Test data symbol : --- start --- test child forked, pid 192318 Compiling test program... Recording workload... [ perf record: Woken up 2 times to write data ] [ perf record: Captured and wrote 0.389 MB /tmp/__perf_test.perf.data.LIuQl (5570 samples) ] Cleaning up files... test child finished with 0 ---- end ---- Test data symbol: Ok [root@quaco ~]# Signed-off-by: Leo Yan <leo.yan@linaro.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Link: https://lore.kernel.org/r/20221006101039.47870-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_data_symbol.sh | 93 ++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100755 tools/perf/tests/shell/test_data_symbol.sh diff --git a/tools/perf/tests/shell/test_data_symbol.sh b/tools/perf/tests/shell/test_data_symbol.sh new file mode 100755 index 0000000000000..cd6eb54d235d8 --- /dev/null +++ b/tools/perf/tests/shell/test_data_symbol.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Test data symbol + +# SPDX-License-Identifier: GPL-2.0 +# Leo Yan <leo.yan@linaro.org>, 2022 + +skip_if_no_mem_event() { + perf mem record -e list 2>&1 | egrep -q 'available' && return 0 + return 2 +} + +skip_if_no_mem_event || exit 2 + +# skip if there's no compiler +if ! [ -x "$(command -v cc)" ]; then + echo "skip: no compiler, install gcc" + exit 2 +fi + +TEST_PROGRAM=$(mktemp /tmp/__perf_test.program.XXXXX) +PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX) + +check_result() { + # The memory report format is as below: + # 99.92% ... [.] buf1+0x38 + result=$(perf mem report -i ${PERF_DATA} -s symbol_daddr -q 2>&1 | + awk '/buf1/ { print $4 }') + + # Testing is failed if has no any sample for "buf1" + [ -z "$result" ] && return 1 + + while IFS= read -r line; do + # The "data1" and "data2" fields in structure "buf1" have + # offset "0x0" and "0x38", returns failure if detect any + # other offset value. + if [ "$line" != "buf1+0x0" ] && [ "$line" != "buf1+0x38" ]; then + return 1 + fi + done <<< "$result" + + return 0 +} + +cleanup_files() +{ + echo "Cleaning up files..." + rm -f ${PERF_DATA} + rm -f ${TEST_PROGRAM} +} + +trap cleanup_files exit term int + +# compile test program +echo "Compiling test program..." +cat << EOF | cc -o ${TEST_PROGRAM} -x c - +typedef struct _buf { + char data1; + char reserved[55]; + char data2; +} buf __attribute__((aligned(64))); + +static buf buf1; + +int main(void) { + for (;;) { + buf1.data1++; + buf1.data2 += buf1.data1; + } + return 0; +} +EOF + +echo "Recording workload..." + +# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't support +# user/kernel filtering and per-process monitoring, spin program on +# specific CPU and test in per-CPU mode. +is_amd=$(egrep -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo) +if (($is_amd >= 1)); then + perf mem record -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM & +else + perf mem record --all-user -o ${PERF_DATA} -- $TEST_PROGRAM & +fi + +PERFPID=$! + +sleep 1 + +kill $PERFPID +wait $PERFPID + +check_result +exit $? -- GitLab From c63317ab14b0319690495b98d638c93ba81f1fb1 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:51 +0100 Subject: [PATCH 1453/2223] perf test: Add CoreSight shell lib shared code for future tests This adds a library of shell "code" to be shared and used by future tests that target quality testing for Arm CoreSight support in perf and the Linux kernel. Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Reviewed-by: James Clark <james.clark@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: http://lore.kernel.org/lkml/20220909152803.2317006-2-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/lib/coresight.sh | 132 ++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 tools/perf/tests/shell/lib/coresight.sh diff --git a/tools/perf/tests/shell/lib/coresight.sh b/tools/perf/tests/shell/lib/coresight.sh new file mode 100644 index 0000000000000..45a1477256b64 --- /dev/null +++ b/tools/perf/tests/shell/lib/coresight.sh @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +# This is sourced from a driver script so no need for #!/bin... etc. at the +# top - the assumption below is that it runs as part of sourcing after the +# test sets up some basic env vars to say what it is. + +# This currently works with ETMv4 / ETF not any other packet types at thi +# point. This will need changes if that changes. + +# perf record options for the perf tests to use +PERFRECMEM="-m ,16M" +PERFRECOPT="$PERFRECMEM -e cs_etm//u" + +TOOLS=$(dirname $0) +DIR="$TOOLS/$TEST" +BIN="$DIR/$TEST" +# If the test tool/binary does not exist and is executable then skip the test +if ! test -x "$BIN"; then exit 2; fi +DATD="." +# If the data dir env is set then make the data dir use that instead of ./ +if test -n "$PERF_TEST_CORESIGHT_DATADIR"; then + DATD="$PERF_TEST_CORESIGHT_DATADIR"; +fi +# If the stat dir env is set then make the data dir use that instead of ./ +STATD="." +if test -n "$PERF_TEST_CORESIGHT_STATDIR"; then + STATD="$PERF_TEST_CORESIGHT_STATDIR"; +fi + +# Called if the test fails - error code 1 +err() { + echo "$1" + exit 1 +} + +# Check that some statistics from our perf +check_val_min() { + STATF="$4" + if test "$2" -lt "$3"; then + echo ", FAILED" >> "$STATF" + err "Sanity check number of $1 is too low ($2 < $3)" + fi +} + +perf_dump_aux_verify() { + # Some basic checking that the AUX chunk contains some sensible data + # to see that we are recording something and at least a minimum + # amount of it. We should almost always see Fn packets in just about + # anything but certainly we will see some trace info and async + # packets + DUMP="$DATD/perf-tmp-aux-dump.txt" + perf report --stdio --dump -i "$1" | \ + grep -o -e I_ATOM_F -e I_ASYNC -e I_TRACE_INFO > "$DUMP" + # Simply count how many of these packets we find to see that we are + # producing a reasonable amount of data - exact checks are not sane + # as this is a lossy process where we may lose some blocks and the + # compiler may produce different code depending on the compiler and + # optimization options, so this is rough just to see if we're + # either missing almost all the data or all of it + ATOM_FX_NUM=`grep I_ATOM_F "$DUMP" | wc -l` + ASYNC_NUM=`grep I_ASYNC "$DUMP" | wc -l` + TRACE_INFO_NUM=`grep I_TRACE_INFO "$DUMP" | wc -l` + rm -f "$DUMP" + + # Arguments provide minimums for a pass + CHECK_FX_MIN="$2" + CHECK_ASYNC_MIN="$3" + CHECK_TRACE_INFO_MIN="$4" + + # Write out statistics, so over time you can track results to see if + # there is a pattern - for example we have less "noisy" results that + # produce more consistent amounts of data each run, to see if over + # time any techinques to minimize data loss are having an effect or + # not + STATF="$STATD/stats-$TEST-$DATV.csv" + if ! test -f "$STATF"; then + echo "ATOM Fx Count, Minimum, ASYNC Count, Minimum, TRACE INFO Count, Minimum" > "$STATF" + fi + echo -n "$ATOM_FX_NUM, $CHECK_FX_MIN, $ASYNC_NUM, $CHECK_ASYNC_MIN, $TRACE_INFO_NUM, $CHECK_TRACE_INFO_MIN" >> "$STATF" + + # Actually check to see if we passed or failed. + check_val_min "ATOM_FX" "$ATOM_FX_NUM" "$CHECK_FX_MIN" "$STATF" + check_val_min "ASYNC" "$ASYNC_NUM" "$CHECK_ASYNC_MIN" "$STATF" + check_val_min "TRACE_INFO" "$TRACE_INFO_NUM" "$CHECK_TRACE_INFO_MIN" "$STATF" + echo ", Ok" >> "$STATF" +} + +perf_dump_aux_tid_verify() { + # Specifically crafted test will produce a list of Tread ID's to + # stdout that need to be checked to see that they have had trace + # info collected in AUX blocks in the perf data. This will go + # through all the TID's that are listed as CID=0xabcdef and see + # that all the Thread IDs the test tool reports are in the perf + # data AUX chunks + + # The TID test tools will print a TID per stdout line that are being + # tested + TIDS=`cat "$2"` + # Scan the perf report to find the TIDs that are actually CID in hex + # and build a list of the ones found + FOUND_TIDS=`perf report --stdio --dump -i "$1" | \ + grep -o "CID=0x[0-9a-z]\+" | sed 's/CID=//g' | \ + uniq | sort | uniq` + # No CID=xxx found - maybe your kernel is reporting these as + # VMID=xxx so look there + if test -z "$FOUND_TIDS"; then + FOUND_TIDS=`perf report --stdio --dump -i "$1" | \ + grep -o "VMID=0x[0-9a-z]\+" | sed 's/VMID=//g' | \ + uniq | sort | uniq` + fi + + # Iterate over the list of TIDs that the test says it has and find + # them in the TIDs found in the perf report + MISSING="" + for TID2 in $TIDS; do + FOUND="" + for TIDHEX in $FOUND_TIDS; do + TID=`printf "%i" $TIDHEX` + if test "$TID" -eq "$TID2"; then + FOUND="y" + break + fi + done + if test -z "$FOUND"; then + MISSING="$MISSING $TID" + fi + done + if test -n "$MISSING"; then + err "Thread IDs $MISSING not found in perf AUX data" + fi +} -- GitLab From 91954c6c904b515baafaee6a1f35c94409a3bb68 Mon Sep 17 00:00:00 2001 From: Daniel Gomez <daniel@qtec.com> Date: Sun, 25 Sep 2022 23:53:13 +0200 Subject: [PATCH 1454/2223] drm/amd/display: Fix mutex lock in dcn10 Removal of DC_FP_* wrappers from dml (9696679bf7ac) provokes a mutex lock [2] on the amdgpu driver. Re-arrange the dcn10 code to avoid locking the mutex by placing the DC_FP_* wrappers around the proper functions. This fixes the following WARN/stacktrace: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:283 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 227, name: systemd-udevd preempt_count: 1, expected: 0 CPU: 4 PID: 227 Comm: systemd-udevd Not tainted 6.0.0-rc6-qtec-standard #2 Hardware name: Qtechnology A/S QT5222/QT5221, BIOS v1.0.1 06/07/2021 Call Trace: <TASK> dump_stack_lvl+0x33/0x42 __might_resched.cold.172+0xa5/0xb3 mutex_lock+0x1a/0x40 amdgpu_dpm_get_clock_by_type_with_voltage+0x38/0x70 [amdgpu] dm_pp_get_clock_levels_by_type_with_voltage+0x64/0xa0 [amdgpu] dcn_bw_update_from_pplib+0x70/0x340 [amdgpu] dcn10_create_resource_pool+0x8c8/0xd20 [amdgpu] ? __kmalloc+0x1c7/0x4a0 dc_create_resource_pool+0xe7/0x190 [amdgpu] dc_create+0x212/0x5d0 [amdgpu] amdgpu_dm_init+0x246/0x370 [amdgpu] ? schedule_hrtimeout_range_clock+0x93/0x120 ? phm_wait_for_register_unequal.part.1+0x4a/0x80 [amdgpu] dm_hw_init+0xe/0x20 [amdgpu] amdgpu_device_init.cold.56+0x1324/0x1653 [amdgpu] ? pci_bus_read_config_word+0x43/0x80 amdgpu_driver_load_kms+0x15/0x120 [amdgpu] amdgpu_pci_probe+0x116/0x320 [amdgpu] pci_device_probe+0x97/0x110 really_probe+0xdd/0x340 __driver_probe_device+0x80/0x170 driver_probe_device+0x1f/0x90 __driver_attach+0xdc/0x180 ? __device_attach_driver+0x100/0x100 ? __device_attach_driver+0x100/0x100 bus_for_each_dev+0x74/0xc0 bus_add_driver+0x19e/0x210 ? kset_find_obj+0x30/0xa0 ? 0xffffffffa0a5b000 driver_register+0x6b/0xc0 ? 0xffffffffa0a5b000 do_one_initcall+0x4a/0x1f0 ? __vunmap+0x28e/0x2f0 ? __cond_resched+0x15/0x30 ? kmem_cache_alloc_trace+0x3d/0x440 do_init_module+0x4a/0x1e0 load_module+0x1cba/0x1e10 ? __do_sys_finit_module+0xb7/0x120 __do_sys_finit_module+0xb7/0x120 do_syscall_64+0x3c/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7ff2b5f5422d Code: 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 ab 0e 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc44ab28e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 0000555c566a9240 RCX: 00007ff2b5f5422d RDX: 0000000000000000 RSI: 00007ff2b60bb353 RDI: 0000000000000019 RBP: 00007ff2b60bb353 R08: 0000000000000000 R09: 0000555c566a9240 R10: 0000000000000019 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000020000 R14: 0000000000000000 R15: 0000000000000000 </TASK> Fixes: 9696679bf7ac ("drm/amd/display: remove DC_FP_* wrapper from dml folder") Reviewed-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Signed-off-by: Daniel Gomez <daniel@qtec.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../amd/display/dc/dcn10/dcn10_hw_sequencer.c | 12 +- .../drm/amd/display/dc/dcn10/dcn10_resource.c | 66 +++++++++- .../drm/amd/display/dc/dml/calcs/dcn_calcs.c | 118 ++++++++---------- .../gpu/drm/amd/display/dc/inc/dcn_calcs.h | 19 ++- 4 files changed, 138 insertions(+), 77 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index 72521749c01d9..4390f6d7050fc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -3005,6 +3005,7 @@ void dcn10_prepare_bandwidth( { struct dce_hwseq *hws = dc->hwseq; struct hubbub *hubbub = dc->res_pool->hubbub; + int min_fclk_khz, min_dcfclk_khz, socclk_khz; if (dc->debug.sanity_checks) hws->funcs.verify_allow_pstate_change_high(dc); @@ -3027,8 +3028,11 @@ void dcn10_prepare_bandwidth( if (dc->debug.pplib_wm_report_mode == WM_REPORT_OVERRIDE) { DC_FP_START(); - dcn_bw_notify_pplib_of_wm_ranges(dc); + dcn_get_soc_clks( + dc, &min_fclk_khz, &min_dcfclk_khz, &socclk_khz); DC_FP_END(); + dcn_bw_notify_pplib_of_wm_ranges( + dc, min_fclk_khz, min_dcfclk_khz, socclk_khz); } if (dc->debug.sanity_checks) @@ -3041,6 +3045,7 @@ void dcn10_optimize_bandwidth( { struct dce_hwseq *hws = dc->hwseq; struct hubbub *hubbub = dc->res_pool->hubbub; + int min_fclk_khz, min_dcfclk_khz, socclk_khz; if (dc->debug.sanity_checks) hws->funcs.verify_allow_pstate_change_high(dc); @@ -3064,8 +3069,11 @@ void dcn10_optimize_bandwidth( if (dc->debug.pplib_wm_report_mode == WM_REPORT_OVERRIDE) { DC_FP_START(); - dcn_bw_notify_pplib_of_wm_ranges(dc); + dcn_get_soc_clks( + dc, &min_fclk_khz, &min_dcfclk_khz, &socclk_khz); DC_FP_END(); + dcn_bw_notify_pplib_of_wm_ranges( + dc, min_fclk_khz, min_dcfclk_khz, socclk_khz); } if (dc->debug.sanity_checks) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c index 831080b9eb873..56d30baf12df2 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c @@ -1336,6 +1336,21 @@ static noinline void dcn10_resource_construct_fp( } } +static bool verify_clock_values(struct dm_pp_clock_levels_with_voltage *clks) +{ + int i; + + if (clks->num_levels == 0) + return false; + + for (i = 0; i < clks->num_levels; i++) + /* Ensure that the result is sane */ + if (clks->data[i].clocks_in_khz == 0) + return false; + + return true; +} + static bool dcn10_resource_construct( uint8_t num_virtual_links, struct dc *dc, @@ -1345,6 +1360,9 @@ static bool dcn10_resource_construct( int j; struct dc_context *ctx = dc->ctx; uint32_t pipe_fuses = read_pipe_fuses(ctx); + struct dm_pp_clock_levels_with_voltage fclks = {0}, dcfclks = {0}; + int min_fclk_khz, min_dcfclk_khz, socclk_khz; + bool res; ctx->dc_bios->regs = &bios_regs; @@ -1523,15 +1541,53 @@ static bool dcn10_resource_construct( && pool->base.pp_smu->rv_funcs.set_pme_wa_enable != NULL) dc->debug.az_endpoint_mute_only = false; - DC_FP_START(); - if (!dc->debug.disable_pplib_clock_request) - dcn_bw_update_from_pplib(dc); + + if (!dc->debug.disable_pplib_clock_request) { + /* + * TODO: This is not the proper way to obtain + * fabric_and_dram_bandwidth, should be min(fclk, memclk). + */ + res = dm_pp_get_clock_levels_by_type_with_voltage( + ctx, DM_PP_CLOCK_TYPE_FCLK, &fclks); + + DC_FP_START(); + + if (res) + res = verify_clock_values(&fclks); + + if (res) + dcn_bw_update_from_pplib_fclks(dc, &fclks); + else + BREAK_TO_DEBUGGER(); + + DC_FP_END(); + + res = dm_pp_get_clock_levels_by_type_with_voltage( + ctx, DM_PP_CLOCK_TYPE_DCFCLK, &dcfclks); + + DC_FP_START(); + + if (res) + res = verify_clock_values(&dcfclks); + + if (res) + dcn_bw_update_from_pplib_dcfclks(dc, &dcfclks); + else + BREAK_TO_DEBUGGER(); + + DC_FP_END(); + } + dcn_bw_sync_calcs_and_dml(dc); if (!dc->debug.disable_pplib_wm_range) { dc->res_pool = &pool->base; - dcn_bw_notify_pplib_of_wm_ranges(dc); + DC_FP_START(); + dcn_get_soc_clks( + dc, &min_fclk_khz, &min_dcfclk_khz, &socclk_khz); + DC_FP_END(); + dcn_bw_notify_pplib_of_wm_ranges( + dc, min_fclk_khz, min_dcfclk_khz, socclk_khz); } - DC_FP_END(); { struct irq_service_init_data init_data; diff --git a/drivers/gpu/drm/amd/display/dc/dml/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/dml/calcs/dcn_calcs.c index d46adc849d2aa..e73f089c84bb6 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/calcs/dcn_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/dml/calcs/dcn_calcs.c @@ -1444,81 +1444,67 @@ unsigned int dcn_find_dcfclk_suits_all( return dcf_clk; } -static bool verify_clock_values(struct dm_pp_clock_levels_with_voltage *clks) +void dcn_bw_update_from_pplib_fclks( + struct dc *dc, + struct dm_pp_clock_levels_with_voltage *fclks) { - int i; - - if (clks->num_levels == 0) - return false; - - for (i = 0; i < clks->num_levels; i++) - /* Ensure that the result is sane */ - if (clks->data[i].clocks_in_khz == 0) - return false; + unsigned vmin0p65_idx, vmid0p72_idx, vnom0p8_idx, vmax0p9_idx; - return true; + ASSERT(fclks->num_levels); + + vmin0p65_idx = 0; + vmid0p72_idx = fclks->num_levels - + (fclks->num_levels > 2 ? 3 : (fclks->num_levels > 1 ? 2 : 1)); + vnom0p8_idx = fclks->num_levels - (fclks->num_levels > 1 ? 2 : 1); + vmax0p9_idx = fclks->num_levels - 1; + + dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 = + 32 * (fclks->data[vmin0p65_idx].clocks_in_khz / 1000.0) / 1000.0; + dc->dcn_soc->fabric_and_dram_bandwidth_vmid0p72 = + dc->dcn_soc->number_of_channels * + (fclks->data[vmid0p72_idx].clocks_in_khz / 1000.0) + * ddr4_dram_factor_single_Channel / 1000.0; + dc->dcn_soc->fabric_and_dram_bandwidth_vnom0p8 = + dc->dcn_soc->number_of_channels * + (fclks->data[vnom0p8_idx].clocks_in_khz / 1000.0) + * ddr4_dram_factor_single_Channel / 1000.0; + dc->dcn_soc->fabric_and_dram_bandwidth_vmax0p9 = + dc->dcn_soc->number_of_channels * + (fclks->data[vmax0p9_idx].clocks_in_khz / 1000.0) + * ddr4_dram_factor_single_Channel / 1000.0; } -void dcn_bw_update_from_pplib(struct dc *dc) +void dcn_bw_update_from_pplib_dcfclks( + struct dc *dc, + struct dm_pp_clock_levels_with_voltage *dcfclks) { - struct dc_context *ctx = dc->ctx; - struct dm_pp_clock_levels_with_voltage fclks = {0}, dcfclks = {0}; - bool res; - unsigned vmin0p65_idx, vmid0p72_idx, vnom0p8_idx, vmax0p9_idx; - - /* TODO: This is not the proper way to obtain fabric_and_dram_bandwidth, should be min(fclk, memclk) */ - res = dm_pp_get_clock_levels_by_type_with_voltage( - ctx, DM_PP_CLOCK_TYPE_FCLK, &fclks); - - if (res) - res = verify_clock_values(&fclks); - - if (res) { - ASSERT(fclks.num_levels); - - vmin0p65_idx = 0; - vmid0p72_idx = fclks.num_levels - - (fclks.num_levels > 2 ? 3 : (fclks.num_levels > 1 ? 2 : 1)); - vnom0p8_idx = fclks.num_levels - (fclks.num_levels > 1 ? 2 : 1); - vmax0p9_idx = fclks.num_levels - 1; - - dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 = - 32 * (fclks.data[vmin0p65_idx].clocks_in_khz / 1000.0) / 1000.0; - dc->dcn_soc->fabric_and_dram_bandwidth_vmid0p72 = - dc->dcn_soc->number_of_channels * - (fclks.data[vmid0p72_idx].clocks_in_khz / 1000.0) - * ddr4_dram_factor_single_Channel / 1000.0; - dc->dcn_soc->fabric_and_dram_bandwidth_vnom0p8 = - dc->dcn_soc->number_of_channels * - (fclks.data[vnom0p8_idx].clocks_in_khz / 1000.0) - * ddr4_dram_factor_single_Channel / 1000.0; - dc->dcn_soc->fabric_and_dram_bandwidth_vmax0p9 = - dc->dcn_soc->number_of_channels * - (fclks.data[vmax0p9_idx].clocks_in_khz / 1000.0) - * ddr4_dram_factor_single_Channel / 1000.0; - } else - BREAK_TO_DEBUGGER(); - - res = dm_pp_get_clock_levels_by_type_with_voltage( - ctx, DM_PP_CLOCK_TYPE_DCFCLK, &dcfclks); - - if (res) - res = verify_clock_values(&dcfclks); + if (dcfclks->num_levels >= 3) { + dc->dcn_soc->dcfclkv_min0p65 = dcfclks->data[0].clocks_in_khz / 1000.0; + dc->dcn_soc->dcfclkv_mid0p72 = dcfclks->data[dcfclks->num_levels - 3].clocks_in_khz / 1000.0; + dc->dcn_soc->dcfclkv_nom0p8 = dcfclks->data[dcfclks->num_levels - 2].clocks_in_khz / 1000.0; + dc->dcn_soc->dcfclkv_max0p9 = dcfclks->data[dcfclks->num_levels - 1].clocks_in_khz / 1000.0; + } +} - if (res && dcfclks.num_levels >= 3) { - dc->dcn_soc->dcfclkv_min0p65 = dcfclks.data[0].clocks_in_khz / 1000.0; - dc->dcn_soc->dcfclkv_mid0p72 = dcfclks.data[dcfclks.num_levels - 3].clocks_in_khz / 1000.0; - dc->dcn_soc->dcfclkv_nom0p8 = dcfclks.data[dcfclks.num_levels - 2].clocks_in_khz / 1000.0; - dc->dcn_soc->dcfclkv_max0p9 = dcfclks.data[dcfclks.num_levels - 1].clocks_in_khz / 1000.0; - } else - BREAK_TO_DEBUGGER(); +void dcn_get_soc_clks( + struct dc *dc, + int *min_fclk_khz, + int *min_dcfclk_khz, + int *socclk_khz) +{ + *min_fclk_khz = dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 * 1000000 / 32; + *min_dcfclk_khz = dc->dcn_soc->dcfclkv_min0p65 * 1000; + *socclk_khz = dc->dcn_soc->socclk * 1000; } -void dcn_bw_notify_pplib_of_wm_ranges(struct dc *dc) +void dcn_bw_notify_pplib_of_wm_ranges( + struct dc *dc, + int min_fclk_khz, + int min_dcfclk_khz, + int socclk_khz) { struct pp_smu_funcs_rv *pp = NULL; struct pp_smu_wm_range_sets ranges = {0}; - int min_fclk_khz, min_dcfclk_khz, socclk_khz; const int overdrive = 5000000; /* 5 GHz to cover Overdrive */ if (dc->res_pool->pp_smu) @@ -1526,10 +1512,6 @@ void dcn_bw_notify_pplib_of_wm_ranges(struct dc *dc) if (!pp || !pp->set_wm_ranges) return; - min_fclk_khz = dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 * 1000000 / 32; - min_dcfclk_khz = dc->dcn_soc->dcfclkv_min0p65 * 1000; - socclk_khz = dc->dcn_soc->socclk * 1000; - /* Now notify PPLib/SMU about which Watermarks sets they should select * depending on DPM state they are in. And update BW MGR GFX Engine and * Memory clock member variables for Watermarks calculations for each diff --git a/drivers/gpu/drm/amd/display/dc/inc/dcn_calcs.h b/drivers/gpu/drm/amd/display/dc/inc/dcn_calcs.h index 806f3041db141..9e4ddc9852406 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/dcn_calcs.h +++ b/drivers/gpu/drm/amd/display/dc/inc/dcn_calcs.h @@ -628,8 +628,23 @@ unsigned int dcn_find_dcfclk_suits_all( const struct dc *dc, struct dc_clocks *clocks); -void dcn_bw_update_from_pplib(struct dc *dc); -void dcn_bw_notify_pplib_of_wm_ranges(struct dc *dc); +void dcn_get_soc_clks( + struct dc *dc, + int *min_fclk_khz, + int *min_dcfclk_khz, + int *socclk_khz); + +void dcn_bw_update_from_pplib_fclks( + struct dc *dc, + struct dm_pp_clock_levels_with_voltage *fclks); +void dcn_bw_update_from_pplib_dcfclks( + struct dc *dc, + struct dm_pp_clock_levels_with_voltage *dcfclks); +void dcn_bw_notify_pplib_of_wm_ranges( + struct dc *dc, + int min_fclk_khz, + int min_dcfclk_khz, + int socclk_khz); void dcn_bw_sync_calcs_and_dml(struct dc *dc); enum source_macro_tile_size swizzle_mode_to_macro_tile_size(enum swizzle_mode_values sw_mode); -- GitLab From 8ab1d7a27eff87001ebd0977db600e4187f63f78 Mon Sep 17 00:00:00 2001 From: Alvin Lee <Alvin.Lee2@amd.com> Date: Wed, 17 Aug 2022 10:47:59 -0400 Subject: [PATCH 1455/2223] drm/amd/display: Only commit SubVP state after pipe programming [Description] We only want to commit the SubVP config to DMCUB after the main and phantom pipe programming has completed. Commiting the state early can cause issues such as P-State being allowed by the HW early which causes the SubVP state machine to go into a bad state Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Brian Chang <Brian.Chang@amd.com> Signed-off-by: Alvin Lee <Alvin.Lee2@amd.com> Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 258ba5a872b11..ccaa43d071cf3 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3495,6 +3495,9 @@ static void commit_planes_for_stream(struct dc *dc, if (update_type != UPDATE_TYPE_FAST) dc->hwss.post_unlock_program_front_end(dc, context); + if (update_type != UPDATE_TYPE_FAST) + if (dc->hwss.commit_subvp_config) + dc->hwss.commit_subvp_config(dc, context); if (update_type != UPDATE_TYPE_FAST) if (dc->hwss.commit_subvp_config) -- GitLab From d37f379ad04dcc21ebd1d2380c3bc979d54f7c46 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Thu, 29 Sep 2022 17:02:00 +0800 Subject: [PATCH 1456/2223] drm/amd/display: change to enc314_stream_encoder_dp_blank static enc314_stream_encoder_dp_blank is only used in dcn314_dio_stream_encoder.c now, change it to static. Fixes: c55bf690fe79 ("drm/amd/display: Add explicit FIFO disable for DP blank") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c index 0d2ffb692957f..7e773bf7b895f 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c @@ -262,7 +262,7 @@ static bool is_two_pixels_per_containter(const struct dc_crtc_timing *timing) return two_pix; } -void enc314_stream_encoder_dp_blank( +static void enc314_stream_encoder_dp_blank( struct dc_link *link, struct stream_encoder *enc) { -- GitLab From 8abbc4f768ddc5c2190ab8966e529cec42b4b2d4 Mon Sep 17 00:00:00 2001 From: Li Zhong <floridsleeves@gmail.com> Date: Sat, 24 Sep 2022 15:19:39 -0700 Subject: [PATCH 1457/2223] drivers/amd/pm: check the return value of amdgpu_bo_kmap amdgpu_bo_kmap() returns error when fails to map buffer object. Add the error check and propagate the error. Signed-off-by: Li Zhong <floridsleeves@gmail.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c index 8fd0782a2b206..f5e08b60f66ef 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c @@ -1384,13 +1384,16 @@ static int kv_dpm_enable(struct amdgpu_device *adev) static void kv_dpm_disable(struct amdgpu_device *adev) { struct kv_power_info *pi = kv_get_pi(adev); + int err; amdgpu_irq_put(adev, &adev->pm.dpm.thermal.irq, AMDGPU_THERMAL_IRQ_LOW_TO_HIGH); amdgpu_irq_put(adev, &adev->pm.dpm.thermal.irq, AMDGPU_THERMAL_IRQ_HIGH_TO_LOW); - amdgpu_kv_smc_bapm_enable(adev, false); + err = amdgpu_kv_smc_bapm_enable(adev, false); + if (err) + DRM_ERROR("amdgpu_kv_smc_bapm_enable failed\n"); if (adev->asic_type == CHIP_MULLINS) kv_enable_nb_dpm(adev, false); -- GitLab From f7367b5fe0e38af02d6915a355f2ee63b172c9ac Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Mon, 19 Sep 2022 10:29:24 -0400 Subject: [PATCH 1458/2223] drm/amd/display: Program SubVP in dc_commit_state_no_check [Why?] Currently SubVP programming is only done in commit_planes_for_stream, as it was expected only this call would add/remove planes from a display. [How?] Add SubVP programming to dc_commit_state_no_check. Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index ccaa43d071cf3..4ba2c1f95dcbd 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1734,10 +1734,20 @@ static enum dc_status dc_commit_state_no_check(struct dc *dc, struct dc_state *c int i, k, l; struct dc_stream_state *dc_streams[MAX_STREAMS] = {0}; struct dc_state *old_state; + bool subvp_prev_use = false; dc_z10_restore(dc); dc_allow_idle_optimizations(dc, false); + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *old_pipe = &dc->current_state->res_ctx.pipe_ctx[i]; + + /* Check old context for SubVP */ + subvp_prev_use |= (old_pipe->stream && old_pipe->stream->mall_stream_config.type == SUBVP_PHANTOM); + if (subvp_prev_use) + break; + } + for (i = 0; i < context->stream_count; i++) dc_streams[i] = context->streams[i]; @@ -1777,6 +1787,9 @@ static enum dc_status dc_commit_state_no_check(struct dc *dc, struct dc_state *c dc->hwss.wait_for_mpcc_disconnect(dc, dc->res_pool, pipe); } + if (dc->hwss.subvp_pipe_control_lock) + dc->hwss.subvp_pipe_control_lock(dc, context, true, true, NULL, subvp_prev_use); + result = dc->hwss.apply_ctx_to_hw(dc, context); if (result != DC_OK) { @@ -1794,6 +1807,12 @@ static enum dc_status dc_commit_state_no_check(struct dc *dc, struct dc_state *c dc->hwss.interdependent_update_lock(dc, context, false); dc->hwss.post_unlock_program_front_end(dc, context); } + + if (dc->hwss.commit_subvp_config) + dc->hwss.commit_subvp_config(dc, context); + if (dc->hwss.subvp_pipe_control_lock) + dc->hwss.subvp_pipe_control_lock(dc, context, false, true, NULL, subvp_prev_use); + for (i = 0; i < context->stream_count; i++) { const struct dc_link *link = context->streams[i]->link; -- GitLab From c1969fbaa57d88ddef626bb8ae313d38478d8631 Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Mon, 19 Sep 2022 13:14:02 -0400 Subject: [PATCH 1459/2223] drm/amd/display: Reorder FCLK P-state switch sequence for DCN32 [WHY?] In some cases, DCFCLK hardmin requests are not acknowledged by SMU as the requested clock does not have a compatible ratio with current FCLK, and it cannot be changed as FCLK P-state is not allowed. [HOW?] Allow FCLK p-state change prior to changing DCFCLK hardmin. Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c index f0f3f66629cc0..96d5e0d5b3ce0 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c @@ -333,6 +333,21 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, if (enter_display_off == safe_to_lower) dcn30_smu_set_num_of_displays(clk_mgr, display_count); + clk_mgr_base->clks.fclk_prev_p_state_change_support = clk_mgr_base->clks.fclk_p_state_change_support; + + total_plane_count = clk_mgr_helper_get_active_plane_cnt(dc, context); + fclk_p_state_change_support = new_clocks->fclk_p_state_change_support || (total_plane_count == 0); + + if (should_update_pstate_support(safe_to_lower, fclk_p_state_change_support, clk_mgr_base->clks.fclk_p_state_change_support)) { + clk_mgr_base->clks.fclk_p_state_change_support = fclk_p_state_change_support; + + /* To enable FCLK P-state switching, send FCLK_PSTATE_NOTSUPPORTED message to PMFW */ + if (clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21 && clk_mgr_base->clks.fclk_p_state_change_support && update_fclk) { + /* Handle the code for sending a message to PMFW that FCLK P-state change is supported */ + dcn32_smu_send_fclk_pstate_message(clk_mgr, FCLK_PSTATE_SUPPORTED); + } + } + if (dc->debug.force_min_dcfclk_mhz > 0) new_clocks->dcfclk_khz = (new_clocks->dcfclk_khz > (dc->debug.force_min_dcfclk_mhz * 1000)) ? new_clocks->dcfclk_khz : (dc->debug.force_min_dcfclk_mhz * 1000); @@ -352,7 +367,6 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, clk_mgr_base->clks.socclk_khz = new_clocks->socclk_khz; clk_mgr_base->clks.prev_p_state_change_support = clk_mgr_base->clks.p_state_change_support; - clk_mgr_base->clks.fclk_prev_p_state_change_support = clk_mgr_base->clks.fclk_p_state_change_support; clk_mgr_base->clks.prev_num_ways = clk_mgr_base->clks.num_ways; if (clk_mgr_base->clks.num_ways != new_clocks->num_ways && @@ -361,9 +375,8 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, dcn32_smu_send_cab_for_uclk_message(clk_mgr, clk_mgr_base->clks.num_ways); } - total_plane_count = clk_mgr_helper_get_active_plane_cnt(dc, context); + p_state_change_support = new_clocks->p_state_change_support || (total_plane_count == 0); - fclk_p_state_change_support = new_clocks->fclk_p_state_change_support || (total_plane_count == 0); if (should_update_pstate_support(safe_to_lower, p_state_change_support, clk_mgr_base->clks.p_state_change_support)) { clk_mgr_base->clks.p_state_change_support = p_state_change_support; @@ -373,15 +386,14 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries - 1].memclk_mhz); } - if (should_update_pstate_support(safe_to_lower, fclk_p_state_change_support, clk_mgr_base->clks.fclk_p_state_change_support) && - clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21) { - clk_mgr_base->clks.fclk_p_state_change_support = fclk_p_state_change_support; + /* Always update saved value, even if new value not set due to P-State switching unsupported. Also check safe_to_lower for FCLK */ + if (safe_to_lower && (clk_mgr_base->clks.fclk_p_state_change_support != clk_mgr_base->clks.fclk_prev_p_state_change_support)) { + update_fclk = true; + } - /* To disable FCLK P-state switching, send FCLK_PSTATE_NOTSUPPORTED message to PMFW */ - if (clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21 && !clk_mgr_base->clks.fclk_p_state_change_support) { - /* Handle code for sending a message to PMFW that FCLK P-state change is not supported */ - dcn32_smu_send_fclk_pstate_message(clk_mgr, FCLK_PSTATE_NOTSUPPORTED); - } + if (clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21 && !clk_mgr_base->clks.fclk_p_state_change_support && update_fclk) { + /* Handle code for sending a message to PMFW that FCLK P-state change is not supported */ + dcn32_smu_send_fclk_pstate_message(clk_mgr, FCLK_PSTATE_NOTSUPPORTED); } /* Always update saved value, even if new value not set due to P-State switching unsupported */ @@ -390,21 +402,11 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, update_uclk = true; } - /* Always update saved value, even if new value not set due to P-State switching unsupported. Also check safe_to_lower for FCLK */ - if (safe_to_lower && (clk_mgr_base->clks.fclk_p_state_change_support != clk_mgr_base->clks.fclk_prev_p_state_change_support)) { - update_fclk = true; - } - /* set UCLK to requested value if P-State switching is supported, or to re-enable P-State switching */ if (clk_mgr_base->clks.p_state_change_support && (update_uclk || !clk_mgr_base->clks.prev_p_state_change_support)) dcn32_smu_set_hard_min_by_freq(clk_mgr, PPCLK_UCLK, khz_to_mhz_ceil(clk_mgr_base->clks.dramclk_khz)); - if (clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21 && clk_mgr_base->clks.fclk_p_state_change_support && update_fclk) { - /* Handle the code for sending a message to PMFW that FCLK P-state change is supported */ - dcn32_smu_send_fclk_pstate_message(clk_mgr, FCLK_PSTATE_SUPPORTED); - } - if (clk_mgr_base->clks.num_ways != new_clocks->num_ways && clk_mgr_base->clks.num_ways > new_clocks->num_ways) { clk_mgr_base->clks.num_ways = new_clocks->num_ways; -- GitLab From ab5220bb5a910246c61512a9e29a4e2406cb1ecd Mon Sep 17 00:00:00 2001 From: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com> Date: Thu, 8 Sep 2022 11:37:58 -0400 Subject: [PATCH 1460/2223] drm/amd/display: fix dcn315 dml detile overestimation DML does not take the fact that dcn315 does not have enough detile buffer to max all pipes. This change adds a workaround to apply the same logic DC does when calculating detile buffer size in DML. Reviewed-by: Charlene Liu <Charlene.Liu@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c | 2 +- .../display/dc/dml/dcn31/display_mode_vba_31.c | 15 +++++++++++++++ .../gpu/drm/amd/display/dc/dml/display_mode_lib.c | 1 + .../gpu/drm/amd/display/dc/dml/display_mode_lib.h | 1 + 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c index b6e99eefe869e..94b0842cd89b5 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c @@ -739,7 +739,7 @@ void dcn315_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param } if (!IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment)) - dml_init_instance(&dc->dml, &dcn3_15_soc, &dcn3_15_ip, DML_PROJECT_DCN31); + dml_init_instance(&dc->dml, &dcn3_15_soc, &dcn3_15_ip, DML_PROJECT_DCN315); else dml_init_instance(&dc->dml, &dcn3_15_soc, &dcn3_15_ip, DML_PROJECT_DCN31_FPGA); } diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c index 8dfe639b65087..b612edb144172 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c @@ -43,6 +43,8 @@ #define BPP_BLENDED_PIPE 0xffffffff #define DCN31_MAX_DSC_IMAGE_WIDTH 5184 #define DCN31_MAX_FMT_420_BUFFER_WIDTH 4096 +#define DCN3_15_MIN_COMPBUF_SIZE_KB 128 +#define DCN3_15_MAX_DET_SIZE 384 // For DML-C changes that hasn't been propagated to VBA yet //#define __DML_VBA_ALLOW_DELTA__ @@ -3775,6 +3777,17 @@ static noinline void CalculatePrefetchSchedulePerPlane( &v->VReadyOffsetPix[k]); } +static void PatchDETBufferSizeInKByte(unsigned int NumberOfActivePlanes, int NoOfDPPThisState[], unsigned int config_return_buffer_size_in_kbytes, unsigned int *DETBufferSizeInKByte) +{ + int i, total_pipes = 0; + for (i = 0; i < NumberOfActivePlanes; i++) + total_pipes += NoOfDPPThisState[i]; + *DETBufferSizeInKByte = ((config_return_buffer_size_in_kbytes - DCN3_15_MIN_COMPBUF_SIZE_KB) / 64 / total_pipes) * 64; + if (*DETBufferSizeInKByte > DCN3_15_MAX_DET_SIZE) + *DETBufferSizeInKByte = DCN3_15_MAX_DET_SIZE; +} + + void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_lib) { struct vba_vars_st *v = &mode_lib->vba; @@ -4533,6 +4546,8 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l v->ODMCombineEnableThisState[k] = v->ODMCombineEnablePerState[i][k]; } + if (v->NumberOfActivePlanes > 1 && mode_lib->project == DML_PROJECT_DCN315) + PatchDETBufferSizeInKByte(v->NumberOfActivePlanes, v->NoOfDPPThisState, v->ip.config_return_buffer_size_in_kbytes, &v->DETBufferSizeInKByte[0]); CalculateSwathAndDETConfiguration( false, v->NumberOfActivePlanes, diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.c b/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.c index f5400eda07a53..4125d3d111d15 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.c +++ b/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.c @@ -114,6 +114,7 @@ void dml_init_instance(struct display_mode_lib *lib, break; case DML_PROJECT_DCN31: case DML_PROJECT_DCN31_FPGA: + case DML_PROJECT_DCN315: lib->funcs = dml31_funcs; break; case DML_PROJECT_DCN314: diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.h b/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.h index b1878a1440e2b..3d643d50c3eb5 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.h +++ b/drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.h @@ -40,6 +40,7 @@ enum dml_project { DML_PROJECT_DCN21, DML_PROJECT_DCN30, DML_PROJECT_DCN31, + DML_PROJECT_DCN315, DML_PROJECT_DCN31_FPGA, DML_PROJECT_DCN314, DML_PROJECT_DCN32, -- GitLab From d35e8b7ae01430b1e722547b2ef40f42dc30520f Mon Sep 17 00:00:00 2001 From: Alvin Lee <Alvin.Lee2@amd.com> Date: Tue, 20 Sep 2022 10:46:18 -0400 Subject: [PATCH 1461/2223] drm/amd/display: Block SubVP if rotation being used [Description] - SubVP rotation support is not explicitly implemented, so block SubVP in rotation cases to avoid unexpected behaviors Reviewed-by: Nevenko Stupar <Nevenko.Stupar@amd.com> Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Alvin Lee <Alvin.Lee2@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dcn32/dcn32_resource.h | 2 ++ .../display/dc/dcn32/dcn32_resource_helpers.c | 17 +++++++++++++++++ .../drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 3 ++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h index 55945cca2260d..a24f538bdc4cc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h @@ -108,6 +108,8 @@ bool dcn32_subvp_in_use(struct dc *dc, bool dcn32_mpo_in_use(struct dc_state *context); +bool dcn32_any_surfaces_rotated(struct dc *dc, struct dc_state *context); + struct pipe_ctx *dcn32_acquire_idle_pipe_for_head_pipe_in_layer( struct dc_state *state, const struct resource_pool *pool, diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c index a2a70a1572b7f..7f318ced5dee4 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c @@ -233,6 +233,23 @@ bool dcn32_mpo_in_use(struct dc_state *context) return false; } + +bool dcn32_any_surfaces_rotated(struct dc *dc, struct dc_state *context) +{ + uint32_t i; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; + + if (!pipe->stream) + continue; + + if (pipe->plane_state && pipe->plane_state->rotation != ROTATION_ANGLE_0) + return true; + } + return false; +} + /** * ******************************************************************************************* * dcn32_determine_det_override: Determine DET allocation for each pipe diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index 0571700f53f93..a56ee04f7df93 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -1115,7 +1115,8 @@ static void dcn32_full_validate_bw_helper(struct dc *dc, * 5. (Config doesn't support MCLK in VACTIVE/VBLANK || dc->debug.force_subvp_mclk_switch) */ if (!dc->debug.force_disable_subvp && dcn32_all_pipes_have_stream_and_plane(dc, context) && - !dcn32_mpo_in_use(context) && (*vlevel == context->bw_ctx.dml.soc.num_states || + !dcn32_mpo_in_use(context) && !dcn32_any_surfaces_rotated(dc, context) && + (*vlevel == context->bw_ctx.dml.soc.num_states || vba->DRAMClockChangeSupport[*vlevel][vba->maxMpcComb] == dm_dram_clock_change_unsupported || dc->debug.force_subvp_mclk_switch)) { -- GitLab From 96ab3cb3b0f862308a03046d01d66c7b4154846b Mon Sep 17 00:00:00 2001 From: Aric Cyr <aric.cyr@amd.com> Date: Mon, 19 Sep 2022 17:42:22 -0400 Subject: [PATCH 1462/2223] Revert "drm/amd/display: correct hostvm flag" This reverts commit 796d6a37ff5ffaf9f2dc0f3f4bf9f4a1034c00de. 4K144 resolution isn't available on DCN31. Reviewed-by: Sherry Wang <Yao.Wang1@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Aric Cyr <aric.cyr@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c index 8c1a6fb36306a..8745132d6374c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c @@ -890,7 +890,7 @@ static const struct dc_debug_options debug_defaults_drv = { .disable_z10 = true, .optimize_edp_link_rate = true, .enable_z9_disable_interface = true, /* Allow support for the PMFW interface for disable Z9*/ - .dml_hostvm_override = DML_HOSTVM_NO_OVERRIDE, + .dml_hostvm_override = DML_HOSTVM_OVERRIDE_FALSE, }; static const struct dc_debug_options debug_defaults_diags = { -- GitLab From dfb3367bd082ccf52d3c13ff62257f08407dffcf Mon Sep 17 00:00:00 2001 From: Charlene Liu <Charlene.Liu@amd.com> Date: Tue, 20 Sep 2022 09:23:06 -0400 Subject: [PATCH 1463/2223] drm/amd/display: prevent S4 test from failing [why] limit the vm prefetch check for now, until the feature is fully verified. Reviewed-by: Hansen Dsouza <Hansen.Dsouza@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Charlene Liu <Charlene.Liu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn21/dcn21_hubbub.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_hubbub.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_hubbub.c index 5752271f22dfe..c5e200d09038f 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_hubbub.c @@ -67,15 +67,9 @@ static uint32_t convert_and_clamp( void dcn21_dchvm_init(struct hubbub *hubbub) { struct dcn20_hubbub *hubbub1 = TO_DCN20_HUBBUB(hubbub); - uint32_t riommu_active, prefetch_done; + uint32_t riommu_active; int i; - REG_GET(DCHVM_RIOMMU_STAT0, HOSTVM_PREFETCH_DONE, &prefetch_done); - - if (prefetch_done) { - hubbub->riommu_active = true; - return; - } //Init DCHVM block REG_UPDATE(DCHVM_CTRL0, HOSTVM_INIT_REQ, 1); -- GitLab From 40169e2f37127b7fe60736045b1f9fc04f76b471 Mon Sep 17 00:00:00 2001 From: Alvin Lee <Alvin.Lee2@amd.com> Date: Tue, 20 Sep 2022 19:26:27 -0400 Subject: [PATCH 1464/2223] drm/amd/display: Disable GSL when enabling phantom pipe [Description] When enabling phantom pipe on a pipe that was previously using immediate flip, we have to disable GSL or this will prevent the update from taking place right away on the phantom pipe when we enable it in FW Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Alvin Lee <Alvin.Lee2@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c index 2038cbda33f74..830562f4139dc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c @@ -79,6 +79,8 @@ void hubp32_phantom_hubp_post_enable(struct hubp *hubp) uint32_t reg_val; struct dcn20_hubp *hubp2 = TO_DCN20_HUBP(hubp); + /* For phantom pipe enable, disable GSL */ + REG_UPDATE(DCSURF_FLIP_CONTROL2, SURFACE_GSL_ENABLE, 0); REG_UPDATE(DCHUBP_CNTL, HUBP_BLANK_EN, 1); reg_val = REG_READ(DCHUBP_CNTL); if (reg_val) { -- GitLab From 283e0a673cdf59fe103707ac0466492b315c81a2 Mon Sep 17 00:00:00 2001 From: Wenjing Liu <wenjing.liu@amd.com> Date: Thu, 22 Sep 2022 14:22:04 -0400 Subject: [PATCH 1465/2223] drm/amd/display: fix integer overflow during MSA V_Freq calculation [why] Analyzer shows incorrect V freq in MSA for some large timing. [how] Cast an 32 bit integer to uint64_t before multiplication to avoid integer overflow for a very large timing. Reviewed-by: Ariel Bernstein <Eric.Bernstein@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Wenjing Liu <wenjing.liu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c index 52fb2bf3d5781..d71d89268a07a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c @@ -197,7 +197,7 @@ static void dcn31_hpo_dp_stream_enc_set_stream_attribute( uint32_t h_back_porch; uint32_t h_width; uint32_t v_height; - unsigned long long v_freq; + uint64_t v_freq; uint8_t misc0 = 0; uint8_t misc1 = 0; uint8_t hsp; @@ -360,7 +360,7 @@ static void dcn31_hpo_dp_stream_enc_set_stream_attribute( v_height = hw_crtc_timing.v_border_top + hw_crtc_timing.v_addressable + hw_crtc_timing.v_border_bottom; hsp = hw_crtc_timing.flags.HSYNC_POSITIVE_POLARITY ? 0 : 0x80; vsp = hw_crtc_timing.flags.VSYNC_POSITIVE_POLARITY ? 0 : 0x80; - v_freq = hw_crtc_timing.pix_clk_100hz * 100; + v_freq = (uint64_t)hw_crtc_timing.pix_clk_100hz * 100; /* MSA Packet Mapping to 32-bit Link Symbols - DP2 spec, section 2.7.4.1 * -- GitLab From 749b6c2ac9d9a7a4d8f4c2e4dc6fa830fd6c6ac7 Mon Sep 17 00:00:00 2001 From: "Leo (Hanghong) Ma" <hanghong.ma@amd.com> Date: Tue, 20 Sep 2022 15:23:42 -0400 Subject: [PATCH 1466/2223] drm/amd/display: AUX tracing cleanup [Why && How] Remove the unnecessary AUX trace and use one trace for AUX failure. Reviewed-by: Martin Leung <Martin.Leung@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Leo (Hanghong) Ma <hanghong.ma@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dce/dce_aux.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c b/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c index 32782ef9ef778..140297c8ff555 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c @@ -942,10 +942,6 @@ bool dce_aux_transfer_with_retries(struct ddc_service *ddc, case AUX_RET_ERROR_ENGINE_ACQUIRE: case AUX_RET_ERROR_UNKNOWN: default: - DC_TRACE_LEVEL_MESSAGE(DAL_TRACE_LEVEL_INFORMATION, - LOG_FLAG_I2cAux_DceAux, - "dce_aux_transfer_with_retries: Failure: operation_result=%d", - (int)operation_result); goto fail; } } @@ -953,14 +949,11 @@ bool dce_aux_transfer_with_retries(struct ddc_service *ddc, fail: DC_TRACE_LEVEL_MESSAGE(DAL_TRACE_LEVEL_ERROR, LOG_FLAG_Error_I2cAux, - "dce_aux_transfer_with_retries: FAILURE"); + "%s: Failure: operation_result=%d", + __func__, + (int)operation_result); if (!payload_reply) payload->reply = NULL; - DC_TRACE_LEVEL_MESSAGE(DAL_TRACE_LEVEL_ERROR, - WPP_BIT_FLAG_DC_ERROR, - "AUX transaction failed. Result: %d", - operation_result); - return false; } -- GitLab From 7aeb2e47e43d5acd4638c64b4c0c01ad90feea51 Mon Sep 17 00:00:00 2001 From: Iswara Nagulendran <Iswara.Nagulendran@amd.com> Date: Mon, 19 Sep 2022 15:53:56 -0400 Subject: [PATCH 1467/2223] drm/amd/display: Allow PSR exit when panel is disconnected [HOW&WHY] Fixed check to only avoid PSR entry when panel is disconnected. PSR exit can be permitted to restore the HW to it's non-PSR state. Reviewed-by: Jayendran Ramani <Jayendran.Ramani@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Iswara Nagulendran <Iswara.Nagulendran@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 3d19fb92333be..895c6e6bfeb84 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -3143,7 +3143,7 @@ bool dc_link_set_psr_allow_active(struct dc_link *link, const bool *allow_active if (!dc_get_edp_link_panel_inst(dc, link, &panel_inst)) return false; - if (allow_active && link->type == dc_connection_none) { + if ((allow_active != NULL) && (*allow_active == true) && (link->type == dc_connection_none)) { // Don't enter PSR if panel is not connected return false; } -- GitLab From 1178ac68dc2869a2f4192600b701de3d853272d2 Mon Sep 17 00:00:00 2001 From: Ian Chen <ian.chen@amd.com> Date: Tue, 23 Aug 2022 17:26:51 +0800 Subject: [PATCH 1468/2223] drm/amd/display: Refactor edp ILR caps codes We split out ILR config from "global" to "per-panel" config settings. Reviewed-by: Anthony Koo <Anthony.Koo@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Ian Chen <ian.chen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 5 ++++- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 4 ++-- drivers/gpu/drm/amd/display/dc/dc.h | 1 - drivers/gpu/drm/amd/display/dc/dc_link.h | 4 ++++ .../gpu/drm/amd/display/dc/dcn21/dcn21_resource.c | 13 ++++++++++++- .../gpu/drm/amd/display/dc/dcn31/dcn31_resource.c | 13 ++++++++++++- .../gpu/drm/amd/display/dc/dcn314/dcn314_resource.c | 13 ++++++++++++- .../gpu/drm/amd/display/dc/dcn315/dcn315_resource.c | 13 ++++++++++++- .../gpu/drm/amd/display/dc/dcn316/dcn316_resource.c | 13 ++++++++++++- drivers/gpu/drm/amd/display/dc/inc/core_types.h | 1 + 10 files changed, 71 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 895c6e6bfeb84..c4daef1e708c9 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -1307,7 +1307,10 @@ static bool detect_link_and_local_sink(struct dc_link *link, } if (link->connector_signal == SIGNAL_TYPE_EDP) { - // Init dc_panel_config + /* Init dc_panel_config by HW config */ + if (dc_ctx->dc->res_pool->funcs->get_panel_config_defaults) + dc_ctx->dc->res_pool->funcs->get_panel_config_defaults(&link->panel_config); + /* Pickup base DM settings */ dm_helpers_init_panel_settings(dc_ctx, &link->panel_config, sink); // Override dc_panel_config if system has specific settings dm_helpers_override_panel_settings(dc_ctx, &link->panel_config); diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index c57df45e83ff5..70456580eecc7 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -5795,7 +5795,7 @@ void detect_edp_sink_caps(struct dc_link *link) * Per VESA eDP spec, "The DPCD revision for eDP v1.4 is 13h" */ if (link->dpcd_caps.dpcd_rev.raw >= DPCD_REV_13 && - (link->dc->debug.optimize_edp_link_rate || + (link->panel_config.ilr.optimize_edp_link_rate || link->reported_link_cap.link_rate == LINK_RATE_UNKNOWN)) { // Read DPCD 00010h - 0001Fh 16 bytes at one shot core_link_read_dpcd(link, DP_SUPPORTED_LINK_RATES, @@ -6744,7 +6744,7 @@ bool is_edp_ilr_optimization_required(struct dc_link *link, struct dc_crtc_timin ASSERT(link || crtc_timing); // invalid input if (link->dpcd_caps.edp_supported_link_rates_count == 0 || - !link->dc->debug.optimize_edp_link_rate) + !link->panel_config.ilr.optimize_edp_link_rate) return false; diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 2ecf36e6329bd..458a4f431ac6e 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -821,7 +821,6 @@ struct dc_debug_options { /* Enable dmub aux for legacy ddc */ bool enable_dmub_aux_for_legacy_ddc; bool disable_fams; - bool optimize_edp_link_rate; /* eDP ILR */ /* FEC/PSR1 sequence enable delay in 100us */ uint8_t fec_enable_delay_in100us; bool enable_driver_sequence_debug; diff --git a/drivers/gpu/drm/amd/display/dc/dc_link.h b/drivers/gpu/drm/amd/display/dc/dc_link.h index bf5f9e2773bc0..caf0c7af2d0b9 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_link.h +++ b/drivers/gpu/drm/amd/display/dc/dc_link.h @@ -138,6 +138,10 @@ struct dc_panel_config { bool disable_dsc_edp; unsigned int force_dsc_edp_policy; } dsc; + /* eDP ILR */ + struct ilr { + bool optimize_edp_link_rate; /* eDP ILR */ + } ilr; }; /* * A link contains one or more sinks and their connected status. diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c index 7cb35bb1c0f15..887081472c0d8 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c @@ -657,7 +657,6 @@ static const struct dc_debug_options debug_defaults_drv = { .usbc_combo_phy_reset_wa = true, .dmub_command_table = true, .use_max_lb = true, - .optimize_edp_link_rate = true }; static const struct dc_debug_options debug_defaults_diags = { @@ -677,6 +676,12 @@ static const struct dc_debug_options debug_defaults_diags = { .use_max_lb = true }; +static const struct dc_panel_config panel_config_defaults = { + .ilr = { + .optimize_edp_link_rate = true, + }, +}; + enum dcn20_clk_src_array_id { DCN20_CLK_SRC_PLL0, DCN20_CLK_SRC_PLL1, @@ -1367,6 +1372,11 @@ static struct panel_cntl *dcn21_panel_cntl_create(const struct panel_cntl_init_d return &panel_cntl->base; } +static void dcn21_get_panel_config_defaults(struct dc_panel_config *panel_config) +{ + *panel_config = panel_config_defaults; +} + #define CTX ctx #define REG(reg_name) \ @@ -1408,6 +1418,7 @@ static const struct resource_funcs dcn21_res_pool_funcs = { .set_mcif_arb_params = dcn20_set_mcif_arb_params, .find_first_free_match_stream_enc_for_link = dcn10_find_first_free_match_stream_enc_for_link, .update_bw_bounding_box = dcn21_update_bw_bounding_box, + .get_panel_config_defaults = dcn21_get_panel_config_defaults, }; static bool dcn21_resource_construct( diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c index 8745132d6374c..fddc21a5a04c4 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c @@ -888,7 +888,6 @@ static const struct dc_debug_options debug_defaults_drv = { } }, .disable_z10 = true, - .optimize_edp_link_rate = true, .enable_z9_disable_interface = true, /* Allow support for the PMFW interface for disable Z9*/ .dml_hostvm_override = DML_HOSTVM_OVERRIDE_FALSE, }; @@ -911,6 +910,12 @@ static const struct dc_debug_options debug_defaults_diags = { .use_max_lb = true }; +static const struct dc_panel_config panel_config_defaults = { + .ilr = { + .optimize_edp_link_rate = true, + }, +}; + static void dcn31_dpp_destroy(struct dpp **dpp) { kfree(TO_DCN20_DPP(*dpp)); @@ -1803,6 +1808,11 @@ validate_out: return out; } +static void dcn31_get_panel_config_defaults(struct dc_panel_config *panel_config) +{ + *panel_config = panel_config_defaults; +} + static struct dc_cap_funcs cap_funcs = { .get_dcc_compression_cap = dcn20_get_dcc_compression_cap }; @@ -1829,6 +1839,7 @@ static struct resource_funcs dcn31_res_pool_funcs = { .release_post_bldn_3dlut = dcn30_release_post_bldn_3dlut, .update_bw_bounding_box = dcn31_update_bw_bounding_box, .patch_unknown_plane_state = dcn20_patch_unknown_plane_state, + .get_panel_config_defaults = dcn31_get_panel_config_defaults, }; static struct clock_source *dcn30_clock_source_create( diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c index 24ec71cbd3e3e..70b647b9b4d37 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c @@ -914,7 +914,6 @@ static const struct dc_debug_options debug_defaults_drv = { .afmt = true, } }, - .optimize_edp_link_rate = true, .seamless_boot_odm_combine = true }; @@ -936,6 +935,12 @@ static const struct dc_debug_options debug_defaults_diags = { .use_max_lb = true }; +static const struct dc_panel_config panel_config_defaults = { + .ilr = { + .optimize_edp_link_rate = true, + }, +}; + static void dcn31_dpp_destroy(struct dpp **dpp) { kfree(TO_DCN20_DPP(*dpp)); @@ -1675,6 +1680,11 @@ static void dcn314_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *b DC_FP_END(); } +static void dcn314_get_panel_config_defaults(struct dc_panel_config *panel_config) +{ + *panel_config = panel_config_defaults; +} + static struct resource_funcs dcn314_res_pool_funcs = { .destroy = dcn314_destroy_resource_pool, .link_enc_create = dcn31_link_encoder_create, @@ -1697,6 +1707,7 @@ static struct resource_funcs dcn314_res_pool_funcs = { .release_post_bldn_3dlut = dcn30_release_post_bldn_3dlut, .update_bw_bounding_box = dcn314_update_bw_bounding_box, .patch_unknown_plane_state = dcn20_patch_unknown_plane_state, + .get_panel_config_defaults = dcn314_get_panel_config_defaults, }; static struct clock_source *dcn30_clock_source_create( diff --git a/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c b/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c index eebb42c9ddd60..0f71bb86dc9a2 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c @@ -885,7 +885,6 @@ static const struct dc_debug_options debug_defaults_drv = { .afmt = true, } }, - .optimize_edp_link_rate = true, .psr_power_use_phy_fsm = 0, }; @@ -907,6 +906,12 @@ static const struct dc_debug_options debug_defaults_diags = { .use_max_lb = true }; +static const struct dc_panel_config panel_config_defaults = { + .ilr = { + .optimize_edp_link_rate = true, + }, +}; + static void dcn31_dpp_destroy(struct dpp **dpp) { kfree(TO_DCN20_DPP(*dpp)); @@ -1708,6 +1713,11 @@ static int dcn315_populate_dml_pipes_from_context( return pipe_cnt; } +static void dcn315_get_panel_config_defaults(struct dc_panel_config *panel_config) +{ + *panel_config = panel_config_defaults; +} + static struct dc_cap_funcs cap_funcs = { .get_dcc_compression_cap = dcn20_get_dcc_compression_cap }; @@ -1734,6 +1744,7 @@ static struct resource_funcs dcn315_res_pool_funcs = { .release_post_bldn_3dlut = dcn30_release_post_bldn_3dlut, .update_bw_bounding_box = dcn315_update_bw_bounding_box, .patch_unknown_plane_state = dcn20_patch_unknown_plane_state, + .get_panel_config_defaults = dcn315_get_panel_config_defaults, }; static bool dcn315_resource_construct( diff --git a/drivers/gpu/drm/amd/display/dc/dcn316/dcn316_resource.c b/drivers/gpu/drm/amd/display/dc/dcn316/dcn316_resource.c index f4b52a35ad84f..6b40a11ac83a9 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn316/dcn316_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn316/dcn316_resource.c @@ -885,7 +885,6 @@ static const struct dc_debug_options debug_defaults_drv = { .afmt = true, } }, - .optimize_edp_link_rate = true, }; static const struct dc_debug_options debug_defaults_diags = { @@ -906,6 +905,12 @@ static const struct dc_debug_options debug_defaults_diags = { .use_max_lb = true }; +static const struct dc_panel_config panel_config_defaults = { + .ilr = { + .optimize_edp_link_rate = true, + }, +}; + static void dcn31_dpp_destroy(struct dpp **dpp) { kfree(TO_DCN20_DPP(*dpp)); @@ -1710,6 +1715,11 @@ static int dcn316_populate_dml_pipes_from_context( return pipe_cnt; } +static void dcn316_get_panel_config_defaults(struct dc_panel_config *panel_config) +{ + *panel_config = panel_config_defaults; +} + static struct dc_cap_funcs cap_funcs = { .get_dcc_compression_cap = dcn20_get_dcc_compression_cap }; @@ -1736,6 +1746,7 @@ static struct resource_funcs dcn316_res_pool_funcs = { .release_post_bldn_3dlut = dcn30_release_post_bldn_3dlut, .update_bw_bounding_box = dcn316_update_bw_bounding_box, .patch_unknown_plane_state = dcn20_patch_unknown_plane_state, + .get_panel_config_defaults = dcn316_get_panel_config_defaults, }; static bool dcn316_resource_construct( diff --git a/drivers/gpu/drm/amd/display/dc/inc/core_types.h b/drivers/gpu/drm/amd/display/dc/inc/core_types.h index 8919a2092ac50..4ff1392633a75 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/core_types.h +++ b/drivers/gpu/drm/amd/display/dc/inc/core_types.h @@ -232,6 +232,7 @@ struct resource_funcs { unsigned int index); bool (*remove_phantom_pipes)(struct dc *dc, struct dc_state *context); + void (*get_panel_config_defaults)(struct dc_panel_config *panel_config); }; struct audio_support{ -- GitLab From 380202c84454e89d29a9abc670f09b9145617d58 Mon Sep 17 00:00:00 2001 From: Alvin Lee <Alvin.Lee2@amd.com> Date: Wed, 21 Sep 2022 12:04:25 -0400 Subject: [PATCH 1469/2223] drm/amd/display: For SubVP pipe split case use min transition into MPO [Description] - For SubVP pipe split case we need to use a minimial transition when opening MPO video since we are transitioning from 4 pipes to 3 pipes where an OPP for a previous MPCC will change - Also save and restore mall config when doing fast_validate in case there was a shallow copy of the dc->current_state Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Alvin Lee <Alvin.Lee2@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 36 ++++++++++ .../drm/amd/display/dc/dcn32/dcn32_resource.c | 18 +++++ .../drm/amd/display/dc/dcn32/dcn32_resource.h | 20 ++++++ .../display/dc/dcn32/dcn32_resource_helpers.c | 71 +++++++++++++++++++ 4 files changed, 145 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 4ba2c1f95dcbd..a81a61f006c4b 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3564,6 +3564,7 @@ static bool could_mpcc_tree_change_for_active_pipes(struct dc *dc, struct dc_stream_status *cur_stream_status = stream_get_status(dc->current_state, stream); bool force_minimal_pipe_splitting = false; + uint32_t i; *is_plane_addition = false; @@ -3595,6 +3596,36 @@ static bool could_mpcc_tree_change_for_active_pipes(struct dc *dc, } } + /* For SubVP pipe split case when adding MPO video + * we need to add a minimal transition. In this case + * there will be 2 streams (1 main stream, 1 phantom + * stream). + */ + if (cur_stream_status && + dc->current_state->stream_count == 2 && + stream->mall_stream_config.type == SUBVP_MAIN) { + bool is_pipe_split = false; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + if (dc->current_state->res_ctx.pipe_ctx[i].stream == stream && + (dc->current_state->res_ctx.pipe_ctx[i].bottom_pipe || + dc->current_state->res_ctx.pipe_ctx[i].next_odm_pipe)) { + is_pipe_split = true; + break; + } + } + + /* determine if minimal transition is required due to SubVP*/ + if (surface_count > 0 && is_pipe_split) { + if (cur_stream_status->plane_count > surface_count) { + force_minimal_pipe_splitting = true; + } else if (cur_stream_status->plane_count < surface_count) { + force_minimal_pipe_splitting = true; + *is_plane_addition = true; + } + } + } + return force_minimal_pipe_splitting; } @@ -3604,6 +3635,7 @@ static bool commit_minimal_transition_state(struct dc *dc, struct dc_state *transition_context = dc_create_state(dc); enum pipe_split_policy tmp_mpc_policy; bool temp_dynamic_odm_policy; + bool temp_subvp_policy; enum dc_status ret = DC_ERROR_UNEXPECTED; unsigned int i, j; @@ -3618,6 +3650,9 @@ static bool commit_minimal_transition_state(struct dc *dc, temp_dynamic_odm_policy = dc->debug.enable_single_display_2to1_odm_policy; dc->debug.enable_single_display_2to1_odm_policy = false; + temp_subvp_policy = dc->debug.force_disable_subvp; + dc->debug.force_disable_subvp = true; + dc_resource_state_copy_construct(transition_base_context, transition_context); //commit minimal state @@ -3646,6 +3681,7 @@ static bool commit_minimal_transition_state(struct dc *dc, dc->debug.pipe_split_policy = tmp_mpc_policy; dc->debug.enable_single_display_2to1_odm_policy = temp_dynamic_odm_policy; + dc->debug.force_disable_subvp = temp_subvp_policy; if (ret != DC_OK) { /*this should never happen*/ diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c index 05de97ea855f1..752a4accb116d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c @@ -1798,14 +1798,32 @@ bool dcn32_validate_bandwidth(struct dc *dc, int vlevel = 0; int pipe_cnt = 0; display_e2e_pipe_params_st *pipes = kzalloc(dc->res_pool->pipe_count * sizeof(display_e2e_pipe_params_st), GFP_KERNEL); + struct mall_temp_config mall_temp_config; DC_LOGGER_INIT(dc->ctx->logger); + /* For fast validation, there are situations where a shallow copy of + * of the dc->current_state is created for the validation. In this case + * we want to save and restore the mall config because we always + * teardown subvp at the beginning of validation (and don't attempt + * to add it back if it's fast validation). If we don't restore the + * subvp config in cases of fast validation + shallow copy of the + * dc->current_state, the dc->current_state will have a partially + * removed subvp state when we did not intend to remove it. + */ + if (fast_validate) { + memset(&mall_temp_config, 0, sizeof(mall_temp_config)); + dcn32_save_mall_state(dc, context, &mall_temp_config); + } + BW_VAL_TRACE_COUNT(); DC_FP_START(); out = dcn32_internal_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel, fast_validate); DC_FP_END(); + if (fast_validate) + dcn32_restore_mall_state(dc, context, &mall_temp_config); + if (pipe_cnt == 0) goto validate_out; diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h index a24f538bdc4cc..f76120e67c16a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.h @@ -45,6 +45,17 @@ extern struct _vcs_dpi_ip_params_st dcn3_2_ip; extern struct _vcs_dpi_soc_bounding_box_st dcn3_2_soc; +/* Temp struct used to save and restore MALL config + * during validation. + * + * TODO: Move MALL config into dc_state instead of stream struct + * to avoid needing to save/restore. + */ +struct mall_temp_config { + struct mall_stream_config mall_stream_config[MAX_PIPES]; + bool is_phantom_plane[MAX_PIPES]; +}; + struct dcn32_resource_pool { struct resource_pool base; }; @@ -122,6 +133,15 @@ void dcn32_determine_det_override(struct dc *dc, void dcn32_set_det_allocations(struct dc *dc, struct dc_state *context, display_e2e_pipe_params_st *pipes); + +void dcn32_save_mall_state(struct dc *dc, + struct dc_state *context, + struct mall_temp_config *temp_config); + +void dcn32_restore_mall_state(struct dc *dc, + struct dc_state *context, + struct mall_temp_config *temp_config); + /* definitions for run time init of reg offsets */ /* CLK SRC */ diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c index 7f318ced5dee4..d51d0c40ae5bc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c @@ -380,3 +380,74 @@ void dcn32_set_det_allocations(struct dc *dc, struct dc_state *context, } else dcn32_determine_det_override(dc, context, pipes); } + +/** + * ******************************************************************************************* + * dcn32_save_mall_state: Save MALL (SubVP) state for fast validation cases + * + * This function saves the MALL (SubVP) case for fast validation cases. For fast validation, + * there are situations where a shallow copy of the dc->current_state is created for the + * validation. In this case we want to save and restore the mall config because we always + * teardown subvp at the beginning of validation (and don't attempt to add it back if it's + * fast validation). If we don't restore the subvp config in cases of fast validation + + * shallow copy of the dc->current_state, the dc->current_state will have a partially + * removed subvp state when we did not intend to remove it. + * + * NOTE: This function ONLY works if the streams are not moved to a different pipe in the + * validation. We don't expect this to happen in fast_validation=1 cases. + * + * @param [in]: dc: Current DC state + * @param [in]: context: New DC state to be programmed + * @param [out]: temp_config: struct used to cache the existing MALL state + * + * @return: void + * + * ******************************************************************************************* + */ +void dcn32_save_mall_state(struct dc *dc, + struct dc_state *context, + struct mall_temp_config *temp_config) +{ + uint32_t i; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; + + if (pipe->stream) + temp_config->mall_stream_config[i] = pipe->stream->mall_stream_config; + + if (pipe->plane_state) + temp_config->is_phantom_plane[i] = pipe->plane_state->is_phantom; + } +} + +/** + * ******************************************************************************************* + * dcn32_restore_mall_state: Restore MALL (SubVP) state for fast validation cases + * + * Restore the MALL state based on the previously saved state from dcn32_save_mall_state + * + * @param [in]: dc: Current DC state + * @param [in/out]: context: New DC state to be programmed, restore MALL state into here + * @param [in]: temp_config: struct that has the cached MALL state + * + * @return: void + * + * ******************************************************************************************* + */ +void dcn32_restore_mall_state(struct dc *dc, + struct dc_state *context, + struct mall_temp_config *temp_config) +{ + uint32_t i; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; + + if (pipe->stream) + pipe->stream->mall_stream_config = temp_config->mall_stream_config[i]; + + if (pipe->plane_state) + pipe->plane_state->is_phantom = temp_config->is_phantom_plane[i]; + } +} -- GitLab From 345d6493476615494bd79a8fe77661918ea7c61a Mon Sep 17 00:00:00 2001 From: Leo Chen <sancchen@amd.com> Date: Fri, 16 Sep 2022 14:13:11 -0400 Subject: [PATCH 1470/2223] drm/amd/display: Add log for LTTPR [Why & How] Adding log for LTTPR to facilitate debugging. Reviewed-by: Charlene Liu <Charlene.Liu@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Leo Chen <sancchen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/core/dc_link_dp.c | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 70456580eecc7..eb32e99fbde16 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -5090,6 +5090,7 @@ bool dp_retrieve_lttpr_cap(struct dc_link *link) (dp_convert_to_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt) == 0)) { ASSERT(0); link->dpcd_caps.lttpr_caps.phy_repeater_cnt = 0x80; + DC_LOG_DC("lttpr_caps forced phy_repeater_cnt = %d\n", link->dpcd_caps.lttpr_caps.phy_repeater_cnt); } /* Attempt to train in LTTPR transparent mode if repeater count exceeds 8. */ @@ -5098,6 +5099,7 @@ bool dp_retrieve_lttpr_cap(struct dc_link *link) if (is_lttpr_present) CONN_DATA_DETECT(link, lttpr_dpcd_data, sizeof(lttpr_dpcd_data), "LTTPR Caps: "); + DC_LOG_DC("is_lttpr_present = %d\n", is_lttpr_present); return is_lttpr_present; } @@ -5134,6 +5136,7 @@ void dp_get_lttpr_mode_override(struct dc_link *link, enum lttpr_mode *override) } else if (link->dc->debug.lttpr_mode_override == LTTPR_MODE_NON_LTTPR) { *override = LTTPR_MODE_NON_LTTPR; } + DC_LOG_DC("lttpr_mode_override chose LTTPR_MODE = %d\n", (uint8_t)(*override)); } enum lttpr_mode dp_decide_8b_10b_lttpr_mode(struct dc_link *link) @@ -5146,22 +5149,34 @@ enum lttpr_mode dp_decide_8b_10b_lttpr_mode(struct dc_link *link) return LTTPR_MODE_NON_LTTPR; if (vbios_lttpr_aware) { - if (vbios_lttpr_force_non_transparent) + if (vbios_lttpr_force_non_transparent) { + DC_LOG_DC("chose LTTPR_MODE_NON_TRANSPARENT due to VBIOS DCE_INFO_CAPS_LTTPR_SUPPORT_ENABLE set to 1.\n"); return LTTPR_MODE_NON_TRANSPARENT; - else + } else { + DC_LOG_DC("chose LTTPR_MODE_NON_TRANSPARENT by default due to VBIOS not set DCE_INFO_CAPS_LTTPR_SUPPORT_ENABLE set to 1.\n"); return LTTPR_MODE_TRANSPARENT; + } } if (link->dc->config.allow_lttpr_non_transparent_mode.bits.DP1_4A && - link->dc->caps.extended_aux_timeout_support) + link->dc->caps.extended_aux_timeout_support) { + DC_LOG_DC("chose LTTPR_MODE_NON_TRANSPARENT by default and dc->config.allow_lttpr_non_transparent_mode.bits.DP1_4A set to 1.\n"); return LTTPR_MODE_NON_TRANSPARENT; + } + DC_LOG_DC("chose LTTPR_MODE_NON_LTTPR.\n"); return LTTPR_MODE_NON_LTTPR; } enum lttpr_mode dp_decide_128b_132b_lttpr_mode(struct dc_link *link) { - return dp_is_lttpr_present(link) ? LTTPR_MODE_NON_TRANSPARENT : LTTPR_MODE_NON_LTTPR; + enum lttpr_mode mode = LTTPR_MODE_NON_LTTPR; + + if (dp_is_lttpr_present(link)) + mode = LTTPR_MODE_NON_TRANSPARENT; + + DC_LOG_DC("128b_132b chose LTTPR_MODE %d.\n", mode); + return mode; } static bool get_usbc_cable_id(struct dc_link *link, union dp_cable_id *cable_id) @@ -5179,9 +5194,10 @@ static bool get_usbc_cable_id(struct dc_link *link, union dp_cable_id *cable_id) cmd.cable_id.data.input.phy_inst = resource_transmitter_to_phy_idx( link->dc, link->link_enc->transmitter); if (dc_dmub_srv_cmd_with_reply_data(link->ctx->dmub_srv, &cmd) && - cmd.cable_id.header.ret_status == 1) + cmd.cable_id.header.ret_status == 1) { cable_id->raw = cmd.cable_id.data.output_raw; - + DC_LOG_DC("usbc_cable_id = %d.\n", cable_id->raw); + } return cmd.cable_id.header.ret_status == 1; } @@ -5228,6 +5244,7 @@ static enum dc_status wa_try_to_wake_dprx(struct dc_link *link, uint64_t timeout lttpr_present = dp_is_lttpr_present(link) || (!vbios_lttpr_interop || !link->dc->caps.extended_aux_timeout_support); + DC_LOG_DC("lttpr_present = %d.\n", lttpr_present ? 1 : 0); /* Issue an AUX read to test DPRX responsiveness. If LTTPR is supported the first read is expected to * be to determine LTTPR capabilities. Otherwise trying to read power state should be an innocuous AUX read. -- GitLab From e4e481e4d838f30985dd46d43ed195110ed265f5 Mon Sep 17 00:00:00 2001 From: Zhikai Zhai <zhikai.zhai@amd.com> Date: Tue, 20 Sep 2022 18:51:02 +0800 Subject: [PATCH 1471/2223] drm/amd/display: skip commit minimal transition state [WHY] Now dynamic ODM will now be disabled when MPO is required safe transitions to avoid underflow, but we are triggering the way of minimal transition too often. Commit state of dc with no check will do pipeline setup which may re-initialize the component with no need such as audio. [HOW] Just do the minimal transition when all of pipes are in use, otherwise return true to skip. Reviewed-by: Dillon Varone <Dillon.Varone@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Zhikai Zhai <zhikai.zhai@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index a81a61f006c4b..6216ceb790b4b 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3638,10 +3638,30 @@ static bool commit_minimal_transition_state(struct dc *dc, bool temp_subvp_policy; enum dc_status ret = DC_ERROR_UNEXPECTED; unsigned int i, j; + unsigned int pipe_in_use = 0; if (!transition_context) return false; + /* check current pipes in use*/ + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *pipe = &transition_base_context->res_ctx.pipe_ctx[i]; + + if (pipe->plane_state) + pipe_in_use++; + } + + /* When the OS add a new surface if we have been used all of pipes with odm combine + * and mpc split feature, it need use commit_minimal_transition_state to transition safely. + * After OS exit MPO, it will back to use odm and mpc split with all of pipes, we need + * call it again. Otherwise return true to skip. + * + * Reduce the scenarios to use dc_commit_state_no_check in the stage of flip. Especially + * enter/exit MPO when DCN still have enough resources. + */ + if (pipe_in_use != dc->res_pool->pipe_count) + return true; + if (!dc->config.is_vmin_only_asic) { tmp_mpc_policy = dc->debug.pipe_split_policy; dc->debug.pipe_split_policy = MPC_SPLIT_AVOID; -- GitLab From 4931ce22eca6ed5f8a3a3820fd13e586011ac219 Mon Sep 17 00:00:00 2001 From: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com> Date: Fri, 16 Sep 2022 15:55:55 -0400 Subject: [PATCH 1472/2223] drm/amd/display: add dummy pstate workaround to dcn315 DCN315 has to always allow pstate change or SMU will hang. This workaround achieves this by applying a low pstate change latency to be used when pstate is calculated to be unsupported. This lower latency only accounts for memory retraining; a previous change handles locking in the highest available pstate allowing us to minimize required latency hiding to only account for memory retraining. Reviewed-by: Charlene Liu <Charlene.Liu@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dcn30/dcn30_resource.c | 4 + .../amd/display/dc/dcn315/dcn315_resource.c | 2 +- .../drm/amd/display/dc/dml/dcn31/dcn31_fpu.c | 89 +++++-------------- .../drm/amd/display/dc/dml/dcn31/dcn31_fpu.h | 1 + 4 files changed, 27 insertions(+), 69 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c index 3a3b2ac791c78..020f512e9690e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c @@ -1655,6 +1655,9 @@ noinline bool dcn30_internal_validate_bw( if (!pipes) return false; + context->bw_ctx.dml.vba.maxMpcComb = 0; + context->bw_ctx.dml.vba.VoltageLevel = 0; + context->bw_ctx.dml.vba.DRAMClockChangeSupport[0][0] = dm_dram_clock_change_vactive; dc->res_pool->funcs->update_soc_for_wm_a(dc, context); pipe_cnt = dc->res_pool->funcs->populate_dml_pipes(dc, context, pipes, fast_validate); @@ -1873,6 +1876,7 @@ noinline bool dcn30_internal_validate_bw( if (repopulate_pipes) pipe_cnt = dc->res_pool->funcs->populate_dml_pipes(dc, context, pipes, fast_validate); + context->bw_ctx.dml.vba.VoltageLevel = vlevel; *vlevel_out = vlevel; *pipe_cnt_out = pipe_cnt; diff --git a/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c b/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c index 0f71bb86dc9a2..58746c437554f 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c @@ -1731,7 +1731,7 @@ static struct resource_funcs dcn315_res_pool_funcs = { .panel_cntl_create = dcn31_panel_cntl_create, .validate_bandwidth = dcn31_validate_bandwidth, .calculate_wm_and_dlg = dcn31_calculate_wm_and_dlg, - .update_soc_for_wm_a = dcn31_update_soc_for_wm_a, + .update_soc_for_wm_a = dcn315_update_soc_for_wm_a, .populate_dml_pipes = dcn315_populate_dml_pipes_from_context, .acquire_idle_pipe_for_layer = dcn20_acquire_idle_pipe_for_layer, .add_stream_to_ctx = dcn30_add_stream_to_ctx, diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c index 94b0842cd89b5..87bfc42bdaaf1 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c @@ -292,6 +292,7 @@ static struct _vcs_dpi_soc_bounding_box_st dcn3_15_soc = { .urgent_latency_adjustment_fabric_clock_component_us = 0, .urgent_latency_adjustment_fabric_clock_reference_mhz = 0, .num_chans = 4, + .dummy_pstate_latency_us = 10.0 }; struct _vcs_dpi_ip_params_st dcn3_16_ip = { @@ -459,6 +460,23 @@ void dcn31_update_soc_for_wm_a(struct dc *dc, struct dc_state *context) } } +void dcn315_update_soc_for_wm_a(struct dc *dc, struct dc_state *context) +{ + dc_assert_fp_enabled(); + + if (dc->clk_mgr->bw_params->wm_table.entries[WM_A].valid) { + /* For 315 pstate change is only supported if possible in vactive */ + if (context->bw_ctx.dml.vba.DRAMClockChangeSupport[context->bw_ctx.dml.vba.VoltageLevel][context->bw_ctx.dml.vba.maxMpcComb] != dm_dram_clock_change_vactive) + context->bw_ctx.dml.soc.dram_clock_change_latency_us = context->bw_ctx.dml.soc.dummy_pstate_latency_us; + else + context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.entries[WM_A].pstate_latency_us; + context->bw_ctx.dml.soc.sr_enter_plus_exit_time_us = + dc->clk_mgr->bw_params->wm_table.entries[WM_A].sr_enter_plus_exit_time_us; + context->bw_ctx.dml.soc.sr_exit_time_us = + dc->clk_mgr->bw_params->wm_table.entries[WM_A].sr_exit_time_us; + } +} + void dcn31_calculate_wm_and_dlg_fp( struct dc *dc, struct dc_state *context, display_e2e_pipe_params_st *pipes, @@ -486,72 +504,6 @@ void dcn31_calculate_wm_and_dlg_fp( pipes[0].clks_cfg.dcfclk_mhz = dcfclk; pipes[0].clks_cfg.socclk_mhz = context->bw_ctx.dml.soc.clock_limits[vlevel].socclk_mhz; -#if 0 // TODO - /* Set B: - * TODO - */ - if (dc->clk_mgr->bw_params->wm_table.nv_entries[WM_B].valid) { - if (vlevel == 0) { - pipes[0].clks_cfg.voltage = 1; - pipes[0].clks_cfg.dcfclk_mhz = context->bw_ctx.dml.soc.clock_limits[0].dcfclk_mhz; - } - context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_B].dml_input.pstate_latency_us; - context->bw_ctx.dml.soc.sr_enter_plus_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_B].dml_input.sr_enter_plus_exit_time_us; - context->bw_ctx.dml.soc.sr_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_B].dml_input.sr_exit_time_us; - } - context->bw_ctx.bw.dcn.watermarks.b.urgent_ns = get_wm_urgent(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.cstate_pstate.cstate_enter_plus_exit_ns = get_wm_stutter_enter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.cstate_pstate.cstate_exit_ns = get_wm_stutter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.cstate_pstate.cstate_enter_plus_exit_z8_ns = get_wm_z8_stutter_enter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.cstate_pstate.cstate_exit_z8_ns = get_wm_z8_stutter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.cstate_pstate.pstate_change_ns = get_wm_dram_clock_change(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.pte_meta_urgent_ns = get_wm_memory_trip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.frac_urg_bw_nom = get_fraction_of_urgent_bandwidth(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.frac_urg_bw_flip = get_fraction_of_urgent_bandwidth_imm_flip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.b.urgent_latency_ns = get_urgent_latency(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - - pipes[0].clks_cfg.voltage = vlevel; - pipes[0].clks_cfg.dcfclk_mhz = dcfclk; - - /* Set C: - * TODO - */ - if (dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].valid) { - context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].dml_input.pstate_latency_us; - context->bw_ctx.dml.soc.sr_enter_plus_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].dml_input.sr_enter_plus_exit_time_us; - context->bw_ctx.dml.soc.sr_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].dml_input.sr_exit_time_us; - } - context->bw_ctx.bw.dcn.watermarks.c.urgent_ns = get_wm_urgent(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.cstate_pstate.cstate_enter_plus_exit_ns = get_wm_stutter_enter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.cstate_pstate.cstate_exit_ns = get_wm_stutter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.cstate_pstate.cstate_enter_plus_exit_z8_ns = get_wm_z8_stutter_enter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.cstate_pstate.cstate_exit_z8_ns = get_wm_z8_stutter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.cstate_pstate.pstate_change_ns = get_wm_dram_clock_change(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.pte_meta_urgent_ns = get_wm_memory_trip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.frac_urg_bw_nom = get_fraction_of_urgent_bandwidth(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.frac_urg_bw_flip = get_fraction_of_urgent_bandwidth_imm_flip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.c.urgent_latency_ns = get_urgent_latency(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - - /* Set D: - * TODO - */ - if (dc->clk_mgr->bw_params->wm_table.nv_entries[WM_D].valid) { - context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_D].dml_input.pstate_latency_us; - context->bw_ctx.dml.soc.sr_enter_plus_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_D].dml_input.sr_enter_plus_exit_time_us; - context->bw_ctx.dml.soc.sr_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_D].dml_input.sr_exit_time_us; - } - context->bw_ctx.bw.dcn.watermarks.d.urgent_ns = get_wm_urgent(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.cstate_pstate.cstate_enter_plus_exit_ns = get_wm_stutter_enter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.cstate_pstate.cstate_exit_ns = get_wm_stutter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.cstate_pstate.pstate_change_ns = get_wm_dram_clock_change(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.cstate_pstate.cstate_enter_plus_exit_z8_ns = get_wm_z8_stutter_enter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.cstate_pstate.cstate_exit_z8_ns = get_wm_z8_stutter_exit(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.pte_meta_urgent_ns = get_wm_memory_trip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.frac_urg_bw_nom = get_fraction_of_urgent_bandwidth(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.frac_urg_bw_flip = get_fraction_of_urgent_bandwidth_imm_flip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - context->bw_ctx.bw.dcn.watermarks.d.urgent_latency_ns = get_urgent_latency(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; -#endif - /* Set A: * All clocks min required * @@ -568,11 +520,9 @@ void dcn31_calculate_wm_and_dlg_fp( context->bw_ctx.bw.dcn.watermarks.a.frac_urg_bw_nom = get_fraction_of_urgent_bandwidth(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; context->bw_ctx.bw.dcn.watermarks.a.frac_urg_bw_flip = get_fraction_of_urgent_bandwidth_imm_flip(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; context->bw_ctx.bw.dcn.watermarks.a.urgent_latency_ns = get_urgent_latency(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; - /* TODO: remove: */ context->bw_ctx.bw.dcn.watermarks.b = context->bw_ctx.bw.dcn.watermarks.a; context->bw_ctx.bw.dcn.watermarks.c = context->bw_ctx.bw.dcn.watermarks.a; context->bw_ctx.bw.dcn.watermarks.d = context->bw_ctx.bw.dcn.watermarks.a; - /* end remove*/ for (i = 0, pipe_idx = 0; i < dc->res_pool->pipe_count; i++) { if (!context->res_ctx.pipe_ctx[i].stream) @@ -594,6 +544,9 @@ void dcn31_calculate_wm_and_dlg_fp( } dcn20_calculate_dlg_params(dc, context, pipes, pipe_cnt, vlevel); + /* For 31x apu pstate change is only supported if possible in vactive */ + context->bw_ctx.bw.dcn.clk.p_state_change_support = + context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] == dm_dram_clock_change_vactive; } void dcn31_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_params) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.h b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.h index 4372f17b55d4e..fd58b2561ec9e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.h +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.h @@ -35,6 +35,7 @@ void dcn31_zero_pipe_dcc_fraction(display_e2e_pipe_params_st *pipes, int pipe_cnt); void dcn31_update_soc_for_wm_a(struct dc *dc, struct dc_state *context); +void dcn315_update_soc_for_wm_a(struct dc *dc, struct dc_state *context); void dcn31_calculate_wm_and_dlg_fp( struct dc *dc, struct dc_state *context, -- GitLab From 8cab4ef0ad9521030e1ae4bd294a1e2e6a04659f Mon Sep 17 00:00:00 2001 From: Lewis Huang <Lewis.Huang@amd.com> Date: Fri, 23 Sep 2022 10:42:40 +0800 Subject: [PATCH 1473/2223] drm/amd/display: Keep OTG on when Z10 is disable [Why] Disable OTG when PSRSU with z10 even if z10 is disable [How] Reverse condition to keep OTG on when Z10 is disable Reviewed-by: Robin Chen <po-tchen@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Lewis Huang <Lewis.Huang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index c4daef1e708c9..d7b1ace6328a0 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -3378,8 +3378,8 @@ bool dc_link_setup_psr(struct dc_link *link, case FAMILY_YELLOW_CARP: case AMDGPU_FAMILY_GC_10_3_6: case AMDGPU_FAMILY_GC_11_0_1: - if(!dc->debug.disable_z10) - psr_context->psr_level.bits.SKIP_CRTC_DISABLE = false; + if (dc->debug.disable_z10) + psr_context->psr_level.bits.SKIP_CRTC_DISABLE = true; break; default: psr_context->psr_level.bits.SKIP_CRTC_DISABLE = true; -- GitLab From b808a7eb30b02e05023b505fe6db590ba799683f Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Tue, 20 Sep 2022 20:50:49 -0400 Subject: [PATCH 1474/2223] drm/amd/display: Increase compbuf size prior to updating clocks [WHY?] Clocks are updating based on the incoming context's support, however the new compbuf size is not programmed prior to udpating clocks, which can result in P-State hangs. [HOW?] Increase compbuf size prior to updating clocks. Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com> Reviewed-by: Martin Leung <Martin.Leung@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c index e1d271fe9e641..7de511fd004b5 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c @@ -2018,6 +2018,10 @@ void dcn20_optimize_bandwidth( context->bw_ctx.bw.dcn.clk.dramclk_khz <= dc->clk_mgr->bw_params->dc_mode_softmax_memclk * 1000) dc->clk_mgr->funcs->set_max_memclk(dc->clk_mgr, dc->clk_mgr->bw_params->dc_mode_softmax_memclk); + /* increase compbuf size */ + if (hubbub->funcs->program_compbuf_size) + hubbub->funcs->program_compbuf_size(hubbub, context->bw_ctx.bw.dcn.compbuf_size_kb, true); + dc->clk_mgr->funcs->update_clocks( dc->clk_mgr, context, @@ -2033,9 +2037,6 @@ void dcn20_optimize_bandwidth( pipe_ctx->dlg_regs.optimized_min_dst_y_next_start); } } - /* increase compbuf size */ - if (hubbub->funcs->program_compbuf_size) - hubbub->funcs->program_compbuf_size(hubbub, context->bw_ctx.bw.dcn.compbuf_size_kb, true); } bool dcn20_update_bandwidth( -- GitLab From baec651f4160f4c3f029edf84bbc18b4fcba9cf5 Mon Sep 17 00:00:00 2001 From: Wenjing Liu <wenjing.liu@amd.com> Date: Thu, 22 Sep 2022 14:36:32 -0400 Subject: [PATCH 1475/2223] drm/amd/display: write all 4 bytes of FFE_PRESET dpcd value [why] According to specs, it expects us to write all 4 bytes even if current lane count is less than 4. Reviewed-by: George Shen <George.Shen@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Wenjing Liu <wenjing.liu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/core/dc_link_dp.c | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index eb32e99fbde16..1254d38f1778a 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -944,6 +944,23 @@ enum dc_status dp_get_lane_status_and_lane_adjust( return status; } +static enum dc_status dpcd_128b_132b_set_lane_settings( + struct dc_link *link, + const struct link_training_settings *link_training_setting) +{ + enum dc_status status = core_link_write_dpcd(link, + DP_TRAINING_LANE0_SET, + (uint8_t *)(link_training_setting->dpcd_lane_settings), + sizeof(link_training_setting->dpcd_lane_settings)); + + DC_LOG_HW_LINK_TRAINING("%s:\n 0x%X TX_FFE_PRESET_VALUE = %x\n", + __func__, + DP_TRAINING_LANE0_SET, + link_training_setting->dpcd_lane_settings[0].tx_ffe.PRESET_VALUE); + return status; +} + + enum dc_status dpcd_set_lane_settings( struct dc_link *link, const struct link_training_settings *link_training_setting, @@ -964,16 +981,6 @@ enum dc_status dpcd_set_lane_settings( link_training_setting->link_settings.lane_count); if (is_repeater(link_training_setting, offset)) { - if (dp_get_link_encoding_format(&link_training_setting->link_settings) == - DP_128b_132b_ENCODING) - DC_LOG_HW_LINK_TRAINING("%s:\n LTTPR Repeater ID: %d\n" - " 0x%X TX_FFE_PRESET_VALUE = %x\n", - __func__, - offset, - lane0_set_address, - link_training_setting->dpcd_lane_settings[0].tx_ffe.PRESET_VALUE); - else if (dp_get_link_encoding_format(&link_training_setting->link_settings) == - DP_8b_10b_ENCODING) DC_LOG_HW_LINK_TRAINING("%s\n LTTPR Repeater ID: %d\n" " 0x%X VS set = %x PE set = %x max VS Reached = %x max PE Reached = %x\n", __func__, @@ -985,14 +992,6 @@ enum dc_status dpcd_set_lane_settings( link_training_setting->dpcd_lane_settings[0].bits.MAX_PRE_EMPHASIS_REACHED); } else { - if (dp_get_link_encoding_format(&link_training_setting->link_settings) == - DP_128b_132b_ENCODING) - DC_LOG_HW_LINK_TRAINING("%s:\n 0x%X TX_FFE_PRESET_VALUE = %x\n", - __func__, - lane0_set_address, - link_training_setting->dpcd_lane_settings[0].tx_ffe.PRESET_VALUE); - else if (dp_get_link_encoding_format(&link_training_setting->link_settings) == - DP_8b_10b_ENCODING) DC_LOG_HW_LINK_TRAINING("%s\n 0x%X VS set = %x PE set = %x max VS Reached = %x max PE Reached = %x\n", __func__, lane0_set_address, @@ -2023,7 +2022,7 @@ static enum link_training_result dp_perform_128b_132b_channel_eq_done_sequence( result = DP_128b_132b_LT_FAILED; } else { dp_set_hw_lane_settings(link, link_res, lt_settings, DPRX); - dpcd_set_lane_settings(link, lt_settings, DPRX); + dpcd_128b_132b_set_lane_settings(link, lt_settings); } loop_count++; } -- GitLab From 51619c671316e96d7adaf2b6ea94ce245b81b6dd Mon Sep 17 00:00:00 2001 From: Aric Cyr <aric.cyr@amd.com> Date: Fri, 23 Sep 2022 17:09:54 -0400 Subject: [PATCH 1476/2223] drm/amd/display: Fix vupdate and vline position calculation [how] Large deltas for periodic interrupts could result in the interrupt not being programmed properly and thus not firing. [why] Add proper wrap-around support for calculating VUPDATE and VLINE positions. Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Aric Cyr <aric.cyr@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../amd/display/dc/dcn10/dcn10_hw_sequencer.c | 60 ++++++++----------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index 4390f6d7050fc..f4b3ec32a331b 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -3818,28 +3818,14 @@ void dcn10_calc_vupdate_position( uint32_t *start_line, uint32_t *end_line) { - const struct dc_crtc_timing *dc_crtc_timing = &pipe_ctx->stream->timing; - int vline_int_offset_from_vupdate = - pipe_ctx->stream->periodic_interrupt.lines_offset; - int vupdate_offset_from_vsync = dc->hwss.get_vupdate_offset_from_vsync(pipe_ctx); - int start_position; - - if (vline_int_offset_from_vupdate > 0) - vline_int_offset_from_vupdate--; - else if (vline_int_offset_from_vupdate < 0) - vline_int_offset_from_vupdate++; + const struct dc_crtc_timing *timing = &pipe_ctx->stream->timing; + int vupdate_pos = dc->hwss.get_vupdate_offset_from_vsync(pipe_ctx); - start_position = vline_int_offset_from_vupdate + vupdate_offset_from_vsync; - - if (start_position >= 0) - *start_line = start_position; + if (vupdate_pos >= 0) + *start_line = vupdate_pos - ((vupdate_pos / timing->v_total) * timing->v_total); else - *start_line = dc_crtc_timing->v_total + start_position - 1; - - *end_line = *start_line + 2; - - if (*end_line >= dc_crtc_timing->v_total) - *end_line = 2; + *start_line = vupdate_pos + ((-vupdate_pos / timing->v_total) + 1) * timing->v_total - 1; + *end_line = (*start_line + 2) % timing->v_total; } static void dcn10_cal_vline_position( @@ -3848,23 +3834,27 @@ static void dcn10_cal_vline_position( uint32_t *start_line, uint32_t *end_line) { - switch (pipe_ctx->stream->periodic_interrupt.ref_point) { - case START_V_UPDATE: - dcn10_calc_vupdate_position( - dc, - pipe_ctx, - start_line, - end_line); - break; - case START_V_SYNC: + const struct dc_crtc_timing *timing = &pipe_ctx->stream->timing; + int vline_pos = pipe_ctx->stream->periodic_interrupt.lines_offset; + + if (pipe_ctx->stream->periodic_interrupt.ref_point == START_V_UPDATE) { + if (vline_pos > 0) + vline_pos--; + else if (vline_pos < 0) + vline_pos++; + + vline_pos += dc->hwss.get_vupdate_offset_from_vsync(pipe_ctx); + if (vline_pos >= 0) + *start_line = vline_pos - ((vline_pos / timing->v_total) * timing->v_total); + else + *start_line = vline_pos + ((-vline_pos / timing->v_total) + 1) * timing->v_total - 1; + *end_line = (*start_line + 2) % timing->v_total; + } else if (pipe_ctx->stream->periodic_interrupt.ref_point == START_V_SYNC) { // vsync is line 0 so start_line is just the requested line offset - *start_line = pipe_ctx->stream->periodic_interrupt.lines_offset; - *end_line = *start_line + 2; - break; - default: + *start_line = vline_pos; + *end_line = (*start_line + 2) % timing->v_total; + } else ASSERT(0); - break; - } } void dcn10_setup_periodic_interrupt( -- GitLab From 2d550a159c55ac836a554fd605545b0feb5f7266 Mon Sep 17 00:00:00 2001 From: Martin Leung <Martin.Leung@amd.com> Date: Fri, 23 Sep 2022 17:55:14 -0400 Subject: [PATCH 1477/2223] drm/amd/display: block odd h_total timings from halving pixel rate why: when dynamic odm was turned on, there is also logic to halve the pixelclk this still turned on when we avoided odm in the case of odd h_total timings how: block the pixel clk mechanism also in the case of odd h_total timings Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Martin Leung <Martin.Leung@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../dc/dcn32/dcn32_dio_stream_encoder.c | 35 ++++++++++++++++++- .../drm/amd/display/dc/dcn32/dcn32_hwseq.c | 9 ++--- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c index 0e9dce4146418..3195be9d38f58 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c @@ -243,6 +243,39 @@ static bool is_two_pixels_per_containter(const struct dc_crtc_timing *timing) return two_pix; } +static bool is_h_timing_divisible_by_2(const struct dc_crtc_timing *timing) +{ + /* math borrowed from function of same name in inc/resource + * checks if h_timing is divisible by 2 + */ + + bool divisible = false; + uint16_t h_blank_start = 0; + uint16_t h_blank_end = 0; + + if (timing) { + h_blank_start = timing->h_total - timing->h_front_porch; + h_blank_end = h_blank_start - timing->h_addressable; + + /* HTOTAL, Hblank start/end, and Hsync start/end all must be + * divisible by 2 in order for the horizontal timing params + * to be considered divisible by 2. Hsync start is always 0. + */ + divisible = (timing->h_total % 2 == 0) && + (h_blank_start % 2 == 0) && + (h_blank_end % 2 == 0) && + (timing->h_sync_width % 2 == 0); + } + return divisible; +} + +static bool is_dp_dig_pixel_rate_div_policy(struct dc *dc, const struct dc_crtc_timing *timing) +{ + /* should be functionally the same as dcn32_is_dp_dig_pixel_rate_div_policy for DP encoders*/ + return is_h_timing_divisible_by_2(timing) && + dc->debug.enable_dp_dig_pixel_rate_div_policy; +} + static void enc32_stream_encoder_dp_unblank( struct dc_link *link, struct stream_encoder *enc, @@ -259,7 +292,7 @@ static void enc32_stream_encoder_dp_unblank( /* YCbCr 4:2:0 : Computed VID_M will be 2X the input rate */ if (is_two_pixels_per_containter(¶m->timing) || param->opp_cnt > 1 - || dc->debug.enable_dp_dig_pixel_rate_div_policy) { + || is_dp_dig_pixel_rate_div_policy(dc, ¶m->timing)) { /*this logic should be the same in get_pixel_clock_parameters() */ n_multiply = 1; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c index a750343ca5211..8012a48859b59 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c @@ -1161,7 +1161,6 @@ unsigned int dcn32_calculate_dccg_k1_k2_values(struct pipe_ctx *pipe_ctx, unsign { struct dc_stream_state *stream = pipe_ctx->stream; unsigned int odm_combine_factor = 0; - struct dc *dc = pipe_ctx->stream->ctx->dc; bool two_pix_per_container = false; // For phantom pipes, use the same programming as the main pipes @@ -1189,7 +1188,7 @@ unsigned int dcn32_calculate_dccg_k1_k2_values(struct pipe_ctx *pipe_ctx, unsign } else { *k1_div = PIXEL_RATE_DIV_BY_1; *k2_div = PIXEL_RATE_DIV_BY_4; - if ((odm_combine_factor == 2) || dc->debug.enable_dp_dig_pixel_rate_div_policy) + if ((odm_combine_factor == 2) || dcn32_is_dp_dig_pixel_rate_div_policy(pipe_ctx)) *k2_div = PIXEL_RATE_DIV_BY_2; } } @@ -1226,7 +1225,6 @@ void dcn32_unblank_stream(struct pipe_ctx *pipe_ctx, struct dc_link *link = stream->link; struct dce_hwseq *hws = link->dc->hwseq; struct pipe_ctx *odm_pipe; - struct dc *dc = pipe_ctx->stream->ctx->dc; uint32_t pix_per_cycle = 1; params.opp_cnt = 1; @@ -1245,7 +1243,7 @@ void dcn32_unblank_stream(struct pipe_ctx *pipe_ctx, pipe_ctx->stream_res.tg->inst); } else if (dc_is_dp_signal(pipe_ctx->stream->signal)) { if (optc2_is_two_pixels_per_containter(&stream->timing) || params.opp_cnt > 1 - || dc->debug.enable_dp_dig_pixel_rate_div_policy) { + || dcn32_is_dp_dig_pixel_rate_div_policy(pipe_ctx)) { params.timing.pix_clk_100hz /= 2; pix_per_cycle = 2; } @@ -1262,6 +1260,9 @@ bool dcn32_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx) { struct dc *dc = pipe_ctx->stream->ctx->dc; + if (!is_h_timing_divisible_by_2(pipe_ctx->stream)) + return false; + if (dc_is_dp_signal(pipe_ctx->stream->signal) && !is_dp_128b_132b_signal(pipe_ctx) && dc->debug.enable_dp_dig_pixel_rate_div_policy) return true; -- GitLab From a2909ff460a8e02168b3658372ebc897f7ab2315 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 20 Sep 2022 13:06:50 -0400 Subject: [PATCH 1478/2223] drm/amd/display: Drop unused code for DCN32/321 Under DCN32/321 we identified some code paths that DC never executes. This commit removes those unused codes to avoid distractions when debugging issues. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../display/dc/dcn32/dcn32_dio_link_encoder.c | 7 ------- .../display/dc/dcn32/dcn32_dio_link_encoder.h | 4 ---- .../dc/dcn32/dcn32_dio_stream_encoder.c | 20 ------------------- .../gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c | 3 +-- .../dc/dcn321/dcn321_dio_link_encoder.c | 1 - .../amd/display/dc/dcn321/dcn321_resource.c | 2 -- 6 files changed, 1 insertion(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c index fdae6aa899082..076969d928afa 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c @@ -150,12 +150,6 @@ static void dcn32_link_encoder_get_max_link_cap(struct link_encoder *enc, } -void enc32_set_dig_output_mode(struct link_encoder *enc, uint8_t pix_per_container) -{ - struct dcn10_link_encoder *enc10 = TO_DCN10_LINK_ENC(enc); - REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_OUTPUT_PIXEL_MODE, pix_per_container); -} - static const struct link_encoder_funcs dcn32_link_enc_funcs = { .read_state = link_enc2_read_state, .validate_output_with_stream = @@ -186,7 +180,6 @@ static const struct link_encoder_funcs dcn32_link_enc_funcs = { .is_in_alt_mode = dcn32_link_encoder_is_in_alt_mode, .get_max_link_cap = dcn32_link_encoder_get_max_link_cap, .set_dio_phy_mux = dcn31_link_encoder_set_dio_phy_mux, - .set_dig_output_mode = enc32_set_dig_output_mode, }; void dcn32_link_encoder_construct( diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.h index 749a1e8cb8113..bbcfce06bec01 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.h @@ -53,8 +53,4 @@ void dcn32_link_encoder_enable_dp_output( const struct dc_link_settings *link_settings, enum clock_source_id clock_source); -void enc32_set_dig_output_mode( - struct link_encoder *enc, - uint8_t pix_per_container); - #endif /* __DC_LINK_ENCODER__DCN32_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c index 3195be9d38f58..40e713c4e172d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c @@ -411,24 +411,6 @@ static void enc32_read_state(struct stream_encoder *enc, struct enc_state *s) } } -static void enc32_stream_encoder_reset_fifo(struct stream_encoder *enc) -{ - struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); - uint32_t fifo_enabled; - - REG_GET(DIG_FIFO_CTRL0, DIG_FIFO_ENABLE, &fifo_enabled); - - if (fifo_enabled == 0) { - /* reset DIG resync FIFO */ - REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_RESET, 1); - /* TODO: fix timeout when wait for DIG_FIFO_RESET_DONE */ - //REG_WAIT(DIG_FIFO_CTRL0, DIG_FIFO_RESET_DONE, 1, 1, 100); - udelay(1); - REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_RESET, 0); - REG_WAIT(DIG_FIFO_CTRL0, DIG_FIFO_RESET_DONE, 0, 1, 100); - } -} - static void enc32_set_dig_input_mode(struct stream_encoder *enc, unsigned int pix_per_container) { struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); @@ -458,8 +440,6 @@ static const struct stream_encoder_funcs dcn32_str_enc_funcs = { enc3_stream_encoder_update_dp_info_packets, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc32_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c index 830562f4139dc..f4b901d393ebc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c @@ -185,8 +185,7 @@ static struct hubp_funcs dcn32_hubp_funcs = { .hubp_update_force_pstate_disallow = hubp32_update_force_pstate_disallow, .phantom_hubp_post_enable = hubp32_phantom_hubp_post_enable, .hubp_update_mall_sel = hubp32_update_mall_sel, - .hubp_prepare_subvp_buffering = hubp32_prepare_subvp_buffering, - .hubp_set_flip_int = hubp1_set_flip_int + .hubp_prepare_subvp_buffering = hubp32_prepare_subvp_buffering }; bool hubp32_construct( diff --git a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_dio_link_encoder.c index 49682a31ecbd7..fa9b6603cfd37 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_dio_link_encoder.c @@ -91,7 +91,6 @@ static const struct link_encoder_funcs dcn321_link_enc_funcs = { .is_in_alt_mode = dcn20_link_encoder_is_in_alt_mode, .get_max_link_cap = dcn20_link_encoder_get_max_link_cap, .set_dio_phy_mux = dcn31_link_encoder_set_dio_phy_mux, - .set_dig_output_mode = enc32_set_dig_output_mode, }; void dcn321_link_encoder_construct( diff --git a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c index aed0f689cbbfa..910b63d874d50 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c @@ -94,8 +94,6 @@ #include "dcn20/dcn20_vmid.h" #define DC_LOGGER_INIT(logger) -#define fixed16_to_double(x) (((double)x) / ((double) (1 << 16))) -#define fixed16_to_double_to_cpu(x) fixed16_to_double(le32_to_cpu(x)) enum dcn321_clk_src_array_id { DCN321_CLK_SRC_PLL0, -- GitLab From 47b7dd9f68c12e7d33a0dfd3d9a5bed755097de0 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 20 Sep 2022 13:29:19 -0400 Subject: [PATCH 1479/2223] drm/amd/display: Update DCN321 hook that deals with pipe aquire DCN provides a hook to check if we can have a new pipe allocation based on some DC constraints. If the current configuration supports the new pipe request, DC updates its context; otherwise, it will keep the same configuration. This behavior is similar across multiple ASICs, and for this reason, we reused DCN20 on DCN321. However, this DCN32x has some peculiarities which require its function to avoid weird pipe split issues. This commit update this issue by using dcn32_acquire_idle_pipe_for_head_pipe_in_layer instead of dcn20_acquire_idle_pipe_for_layer. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c index 910b63d874d50..6658849d5b4e8 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c @@ -1604,7 +1604,7 @@ static struct resource_funcs dcn321_res_pool_funcs = { .validate_bandwidth = dcn32_validate_bandwidth, .calculate_wm_and_dlg = dcn32_calculate_wm_and_dlg, .populate_dml_pipes = dcn32_populate_dml_pipes_from_context, - .acquire_idle_pipe_for_layer = dcn20_acquire_idle_pipe_for_layer, + .acquire_idle_pipe_for_head_pipe_in_layer = dcn32_acquire_idle_pipe_for_head_pipe_in_layer, .add_stream_to_ctx = dcn30_add_stream_to_ctx, .add_dsc_to_stream_resource = dcn20_add_dsc_to_stream_resource, .remove_stream_from_ctx = dcn20_remove_stream_from_ctx, -- GitLab From 9114b55fabae5522b7124af4f16ea6ce6378aa19 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 20 Sep 2022 14:27:04 -0400 Subject: [PATCH 1480/2223] drm/amd/display: Fix SubVP control flow in the MPO context SubVP has some issues related to how we allocate and enable it. This commit fixes this behavior by adding the proper check and configuration to the SubVP code path. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 16 ++++++++++++++-- .../gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c | 18 ------------------ .../drm/amd/display/dc/dcn32/dcn32_resource.c | 6 ++++++ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 6216ceb790b4b..40a34b600c8ee 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2946,6 +2946,12 @@ static bool update_planes_and_stream_state(struct dc *dc, dc_resource_state_copy_construct( dc->current_state, context); + /* For each full update, remove all existing phantom pipes first. + * Ensures that we have enough pipes for newly added MPO planes + */ + if (dc->res_pool->funcs->remove_phantom_pipes) + dc->res_pool->funcs->remove_phantom_pipes(dc, context); + /*remove old surfaces from context */ if (!dc_rem_all_planes_for_stream(dc, stream, context)) { @@ -3353,8 +3359,14 @@ static void commit_planes_for_stream(struct dc *dc, /* Since phantom pipe programming is moved to post_unlock_program_front_end, * move the SubVP lock to after the phantom pipes have been setup */ - if (dc->hwss.subvp_pipe_control_lock) - dc->hwss.subvp_pipe_control_lock(dc, context, false, should_lock_all_pipes, NULL, subvp_prev_use); + if (should_lock_all_pipes && dc->hwss.interdependent_update_lock) { + if (dc->hwss.subvp_pipe_control_lock) + dc->hwss.subvp_pipe_control_lock(dc, context, false, should_lock_all_pipes, NULL, subvp_prev_use); + } else { + if (dc->hwss.subvp_pipe_control_lock) + dc->hwss.subvp_pipe_control_lock(dc, context, false, should_lock_all_pipes, NULL, subvp_prev_use); + } + return; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c index 7de511fd004b5..d732b6f031a12 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c @@ -1860,24 +1860,6 @@ void dcn20_post_unlock_program_front_end( } } - for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; - struct pipe_ctx *mpcc_pipe; - - if (pipe->vtp_locked) { - dc->hwseq->funcs.wait_for_blank_complete(pipe->stream_res.opp); - pipe->plane_res.hubp->funcs->set_blank(pipe->plane_res.hubp, true); - pipe->vtp_locked = false; - - for (mpcc_pipe = pipe->bottom_pipe; mpcc_pipe; mpcc_pipe = mpcc_pipe->bottom_pipe) - mpcc_pipe->plane_res.hubp->funcs->set_blank(mpcc_pipe->plane_res.hubp, true); - - for (i = 0; i < dc->res_pool->pipe_count; i++) - if (context->res_ctx.pipe_ctx[i].update_flags.bits.disable) - dc->hwss.disable_plane(dc, &dc->current_state->res_ctx.pipe_ctx[i]); - } - } - for (i = 0; i < dc->res_pool->pipe_count; i++) { struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; struct pipe_ctx *old_pipe = &dc->current_state->res_ctx.pipe_ctx[i]; diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c index 752a4accb116d..9585b25f10e52 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c @@ -1680,6 +1680,8 @@ static void dcn32_enable_phantom_plane(struct dc *dc, phantom_plane->clip_rect.y = 0; phantom_plane->clip_rect.height = phantom_stream->timing.v_addressable; + phantom_plane->is_phantom = true; + dc_add_plane_to_context(dc, phantom_stream, phantom_plane, context); curr_pipe = curr_pipe->bottom_pipe; @@ -1749,6 +1751,10 @@ bool dcn32_remove_phantom_pipes(struct dc *dc, struct dc_state *context) pipe->stream->mall_stream_config.type = SUBVP_NONE; pipe->stream->mall_stream_config.paired_stream = NULL; } + + if (pipe->plane_state) { + pipe->plane_state->is_phantom = false; + } } return removed_pipe; } -- GitLab From b33cd65df18f1cf60b066a02c09df92b4763bb31 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Wed, 21 Sep 2022 08:47:44 -0400 Subject: [PATCH 1481/2223] drm/amd/display: Remove OPTC lock check At some point, we decided to blank HUBP during pixel data blank, and to handle that, we added some OPTC lock checks. Later, we realized that this change caused multiple regression, and we removed it. Nevertheless, we still have some leftovers that might affect some ASIC behavior, and this commit drops those changes to keep the code consistent. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c | 11 ----------- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h | 1 - drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c | 1 - drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c | 1 - drivers/gpu/drm/amd/display/dc/inc/core_types.h | 1 - .../gpu/drm/amd/display/dc/inc/hw/timing_generator.h | 1 - 6 files changed, 16 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c index ea77392551190..143a900d4d3d3 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c @@ -679,16 +679,6 @@ void optc1_unlock(struct timing_generator *optc) OTG_MASTER_UPDATE_LOCK, 0); } -bool optc1_is_locked(struct timing_generator *optc) -{ - struct optc *optc1 = DCN10TG_FROM_TG(optc); - uint32_t locked; - - REG_GET(OTG_MASTER_UPDATE_LOCK, UPDATE_LOCK_STATUS, &locked); - - return (locked == 1); -} - void optc1_get_position(struct timing_generator *optc, struct crtc_position *position) { @@ -1583,7 +1573,6 @@ static const struct timing_generator_funcs dcn10_tg_funcs = { .enable_crtc_reset = optc1_enable_crtc_reset, .disable_reset_trigger = optc1_disable_reset_trigger, .lock = optc1_lock, - .is_locked = optc1_is_locked, .unlock = optc1_unlock, .enable_optc_clock = optc1_enable_optc_clock, .set_drr = optc1_set_drr, diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h index 6323ca6dc3b33..88ac5f6f4c96c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h @@ -654,7 +654,6 @@ void optc1_set_blank(struct timing_generator *optc, bool enable_blanking); bool optc1_is_blanked(struct timing_generator *optc); -bool optc1_is_locked(struct timing_generator *optc); void optc1_program_blank_color( struct timing_generator *optc, diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c index 1782b9c26cf4b..02459a64ee211 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c @@ -319,7 +319,6 @@ static struct timing_generator_funcs dcn30_tg_funcs = { .enable_crtc_reset = optc1_enable_crtc_reset, .disable_reset_trigger = optc1_disable_reset_trigger, .lock = optc3_lock, - .is_locked = optc1_is_locked, .unlock = optc1_unlock, .lock_doublebuffer_enable = optc3_lock_doublebuffer_enable, .lock_doublebuffer_disable = optc3_lock_doublebuffer_disable, diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c index 2f7404a974790..d873def1a8f93 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c @@ -260,7 +260,6 @@ static struct timing_generator_funcs dcn31_tg_funcs = { .enable_crtc_reset = optc1_enable_crtc_reset, .disable_reset_trigger = optc1_disable_reset_trigger, .lock = optc3_lock, - .is_locked = optc1_is_locked, .unlock = optc1_unlock, .lock_doublebuffer_enable = optc3_lock_doublebuffer_enable, .lock_doublebuffer_disable = optc3_lock_doublebuffer_disable, diff --git a/drivers/gpu/drm/amd/display/dc/inc/core_types.h b/drivers/gpu/drm/amd/display/dc/inc/core_types.h index 4ff1392633a75..1fd7ad8532107 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/core_types.h +++ b/drivers/gpu/drm/amd/display/dc/inc/core_types.h @@ -439,7 +439,6 @@ struct pipe_ctx { union pipe_update_flags update_flags; struct dwbc *dwbc; struct mcif_wb *mcif_wb; - bool vtp_locked; }; /* Data used for dynamic link encoder assignment. diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h b/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h index 72eef7a5ed83a..25a1df45b2641 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h @@ -209,7 +209,6 @@ struct timing_generator_funcs { void (*set_blank)(struct timing_generator *tg, bool enable_blanking); bool (*is_blanked)(struct timing_generator *tg); - bool (*is_locked)(struct timing_generator *tg); void (*set_overscan_blank_color) (struct timing_generator *tg, const struct tg_color *color); void (*set_blank_color)(struct timing_generator *tg, const struct tg_color *color); void (*set_colors)(struct timing_generator *tg, -- GitLab From f1b47f0004cfff051441aa93b7115d756d5eebb7 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 20 Sep 2022 15:34:55 -0400 Subject: [PATCH 1482/2223] drm/amd/display: Adding missing HDMI ACP SEND register Add HDMI ACP bit field definition for DCN32. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h index 250d9a341cf66..e80dd2b925037 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h @@ -106,6 +106,7 @@ SE_SF(DIG0_HDMI_VBI_PACKET_CONTROL, HDMI_GC_CONT, mask_sh),\ SE_SF(DIG0_HDMI_VBI_PACKET_CONTROL, HDMI_GC_SEND, mask_sh),\ SE_SF(DIG0_HDMI_VBI_PACKET_CONTROL, HDMI_NULL_SEND, mask_sh),\ + SE_SF(DIG0_HDMI_VBI_PACKET_CONTROL, HDMI_ACP_SEND, mask_sh),\ SE_SF(DIG0_HDMI_INFOFRAME_CONTROL0, HDMI_AUDIO_INFO_SEND, mask_sh),\ SE_SF(DIG0_HDMI_INFOFRAME_CONTROL1, HDMI_AUDIO_INFO_LINE, mask_sh),\ SE_SF(DIG0_HDMI_GC, HDMI_GC_AVMUTE, mask_sh),\ -- GitLab From 3f4dee59253a6882acde98a2a027e55f1330ae86 Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Fri, 23 Sep 2022 14:00:09 -0400 Subject: [PATCH 1483/2223] drm/amd/display: Fix merging dynamic ODM+MPO configs on DCN32 [WHY&HOW?] When merging ODM pipes that are using MPO, we must copy the stream_res from the new top pipe to the bottom pipe so that the overlayed plane is not pointing to the wrong stream assets. Reviewed-by: Martin Leung <Martin.Leung@amd.com> Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index a56ee04f7df93..f3f98e9a0ce65 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -1598,6 +1598,9 @@ bool dcn32_internal_validate_bw(struct dc *dc, /*MPC split rules will handle this case*/ pipe->bottom_pipe->top_pipe = NULL; } else { + /* when merging an ODM pipes, the bottom MPC pipe must now point to + * the previous ODM pipe and its associated stream assets + */ if (pipe->prev_odm_pipe->bottom_pipe) { /* 3 plane MPO*/ pipe->bottom_pipe->top_pipe = pipe->prev_odm_pipe->bottom_pipe; @@ -1607,6 +1610,8 @@ bool dcn32_internal_validate_bw(struct dc *dc, pipe->bottom_pipe->top_pipe = pipe->prev_odm_pipe; pipe->prev_odm_pipe->bottom_pipe = pipe->bottom_pipe; } + + memcpy(&pipe->bottom_pipe->stream_res, &pipe->bottom_pipe->top_pipe->stream_res, sizeof(struct stream_resource)); } } -- GitLab From fe674c0b6f5382b7c377ca2c418c26dd78b428b4 Mon Sep 17 00:00:00 2001 From: Eric Bernstein <eric.bernstein@amd.com> Date: Tue, 25 Jan 2022 14:42:12 -0500 Subject: [PATCH 1484/2223] drm/amd/display: Fix disable DSC logic in the DIO code [Why] In DIO stream encoder, definition of DP_DSC_MODE is changed (only enable/disable) In OPTC, OTG_SET_V_TOTAL_MIN_MASK_EN is removed (same as DCN3.1) [How] In DIO stream encoder, update enc32_dp_set_dsc_config(). In OPTC, use DCN3.1 version for function interfaces .set_vrr_m_const and .set_drr Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Eric Bernstein <eric.bernstein@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c index 40e713c4e172d..d19fc93dbc75d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.c @@ -388,7 +388,7 @@ static void enc32_dp_set_dsc_config(struct stream_encoder *enc, { struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); - REG_UPDATE(DP_DSC_CNTL, DP_DSC_MODE, dsc_mode); + REG_UPDATE(DP_DSC_CNTL, DP_DSC_MODE, dsc_mode == OPTC_DSC_DISABLED ? 0 : 1); } /* this function read dsc related register fields to be logged later in dcn10_log_hw_state -- GitLab From f638fe27b817c755e017b8a6ae4b9b4224461941 Mon Sep 17 00:00:00 2001 From: George Shen <George.Shen@amd.com> Date: Thu, 2 Jun 2022 11:10:25 -0400 Subject: [PATCH 1485/2223] drm/amd/display: Add missing SDP registers to DCN32 reglist [Why] Certain features require the additional DP SDP configuration registers DP_SEC_CNTL1 and DP_SEC_CNTL5 in order to function correctly. The DCN32 DIO stream encoder reglist is currently missing these two registers. [How] Add the missing registers to the DCN32 DIO stream encoder reglist. Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: George Shen <George.Shen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h index e80dd2b925037..20e5f016a45a3 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h @@ -71,7 +71,9 @@ SRI(DP_MSE_RATE_UPDATE, DP, id), \ SRI(DP_PIXEL_FORMAT, DP, id), \ SRI(DP_SEC_CNTL, DP, id), \ + SRI(DP_SEC_CNTL1, DP, id), \ SRI(DP_SEC_CNTL2, DP, id), \ + SRI(DP_SEC_CNTL5, DP, id), \ SRI(DP_SEC_CNTL6, DP, id), \ SRI(DP_STEER_FIFO, DP, id), \ SRI(DP_VID_M, DP, id), \ -- GitLab From 46c87432e3d4cea8e1a7ac6e9e3ebd2462f47617 Mon Sep 17 00:00:00 2001 From: Wenjing Liu <wenjing.liu@amd.com> Date: Thu, 23 Jun 2022 16:09:25 -0400 Subject: [PATCH 1486/2223] drm/amd/display: Add missing mask sh for SYM32_TP_SQ_PULSE register There is a missing register mask in dcn32 causing the hardware programming is not executed when programming SQ_num test pattern for DP2. Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Wenjing Liu <wenjing.liu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hpo_dp_link_encoder.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hpo_dp_link_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hpo_dp_link_encoder.h index 9db1323e19337..176b1537d2a13 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hpo_dp_link_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hpo_dp_link_encoder.h @@ -47,6 +47,7 @@ SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_TP_CONFIG, TP_PRBS_SEL1, mask_sh),\ SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_TP_CONFIG, TP_PRBS_SEL2, mask_sh),\ SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_TP_CONFIG, TP_PRBS_SEL3, mask_sh),\ + SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_TP_SQ_PULSE, TP_SQ_PULSE_WIDTH, mask_sh),\ SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_SAT_VC0, SAT_STREAM_SOURCE, mask_sh),\ SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_SAT_VC0, SAT_SLOT_COUNT, mask_sh),\ SE_SF(DP_DPHY_SYM320_DP_DPHY_SYM32_VC_RATE_CNTL0, STREAM_VC_RATE_X, mask_sh),\ -- GitLab From e626d9b9c6e038a6918aad1b5affd38f6b9deaed Mon Sep 17 00:00:00 2001 From: Sonny Jiang <sonny.jiang@amd.com> Date: Fri, 30 Sep 2022 16:23:32 -0400 Subject: [PATCH 1487/2223] drm/amdgpu: Enable VCN PG on GC11_0_1 Enable VCN PG on GC11_0_1 Signed-off-by: Sonny Jiang <sonny.jiang@amd.com> Reviewed-by: James Zhu <James.Zhu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x --- drivers/gpu/drm/amd/amdgpu/soc21.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 16b757664a35e..795706b3b092f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -629,6 +629,7 @@ static int soc21_common_early_init(void *handle) AMD_CG_SUPPORT_JPEG_MGCG; adev->pg_flags = AMD_PG_SUPPORT_GFX_PG | + AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_JPEG; adev->external_rev_id = adev->rev_id + 0x1; -- GitLab From 11895d32ffddb50152f0a1e671d36b7f60e4daba Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 20 Sep 2022 15:46:58 -0400 Subject: [PATCH 1488/2223] drm/amd/display: Add PState change high hook for DCN32 For some reason, we missed the PState check for DCN32 which may cause issues for clock transition. This commit add that required hook. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c index f6d3da475835b..9fbb72369c10e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c @@ -936,6 +936,7 @@ static const struct hubbub_funcs hubbub32_funcs = { .program_watermarks = hubbub32_program_watermarks, .allow_self_refresh_control = hubbub1_allow_self_refresh_control, .is_allow_self_refresh_enabled = hubbub1_is_allow_self_refresh_enabled, + .verify_allow_pstate_change_high = hubbub1_verify_allow_pstate_change_high, .force_wm_propagate_to_pipes = hubbub32_force_wm_propagate_to_pipes, .force_pstate_change_control = hubbub3_force_pstate_change_control, .init_watermarks = hubbub32_init_watermarks, -- GitLab From 54fae65ff469a79fc0ca46f480c4e7fce50f3963 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 20 Sep 2022 16:06:36 -0400 Subject: [PATCH 1489/2223] drm/amd/display: Enable 2 to 1 ODM policy if supported If the current configuration supports 2 to 1 ODM policy, let's also enable the windowed MPO feature. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c index 8012a48859b59..218927d6ecb20 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c @@ -991,6 +991,10 @@ void dcn32_init_hw(struct dc *dc) dc_dmub_srv_query_caps_cmd(dc->ctx->dmub_srv->dmub); dc->caps.dmub_caps.psr = dc->ctx->dmub_srv->dmub->feature_caps.psr; } + + /* Enable support for ODM and windowed MPO if policy flag is set */ + if (dc->debug.enable_single_display_2to1_odm_policy) + dc->config.enable_windowed_mpo_odm = true; } static int calc_mpc_flow_ctrl_cnt(const struct dc_stream_state *stream, -- GitLab From 36939c94689ae7e6aaa9a0fa37e5c41616f76665 Mon Sep 17 00:00:00 2001 From: Aric Cyr <aric.cyr@amd.com> Date: Mon, 26 Sep 2022 10:21:48 -0400 Subject: [PATCH 1490/2223] drm/amd/display: 3.2.206 This version brings along the following: - ILR improvements - PSR fixes - DCN315 fixes - DCN32 fixes - ODM fixes - DSC fixes - SubVP fixes Reviewed-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Signed-off-by: Aric Cyr <aric.cyr@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 458a4f431ac6e..66b7482d2e729 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -47,7 +47,7 @@ struct aux_payload; struct set_config_cmd_payload; struct dmub_notification; -#define DC_VER "3.2.205" +#define DC_VER "3.2.206" #define MAX_SURFACES 3 #define MAX_PLANES 6 -- GitLab From 9691a7a776302c85c10294f1a92c15c7f57a5947 Mon Sep 17 00:00:00 2001 From: Martin Leung <Martin.Leung@amd.com> Date: Mon, 23 May 2022 14:57:30 -0400 Subject: [PATCH 1491/2223] drm/amd/display: unblock mcm_luts why and how: needed to fix bad assumption for enable mcm_luts Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Martin Leung <Martin.Leung@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c index 218927d6ecb20..33bdf56b2b3a6 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c @@ -630,10 +630,9 @@ bool dcn32_set_input_transfer_func(struct dc *dc, params = &dpp_base->degamma_params; } - result = dpp_base->funcs->dpp_program_gamcor_lut(dpp_base, params); + dpp_base->funcs->dpp_program_gamcor_lut(dpp_base, params); - if (result && - pipe_ctx->stream_res.opp && + if (pipe_ctx->stream_res.opp && pipe_ctx->stream_res.opp->ctx && hws->funcs.set_mcm_luts) result = hws->funcs.set_mcm_luts(pipe_ctx, plane_state); -- GitLab From 07ebc18c047adcd72905619e72ae7c48db28ab48 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Thu, 22 Sep 2022 09:24:12 -0400 Subject: [PATCH 1492/2223] drm/amd/display: Disconnect DSC for unused pipes during ODM transition [Why] During transition from ODM combine to ODM bypass, if DSC is enabled need to disconnect the DSC mux for pipes no longer in use. [How] During ODM update, detect pipes with DSC that are no longer being used for new state and call new DSC interface to disconnect. Add new DSC interface to disconnect from pipe Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dcn32/dcn32_hwseq.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c index 33bdf56b2b3a6..955ca273cfe1e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c @@ -1148,16 +1148,19 @@ void dcn32_update_odm(struct dc *dc, struct dc_state *context, struct pipe_ctx * true); } - // Don't program pixel clock after link is already enabled -/* if (false == pipe_ctx->clock_source->funcs->program_pix_clk( - pipe_ctx->clock_source, - &pipe_ctx->stream_res.pix_clk_params, - &pipe_ctx->pll_settings)) { - BREAK_TO_DEBUGGER(); - }*/ + if (pipe_ctx->stream_res.dsc) { + struct pipe_ctx *current_pipe_ctx = &dc->current_state->res_ctx.pipe_ctx[pipe_ctx->pipe_idx]; - if (pipe_ctx->stream_res.dsc) update_dsc_on_stream(pipe_ctx, pipe_ctx->stream->timing.flags.DSC); + + /* Check if no longer using pipe for ODM, then need to disconnect DSC for that pipe */ + if (!pipe_ctx->next_odm_pipe && current_pipe_ctx->next_odm_pipe && + current_pipe_ctx->next_odm_pipe->stream_res.dsc) { + struct display_stream_compressor *dsc = current_pipe_ctx->next_odm_pipe->stream_res.dsc; + /* disconnect DSC block from stream */ + dsc->funcs->dsc_disconnect(dsc); + } + } } unsigned int dcn32_calculate_dccg_k1_k2_values(struct pipe_ctx *pipe_ctx, unsigned int *k1_div, unsigned int *k2_div) -- GitLab From a3daede47576037ff7bbbe9cbd36e52a71d92bc8 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Thu, 22 Sep 2022 10:06:48 -0400 Subject: [PATCH 1493/2223] drm/amd/display: update DSC for DCN32 Update DSC checks in the DCN32 VBA. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c index 75be1e1ce543f..8316b1b914c67 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c @@ -2252,9 +2252,8 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l for (k = 0; k <= mode_lib->vba.NumberOfActiveSurfaces - 1; k++) { if (!(mode_lib->vba.DSCInputBitPerComponent[k] == 12.0 || mode_lib->vba.DSCInputBitPerComponent[k] == 10.0 - || mode_lib->vba.DSCInputBitPerComponent[k] == 8.0 - || mode_lib->vba.DSCInputBitPerComponent[k] > - mode_lib->vba.MaximumDSCBitsPerComponent)) { + || mode_lib->vba.DSCInputBitPerComponent[k] == 8.0) + || mode_lib->vba.DSCInputBitPerComponent[k] > mode_lib->vba.MaximumDSCBitsPerComponent) { mode_lib->vba.NonsupportedDSCInputBPC = true; } } @@ -2330,16 +2329,15 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l if (mode_lib->vba.OutputMultistreamId[k] == k && mode_lib->vba.ForcedOutputLinkBPP[k] == 0) mode_lib->vba.BPPForMultistreamNotIndicated = true; for (j = 0; j < mode_lib->vba.NumberOfActiveSurfaces; ++j) { - if (mode_lib->vba.OutputMultistreamId[k] == j && mode_lib->vba.OutputMultistreamEn[k] + if (mode_lib->vba.OutputMultistreamId[k] == j && mode_lib->vba.ForcedOutputLinkBPP[k] == 0) mode_lib->vba.BPPForMultistreamNotIndicated = true; } } if ((mode_lib->vba.Output[k] == dm_edp || mode_lib->vba.Output[k] == dm_hdmi)) { - if (mode_lib->vba.OutputMultistreamId[k] == k && mode_lib->vba.OutputMultistreamEn[k]) + if (mode_lib->vba.OutputMultistreamEn[k] == true && mode_lib->vba.OutputMultistreamId[k] == k) mode_lib->vba.MultistreamWithHDMIOreDP = true; - for (j = 0; j < mode_lib->vba.NumberOfActiveSurfaces; ++j) { if (mode_lib->vba.OutputMultistreamEn[k] == true && mode_lib->vba.OutputMultistreamId[k] == j) mode_lib->vba.MultistreamWithHDMIOreDP = true; -- GitLab From 7e6d5cf8e3e3f8050de52a28236d5a172caf2da9 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Thu, 22 Sep 2022 10:08:11 -0400 Subject: [PATCH 1494/2223] drm/amd/display: Minor code style change This commit adds some minor code style changes just to reduce the merge conflicts we have when we upstream some of the VBA code. Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c index 8316b1b914c67..11d5750e15afe 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c @@ -2476,8 +2476,6 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l mode_lib->vba.PixelClock[k], mode_lib->vba.PixelClockBackEnd[k]); } - m = 0; - for (k = 0; k <= mode_lib->vba.NumberOfActiveSurfaces - 1; k++) { for (m = 0; m <= mode_lib->vba.NumberOfActiveSurfaces - 1; m++) { for (j = 0; j <= mode_lib->vba.NumberOfActiveSurfaces - 1; j++) { @@ -2854,8 +2852,6 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l } } - m = 0; - //Calculate Return BW for (i = 0; i < (int) v->soc.num_states; ++i) { for (j = 0; j <= 1; ++j) { @@ -3616,11 +3612,10 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l mode_lib->vba.ModeIsSupported = mode_lib->vba.ModeSupport[i][0] == true || mode_lib->vba.ModeSupport[i][1] == true; - if (mode_lib->vba.ModeSupport[i][0] == true) { + if (mode_lib->vba.ModeSupport[i][0] == true) MaximumMPCCombine = 0; - } else { + else MaximumMPCCombine = 1; - } } } -- GitLab From 95c985ffc63e2a7d8f6aa18f9351f5010a8d1adb Mon Sep 17 00:00:00 2001 From: Yang Li <yang.lee@linux.alibaba.com> Date: Fri, 30 Sep 2022 13:38:58 +0800 Subject: [PATCH 1495/2223] drm/amd/display: clean up one inconsistent indenting clean up one inconsistent indenting Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2238 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c index 6658849d5b4e8..61087f2385a96 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn321/dcn321_resource.c @@ -1654,7 +1654,7 @@ static bool dcn321_resource_construct( #undef REG_STRUCT #define REG_STRUCT dccg_regs - dccg_regs_init(); + dccg_regs_init(); ctx->dc_bios->regs = &bios_regs; -- GitLab From 8c39634d28fa460869702b9801d2efe06671b342 Mon Sep 17 00:00:00 2001 From: Yang Li <yang.lee@linux.alibaba.com> Date: Fri, 30 Sep 2022 13:38:59 +0800 Subject: [PATCH 1496/2223] drm/amd/display: clean up one inconsistent indenting clean up one inconsistent indenting Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2321 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c index 559e563d5bc16..f04595b750abc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c @@ -852,7 +852,7 @@ static struct hubbub *dcn301_hubbub_create(struct dc_context *ctx) vmid->masks = &vmid_masks; } - hubbub3->num_vmid = res_cap_dcn301.num_vmid; + hubbub3->num_vmid = res_cap_dcn301.num_vmid; return &hubbub3->base; } -- GitLab From 525530ad9a7ec9aa34266e1429cc5ef9acb58e6c Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Thu, 29 Sep 2022 22:20:15 +0800 Subject: [PATCH 1497/2223] drm/amdgpu/sdma: add missing release_firmware() in amdgpu_sdma_init_microcode() In some error path in amdgpu_sdma_init_microcode(), release_firmware() is not called, the memory allocated in request_firmware() will be leaked, calling amdgpu_sdma_destroy_inst_ctx() which calls release_firmware() to avoid memory leak. Fixes: 15aa13056d11da ("drm/amdgpu: add function to init SDMA microcode") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 3949b7e3907f0..43cf8632cc1ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -222,8 +222,10 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, adev->sdma.instance[instance].fw->data; version_major = le16_to_cpu(header->header_version_major); - if ((duplicate && instance) || (!duplicate && version_major > 1)) - return -EINVAL; + if ((duplicate && instance) || (!duplicate && version_major > 1)) { + err = -EINVAL; + goto out; + } err = amdgpu_sdma_init_inst_ctx(&adev->sdma.instance[instance]); if (err) @@ -272,7 +274,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, ALIGN(le32_to_cpu(sdma_hdr->ctl_ucode_size_bytes), PAGE_SIZE); break; default: - return -EINVAL; + err = -EINVAL; } } -- GitLab From 21a550de5faf9f54013334c9a6a7643b8fd80b36 Mon Sep 17 00:00:00 2001 From: Ruili Ji <ruiliji2@amd.com> Date: Mon, 3 Oct 2022 17:39:45 +0800 Subject: [PATCH 1498/2223] drm/amdgpu: Enable F32_WPTR_POLL_ENABLE in mqd This patch is to fix the SDMA user queue doorbell missing issue on SDMA 6.0. F32_WPTR_POLL_ENABLE has to be set if doorbell mode is used. Otherwise ringing SDMA user queue doorbell can't wake up system from gfxoff. Signed-off-by: Ruili Ji <ruiliji2@amd.com> Reviewed-by: Yifan Zhang <yifan1.zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x --- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index db51230163c5c..0150f66a5ae6d 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -846,7 +846,8 @@ static int sdma_v6_0_mqd_init(struct amdgpu_device *adev, void *mqd, m->sdmax_rlcx_rb_cntl = order_base_2(prop->queue_size / 4) << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 4 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + 4 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT; m->sdmax_rlcx_rb_base = lower_32_bits(prop->hqd_base_gpu_addr >> 8); m->sdmax_rlcx_rb_base_hi = upper_32_bits(prop->hqd_base_gpu_addr >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 26b53b6d673e5..4f6390f3236ef 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -333,7 +333,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT; m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); -- GitLab From f6aa84b83aee629fbbbc4ea16c2c142caf920d5a Mon Sep 17 00:00:00 2001 From: Roman Li <roman.li@amd.com> Date: Thu, 29 Sep 2022 14:37:00 -0400 Subject: [PATCH 1499/2223] drm/amd/display: Enable dpia support for dcn314 [Why] DCN 3.1.4 supports DPIA. [How] - Set dpia_supported flag for dcn314 in dmub_hw_init() - Remove comment that becomes irrelevant after this change. Signed-off-by: Roman Li <roman.li@amd.com> Reviewed-by: Nicholas Kazlauskas <Nicholas.Kazlauskas@amd.com> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4c73727e0b7d5..d5222d5e3a616 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1110,7 +1110,8 @@ static int dm_dmub_hw_init(struct amdgpu_device *adev) hw_params.fb[i] = &fb_info->fb[i]; switch (adev->ip_versions[DCE_HWIP][0]) { - case IP_VERSION(3, 1, 3): /* Only for this asic hw internal rev B0 */ + case IP_VERSION(3, 1, 3): + case IP_VERSION(3, 1, 4): hw_params.dpia_supported = true; hw_params.disable_dpia = adev->dm.dc->debug.dpia_debug.bits.disable_dpia; break; -- GitLab From 8799c0be89ebb99a16098bdf618f49f817bef76a Mon Sep 17 00:00:00 2001 From: Yunxiang Li <Yunxiang.Li@amd.com> Date: Wed, 21 Sep 2022 17:20:19 -0400 Subject: [PATCH 1500/2223] drm/amd/display: Fix vblank refcount in vrr transition manage_dm_interrupts disable/enable vblank using drm_crtc_vblank_off/on which causes drm_crtc_vblank_get in vrr_transition to fail, and later when drm_crtc_vblank_put is called the refcount on vblank will be messed up. Therefore move the call to after manage_dm_interrupts. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1247 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1380 Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 55 +++++++++---------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d5222d5e3a616..b84aedb707b8f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7479,15 +7479,15 @@ static void amdgpu_dm_handle_vrr_transition(struct dm_crtc_state *old_state, * We also need vupdate irq for the actual core vblank handling * at end of vblank. */ - dm_set_vupdate_irq(new_state->base.crtc, true); - drm_crtc_vblank_get(new_state->base.crtc); + WARN_ON(dm_set_vupdate_irq(new_state->base.crtc, true) != 0); + WARN_ON(drm_crtc_vblank_get(new_state->base.crtc) != 0); DRM_DEBUG_DRIVER("%s: crtc=%u VRR off->on: Get vblank ref\n", __func__, new_state->base.crtc->base.id); } else if (old_vrr_active && !new_vrr_active) { /* Transition VRR active -> inactive: * Allow vblank irq disable again for fixed refresh rate. */ - dm_set_vupdate_irq(new_state->base.crtc, false); + WARN_ON(dm_set_vupdate_irq(new_state->base.crtc, false) != 0); drm_crtc_vblank_put(new_state->base.crtc); DRM_DEBUG_DRIVER("%s: crtc=%u VRR on->off: Drop vblank ref\n", __func__, new_state->base.crtc->base.id); @@ -8243,23 +8243,6 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) mutex_unlock(&dm->dc_lock); } - /* Count number of newly disabled CRTCs for dropping PM refs later. */ - for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, - new_crtc_state, i) { - if (old_crtc_state->active && !new_crtc_state->active) - crtc_disable_count++; - - dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); - dm_old_crtc_state = to_dm_crtc_state(old_crtc_state); - - /* For freesync config update on crtc state and params for irq */ - update_stream_irq_parameters(dm, dm_new_crtc_state); - - /* Handle vrr on->off / off->on transitions */ - amdgpu_dm_handle_vrr_transition(dm_old_crtc_state, - dm_new_crtc_state); - } - /** * Enable interrupts for CRTCs that are newly enabled or went through * a modeset. It was intentionally deferred until after the front end @@ -8269,16 +8252,29 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc); #ifdef CONFIG_DEBUG_FS - bool configure_crc = false; enum amdgpu_dm_pipe_crc_source cur_crc_src; #if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) - struct crc_rd_work *crc_rd_wrk = dm->crc_rd_wrk; + struct crc_rd_work *crc_rd_wrk; +#endif +#endif + /* Count number of newly disabled CRTCs for dropping PM refs later. */ + if (old_crtc_state->active && !new_crtc_state->active) + crtc_disable_count++; + + dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); + dm_old_crtc_state = to_dm_crtc_state(old_crtc_state); + + /* For freesync config update on crtc state and params for irq */ + update_stream_irq_parameters(dm, dm_new_crtc_state); + +#ifdef CONFIG_DEBUG_FS +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + crc_rd_wrk = dm->crc_rd_wrk; #endif spin_lock_irqsave(&adev_to_drm(adev)->event_lock, flags); cur_crc_src = acrtc->dm_irq_params.crc_src; spin_unlock_irqrestore(&adev_to_drm(adev)->event_lock, flags); #endif - dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); if (new_crtc_state->active && (!old_crtc_state->active || @@ -8286,16 +8282,19 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) dc_stream_retain(dm_new_crtc_state->stream); acrtc->dm_irq_params.stream = dm_new_crtc_state->stream; manage_dm_interrupts(adev, acrtc, true); + } + /* Handle vrr on->off / off->on transitions */ + amdgpu_dm_handle_vrr_transition(dm_old_crtc_state, dm_new_crtc_state); #ifdef CONFIG_DEBUG_FS + if (new_crtc_state->active && + (!old_crtc_state->active || + drm_atomic_crtc_needs_modeset(new_crtc_state))) { /** * Frontend may have changed so reapply the CRC capture * settings for the stream. */ - dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); - if (amdgpu_dm_is_valid_crc_source(cur_crc_src)) { - configure_crc = true; #if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) if (amdgpu_dm_crc_window_is_activated(crtc)) { spin_lock_irqsave(&adev_to_drm(adev)->event_lock, flags); @@ -8307,12 +8306,10 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) spin_unlock_irqrestore(&adev_to_drm(adev)->event_lock, flags); } #endif - } - - if (configure_crc) if (amdgpu_dm_crtc_configure_crc_source( crtc, dm_new_crtc_state, cur_crc_src)) DRM_DEBUG_DRIVER("Failed to configure crc source"); + } #endif } } -- GitLab From 7d30ccc7761cfcd6756aa0b760c5f5493038d30a Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Fri, 30 Sep 2022 21:33:54 -0700 Subject: [PATCH 1501/2223] drm/amd/display: clean up dcn32_fpu.c kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rectify multiple kernel-doc warnings in dcn32_fpu.c. E.g.: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:247: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Finds dummy_latency_index when MCLK switching using firmware based drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:484: warning: Function parameter or member 'phantom_stream' not described in 'dcn32_set_phantom_stream_timing' drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:601: warning: Function parameter or member 'dc' not described in 'dcn32_assign_subvp_pipe' drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:601: warning: Function parameter or member 'context' not described in 'dcn32_assign_subvp_pipe' drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:601: warning: Function parameter or member 'index' not described in 'dcn32_assign_subvp_pipe' drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:2140: warning: Function parameter or member 'dc' not described in 'dcn32_update_bw_bounding_box_fpu' drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:2140: warning: Function parameter or member 'bw_params' not described in 'dcn32_update_bw_bounding_box_fpu' drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/dcn32_fpu.c:2140: warning: expecting prototype for dcn32_update_bw_bounding_box(). Prototype was for dcn32_update_bw_bounding_box_fpu() instead Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Cc: George Shen <george.shen@amd.com> Cc: Alvin Lee <alvin.lee2@amd.com> Cc: Nevenko Stupar <Nevenko.Stupar@amd.com> Cc: Harry Wentland <harry.wentland@amd.com> Cc: Leo Li <sunpeng.li@amd.com> Cc: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Cc: amd-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: "Pan, Xinhui" <Xinhui.Pan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 116 ++++++++---------- 1 file changed, 49 insertions(+), 67 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index f3f98e9a0ce65..6bdd509d292a6 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -243,7 +243,7 @@ void dcn32_build_wm_range_table_fpu(struct clk_mgr_internal *clk_mgr) clk_mgr->base.bw_params->wm_table.nv_entries[WM_D].pmfw_breakdown.max_uclk = 0xFFFF; } -/** +/* * Finds dummy_latency_index when MCLK switching using firmware based * vblank stretch is enabled. This function will iterate through the * table of dummy pstate latencies until the lowest value that allows @@ -290,15 +290,14 @@ int dcn32_find_dummy_latency_index_for_fw_based_mclk_switch(struct dc *dc, /** * dcn32_helper_populate_phantom_dlg_params - Get DLG params for phantom pipes * and populate pipe_ctx with those params. - * - * This function must be called AFTER the phantom pipes are added to context - * and run through DML (so that the DLG params for the phantom pipes can be - * populated), and BEFORE we program the timing for the phantom pipes. - * * @dc: [in] current dc state * @context: [in] new dc state * @pipes: [in] DML pipe params array * @pipe_cnt: [in] DML pipe count + * + * This function must be called AFTER the phantom pipes are added to context + * and run through DML (so that the DLG params for the phantom pipes can be + * populated), and BEFORE we program the timing for the phantom pipes. */ void dcn32_helper_populate_phantom_dlg_params(struct dc *dc, struct dc_state *context, @@ -331,8 +330,9 @@ void dcn32_helper_populate_phantom_dlg_params(struct dc *dc, } /** - * ******************************************************************************************* - * dcn32_predict_pipe_split: Predict if pipe split will occur for a given DML pipe + * dcn32_predict_pipe_split - Predict if pipe split will occur for a given DML pipe + * @context: [in] New DC state to be programmed + * @pipe_e2e: [in] DML pipe end to end context * * This function takes in a DML pipe (pipe_e2e) and predicts if pipe split is required (both * ODM and MPC). For pipe split, ODM combine is determined by the ODM mode, and MPC combine is @@ -343,12 +343,7 @@ void dcn32_helper_populate_phantom_dlg_params(struct dc *dc, * - MPC combine is only chosen if there is no ODM combine requirements / policy in place, and * MPC is required * - * @param [in]: context: New DC state to be programmed - * @param [in]: pipe_e2e: DML pipe end to end context - * - * @return: Number of splits expected (1 for 2:1 split, 3 for 4:1 split, 0 for no splits). - * - * ******************************************************************************************* + * Return: Number of splits expected (1 for 2:1 split, 3 for 4:1 split, 0 for no splits). */ uint8_t dcn32_predict_pipe_split(struct dc_state *context, display_e2e_pipe_params_st *pipe_e2e) @@ -504,7 +499,14 @@ void insert_entry_into_table_sorted(struct _vcs_dpi_voltage_scaling_st *table, } /** - * dcn32_set_phantom_stream_timing: Set timing params for the phantom stream + * dcn32_set_phantom_stream_timing - Set timing params for the phantom stream + * @dc: current dc state + * @context: new dc state + * @ref_pipe: Main pipe for the phantom stream + * @phantom_stream: target phantom stream state + * @pipes: DML pipe params + * @pipe_cnt: number of DML pipes + * @dc_pipe_idx: DC pipe index for the main pipe (i.e. ref_pipe) * * Set timing params of the phantom stream based on calculated output from DML. * This function first gets the DML pipe index using the DC pipe index, then @@ -517,13 +519,6 @@ void insert_entry_into_table_sorted(struct _vcs_dpi_voltage_scaling_st *table, * that separately. * * - Set phantom backporch = vstartup of main pipe - * - * @dc: current dc state - * @context: new dc state - * @ref_pipe: Main pipe for the phantom stream - * @pipes: DML pipe params - * @pipe_cnt: number of DML pipes - * @dc_pipe_idx: DC pipe index for the main pipe (i.e. ref_pipe) */ void dcn32_set_phantom_stream_timing(struct dc *dc, struct dc_state *context, @@ -592,16 +587,14 @@ void dcn32_set_phantom_stream_timing(struct dc *dc, } /** - * dcn32_get_num_free_pipes: Calculate number of free pipes + * dcn32_get_num_free_pipes - Calculate number of free pipes + * @dc: current dc state + * @context: new dc state * * This function assumes that a "used" pipe is a pipe that has * both a stream and a plane assigned to it. * - * @dc: current dc state - * @context: new dc state - * - * Return: - * Number of free pipes available in the context + * Return: Number of free pipes available in the context */ static unsigned int dcn32_get_num_free_pipes(struct dc *dc, struct dc_state *context) { @@ -625,7 +618,10 @@ static unsigned int dcn32_get_num_free_pipes(struct dc *dc, struct dc_state *con } /** - * dcn32_assign_subvp_pipe: Function to decide which pipe will use Sub-VP. + * dcn32_assign_subvp_pipe - Function to decide which pipe will use Sub-VP. + * @dc: current dc state + * @context: new dc state + * @index: [out] dc pipe index for the pipe chosen to have phantom pipes assigned * * We enter this function if we are Sub-VP capable (i.e. enough pipes available) * and regular P-State switching (i.e. VACTIVE/VBLANK) is not supported, or if @@ -639,12 +635,7 @@ static unsigned int dcn32_get_num_free_pipes(struct dc *dc, struct dc_state *con * for determining which should be the SubVP pipe (need a way to determine if a pipe / plane doesn't * support MCLK switching naturally [i.e. ACTIVE or VBLANK]). * - * @param dc: current dc state - * @param context: new dc state - * @param index: [out] dc pipe index for the pipe chosen to have phantom pipes assigned - * - * Return: - * True if a valid pipe assignment was found for Sub-VP. Otherwise false. + * Return: True if a valid pipe assignment was found for Sub-VP. Otherwise false. */ static bool dcn32_assign_subvp_pipe(struct dc *dc, struct dc_state *context, @@ -711,7 +702,9 @@ static bool dcn32_assign_subvp_pipe(struct dc *dc, } /** - * dcn32_enough_pipes_for_subvp: Function to check if there are "enough" pipes for SubVP. + * dcn32_enough_pipes_for_subvp - Function to check if there are "enough" pipes for SubVP. + * @dc: current dc state + * @context: new dc state * * This function returns true if there are enough free pipes * to create the required phantom pipes for any given stream @@ -723,9 +716,6 @@ static bool dcn32_assign_subvp_pipe(struct dc *dc, * pipe which can be used as the phantom pipe for the non pipe * split pipe. * - * @dc: current dc state - * @context: new dc state - * * Return: * True if there are enough free pipes to assign phantom pipes to at least one * stream that does not already have phantom pipes assigned. Otherwise false. @@ -764,7 +754,9 @@ static bool dcn32_enough_pipes_for_subvp(struct dc *dc, struct dc_state *context } /** - * subvp_subvp_schedulable: Determine if SubVP + SubVP config is schedulable + * subvp_subvp_schedulable - Determine if SubVP + SubVP config is schedulable + * @dc: current dc state + * @context: new dc state * * High level algorithm: * 1. Find longest microschedule length (in us) between the two SubVP pipes @@ -772,11 +764,7 @@ static bool dcn32_enough_pipes_for_subvp(struct dc *dc, struct dc_state *context * pipes still allows for the maximum microschedule to fit in the active * region for both pipes. * - * @dc: current dc state - * @context: new dc state - * - * Return: - * bool - True if the SubVP + SubVP config is schedulable, false otherwise + * Return: True if the SubVP + SubVP config is schedulable, false otherwise */ static bool subvp_subvp_schedulable(struct dc *dc, struct dc_state *context) { @@ -836,7 +824,10 @@ static bool subvp_subvp_schedulable(struct dc *dc, struct dc_state *context) } /** - * subvp_drr_schedulable: Determine if SubVP + DRR config is schedulable + * subvp_drr_schedulable - Determine if SubVP + DRR config is schedulable + * @dc: current dc state + * @context: new dc state + * @drr_pipe: DRR pipe_ctx for the SubVP + DRR config * * High level algorithm: * 1. Get timing for SubVP pipe, phantom pipe, and DRR pipe @@ -845,12 +836,7 @@ static bool subvp_subvp_schedulable(struct dc *dc, struct dc_state *context) * 3.If (SubVP Active - Prefetch > Stretched DRR frame + max(MALL region, Stretched DRR frame)) * then report the configuration as supported * - * @dc: current dc state - * @context: new dc state - * @drr_pipe: DRR pipe_ctx for the SubVP + DRR config - * - * Return: - * bool - True if the SubVP + DRR config is schedulable, false otherwise + * Return: True if the SubVP + DRR config is schedulable, false otherwise */ static bool subvp_drr_schedulable(struct dc *dc, struct dc_state *context, struct pipe_ctx *drr_pipe) { @@ -914,7 +900,9 @@ static bool subvp_drr_schedulable(struct dc *dc, struct dc_state *context, struc /** - * subvp_vblank_schedulable: Determine if SubVP + VBLANK config is schedulable + * subvp_vblank_schedulable - Determine if SubVP + VBLANK config is schedulable + * @dc: current dc state + * @context: new dc state * * High level algorithm: * 1. Get timing for SubVP pipe, phantom pipe, and VBLANK pipe @@ -922,11 +910,7 @@ static bool subvp_drr_schedulable(struct dc *dc, struct dc_state *context, struc * then report the configuration as supported * 3. If the VBLANK display is DRR, then take the DRR static schedulability path * - * @dc: current dc state - * @context: new dc state - * - * Return: - * bool - True if the SubVP + VBLANK/DRR config is schedulable, false otherwise + * Return: True if the SubVP + VBLANK/DRR config is schedulable, false otherwise */ static bool subvp_vblank_schedulable(struct dc *dc, struct dc_state *context) { @@ -1003,20 +987,18 @@ static bool subvp_vblank_schedulable(struct dc *dc, struct dc_state *context) } /** - * subvp_validate_static_schedulability: Check which SubVP case is calculated and handle - * static analysis based on the case. + * subvp_validate_static_schedulability - Check which SubVP case is calculated + * and handle static analysis based on the case. + * @dc: current dc state + * @context: new dc state + * @vlevel: Voltage level calculated by DML * * Three cases: * 1. SubVP + SubVP * 2. SubVP + VBLANK (DRR checked internally) * 3. SubVP + VACTIVE (currently unsupported) * - * @dc: current dc state - * @context: new dc state - * @vlevel: Voltage level calculated by DML - * - * Return: - * bool - True if statically schedulable, false otherwise + * Return: True if statically schedulable, false otherwise */ static bool subvp_validate_static_schedulability(struct dc *dc, struct dc_state *context, @@ -2281,7 +2263,7 @@ static int build_synthetic_soc_states(struct clk_bw_params *bw_params, return 0; } -/** +/* * dcn32_update_bw_bounding_box * * This would override some dcn3_2 ip_or_soc initial parameters hardcoded from -- GitLab From 5e69732d4a89928b7daaa651ad869cebee28bfff Mon Sep 17 00:00:00 2001 From: Dong Chenchen <dongchenchen2@huawei.com> Date: Fri, 30 Sep 2022 14:38:27 +0800 Subject: [PATCH 1502/2223] drm/amd/display: Removed unused variable 'sdp_stream_enable' Kernel test robot throws below warning -> drivers/gpu/drm/amd/amdgpu/../display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c: In function 'dcn31_hpo_dp_stream_enc_update_dp_info_packets': drivers/gpu/drm/amd/amdgpu/../display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c:439:14: warning: variable 'sdp_stream_enable' set but not used [-Wunused-but-set-variable] 439 | bool sdp_stream_enable = false; Removed unused variable 'sdp_stream_enable'. Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Dong Chenchen <dongchenchen2@huawei.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../dc/dcn31/dcn31_hpo_dp_stream_encoder.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c index d71d89268a07a..814f401db3b34 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hpo_dp_stream_encoder.c @@ -436,32 +436,28 @@ static void dcn31_hpo_dp_stream_enc_update_dp_info_packets( { struct dcn31_hpo_dp_stream_encoder *enc3 = DCN3_1_HPO_DP_STREAM_ENC_FROM_HPO_STREAM_ENC(enc); uint32_t dmdata_packet_enabled = 0; - bool sdp_stream_enable = false; - if (info_frame->vsc.valid) { + if (info_frame->vsc.valid) enc->vpg->funcs->update_generic_info_packet( enc->vpg, 0, /* packetIndex */ &info_frame->vsc, true); - sdp_stream_enable = true; - } - if (info_frame->spd.valid) { + + if (info_frame->spd.valid) enc->vpg->funcs->update_generic_info_packet( enc->vpg, 2, /* packetIndex */ &info_frame->spd, true); - sdp_stream_enable = true; - } - if (info_frame->hdrsmd.valid) { + + if (info_frame->hdrsmd.valid) enc->vpg->funcs->update_generic_info_packet( enc->vpg, 3, /* packetIndex */ &info_frame->hdrsmd, true); - sdp_stream_enable = true; - } + /* enable/disable transmission of packet(s). * If enabled, packet transmission begins on the next frame */ -- GitLab From 7e4ab9fb2b9449ef01977e79157d06c8900f73fd Mon Sep 17 00:00:00 2001 From: Yuan Can <yuancan@huawei.com> Date: Tue, 27 Sep 2022 13:39:08 +0000 Subject: [PATCH 1503/2223] drm/amd/display: Remove unused struct i2c_id_config_access After commit 5a8132b9f606 ("drm/amd/display: remove dead dc vbios code"), no one use struct i2c_id_config_access, so remove it. Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Yuan Can <yuancan@huawei.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c index 53b077b40d729..ee0456b5e14e4 100644 --- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c +++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c @@ -51,13 +51,6 @@ #define LAST_RECORD_TYPE 0xff #define SMU9_SYSPLL0_ID 0 -struct i2c_id_config_access { - uint8_t bfI2C_LineMux:4; - uint8_t bfHW_EngineID:3; - uint8_t bfHW_Capable:1; - uint8_t ucAccess; -}; - static enum bp_result get_gpio_i2c_info(struct bios_parser *bp, struct atom_i2c_record *record, struct graphics_object_i2c_info *info); -- GitLab From 312b4dc11d4f74bfe03ea25ffe04c1f2fdd13cb9 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com> Date: Tue, 4 Oct 2022 07:33:39 -0700 Subject: [PATCH 1504/2223] drm/amdgpu: Fix VRAM BO swap issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRM buddy manager allocates the contiguous memory requests in a single block or multiple blocks. So for the ttm move operation (incase of low vram memory) we should consider all the blocks to compute the total memory size which compared with the struct ttm_resource num_pages in order to verify that the blocks are contiguous for the eviction process. v2: Added a Fixes tag v3: Rewrite the code to save a bit of calculations and variables (Christian) Fixes: c9cad937c0c5 ("drm/amdgpu: add drm buddy support to amdgpu") Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index b1c455329023a..dc262d2c2925e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -424,8 +424,9 @@ error: static bool amdgpu_mem_visible(struct amdgpu_device *adev, struct ttm_resource *mem) { - uint64_t mem_size = (u64)mem->num_pages << PAGE_SHIFT; + u64 mem_size = (u64)mem->num_pages << PAGE_SHIFT; struct amdgpu_res_cursor cursor; + u64 end; if (mem->mem_type == TTM_PL_SYSTEM || mem->mem_type == TTM_PL_TT) @@ -434,12 +435,18 @@ static bool amdgpu_mem_visible(struct amdgpu_device *adev, return false; amdgpu_res_first(mem, 0, mem_size, &cursor); + end = cursor.start + cursor.size; + while (cursor.remaining) { + amdgpu_res_next(&cursor, cursor.size); - /* ttm_resource_ioremap only supports contiguous memory */ - if (cursor.size != mem_size) - return false; + /* ttm_resource_ioremap only supports contiguous memory */ + if (end != cursor.start) + return false; + + end = cursor.start + cursor.size; + } - return cursor.start + cursor.size <= adev->gmc.visible_vram_size; + return end <= adev->gmc.visible_vram_size; } /* -- GitLab From 9a3c6067bd2ee2ca2652fbb0679f422f3c9109f9 Mon Sep 17 00:00:00 2001 From: Philip Yang <Philip.Yang@amd.com> Date: Mon, 3 Oct 2022 13:03:26 -0400 Subject: [PATCH 1505/2223] drm/amdgpu: Set vmbo destroy after pt bo is created MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under VRAM usage pression, map to GPU may fail to create pt bo and vmbo->shadow_list is not initialized, then ttm_bo_release calling amdgpu_bo_vm_destroy to access vmbo->shadow_list generates below dmesg and NULL pointer access backtrace: Set vmbo destroy callback to amdgpu_bo_vm_destroy only after creating pt bo successfully, otherwise use default callback amdgpu_bo_destroy. amdgpu: amdgpu_vm_bo_update failed amdgpu: update_gpuvm_pte() failed amdgpu: Failed to map bo to gpuvm amdgpu 0000:43:00.0: amdgpu: Failed to map peer:0000:43:00.0 mem_domain:2 BUG: kernel NULL pointer dereference, address: RIP: 0010:amdgpu_bo_vm_destroy+0x4d/0x80 [amdgpu] Call Trace: <TASK> ttm_bo_release+0x207/0x320 [amdttm] amdttm_bo_init_reserved+0x1d6/0x210 [amdttm] amdgpu_bo_create+0x1ba/0x520 [amdgpu] amdgpu_bo_create_vm+0x3a/0x80 [amdgpu] amdgpu_vm_pt_create+0xde/0x270 [amdgpu] amdgpu_vm_ptes_update+0x63b/0x710 [amdgpu] amdgpu_vm_update_range+0x2e7/0x6e0 [amdgpu] amdgpu_vm_bo_update+0x2bd/0x600 [amdgpu] update_gpuvm_pte+0x160/0x420 [amdgpu] amdgpu_amdkfd_gpuvm_map_memory_to_gpu+0x313/0x1130 [amdgpu] kfd_ioctl_map_memory_to_gpu+0x115/0x390 [amdgpu] kfd_ioctl+0x24a/0x5b0 [amdgpu] Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index e6a9b9fc9e0bb..2e8f6cd7a7293 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -688,13 +688,16 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev, * num of amdgpu_vm_pt entries. */ BUG_ON(bp->bo_ptr_size < sizeof(struct amdgpu_bo_vm)); - bp->destroy = &amdgpu_bo_vm_destroy; r = amdgpu_bo_create(adev, bp, &bo_ptr); if (r) return r; *vmbo_ptr = to_amdgpu_bo_vm(bo_ptr); INIT_LIST_HEAD(&(*vmbo_ptr)->shadow_list); + /* Set destroy callback to amdgpu_bo_vm_destroy after vmbo->shadow_list + * is initialized. + */ + bo_ptr->tbo.destroy = &amdgpu_bo_vm_destroy; return r; } -- GitLab From 2302d507149f0ae7cc697089ab5675a2d4cf9d2a Mon Sep 17 00:00:00 2001 From: Philip Yang <Philip.Yang@amd.com> Date: Mon, 3 Oct 2022 17:53:25 -0400 Subject: [PATCH 1506/2223] drm/amdgpu: Correct amdgpu_amdkfd_total_mem_size calculation amdkfd_total_mem_size is the size of total GPUs vram plus system memory to estimate page tables memory usage and leave enough VRAM room for page tables allocation. Calculate amdkfd_total_mem_size in amdgpu_amdkfd_device_probe is incorrect because adev->gmc.real_vram_size is still 0 called from amdgpu_device_ip_early_init. Move the calculation to amdgpu_amdkfd_device_init to get the correct VRAM size. Do reverse calculation in amdgpu_amdkfd_device_fini_sw to support hot-unplugging GPUs. Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 9e98f3866edca..03bbfaa51cbcb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -75,9 +75,6 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) return; adev->kfd.dev = kgd2kfd_probe(adev, vf); - - if (adev->kfd.dev) - amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size; } /** @@ -201,6 +198,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) adev->kfd.init_complete = kgd2kfd_device_init(adev->kfd.dev, adev_to_drm(adev), &gpu_resources); + amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size; + INIT_WORK(&adev->kfd.reset_work, amdgpu_amdkfd_reset_work); } } @@ -210,6 +209,7 @@ void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev) if (adev->kfd.dev) { kgd2kfd_device_exit(adev->kfd.dev); adev->kfd.dev = NULL; + amdgpu_amdkfd_total_mem_size -= adev->gmc.real_vram_size; } } -- GitLab From 17d819e2828cacca2e4c909044eb9798ed379cd2 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz <hamza.mahfooz@amd.com> Date: Wed, 5 Oct 2022 11:30:38 -0400 Subject: [PATCH 1507/2223] Revert "drm/amdgpu: use dirty framebuffer helper" This reverts commit 66f99628eb24409cb8feb5061f78283c8b65f820. Unfortunately, that commit causes performance regressions on non-PSR setups. So, just revert it until FB_DAMAGE_CLIPS support can be added. Cc: stable@vger.kernel.org Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2189 Link: https://bugzilla.kernel.org/show_bug.cgi?id=216554 Fixes: 66f99628eb2440 ("drm/amdgpu: use dirty framebuffer helper") Fixes: abbc7a3dafb91b ("drm/amdgpu: don't register a dirty callback for non-atomic") Signed-off-by: Hamza Mahfooz <hamza.mahfooz@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 23998f727c7f9..1a06b8d724f39 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -38,8 +38,6 @@ #include <linux/pci.h> #include <linux/pm_runtime.h> #include <drm/drm_crtc_helper.h> -#include <drm/drm_damage_helper.h> -#include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_fb_helper.h> @@ -500,12 +498,6 @@ static const struct drm_framebuffer_funcs amdgpu_fb_funcs = { .create_handle = drm_gem_fb_create_handle, }; -static const struct drm_framebuffer_funcs amdgpu_fb_funcs_atomic = { - .destroy = drm_gem_fb_destroy, - .create_handle = drm_gem_fb_create_handle, - .dirty = drm_atomic_helper_dirtyfb, -}; - uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev, uint64_t bo_flags) { @@ -1108,10 +1100,8 @@ static int amdgpu_display_gem_fb_verify_and_init(struct drm_device *dev, if (ret) goto err; - if (drm_drv_uses_atomic_modeset(dev)) - ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs_atomic); - else - ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs); + ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs); + if (ret) goto err; -- GitLab From 34bec35cbbb23e5fd18100f2a2b217ebb6cb129c Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:52 +0100 Subject: [PATCH 1508/2223] perf test: Add build infra for perf test tools for ARM CoreSight tests This adds the initial build infrastructure (makefiles maintainers information) for adding follow-on tests for CoreSight. Committer notes: Remove the installation of tests/shell/coresight/*.sh, as there are no files there yet and thus, at this point, make install fails. Use $(QUIET_CLEAN) to avoid having extraneous output in the 'make clean' output. Also use @$(MAKE) in tools/perf/tests/shell/coresight/Makefile as $(Q) is not turning into @ when V=1 isn't used, i.e. in the default case it is not being quiet. The >/dev/null in the all for tools/perf/tests/shell/coresight/Makefile is to avoid this: make[4]: Nothing to be done for 'all'. make[4]: Nothing to be done for 'all'. make[4]: Nothing to be done for 'all'. DESCEND plugins GEN /tmp/build/perf/python/perf.so make[4]: Nothing to be done for 'all'. INSTALL trace_plugins On !arm64 where nothing is done on the main target for tools/perf/tests/shell/coresight/*/Makefile. Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: http://lore.kernel.org/lkml/20220909152803.2317006-3-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- MAINTAINERS | 1 + tools/perf/Makefile.config | 2 ++ tools/perf/Makefile.perf | 17 ++++++++++--- tools/perf/tests/shell/coresight/Makefile | 25 +++++++++++++++++++ .../tests/shell/coresight/Makefile.miniconfig | 14 +++++++++++ 5 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 tools/perf/tests/shell/coresight/Makefile create mode 100644 tools/perf/tests/shell/coresight/Makefile.miniconfig diff --git a/MAINTAINERS b/MAINTAINERS index 66ef84d8b304a..caddb789b31ec 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2027,6 +2027,7 @@ F: drivers/hwtracing/coresight/* F: include/dt-bindings/arm/coresight-cti-dt.h F: include/linux/coresight* F: samples/coresight/* +F: tools/perf/tests/shell/coresight/* F: tools/perf/arch/arm/util/auxtrace.c F: tools/perf/arch/arm/util/cs-etm.c F: tools/perf/arch/arm/util/cs-etm.h diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index c7c188ba1a4bc..6fd4b1384b975 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1296,6 +1296,8 @@ perf_examples_instdir_SQ = $(subst ','\'',$(perf_examples_instdir)) STRACE_GROUPS_INSTDIR_SQ = $(subst ','\'',$(STRACE_GROUPS_INSTDIR)) tip_instdir_SQ = $(subst ','\'',$(tip_instdir)) +export perfexec_instdir_SQ + # If we install to $(HOME) we keep the traceevent default: # $(HOME)/.traceevent/plugins # Otherwise we install plugins into the global $(libdir). diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index bd947885a639b..194e582e70c2c 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -629,7 +629,16 @@ sync_file_range_tbls := $(srctree)/tools/perf/trace/beauty/sync_file_range.sh $(sync_file_range_arrays): $(linux_uapi_dir)/fs.h $(sync_file_range_tbls) $(Q)$(SHELL) '$(sync_file_range_tbls)' $(linux_uapi_dir) > $@ -all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) +TESTS_CORESIGHT_DIR := $(srctree)/tools/perf/tests/shell/coresight + +tests-coresight-targets: FORCE + $(Q)$(MAKE) -C $(TESTS_CORESIGHT_DIR) + +tests-coresight-targets-clean: + $(call QUIET_CLEAN, coresight) + $(Q)$(MAKE) -C $(TESTS_CORESIGHT_DIR) O=$(OUTPUT) clean >/dev/null + +all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) tests-coresight-targets # Create python binding output directory if not already present _dummy := $(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python') @@ -1006,7 +1015,9 @@ install-tests: all install-gtk $(INSTALL) tests/shell/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell'; \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \ $(INSTALL) tests/shell/lib/*.sh -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \ - $(INSTALL) tests/shell/lib/*.py -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib' + $(INSTALL) tests/shell/lib/*.py -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' + $(Q)$(MAKE) -C tests/shell/coresight install-tests install-bin: install-tools install-tests install-traceevent-plugins @@ -1077,7 +1088,7 @@ endif # BUILD_BPF_SKEL bpf-skel-clean: $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) -clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS) $(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected diff --git a/tools/perf/tests/shell/coresight/Makefile b/tools/perf/tests/shell/coresight/Makefile new file mode 100644 index 0000000000000..c24271972c672 --- /dev/null +++ b/tools/perf/tests/shell/coresight/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 +include ../../../../../tools/scripts/Makefile.include +include ../../../../../tools/scripts/Makefile.arch +include ../../../../../tools/scripts/utilities.mak + +SUBDIRS = + +all: $(SUBDIRS) +$(SUBDIRS): + @$(MAKE) -C $@ >/dev/null + +INSTALLDIRS = $(SUBDIRS:%=install-%) + +install-tests: $(INSTALLDIRS) +$(INSTALLDIRS): + @$(MAKE) -C $(@:install-%=%) install-tests >/dev/null + +CLEANDIRS = $(SUBDIRS:%=clean-%) + +clean: $(CLEANDIRS) +$(CLEANDIRS): + $(call QUIET_CLEAN, test-$(@:clean-%=%)) $(Q)$(MAKE) -C $(@:clean-%=%) clean >/dev/null + +.PHONY: all clean $(SUBDIRS) $(CLEANDIRS) $(INSTALLDIRS) diff --git a/tools/perf/tests/shell/coresight/Makefile.miniconfig b/tools/perf/tests/shell/coresight/Makefile.miniconfig new file mode 100644 index 0000000000000..5f72a9cb43f36 --- /dev/null +++ b/tools/perf/tests/shell/coresight/Makefile.miniconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +ifndef DESTDIR +prefix ?= $(HOME) +endif + +DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) +INSTALL = install +INSTDIR_SUB = tests/shell/coresight + +include ../../../../../scripts/Makefile.include +include ../../../../../scripts/Makefile.arch +include ../../../../../scripts/utilities.mak -- GitLab From 8b97519711c3a0f9eb8274a227dff3fe4f0f72a2 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:53 +0100 Subject: [PATCH 1509/2223] perf test: Add asm pureloop test tool Add test tool to be driven by further test scripts. This tool is pure arm64 ASM with no libc usage to ensure it is the same exact binary/code every time so it can also be re-used for many uses. It just loops for a given fixed number of loops. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-4-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/coresight/Makefile | 3 +- .../shell/coresight/asm_pure_loop/.gitignore | 1 + .../shell/coresight/asm_pure_loop/Makefile | 34 +++++++++++++++++++ .../coresight/asm_pure_loop/asm_pure_loop.S | 28 +++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tools/perf/tests/shell/coresight/asm_pure_loop/.gitignore create mode 100644 tools/perf/tests/shell/coresight/asm_pure_loop/Makefile create mode 100644 tools/perf/tests/shell/coresight/asm_pure_loop/asm_pure_loop.S diff --git a/tools/perf/tests/shell/coresight/Makefile b/tools/perf/tests/shell/coresight/Makefile index c24271972c672..e3cea2fb4df8d 100644 --- a/tools/perf/tests/shell/coresight/Makefile +++ b/tools/perf/tests/shell/coresight/Makefile @@ -4,7 +4,8 @@ include ../../../../../tools/scripts/Makefile.include include ../../../../../tools/scripts/Makefile.arch include ../../../../../tools/scripts/utilities.mak -SUBDIRS = +SUBDIRS = \ + asm_pure_loop all: $(SUBDIRS) $(SUBDIRS): diff --git a/tools/perf/tests/shell/coresight/asm_pure_loop/.gitignore b/tools/perf/tests/shell/coresight/asm_pure_loop/.gitignore new file mode 100644 index 0000000000000..468673ac32e87 --- /dev/null +++ b/tools/perf/tests/shell/coresight/asm_pure_loop/.gitignore @@ -0,0 +1 @@ +asm_pure_loop diff --git a/tools/perf/tests/shell/coresight/asm_pure_loop/Makefile b/tools/perf/tests/shell/coresight/asm_pure_loop/Makefile new file mode 100644 index 0000000000000..206849e92bc93 --- /dev/null +++ b/tools/perf/tests/shell/coresight/asm_pure_loop/Makefile @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +include ../Makefile.miniconfig + +# Binary to produce +BIN=asm_pure_loop +# Any linking/libraries needed for the binary - empty if none needed +LIB= + +all: $(BIN) + +$(BIN): $(BIN).S +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Build line - this is raw asm with no libc to have an always exact binary + $(Q)$(CC) $(BIN).S -nostdlib -static -o $(BIN) $(LIB) +endif +endif + +install-tests: all +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Install the test tool in the right place + $(call QUIET_INSTALL, tests) \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)'; \ + $(INSTALL) $(BIN) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)/$(BIN)' +endif +endif + +clean: + $(Q)$(RM) -f $(BIN) + +.PHONY: all clean install-tests diff --git a/tools/perf/tests/shell/coresight/asm_pure_loop/asm_pure_loop.S b/tools/perf/tests/shell/coresight/asm_pure_loop/asm_pure_loop.S new file mode 100644 index 0000000000000..75cf084a927d3 --- /dev/null +++ b/tools/perf/tests/shell/coresight/asm_pure_loop/asm_pure_loop.S @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Tamas Zsoldos <tamas.zsoldos@arm.com>, 2021 */ + +.globl _start +_start: + mov x0, 0x0000ffff + mov x1, xzr +loop: + nop + nop + cbnz x1, noskip + nop + nop + adrp x2, skip + add x2, x2, :lo12:skip + br x2 + nop + nop +noskip: + nop + nop +skip: + sub x0, x0, 1 + cbnz x0, loop + + mov x0, #0 + mov x8, #93 // __NR_exit syscall + svc #0 -- GitLab From fdc25cc59c7126952f04beafb0de6143a7fa574d Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:54 +0100 Subject: [PATCH 1510/2223] perf test: Add arm64 asm pureloop test shell script Add a script to drive the asm pureloop test for arm64/CoreSight that gathers data so it passes a minimum bar for amount and quality of content that we extract from the kernel's perf support. Committer notes: Add the install of tests/shell/coresight/*.sh to tools/perf/Makefile.perf as we're starting to populate that dir. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-5-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Makefile.perf | 3 ++- .../tests/shell/coresight/asm_pure_loop.sh | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100755 tools/perf/tests/shell/coresight/asm_pure_loop.sh diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 194e582e70c2c..a432e59afc42a 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1016,7 +1016,8 @@ install-tests: all install-gtk $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \ $(INSTALL) tests/shell/lib/*.sh -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \ $(INSTALL) tests/shell/lib/*.py -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \ - $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' ; \ + $(INSTALL) tests/shell/coresight/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' $(Q)$(MAKE) -C tests/shell/coresight install-tests install-bin: install-tools install-tests install-traceevent-plugins diff --git a/tools/perf/tests/shell/coresight/asm_pure_loop.sh b/tools/perf/tests/shell/coresight/asm_pure_loop.sh new file mode 100755 index 0000000000000..569e9d46162bc --- /dev/null +++ b/tools/perf/tests/shell/coresight/asm_pure_loop.sh @@ -0,0 +1,18 @@ +#!/bin/sh -e +# CoreSight / ASM Pure Loop + +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +TEST="asm_pure_loop" +. $(dirname $0)/../lib/coresight.sh +ARGS="" +DATV="out" +DATA="$DATD/perf-$TEST-$DATV.data" + +perf record $PERFRECOPT -o "$DATA" "$BIN" $ARGS + +perf_dump_aux_verify "$DATA" 10 10 10 + +err=$? +exit $err -- GitLab From 6ea586b1e3dc56e831488f1c80acdf21f301b6b6 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:55 +0100 Subject: [PATCH 1511/2223] perf test: Add git ignore for perf data generated by the ARM CoreSight tests Ignore perf output data files generated by perf tests for cleaner git status. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-6-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/.gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index 4b9c71faa01ad..faa23b5d32f55 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -15,8 +15,8 @@ perf*.1 perf*.xml perf*.html common-cmds.h -perf.data -perf.data.old +perf*.data +perf*.data.old output.svg perf-archive perf-iostat -- GitLab From f1288bdb6d48324e46406230ebd70190ec815fbb Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:56 +0100 Subject: [PATCH 1512/2223] perf test coresight: Add memcpy thread test tool Add test tool to be driven by further test scripts. This is a simple C based memcpy with threads test to drive from scripts. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-7-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/coresight/Makefile | 3 +- .../shell/coresight/memcpy_thread/.gitignore | 1 + .../shell/coresight/memcpy_thread/Makefile | 33 ++++++++ .../coresight/memcpy_thread/memcpy_thread.c | 79 +++++++++++++++++++ 4 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 tools/perf/tests/shell/coresight/memcpy_thread/.gitignore create mode 100644 tools/perf/tests/shell/coresight/memcpy_thread/Makefile create mode 100644 tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c diff --git a/tools/perf/tests/shell/coresight/Makefile b/tools/perf/tests/shell/coresight/Makefile index e3cea2fb4df8d..78f5d5c3ebdae 100644 --- a/tools/perf/tests/shell/coresight/Makefile +++ b/tools/perf/tests/shell/coresight/Makefile @@ -5,7 +5,8 @@ include ../../../../../tools/scripts/Makefile.arch include ../../../../../tools/scripts/utilities.mak SUBDIRS = \ - asm_pure_loop + asm_pure_loop \ + memcpy_thread all: $(SUBDIRS) $(SUBDIRS): diff --git a/tools/perf/tests/shell/coresight/memcpy_thread/.gitignore b/tools/perf/tests/shell/coresight/memcpy_thread/.gitignore new file mode 100644 index 0000000000000..f8217e56091ef --- /dev/null +++ b/tools/perf/tests/shell/coresight/memcpy_thread/.gitignore @@ -0,0 +1 @@ +memcpy_thread diff --git a/tools/perf/tests/shell/coresight/memcpy_thread/Makefile b/tools/perf/tests/shell/coresight/memcpy_thread/Makefile new file mode 100644 index 0000000000000..2db637eb2c261 --- /dev/null +++ b/tools/perf/tests/shell/coresight/memcpy_thread/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 +include ../Makefile.miniconfig + +# Binary to produce +BIN=memcpy_thread +# Any linking/libraries needed for the binary - empty if none needed +LIB=-pthread + +all: $(BIN) + +$(BIN): $(BIN).c +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Build line + $(Q)$(CC) $(BIN).c -o $(BIN) $(LIB) +endif +endif + +install-tests: all +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Install the test tool in the right place + $(call QUIET_INSTALL, tests) \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)'; \ + $(INSTALL) $(BIN) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)/$(BIN)' +endif +endif + +clean: + $(Q)$(RM) -f $(BIN) + +.PHONY: all clean install-tests diff --git a/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c b/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c new file mode 100644 index 0000000000000..a7e169d1bf645 --- /dev/null +++ b/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +// Carsten Haitzler <carsten.haitzler@arm.com>, 2021 +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <pthread.h> + +struct args { + unsigned long loops; + unsigned long size; + pthread_t th; + void *ret; +}; + +static void *thrfn(void *arg) +{ + struct args *a = arg; + unsigned long i, len = a->loops; + unsigned char *src, *dst; + + src = malloc(a->size * 1024); + dst = malloc(a->size * 1024); + if ((!src) || (!dst)) { + printf("ERR: Can't allocate memory\n"); + exit(1); + } + for (i = 0; i < len; i++) + memcpy(dst, src, a->size * 1024); +} + +static pthread_t new_thr(void *(*fn) (void *arg), void *arg) +{ + pthread_t t; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_create(&t, &attr, fn, arg); + return t; +} + +int main(int argc, char **argv) +{ + unsigned long i, len, size, thr; + pthread_t threads[256]; + struct args args[256]; + long long v; + + if (argc < 4) { + printf("ERR: %s [copysize Kb] [numthreads] [numloops (hundreds)]\n", argv[0]); + exit(1); + } + + v = atoll(argv[1]); + if ((v < 1) || (v > (1024 * 1024))) { + printf("ERR: max memory 1GB (1048576 KB)\n"); + exit(1); + } + size = v; + thr = atol(argv[2]); + if ((thr < 1) || (thr > 256)) { + printf("ERR: threads 1-256\n"); + exit(1); + } + v = atoll(argv[3]); + if ((v < 1) || (v > 40000000000ll)) { + printf("ERR: loops 1-40000000000 (hundreds)\n"); + exit(1); + } + len = v * 100; + for (i = 0; i < thr; i++) { + args[i].loops = len; + args[i].size = size; + args[i].th = new_thr(thrfn, &(args[i])); + } + for (i = 0; i < thr; i++) + pthread_join(args[i].th, &(args[i].ret)); + return 0; +} -- GitLab From b76692fea7f29f80b88d52dcfa9f994b76337988 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:57 +0100 Subject: [PATCH 1513/2223] perf test coresight: Add memcpy thread test shell script Add a script to drive the threaded memcpy test that gathers data so it passes a minimum bar for amount and quality of content that we extract from the kernel's perf support. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-8-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../shell/coresight/memcpy_thread_16k_10.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 tools/perf/tests/shell/coresight/memcpy_thread_16k_10.sh diff --git a/tools/perf/tests/shell/coresight/memcpy_thread_16k_10.sh b/tools/perf/tests/shell/coresight/memcpy_thread_16k_10.sh new file mode 100755 index 0000000000000..d21ba8545938d --- /dev/null +++ b/tools/perf/tests/shell/coresight/memcpy_thread_16k_10.sh @@ -0,0 +1,18 @@ +#!/bin/sh -e +# CoreSight / Memcpy 16k 10 Threads + +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +TEST="memcpy_thread" +. $(dirname $0)/../lib/coresight.sh +ARGS="16 10 1" +DATV="16k_10" +DATA="$DATD/perf-$TEST-$DATV.data" + +perf record $PERFRECOPT -o "$DATA" "$BIN" $ARGS + +perf_dump_aux_verify "$DATA" 10 10 10 + +err=$? +exit $err -- GitLab From e9664b96c6c0d7f3bac45d1141afacec45725169 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:58 +0100 Subject: [PATCH 1514/2223] perf test coresight: Add thread loop test tool Add test tool to be driven by further test scripts. This is a simple C based loop with threads test to drive from scripts that can output TIDs for tracking/checking. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-9-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/coresight/Makefile | 3 +- .../shell/coresight/thread_loop/.gitignore | 1 + .../shell/coresight/thread_loop/Makefile | 33 +++++++ .../shell/coresight/thread_loop/thread_loop.c | 86 +++++++++++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 tools/perf/tests/shell/coresight/thread_loop/.gitignore create mode 100644 tools/perf/tests/shell/coresight/thread_loop/Makefile create mode 100644 tools/perf/tests/shell/coresight/thread_loop/thread_loop.c diff --git a/tools/perf/tests/shell/coresight/Makefile b/tools/perf/tests/shell/coresight/Makefile index 78f5d5c3ebdae..db83cad8a02a9 100644 --- a/tools/perf/tests/shell/coresight/Makefile +++ b/tools/perf/tests/shell/coresight/Makefile @@ -6,7 +6,8 @@ include ../../../../../tools/scripts/utilities.mak SUBDIRS = \ asm_pure_loop \ - memcpy_thread + memcpy_thread \ + thread_loop all: $(SUBDIRS) $(SUBDIRS): diff --git a/tools/perf/tests/shell/coresight/thread_loop/.gitignore b/tools/perf/tests/shell/coresight/thread_loop/.gitignore new file mode 100644 index 0000000000000..6d4c33eaa9e89 --- /dev/null +++ b/tools/perf/tests/shell/coresight/thread_loop/.gitignore @@ -0,0 +1 @@ +thread_loop diff --git a/tools/perf/tests/shell/coresight/thread_loop/Makefile b/tools/perf/tests/shell/coresight/thread_loop/Makefile new file mode 100644 index 0000000000000..ea846c038e7ac --- /dev/null +++ b/tools/perf/tests/shell/coresight/thread_loop/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 +include ../Makefile.miniconfig + +# Binary to produce +BIN=thread_loop +# Any linking/libraries needed for the binary - empty if none needed +LIB=-pthread + +all: $(BIN) + +$(BIN): $(BIN).c +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Build line + $(Q)$(CC) $(BIN).c -o $(BIN) $(LIB) +endif +endif + +install-tests: all +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Install the test tool in the right place + $(call QUIET_INSTALL, tests) \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)'; \ + $(INSTALL) $(BIN) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)/$(BIN)' +endif +endif + +clean: + $(Q)$(RM) -f $(BIN) + +.PHONY: all clean install-tests diff --git a/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c b/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c new file mode 100644 index 0000000000000..c0158fac7d0b0 --- /dev/null +++ b/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +// Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +// define this for gettid() +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <pthread.h> +#include <sys/syscall.h> +#ifndef SYS_gettid +// gettid is 178 on arm64 +# define SYS_gettid 178 +#endif +#define gettid() syscall(SYS_gettid) + +struct args { + unsigned int loops; + pthread_t th; + void *ret; +}; + +static void *thrfn(void *arg) +{ + struct args *a = arg; + int i = 0, len = a->loops; + + if (getenv("SHOW_TID")) { + unsigned long long tid = gettid(); + + printf("%llu\n", tid); + } + asm volatile( + "loop:\n" + "add %[i], %[i], #1\n" + "cmp %[i], %[len]\n" + "blt loop\n" + : /* out */ + : /* in */ [i] "r" (i), [len] "r" (len) + : /* clobber */ + ); + return (void *)(long)i; +} + +static pthread_t new_thr(void *(*fn) (void *arg), void *arg) +{ + pthread_t t; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_create(&t, &attr, fn, arg); + return t; +} + +int main(int argc, char **argv) +{ + unsigned int i, len, thr; + pthread_t threads[256]; + struct args args[256]; + + if (argc < 3) { + printf("ERR: %s [numthreads] [numloops (millions)]\n", argv[0]); + exit(1); + } + + thr = atoi(argv[1]); + if ((thr < 1) || (thr > 256)) { + printf("ERR: threads 1-256\n"); + exit(1); + } + len = atoi(argv[2]); + if ((len < 1) || (len > 4000)) { + printf("ERR: max loops 4000 (millions)\n"); + exit(1); + } + len *= 1000000; + for (i = 0; i < thr; i++) { + args[i].loops = len; + args[i].th = new_thr(thrfn, &(args[i])); + } + for (i = 0; i < thr; i++) + pthread_join(args[i].th, &(args[i].ret)); + return 0; +} -- GitLab From 74c62b8d6161678b72854d95e47b41782dfec39a Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:27:59 +0100 Subject: [PATCH 1515/2223] perf test coresight: Add thread loop test shell scripts Add a script to drive the thread loop test that gathers data so it passes a minimum bar (in this case do we get any perf context data for every thread). Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-10-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../coresight/thread_loop_check_tid_10.sh | 19 +++++++++++++++++++ .../coresight/thread_loop_check_tid_2.sh | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100755 tools/perf/tests/shell/coresight/thread_loop_check_tid_10.sh create mode 100755 tools/perf/tests/shell/coresight/thread_loop_check_tid_2.sh diff --git a/tools/perf/tests/shell/coresight/thread_loop_check_tid_10.sh b/tools/perf/tests/shell/coresight/thread_loop_check_tid_10.sh new file mode 100755 index 0000000000000..7c13636fc7785 --- /dev/null +++ b/tools/perf/tests/shell/coresight/thread_loop_check_tid_10.sh @@ -0,0 +1,19 @@ +#!/bin/sh -e +# CoreSight / Thread Loop 10 Threads - Check TID + +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +TEST="thread_loop" +. $(dirname $0)/../lib/coresight.sh +ARGS="10 1" +DATV="check-tid-10th" +DATA="$DATD/perf-$TEST-$DATV.data" +STDO="$DATD/perf-$TEST-$DATV.stdout" + +SHOW_TID=1 perf record -s $PERFRECOPT -o "$DATA" "$BIN" $ARGS > $STDO + +perf_dump_aux_tid_verify "$DATA" "$STDO" + +err=$? +exit $err diff --git a/tools/perf/tests/shell/coresight/thread_loop_check_tid_2.sh b/tools/perf/tests/shell/coresight/thread_loop_check_tid_2.sh new file mode 100755 index 0000000000000..a067145af43ce --- /dev/null +++ b/tools/perf/tests/shell/coresight/thread_loop_check_tid_2.sh @@ -0,0 +1,19 @@ +#!/bin/sh -e +# CoreSight / Thread Loop 2 Threads - Check TID + +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +TEST="thread_loop" +. $(dirname $0)/../lib/coresight.sh +ARGS="2 20" +DATV="check-tid-2th" +DATA="$DATD/perf-$TEST-$DATV.data" +STDO="$DATD/perf-$TEST-$DATV.stdout" + +SHOW_TID=1 perf record -s $PERFRECOPT -o "$DATA" "$BIN" $ARGS > $STDO + +perf_dump_aux_tid_verify "$DATA" "$STDO" + +err=$? +exit $err -- GitLab From fc0a0ea039802e1546c19b8514fd2efee43d10b5 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:28:00 +0100 Subject: [PATCH 1516/2223] perf test coresight: Add unroll thread test tool Add test tool to be driven by further test scripts. This is a simple C based test that is for arm64 with some inline ASM to manually unroll a lot of code to have a very long sequence of commands. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-11-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/coresight/Makefile | 3 +- .../coresight/unroll_loop_thread/.gitignore | 1 + .../coresight/unroll_loop_thread/Makefile | 33 +++++++++ .../unroll_loop_thread/unroll_loop_thread.c | 74 +++++++++++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 tools/perf/tests/shell/coresight/unroll_loop_thread/.gitignore create mode 100644 tools/perf/tests/shell/coresight/unroll_loop_thread/Makefile create mode 100644 tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c diff --git a/tools/perf/tests/shell/coresight/Makefile b/tools/perf/tests/shell/coresight/Makefile index db83cad8a02a9..b070e779703e9 100644 --- a/tools/perf/tests/shell/coresight/Makefile +++ b/tools/perf/tests/shell/coresight/Makefile @@ -7,7 +7,8 @@ include ../../../../../tools/scripts/utilities.mak SUBDIRS = \ asm_pure_loop \ memcpy_thread \ - thread_loop + thread_loop \ + unroll_loop_thread all: $(SUBDIRS) $(SUBDIRS): diff --git a/tools/perf/tests/shell/coresight/unroll_loop_thread/.gitignore b/tools/perf/tests/shell/coresight/unroll_loop_thread/.gitignore new file mode 100644 index 0000000000000..2cb4e996dbf3a --- /dev/null +++ b/tools/perf/tests/shell/coresight/unroll_loop_thread/.gitignore @@ -0,0 +1 @@ +unroll_loop_thread diff --git a/tools/perf/tests/shell/coresight/unroll_loop_thread/Makefile b/tools/perf/tests/shell/coresight/unroll_loop_thread/Makefile new file mode 100644 index 0000000000000..6264c4e3abd15 --- /dev/null +++ b/tools/perf/tests/shell/coresight/unroll_loop_thread/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 +include ../Makefile.miniconfig + +# Binary to produce +BIN=unroll_loop_thread +# Any linking/libraries needed for the binary - empty if none needed +LIB=-pthread + +all: $(BIN) + +$(BIN): $(BIN).c +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Build line + $(Q)$(CC) $(BIN).c -o $(BIN) $(LIB) +endif +endif + +install-tests: all +ifdef CORESIGHT +ifeq ($(ARCH),arm64) +# Install the test tool in the right place + $(call QUIET_INSTALL, tests) \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)'; \ + $(INSTALL) $(BIN) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$(INSTDIR_SUB)/$(BIN)/$(BIN)' +endif +endif + +clean: + $(Q)$(RM) -f $(BIN) + +.PHONY: all clean install-tests diff --git a/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c b/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c new file mode 100644 index 0000000000000..8f6d384208ed9 --- /dev/null +++ b/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +// Carsten Haitzler <carsten.haitzler@arm.com>, 2021 +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <pthread.h> + +struct args { + pthread_t th; + unsigned int in; + void *ret; +}; + +static void *thrfn(void *arg) +{ + struct args *a = arg; + unsigned int i, in = a->in; + + for (i = 0; i < 10000; i++) { + asm volatile ( +// force an unroll of thia add instruction so we can test long runs of code +#define SNIP1 "add %[in], %[in], #1\n" +// 10 +#define SNIP2 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 +// 100 +#define SNIP3 SNIP2 SNIP2 SNIP2 SNIP2 SNIP2 SNIP2 SNIP2 SNIP2 SNIP2 SNIP2 +// 1000 +#define SNIP4 SNIP3 SNIP3 SNIP3 SNIP3 SNIP3 SNIP3 SNIP3 SNIP3 SNIP3 SNIP3 +// 10000 +#define SNIP5 SNIP4 SNIP4 SNIP4 SNIP4 SNIP4 SNIP4 SNIP4 SNIP4 SNIP4 SNIP4 +// 100000 + SNIP5 SNIP5 SNIP5 SNIP5 SNIP5 SNIP5 SNIP5 SNIP5 SNIP5 SNIP5 + : /* out */ + : /* in */ [in] "r" (in) + : /* clobber */ + ); + } +} + +static pthread_t new_thr(void *(*fn) (void *arg), void *arg) +{ + pthread_t t; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_create(&t, &attr, fn, arg); + return t; +} + +int main(int argc, char **argv) +{ + unsigned int i, thr; + pthread_t threads[256]; + struct args args[256]; + + if (argc < 2) { + printf("ERR: %s [numthreads]\n", argv[0]); + exit(1); + } + + thr = atoi(argv[1]); + if ((thr > 256) || (thr < 1)) { + printf("ERR: threads 1-256\n"); + exit(1); + } + for (i = 0; i < thr; i++) { + args[i].in = rand(); + args[i].th = new_thr(thrfn, &(args[i])); + } + for (i = 0; i < thr; i++) + pthread_join(args[i].th, &(args[i].ret)); + return 0; +} -- GitLab From b65c6477f6bb1147e34164bb8580138daa12ddab Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:28:01 +0100 Subject: [PATCH 1517/2223] perf test coresight: Add unroll thread test shell script This adds scripts to drive the unroll thread tests to compare perf output against a minimum bar of content/quality. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-12-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../shell/coresight/unroll_loop_thread_10.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 tools/perf/tests/shell/coresight/unroll_loop_thread_10.sh diff --git a/tools/perf/tests/shell/coresight/unroll_loop_thread_10.sh b/tools/perf/tests/shell/coresight/unroll_loop_thread_10.sh new file mode 100755 index 0000000000000..f48c85230b155 --- /dev/null +++ b/tools/perf/tests/shell/coresight/unroll_loop_thread_10.sh @@ -0,0 +1,18 @@ +#!/bin/sh -e +# CoreSight / Unroll Loop Thread 10 + +# SPDX-License-Identifier: GPL-2.0 +# Carsten Haitzler <carsten.haitzler@arm.com>, 2021 + +TEST="unroll_loop_thread" +. $(dirname $0)/../lib/coresight.sh +ARGS="10" +DATV="10" +DATA="$DATD/perf-$TEST-$DATV.data" + +perf record $PERFRECOPT -o "$DATA" "$BIN" $ARGS + +perf_dump_aux_verify "$DATA" 10 10 10 + +err=$? +exit $err -- GitLab From 43c688cb32412e4466d0be614e9d6003ae4cf451 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:28:02 +0100 Subject: [PATCH 1518/2223] perf test: Add git ignore for tmp and output files of ARM CoreSight tests Ignore other output files of the new CoreSight tests so they don't fill git status with noise we don't need or want. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20220909152803.2317006-13-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index faa23b5d32f55..a653311d96938 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -22,6 +22,7 @@ perf-archive perf-iostat tags TAGS +stats-*.csv cscope* config.mak config.mak.autogen @@ -29,6 +30,7 @@ config.mak.autogen *-flex.* *.pyc *.pyo +*.stdout .config-detected util/intel-pt-decoder/inat-tables.c arch/*/include/generated/ -- GitLab From dc2e0fb00bb2b24f0b6c4877c34bb1d288d31fb2 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler <carsten.haitzler@arm.com> Date: Fri, 9 Sep 2022 16:28:03 +0100 Subject: [PATCH 1519/2223] perf test coresight: Add relevant documentation about ARM64 CoreSight testing Add/improve documentation helping people get started with CoreSight and perf as well as describe the testing and how it works. Reviewed-by: James Clark <james.clark@arm.com> Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Cc: linux-doc@vger.kernel.org Link: https://lore.kernel.org/r/20220909152803.2317006-14-carsten.haitzler@foss.arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- .../trace/coresight/coresight-perf.rst | 158 ++++++++++++++++++ .../perf/Documentation/perf-arm-coresight.txt | 5 + 2 files changed, 163 insertions(+) create mode 100644 Documentation/trace/coresight/coresight-perf.rst create mode 100644 tools/perf/Documentation/perf-arm-coresight.txt diff --git a/Documentation/trace/coresight/coresight-perf.rst b/Documentation/trace/coresight/coresight-perf.rst new file mode 100644 index 0000000000000..d087aae7d4928 --- /dev/null +++ b/Documentation/trace/coresight/coresight-perf.rst @@ -0,0 +1,158 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +CoreSight - Perf +================ + + :Author: Carsten Haitzler <carsten.haitzler@arm.com> + :Date: June 29th, 2022 + +Perf is able to locally access CoreSight trace data and store it to the +output perf data files. This data can then be later decoded to give the +instructions that were traced for debugging or profiling purposes. You +can log such data with a perf record command like:: + + perf record -e cs_etm//u testbinary + +This would run some test binary (testbinary) until it exits and record +a perf.data trace file. That file would have AUX sections if CoreSight +is working correctly. You can dump the content of this file as +readable text with a command like:: + + perf report --stdio --dump -i perf.data + +You should find some sections of this file have AUX data blocks like:: + + 0x1e78 [0x30]: PERF_RECORD_AUXTRACE size: 0x11dd0 offset: 0 ref: 0x1b614fc1061b0ad1 idx: 0 tid: 531230 cpu: -1 + + . ... CoreSight ETM Trace data: size 73168 bytes + Idx:0; ID:10; I_ASYNC : Alignment Synchronisation. + Idx:12; ID:10; I_TRACE_INFO : Trace Info.; INFO=0x0 { CC.0 } + Idx:17; ID:10; I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.; Addr=0x0000000000000000; + Idx:26; ID:10; I_TRACE_ON : Trace On. + Idx:27; ID:10; I_ADDR_CTXT_L_64IS0 : Address & Context, Long, 64 bit, IS0.; Addr=0x0000FFFFB6069140; Ctxt: AArch64,EL0, NS; + Idx:38; ID:10; I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEEEEEEEEEEEEEE + Idx:39; ID:10; I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEEEEEEEEEEEEEE + Idx:40; ID:10; I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEEEEEEEEEEEEEE + Idx:41; ID:10; I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEN + ... + +If you see these above, then your system is tracing CoreSight data +correctly. + +To compile perf with CoreSight support in the tools/perf directory do:: + + make CORESIGHT=1 + +This requires OpenCSD to build. You may install distribution packages +for the support such as libopencsd and libopencsd-dev or download it +and build yourself. Upstream OpenCSD is located at: + + https://github.com/Linaro/OpenCSD + +For complete information on building perf with CoreSight support and +more extensive usage look at: + + https://github.com/Linaro/OpenCSD/blob/master/HOWTO.md + + +Kernel CoreSight Support +------------------------ + +You will also want CoreSight support enabled in your kernel config. +Ensure it is enabled with:: + + CONFIG_CORESIGHT=y + +There are various other CoreSight options you probably also want +enabled like:: + + CONFIG_CORESIGHT_LINKS_AND_SINKS=y + CONFIG_CORESIGHT_LINK_AND_SINK_TMC=y + CONFIG_CORESIGHT_CATU=y + CONFIG_CORESIGHT_SINK_TPIU=y + CONFIG_CORESIGHT_SINK_ETBV10=y + CONFIG_CORESIGHT_SOURCE_ETM4X=y + CONFIG_CORESIGHT_CTI=y + CONFIG_CORESIGHT_CTI_INTEGRATION_REGS=y + +Please refer to the kernel configuration help for more information. + +Perf test - Verify kernel and userspace perf CoreSight work +----------------------------------------------------------- + +When you run perf test, it will do a lot of self tests. Some of those +tests will cover CoreSight (only if enabled and on ARM64). You +generally would run perf test from the tools/perf directory in the +kernel tree. Some tests will check some internal perf support like: + + Check Arm CoreSight trace data recording and synthesized samples + Check Arm SPE trace data recording and synthesized samples + +Some others will actually use perf record and some test binaries that +are in tests/shell/coresight and will collect traces to ensure a +minimum level of functionality is met. The scripts that launch these +tests are in the same directory. These will all look like: + + CoreSight / ASM Pure Loop + CoreSight / Memcpy 16k 10 Threads + CoreSight / Thread Loop 10 Threads - Check TID + etc. + +These perf record tests will not run if the tool binaries do not exist +in tests/shell/coresight/\*/ and will be skipped. If you do not have +CoreSight support in hardware then either do not build perf with +CoreSight support or remove these binaries in order to not have these +tests fail and have them skip instead. + +These tests will log historical results in the current working +directory (e.g. tools/perf) and will be named stats-\*.csv like: + + stats-asm_pure_loop-out.csv + stats-memcpy_thread-16k_10.csv + ... + +These statistic files log some aspects of the AUX data sections in +the perf data output counting some numbers of certain encodings (a +good way to know that it's working in a very simple way). One problem +with CoreSight is that given a large enough amount of data needing to +be logged, some of it can be lost due to the processor not waking up +in time to read out all the data from buffers etc.. You will notice +that the amount of data collected can vary a lot per run of perf test. +If you wish to see how this changes over time, simply run perf test +multiple times and all these csv files will have more and more data +appended to it that you can later examine, graph and otherwise use to +figure out if things have become worse or better. + +This means sometimes these tests fail as they don't capture all the +data needed. This is about tracking quality and amount of data +produced over time and to see when changes to the Linux kernel improve +quality of traces. + +Be aware that some of these tests take quite a while to run, specifically +in processing the perf data file and dumping contents to then examine what +is inside. + +You can change where these csv logs are stored by setting the +PERF_TEST_CORESIGHT_STATDIR environment variable before running perf +test like:: + + export PERF_TEST_CORESIGHT_STATDIR=/var/tmp + perf test + +They will also store resulting perf output data in the current +directory for later inspection like:: + + perf-asm_pure_loop-out.data + perf-memcpy_thread-16k_10.data + ... + +You can alter where the perf data files are stored by setting the +PERF_TEST_CORESIGHT_DATADIR environment variable such as:: + + PERF_TEST_CORESIGHT_DATADIR=/var/tmp + perf test + +You may wish to set these above environment variables if you wish to +keep the output of tests outside of the current working directory for +longer term storage and examination. diff --git a/tools/perf/Documentation/perf-arm-coresight.txt b/tools/perf/Documentation/perf-arm-coresight.txt new file mode 100644 index 0000000000000..c117fc50a2a95 --- /dev/null +++ b/tools/perf/Documentation/perf-arm-coresight.txt @@ -0,0 +1,5 @@ +Arm CoreSight Support +===================== + +For full documentation, see Documentation/trace/coresight/coresight-perf.rst +in the kernel tree. -- GitLab From cad3b68954134f6b871e76d9b39354e8d9a53db5 Mon Sep 17 00:00:00 2001 From: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Date: Thu, 6 Oct 2022 17:12:25 +0530 Subject: [PATCH 1520/2223] perf stat: Fix cpu check to use id.cpu.cpu in aggr_printout() 'perf stat' has options to aggregate the counts in different modes like per socket, per core etc. The function "aggr_printout" in util/stat-display.c which is used to print the aggregates, has a check for cpu in case of AGGR_NONE. This check was originally using condition : "if (id.cpu.cpu > -1)". But this got changed after commit df936cadfb58 ("perf stat: Add JSON output option"), which added option to output json format for different aggregation modes. After this commit, the check in "aggr_printout" is using "if (id.core > -1)". The old code was using "id.cpu.cpu > -1" while the new code is using "id.core > -1". But since the value printed is id.cpu.cpu, fix this check to use cpu and not core. Suggested-by: Ian Rogers <irogers@google.com> Suggested-by: James Clark <james.clark@arm.com> Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Tested-by: Ian Rogers <irogers@google.com> Cc: Disha Goel <disgoel@linux.vnet.ibm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nageswara R Sastry <rnsastry@linux.ibm.com> Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20221006114225.66303-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/stat-display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index df26fb5eb072b..5c47ee9963a7c 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -168,7 +168,7 @@ static void aggr_printout(struct perf_stat_config *config, id.socket, id.die, id.core); - } else if (id.core > -1) { + } else if (id.cpu.cpu > -1) { fprintf(config->output, "\"cpu\" : \"%d\", ", id.cpu.cpu); } @@ -179,7 +179,7 @@ static void aggr_printout(struct perf_stat_config *config, id.die, config->csv_output ? 0 : -3, id.core, config->csv_sep); - } else if (id.core > -1) { + } else if (id.cpu.cpu > -1) { fprintf(config->output, "CPU%*d%s", config->csv_output ? 0 : -7, id.cpu.cpu, config->csv_sep); -- GitLab From b7ddd38ccc723f0dca68151baed1e6c07c2a6005 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:39 +0530 Subject: [PATCH 1521/2223] tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel Two new fields for mem_lvl_num has been introduced: PERF_MEM_LVLNUM_IO and PERF_MEM_LVLNUM_CXL which are required to support perf mem/c2c on AMD platform. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-2-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/include/uapi/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 49cb2355efc0c..ea6defacc1a7d 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1327,7 +1327,9 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0xa available */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -- GitLab From 160ae99365abeac216aeaa3407dce6cf038037e1 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:40 +0530 Subject: [PATCH 1522/2223] perf amd ibs: Sync arch/x86/include/asm/amd-ibs.h header with the kernel Although new details added into this header is currently used by kernel only, tools copy needs to be in sync with kernel file to avoid tools/perf/check-headers.sh warnings. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-3-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/arch/x86/include/asm/amd-ibs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 9a3312e12e2ed..93807b437e4de 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -6,6 +6,22 @@ #include "msr-index.h" +/* IBS_OP_DATA2 DataSrc */ +#define IBS_DATA_SRC_LOC_CACHE 2 +#define IBS_DATA_SRC_DRAM 3 +#define IBS_DATA_SRC_REM_CACHE 4 +#define IBS_DATA_SRC_IO 7 + +/* IBS_OP_DATA2 DataSrc Extension */ +#define IBS_DATA_SRC_EXT_LOC_CACHE 1 +#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 +#define IBS_DATA_SRC_EXT_DRAM 3 +#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 +#define IBS_DATA_SRC_EXT_PMEM 6 +#define IBS_DATA_SRC_EXT_IO 7 +#define IBS_DATA_SRC_EXT_EXT_MEM 8 +#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 + /* * IBS Hardware MSRs */ -- GitLab From 923396f6827d00ef18c1bf589551e5a604191261 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:41 +0530 Subject: [PATCH 1523/2223] perf mem: Add support for printing PERF_MEM_LVLNUM_{CXL|IO} Add support for printing these new fields in perf mem report. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-4-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/mem-events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 764883183519e..8909dc7b14a71 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -294,6 +294,8 @@ static const char * const mem_lvl[] = { }; static const char * const mem_lvlnum[] = { + [PERF_MEM_LVLNUM_CXL] = "CXL", + [PERF_MEM_LVLNUM_IO] = "I/O", [PERF_MEM_LVLNUM_ANY_CACHE] = "Any cache", [PERF_MEM_LVLNUM_LFB] = "LFB", [PERF_MEM_LVLNUM_RAM] = "RAM", -- GitLab From 4173cc055dc92f199a43775775e54dc7fafd37b6 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:42 +0530 Subject: [PATCH 1524/2223] perf mem/c2c: Set PERF_SAMPLE_WEIGHT for LOAD_STORE events Currently perf sets PERF_SAMPLE_WEIGHT flag only for mem load events. Set it for combined load-store event as well which will enable recording of load latency by default on arch that does not support independent mem load event. Also document missing -W in perf-record man page. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-5-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-record.txt | 1 + tools/perf/builtin-c2c.c | 1 + tools/perf/builtin-mem.c | 1 + 3 files changed, 3 insertions(+) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 378f497f4be32..e41ae950fdc3b 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -411,6 +411,7 @@ is enabled for all the sampling events. The sampled branch type is the same for The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k Note that this feature may not be available on all processors. +-W:: --weight:: Enable weightened sampling. An additional weight is recorded per sample and can be displayed with the weight and local_weight sort keys. This currently works for TSX diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index f35a47b2dbe49..a9190458d2d50 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -3281,6 +3281,7 @@ static int perf_c2c__record(int argc, const char **argv) */ if (e->tag) { e->record = true; + rec_argv[i++] = "-W"; } else { e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD); e->record = true; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 9e435fd235032..f7dd8216de72e 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -122,6 +122,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) (mem->operation & MEM_OPERATION_LOAD) && (mem->operation & MEM_OPERATION_STORE)) { e->record = true; + rec_argv[i++] = "-W"; } else { if (mem->operation & MEM_OPERATION_LOAD) { e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD); -- GitLab From f7b58cbdb3ff36eba8622e67eee66c10dd1c9995 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:43 +0530 Subject: [PATCH 1525/2223] perf mem/c2c: Add load store event mappings for AMD The 'perf mem' and 'perf c2c' tools are wrappers around 'perf record' with mem load/ store events. IBS tagged load/store sample provides most of the information needed for these tools. Wire in the "ibs_op//" event as mem-ldst event for AMD. There are some limitations though: Only load/store micro-ops provide mem/c2c information. Whereas, IBS does not have a way to choose a particular type of micro-op to tag. This results in many non-LS micro-ops being tagged which appear as N/A in the perf report. IBS, being an uncore pmu from kernel point of view[1], does not support per process monitoring. Thus, perf mem/c2c on AMD are currently supported in per-cpu mode only. Example: $ sudo perf mem record -- -c 10000 ^C[ perf record: Woken up 227 times to write data ] [ perf record: Captured and wrote 58.760 MB perf.data (836978 samples) ] $ sudo perf mem report -F mem,sample,snoop Samples: 836K of event 'ibs_op//', Event count (approx.): 8418762 Memory access Samples Snoop N/A 700620 N/A L1 hit 126675 N/A L2 hit 424 N/A L3 hit 664 HitM L3 hit 10 N/A Local RAM hit 2 N/A Remote RAM (1 hop) hit 8558 N/A Remote Cache (1 hop) hit 3 N/A Remote Cache (1 hop) hit 2 HitM Remote Cache (2 hops) hit 10 HitM Remote Cache (2 hops) hit 6 N/A Uncached hit 4 N/A $ [1]: https://lore.kernel.org/lkml/20220829113347.295-1-ravi.bangoria@amd.com Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-6-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/Documentation/perf-c2c.txt | 14 ++++++++---- tools/perf/Documentation/perf-mem.txt | 3 ++- tools/perf/arch/x86/util/mem-events.c | 31 +++++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index f1f7ae6b08d1e..5c5eb2def83e4 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -19,9 +19,10 @@ C2C stands for Cache To Cache. The perf c2c tool provides means for Shared Data C2C/HITM analysis. It allows you to track down the cacheline contentions. -On x86, the tool is based on load latency and precise store facility events +On Intel, the tool is based on load latency and precise store facility events provided by Intel CPUs. On PowerPC, the tool uses random instruction sampling -with thresholding feature. +with thresholding feature. On AMD, the tool uses IBS op pmu (due to hardware +limitations, perf c2c is not supported on Zen3 cpus). These events provide: - memory address of the access @@ -49,7 +50,8 @@ RECORD OPTIONS -l:: --ldlat:: - Configure mem-loads latency. (x86 only) + Configure mem-loads latency. Supported on Intel and Arm64 processors + only. Ignored on other archs. -k:: --all-kernel:: @@ -135,11 +137,15 @@ Following perf record options are configured by default: -W,-d,--phys-data,--sample-cpu Unless specified otherwise with '-e' option, following events are monitored by -default on x86: +default on Intel: cpu/mem-loads,ldlat=30/P cpu/mem-stores/P +following on AMD: + + ibs_op// + and following on PowerPC: cpu/mem-loads/ diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 66177511c5c4b..005c95580b1e6 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -85,7 +85,8 @@ RECORD OPTIONS Be more verbose (show counter open errors, etc) --ldlat <n>:: - Specify desired latency for loads event. (x86 only) + Specify desired latency for loads event. Supported on Intel and Arm64 + processors only. Ignored on other archs. In addition, for report all perf report options are valid, and for record all perf record options. diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c index 5214370ca4e48..f683ac702247c 100644 --- a/tools/perf/arch/x86/util/mem-events.c +++ b/tools/perf/arch/x86/util/mem-events.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "util/pmu.h" +#include "util/env.h" #include "map_symbol.h" #include "mem-events.h" +#include "linux/string.h" static char mem_loads_name[100]; static bool mem_loads_name__init; @@ -12,18 +14,43 @@ static char mem_stores_name[100]; #define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s } -static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = { +static struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = { E("ldlat-loads", "%s/mem-loads,ldlat=%u/P", "%s/events/mem-loads"), E("ldlat-stores", "%s/mem-stores/P", "%s/events/mem-stores"), E(NULL, NULL, NULL), }; +static struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = { + E(NULL, NULL, NULL), + E(NULL, NULL, NULL), + E("mem-ldst", "ibs_op//", "ibs_op"), +}; + +static int perf_mem_is_amd_cpu(void) +{ + struct perf_env env = { .total_mem = 0, }; + + perf_env__cpuid(&env); + if (env.cpuid && strstarts(env.cpuid, "AuthenticAMD")) + return 1; + return -1; +} + struct perf_mem_event *perf_mem_events__ptr(int i) { + /* 0: Uninitialized, 1: Yes, -1: No */ + static int is_amd; + if (i >= PERF_MEM_EVENTS__MAX) return NULL; - return &perf_mem_events[i]; + if (!is_amd) + is_amd = perf_mem_is_amd_cpu(); + + if (is_amd == 1) + return &perf_mem_events_amd[i]; + + return &perf_mem_events_intel[i]; } bool is_mem_loads_aux_event(struct evsel *leader) -- GitLab From 2c5f652c442600cfd86fc2a7a7cfd8152f254971 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:44 +0530 Subject: [PATCH 1526/2223] perf mem/c2c: Avoid printing empty lines for unsupported events The 'perf mem' and 'perf c2c' tools can be used with 3 different events: load, store and combined load-store. Some architectures might support only partial set of events in which case, perf prints an empty line for unsupported events. Avoid that. Ex, AMD Zen cpus supports only combined load-store event and does not support individual load and store event. Before patch: $ perf mem record -e list mem-ldst : available $ After patch: $ perf mem record -e list mem-ldst : available $ Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-7-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/mem-events.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 8909dc7b14a71..6c7feecd2e049 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -156,11 +156,12 @@ void perf_mem_events__list(void) for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { struct perf_mem_event *e = perf_mem_events__ptr(j); - fprintf(stderr, "%-13s%-*s%s\n", - e->tag ?: "", - verbose > 0 ? 25 : 0, - verbose > 0 ? perf_mem_events__name(j, NULL) : "", - e->supported ? ": available" : ""); + fprintf(stderr, "%-*s%-*s%s", + e->tag ? 13 : 0, + e->tag ? : "", + e->tag && verbose > 0 ? 25 : 0, + e->tag && verbose > 0 ? perf_mem_events__name(j, NULL) : "", + e->supported ? ": available\n" : ""); } } -- GitLab From c72de11605c5e291981cd30225542169fb3da4df Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:45 +0530 Subject: [PATCH 1527/2223] perf mem: Print "LFB/MAB" for PERF_MEM_LVLNUM_LFB A hw component to track outstanding L1 Data Cache misses is called LFB (Line Fill Buffer) on Intel and Arm. However similar component exists on other arch with different names, for ex, it's called MAB (Miss Address Buffer) on AMD. Use 'LFB/MAB' instead of just 'LFB'. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-8-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/mem-events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 6c7feecd2e049..b3a91093069a5 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -282,7 +282,7 @@ static const char * const mem_lvl[] = { "HIT", "MISS", "L1", - "LFB", + "LFB/MAB", "L2", "L3", "Local RAM", @@ -298,7 +298,7 @@ static const char * const mem_lvlnum[] = { [PERF_MEM_LVLNUM_CXL] = "CXL", [PERF_MEM_LVLNUM_IO] = "I/O", [PERF_MEM_LVLNUM_ANY_CACHE] = "Any cache", - [PERF_MEM_LVLNUM_LFB] = "LFB", + [PERF_MEM_LVLNUM_LFB] = "LFB/MAB", [PERF_MEM_LVLNUM_RAM] = "RAM", [PERF_MEM_LVLNUM_PMEM] = "PMEM", [PERF_MEM_LVLNUM_NA] = "N/A", -- GitLab From d79310700590b8b40d8c867012d6c899ea6fd505 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@amd.com> Date: Thu, 6 Oct 2022 21:09:46 +0530 Subject: [PATCH 1528/2223] perf script: Add missing fields in usage hint A few fields are missing in the usage message printed when an unknown field option is passed. Add them to the list. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-9-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-script.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 7fa467ed91dc7..7ca238277d835 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3846,9 +3846,10 @@ int cmd_script(int argc, const char **argv) "Valid types: hw,sw,trace,raw,synth. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," "addr,symoff,srcline,period,iregs,uregs,brstack," - "brstacksym,flags,bpf-output,brstackinsn,brstackinsnlen,brstackoff," - "callindent,insn,insnlen,synth,phys_addr,metric,misc,ipc,tod," - "data_page_size,code_page_size,ins_lat", + "brstacksym,flags,data_src,weight,bpf-output,brstackinsn," + "brstackinsnlen,brstackoff,callindent,insn,insnlen,synth," + "phys_addr,metric,misc,srccode,ipc,tod,data_page_size," + "code_page_size,ins_lat", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), -- GitLab From ef575281b21e9a34dfae544a187c6aac2ae424a9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Date: Sat, 27 Aug 2022 00:27:46 +0900 Subject: [PATCH 1529/2223] 9p/trans_fd: always use O_NONBLOCK read/write syzbot is reporting hung task at p9_fd_close() [1], for p9_mux_poll_stop() from p9_conn_destroy() from p9_fd_close() is failing to interrupt already started kernel_read() from p9_fd_read() from p9_read_work() and/or kernel_write() from p9_fd_write() from p9_write_work() requests. Since p9_socket_open() sets O_NONBLOCK flag, p9_mux_poll_stop() does not need to interrupt kernel_read()/kernel_write(). However, since p9_fd_open() does not set O_NONBLOCK flag, but pipe blocks unless signal is pending, p9_mux_poll_stop() needs to interrupt kernel_read()/kernel_write() when the file descriptor refers to a pipe. In other words, pipe file descriptor needs to be handled as if socket file descriptor. We somehow need to interrupt kernel_read()/kernel_write() on pipes. A minimal change, which this patch is doing, is to set O_NONBLOCK flag from p9_fd_open(), for O_NONBLOCK flag does not affect reading/writing of regular files. But this approach changes O_NONBLOCK flag on userspace- supplied file descriptors (which might break userspace programs), and O_NONBLOCK flag could be changed by userspace. It would be possible to set O_NONBLOCK flag every time p9_fd_read()/p9_fd_write() is invoked, but still remains small race window for clearing O_NONBLOCK flag. If we don't want to manipulate O_NONBLOCK flag, we might be able to surround kernel_read()/kernel_write() with set_thread_flag(TIF_SIGPENDING) and recalc_sigpending(). Since p9_read_work()/p9_write_work() works are processed by kernel threads which process global system_wq workqueue, signals could not be delivered from remote threads when p9_mux_poll_stop() from p9_conn_destroy() from p9_fd_close() is called. Therefore, calling set_thread_flag(TIF_SIGPENDING)/recalc_sigpending() every time would be needed if we count on signals for making kernel_read()/kernel_write() non-blocking. Link: https://lkml.kernel.org/r/345de429-a88b-7097-d177-adecf9fed342@I-love.SAKURA.ne.jp Link: https://syzkaller.appspot.com/bug?extid=8b41a1365f1106fd0f33 [1] Reported-by: syzbot <syzbot+8b41a1365f1106fd0f33@syzkaller.appspotmail.com> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Tested-by: syzbot <syzbot+8b41a1365f1106fd0f33@syzkaller.appspotmail.com> Reviewed-by: Christian Schoenebeck <linux_oss@crudebyte.com> [Dominique: add comment at Christian's suggestion] Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/trans_fd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 25d422c473e8a..98732619d8394 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -822,11 +822,14 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd) goto out_free_ts; if (!(ts->rd->f_mode & FMODE_READ)) goto out_put_rd; + /* prevent workers from hanging on IO when fd is a pipe */ + ts->rd->f_flags |= O_NONBLOCK; ts->wr = fget(wfd); if (!ts->wr) goto out_put_rd; if (!(ts->wr->f_mode & FMODE_WRITE)) goto out_put_wr; + ts->wr->f_flags |= O_NONBLOCK; client->trans = ts; client->status = Connected; -- GitLab From 2849752f36848359034616eb70dfc7fb14eb3cd4 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Thu, 6 Oct 2022 10:50:28 +0200 Subject: [PATCH 1530/2223] xen/pcifront: move xenstore config scanning into sub-function pcifront_try_connect() and pcifront_attach_devices() share a large chunk of duplicated code for reading the config information from Xenstore, which only differs regarding calling pcifront_rescan_root() or pcifront_scan_root(). Put that code into a new sub-function. It is fine to always call pcifront_rescan_root() from that common function, as it will fallback to pcifront_scan_root() if the domain/bus combination isn't known yet (and pcifront_scan_root() should never be called for an already known domain/bus combination anyway). In order to avoid duplicate messages for the fallback case move the check for domain/bus not known to the beginning of pcifront_rescan_root(). While at it fix the error reporting in case the root-xx node had the wrong format. As the return value of pcifront_try_connect() and pcifront_attach_devices() are not used anywhere make those functions return void. As an additional bonus this removes the dubious return of -EFAULT in case of an unexpected driver state. Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Jason Andryuk <jandryuk@gmail.com> Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/pci/xen-pcifront.c | 143 ++++++++++--------------------------- 1 file changed, 37 insertions(+), 106 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 77e61b4701218..7378e2f3e525f 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -521,24 +521,14 @@ static int pcifront_rescan_root(struct pcifront_device *pdev, int err; struct pci_bus *b; -#ifndef CONFIG_PCI_DOMAINS - if (domain != 0) { - dev_err(&pdev->xdev->dev, - "PCI Root in non-zero PCI Domain! domain=%d\n", domain); - dev_err(&pdev->xdev->dev, - "Please compile with CONFIG_PCI_DOMAINS\n"); - return -EINVAL; - } -#endif - - dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n", - domain, bus); - b = pci_find_bus(domain, bus); if (!b) /* If the bus is unknown, create it. */ return pcifront_scan_root(pdev, domain, bus); + dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n", + domain, bus); + err = pcifront_scan_bus(pdev, domain, bus, b); /* Claim resources before going "live" with our devices */ @@ -819,76 +809,73 @@ out: return err; } -static int pcifront_try_connect(struct pcifront_device *pdev) +static void pcifront_connect(struct pcifront_device *pdev) { - int err = -EFAULT; + int err; int i, num_roots, len; char str[64]; unsigned int domain, bus; - - /* Only connect once */ - if (xenbus_read_driver_state(pdev->xdev->nodename) != - XenbusStateInitialised) - goto out; - - err = pcifront_connect_and_init_dma(pdev); - if (err && err != -EEXIST) { - xenbus_dev_fatal(pdev->xdev, err, - "Error setting up PCI Frontend"); - goto out; - } - err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "root_num", "%d", &num_roots); if (err == -ENOENT) { xenbus_dev_error(pdev->xdev, err, "No PCI Roots found, trying 0000:00"); - err = pcifront_scan_root(pdev, 0, 0); + err = pcifront_rescan_root(pdev, 0, 0); if (err) { xenbus_dev_fatal(pdev->xdev, err, "Error scanning PCI root 0000:00"); - goto out; + return; } num_roots = 0; } else if (err != 1) { - if (err == 0) - err = -EINVAL; - xenbus_dev_fatal(pdev->xdev, err, + xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err, "Error reading number of PCI roots"); - goto out; + return; } for (i = 0; i < num_roots; i++) { len = snprintf(str, sizeof(str), "root-%d", i); - if (unlikely(len >= (sizeof(str) - 1))) { - err = -ENOMEM; - goto out; - } + if (unlikely(len >= (sizeof(str) - 1))) + return; err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%x:%x", &domain, &bus); if (err != 2) { - if (err >= 0) - err = -EINVAL; - xenbus_dev_fatal(pdev->xdev, err, + xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err, "Error reading PCI root %d", i); - goto out; + return; } - err = pcifront_scan_root(pdev, domain, bus); + err = pcifront_rescan_root(pdev, domain, bus); if (err) { xenbus_dev_fatal(pdev->xdev, err, "Error scanning PCI root %04x:%02x", domain, bus); - goto out; + return; } } - err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); + xenbus_switch_state(pdev->xdev, XenbusStateConnected); +} -out: - return err; +static void pcifront_try_connect(struct pcifront_device *pdev) +{ + int err; + + /* Only connect once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitialised) + return; + + err = pcifront_connect_and_init_dma(pdev); + if (err && err != -EEXIST) { + xenbus_dev_fatal(pdev->xdev, err, + "Error setting up PCI Frontend"); + return; + } + + pcifront_connect(pdev); } static int pcifront_try_disconnect(struct pcifront_device *pdev) @@ -914,67 +901,11 @@ out: return err; } -static int pcifront_attach_devices(struct pcifront_device *pdev) +static void pcifront_attach_devices(struct pcifront_device *pdev) { - int err = -EFAULT; - int i, num_roots, len; - unsigned int domain, bus; - char str[64]; - - if (xenbus_read_driver_state(pdev->xdev->nodename) != + if (xenbus_read_driver_state(pdev->xdev->nodename) == XenbusStateReconfiguring) - goto out; - - err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, - "root_num", "%d", &num_roots); - if (err == -ENOENT) { - xenbus_dev_error(pdev->xdev, err, - "No PCI Roots found, trying 0000:00"); - err = pcifront_rescan_root(pdev, 0, 0); - if (err) { - xenbus_dev_fatal(pdev->xdev, err, - "Error scanning PCI root 0000:00"); - goto out; - } - num_roots = 0; - } else if (err != 1) { - if (err == 0) - err = -EINVAL; - xenbus_dev_fatal(pdev->xdev, err, - "Error reading number of PCI roots"); - goto out; - } - - for (i = 0; i < num_roots; i++) { - len = snprintf(str, sizeof(str), "root-%d", i); - if (unlikely(len >= (sizeof(str) - 1))) { - err = -ENOMEM; - goto out; - } - - err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, - "%x:%x", &domain, &bus); - if (err != 2) { - if (err >= 0) - err = -EINVAL; - xenbus_dev_fatal(pdev->xdev, err, - "Error reading PCI root %d", i); - goto out; - } - - err = pcifront_rescan_root(pdev, domain, bus); - if (err) { - xenbus_dev_fatal(pdev->xdev, err, - "Error scanning PCI root %04x:%02x", - domain, bus); - goto out; - } - } - - xenbus_switch_state(pdev->xdev, XenbusStateConnected); - -out: - return err; + pcifront_connect(pdev); } static int pcifront_detach_devices(struct pcifront_device *pdev) -- GitLab From 87d1aa8b90d83b0084c7d9baadefaeaf928014c3 Mon Sep 17 00:00:00 2001 From: Wenjia Zhang <wenjia@linux.ibm.com> Date: Fri, 7 Oct 2022 08:54:36 +0200 Subject: [PATCH 1531/2223] MAINTAINERS: add Jan as SMC maintainer Add Jan as maintainer for Shared Memory Communications (SMC) Sockets. Acked-by: Jan Karcher <jaka@linux.ibm.com> Acked-by: Alexandra Winter <wintera@linux.ibm.com> Signed-off-by: Wenjia Zhang <wenjia@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9ca84cb5ab4a9..b7105db9fe6c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18487,6 +18487,7 @@ F: drivers/misc/sgi-xp/ SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS M: Karsten Graul <kgraul@linux.ibm.com> M: Wenjia Zhang <wenjia@linux.ibm.com> +M: Jan Karcher <jaka@linux.ibm.com> L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ -- GitLab From 30393181fdbc1608cc683b4ee99dcce05ffcc8c7 Mon Sep 17 00:00:00 2001 From: Alexander Aring <aahringo@redhat.com> Date: Wed, 5 Oct 2022 22:02:37 -0400 Subject: [PATCH 1532/2223] net: ieee802154: return -EINVAL for unknown addr type This patch adds handling to return -EINVAL for an unknown addr type. The current behaviour is to return 0 as successful but the size of an unknown addr type is not defined and should return an error like -EINVAL. Fixes: 94160108a70c ("net/ieee802154: fix uninit value bug in dgram_sendmsg") Signed-off-by: Alexander Aring <aahringo@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/ieee802154_netdev.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index a8994f307fc38..03b64bf876a46 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -185,21 +185,27 @@ static inline int ieee802154_sockaddr_check_size(struct sockaddr_ieee802154 *daddr, int len) { struct ieee802154_addr_sa *sa; + int ret = 0; sa = &daddr->addr; if (len < IEEE802154_MIN_NAMELEN) return -EINVAL; switch (sa->addr_type) { + case IEEE802154_ADDR_NONE: + break; case IEEE802154_ADDR_SHORT: if (len < IEEE802154_NAMELEN_SHORT) - return -EINVAL; + ret = -EINVAL; break; case IEEE802154_ADDR_LONG: if (len < IEEE802154_NAMELEN_LONG) - return -EINVAL; + ret = -EINVAL; + break; + default: + ret = -EINVAL; break; } - return 0; + return ret; } static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a, -- GitLab From 365e1ececb2905f94cc10a5817c5b644a32a3ae2 Mon Sep 17 00:00:00 2001 From: Gaurav Kohli <gauravkohli@linux.microsoft.com> Date: Wed, 5 Oct 2022 22:52:59 -0700 Subject: [PATCH 1533/2223] hv_netvsc: Fix race between VF offering and VF association message from host During vm boot, there might be possibility that vf registration call comes before the vf association from host to vm. And this might break netvsc vf path, To prevent the same block vf registration until vf bind message comes from host. Cc: stable@vger.kernel.org Fixes: 00d7ddba11436 ("hv_netvsc: pair VF based on serial number") Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com> Signed-off-by: Gaurav Kohli <gauravkohli@linux.microsoft.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/hyperv/hyperv_net.h | 3 ++- drivers/net/hyperv/netvsc.c | 4 ++++ drivers/net/hyperv/netvsc_drv.c | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 25b38a374e3c3..dd5919ec408bf 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -1051,7 +1051,8 @@ struct net_device_context { u32 vf_alloc; /* Serial number of the VF to team with */ u32 vf_serial; - + /* completion variable to confirm vf association */ + struct completion vf_add; /* Is the current data path through the VF NIC? */ bool data_path_is_vf; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index f066de0da4925..9352dad58996d 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1580,6 +1580,10 @@ static void netvsc_send_vf(struct net_device *ndev, net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated; net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial; + + if (net_device_ctx->vf_alloc) + complete(&net_device_ctx->vf_add); + netdev_info(ndev, "VF slot %u %s\n", net_device_ctx->vf_serial, net_device_ctx->vf_alloc ? "added" : "removed"); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5f08482065cab..89eb4f179a3ce 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2313,6 +2313,18 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) } + /* Fallback path to check synthetic vf with + * help of mac addr + */ + list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { + ndev = hv_get_drvdata(ndev_ctx->device_ctx); + if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) { + netdev_notice(vf_netdev, + "falling back to mac addr based matching\n"); + return ndev; + } + } + netdev_notice(vf_netdev, "no netdev found for vf serial:%u\n", serial); return NULL; @@ -2409,6 +2421,11 @@ static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event) if (net_device_ctx->data_path_is_vf == vf_is_up) return NOTIFY_OK; + if (vf_is_up && !net_device_ctx->vf_alloc) { + netdev_info(ndev, "Waiting for the VF association from host\n"); + wait_for_completion(&net_device_ctx->vf_add); + } + ret = netvsc_switch_datapath(ndev, vf_is_up); if (ret) { @@ -2440,6 +2457,7 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) netvsc_vf_setxdp(vf_netdev, NULL); + reinit_completion(&net_device_ctx->vf_add); netdev_rx_handler_unregister(vf_netdev); netdev_upper_dev_unlink(vf_netdev, ndev); RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL); @@ -2479,6 +2497,7 @@ static int netvsc_probe(struct hv_device *dev, INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change); + init_completion(&net_device_ctx->vf_add); spin_lock_init(&net_device_ctx->lock); INIT_LIST_HEAD(&net_device_ctx->reconfig_events); INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup); -- GitLab From 7305e7804d04eafff615a24e241aa6105101d48b Mon Sep 17 00:00:00 2001 From: Yang Li <yang.lee@linux.alibaba.com> Date: Thu, 6 Oct 2022 19:44:00 +0800 Subject: [PATCH 1534/2223] octeontx2-pf: mcs: remove unneeded semicolon Semicolon is not required after curly braces. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2332 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index 64f3acd7f67bd..18420d9a145fb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -133,7 +133,7 @@ static int cn10k_mcs_alloc_rsrc(struct otx2_nic *pfvf, enum mcs_direction dir, default: ret = -EINVAL; goto fail; - }; + } mutex_unlock(&mbox->lock); -- GitLab From 3030cbff67a7ae12b4b7bf69a605699372424f41 Mon Sep 17 00:00:00 2001 From: Yang Li <yang.lee@linux.alibaba.com> Date: Thu, 6 Oct 2022 20:01:36 +0800 Subject: [PATCH 1535/2223] net: enetc: Remove duplicated include in enetc_qos.c net/pkt_sched.h is included twice in enetc_qos.c, remove one of them. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2334 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/freescale/enetc/enetc_qos.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_qos.c b/drivers/net/ethernet/freescale/enetc/enetc_qos.c index e6416332ec796..a842e1999122c 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_qos.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_qos.c @@ -7,7 +7,6 @@ #include <linux/math64.h> #include <linux/refcount.h> #include <net/pkt_cls.h> -#include <net/pkt_sched.h> #include <net/tc_act/tc_gate.h> static u16 enetc_get_max_gcl_len(struct enetc_hw *hw) -- GitLab From 61b91eb33a69c3be11b259c5ea484505cd79f883 Mon Sep 17 00:00:00 2001 From: David Ahern <dsahern@kernel.org> Date: Thu, 6 Oct 2022 10:48:49 -0600 Subject: [PATCH 1536/2223] ipv4: Handle attempt to delete multipath route when fib_info contains an nh reference Gwangun Jung reported a slab-out-of-bounds access in fib_nh_match: fib_nh_match+0xf98/0x1130 linux-6.0-rc7/net/ipv4/fib_semantics.c:961 fib_table_delete+0x5f3/0xa40 linux-6.0-rc7/net/ipv4/fib_trie.c:1753 inet_rtm_delroute+0x2b3/0x380 linux-6.0-rc7/net/ipv4/fib_frontend.c:874 Separate nexthop objects are mutually exclusive with the legacy multipath spec. Fix fib_nh_match to return if the config for the to be deleted route contains a multipath spec while the fib_info is using a nexthop object. Fixes: 493ced1ac47c ("ipv4: Allow routes to use nexthop objects") Fixes: 6bf92d70e690 ("net: ipv4: fix route with nexthop object delete warning") Reported-by: Gwangun Jung <exsociety@gmail.com> Signed-off-by: David Ahern <dsahern@kernel.org> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Tested-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/fib_semantics.c | 8 ++++---- tools/testing/selftests/net/fib_nexthops.sh | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2dc97583d2790..e9a7f70a54df4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -888,13 +888,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, return 1; } + /* cannot match on nexthop object attributes */ + if (fi->nh) + return 1; + if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; - /* cannot match on nexthop object attributes */ - if (fi->nh) - return 1; - nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index d5a0dd548989b..ee5e98204d3d2 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1223,6 +1223,11 @@ ipv4_fcnal() log_test $rc 0 "Delete nexthop route warning" run_cmd "$IP route delete 172.16.101.1/32 nhid 12" run_cmd "$IP nexthop del id 12" + + run_cmd "$IP nexthop add id 21 via 172.16.1.6 dev veth1" + run_cmd "$IP ro add 172.16.101.0/24 nhid 21" + run_cmd "$IP ro del 172.16.101.0/24 nexthop via 172.16.1.7 dev veth1 nexthop via 172.16.1.8 dev veth1" + log_test $? 2 "Delete multipath route with only nh id based entry" } ipv4_grp_fcnal() -- GitLab From fb4a5dfca0f0a027e2d89be00e53adb2827943f6 Mon Sep 17 00:00:00 2001 From: Serhiy Boiko <serhiy.boiko@plvision.eu> Date: Thu, 6 Oct 2022 22:04:09 +0300 Subject: [PATCH 1537/2223] prestera: matchall: do not rollback if rule exists If you try to create a 'mirror' ACL rule on a port that already has a mirror rule, prestera_span_rule_add() will fail with EEXIST error. This forces rollback procedure which destroys existing mirror rule on hardware leaving it visible in linux. Add an explicit check for EEXIST to prevent the deletion of the existing rule but keep user seeing error message: $ tc filter add dev sw1p1 ... skip_sw action mirred egress mirror dev sw1p2 $ tc filter add dev sw1p1 ... skip_sw action mirred egress mirror dev sw1p3 RTNETLINK answers: File exists We have an error talking to the kernel Fixes: 13defa275eef ("net: marvell: prestera: Add matchall support") Signed-off-by: Serhiy Boiko <serhiy.boiko@plvision.eu> Signed-off-by: Maksym Glubokiy <maksym.glubokiy@plvision.eu> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/prestera/prestera_matchall.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_matchall.c b/drivers/net/ethernet/marvell/prestera/prestera_matchall.c index 6f2b95a5263ec..1da9c1bc1ee9a 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_matchall.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_matchall.c @@ -96,6 +96,8 @@ int prestera_mall_replace(struct prestera_flow_block *block, list_for_each_entry(binding, &block->binding_list, list) { err = prestera_span_rule_add(binding, port, block->ingress); + if (err == -EEXIST) + return err; if (err) goto rollback; } -- GitLab From 4af609b216e8d9e8d4b03d49ca5b965e87c344b3 Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Thu, 6 Oct 2022 12:20:52 -0700 Subject: [PATCH 1538/2223] net: ethernet: mediatek: Remove -Warray-bounds exception GCC-12 emits false positive -Warray-bounds warnings with CONFIG_UBSAN_SHIFT (-fsanitize=shift). This is fixed in GCC 13[1], and there is top-level Makefile logic to remove -Warray-bounds for known-bad GCC versions staring with commit f0be87c42cbd ("gcc-12: disable '-Warray-bounds' universally for now"). Remove the local work-around. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105679 Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mediatek/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/Makefile b/drivers/net/ethernet/mediatek/Makefile index fe66ba8793cf8..45ba0970504a4 100644 --- a/drivers/net/ethernet/mediatek/Makefile +++ b/drivers/net/ethernet/mediatek/Makefile @@ -11,8 +11,3 @@ mtk_eth-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed_debugfs.o endif obj-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed_ops.o obj-$(CONFIG_NET_MEDIATEK_STAR_EMAC) += mtk_star_emac.o - -# FIXME: temporarily silence -Warray-bounds on non W=1+ builds -ifndef KBUILD_EXTRA_WARN -CFLAGS_mtk_ppe.o += -Wno-array-bounds -endif -- GitLab From aabf6155dfb83262ef9a10af4bef945e7aba9b8e Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Thu, 6 Oct 2022 12:20:53 -0700 Subject: [PATCH 1539/2223] net: ethernet: bgmac: Remove -Warray-bounds exception GCC-12 emits false positive -Warray-bounds warnings with CONFIG_UBSAN_SHIFT (-fsanitize=shift). This is fixed in GCC 13[1], and there is top-level Makefile logic to remove -Warray-bounds for known-bad GCC versions staring with commit f0be87c42cbd ("gcc-12: disable '-Warray-bounds' universally for now"). Remove the local work-around. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105679 Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/broadcom/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/Makefile b/drivers/net/ethernet/broadcom/Makefile index 2e6c5f258a1ff..0ddfb5b5d53ca 100644 --- a/drivers/net/ethernet/broadcom/Makefile +++ b/drivers/net/ethernet/broadcom/Makefile @@ -17,8 +17,3 @@ obj-$(CONFIG_BGMAC_BCMA) += bgmac-bcma.o bgmac-bcma-mdio.o obj-$(CONFIG_BGMAC_PLATFORM) += bgmac-platform.o obj-$(CONFIG_SYSTEMPORT) += bcmsysport.o obj-$(CONFIG_BNXT) += bnxt/ - -# FIXME: temporarily silence -Warray-bounds on non W=1+ builds -ifndef KBUILD_EXTRA_WARN -CFLAGS_tg3.o += -Wno-array-bounds -endif -- GitLab From f0c00454bf78975925eccc9737faaa4d4951edbf Mon Sep 17 00:00:00 2001 From: Biju Das <biju.das.jz@bp.renesas.com> Date: Wed, 28 Sep 2022 12:07:55 +0100 Subject: [PATCH 1540/2223] mmc: renesas_sdhi: Fix rounding errors Due to clk rounding errors on RZ/G2L platforms, it selects a clock source with a lower clock rate compared to a higher one. For eg: The rounding error (533333333 Hz / 4 * 4 = 533333332 Hz < 5333333 33 Hz) selects a clk source of 400 MHz instead of 533.333333 MHz. This patch fixes this issue by adding a margin of (1/1024) higher to the clock rate. Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com> Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be> Tested-by: Geert Uytterhoeven <geert+renesas@glider.be> Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com> Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com> Fixes: bb6d3fa98a41 ("clk: renesas: rcar-gen3: Switch to new SD clock handling") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220928110755.849275-1-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org> --- drivers/mmc/host/renesas_sdhi_core.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 6edbf5c161ab9..b970699743e0a 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -128,6 +128,7 @@ static unsigned int renesas_sdhi_clk_update(struct tmio_mmc_host *host, struct clk *ref_clk = priv->clk; unsigned int freq, diff, best_freq = 0, diff_min = ~0; unsigned int new_clock, clkh_shift = 0; + unsigned int new_upper_limit; int i; /* @@ -153,13 +154,20 @@ static unsigned int renesas_sdhi_clk_update(struct tmio_mmc_host *host, * greater than, new_clock. As we can divide by 1 << i for * any i in [0, 9] we want the input clock to be as close as * possible, but no greater than, new_clock << i. + * + * Add an upper limit of 1/1024 rate higher to the clock rate to fix + * clk rate jumping to lower rate due to rounding error (eg: RZ/G2L has + * 3 clk sources 533.333333 MHz, 400 MHz and 266.666666 MHz. The request + * for 533.333333 MHz will selects a slower 400 MHz due to rounding + * error (533333333 Hz / 4 * 4 = 533333332 Hz < 533333333 Hz)). */ for (i = min(9, ilog2(UINT_MAX / new_clock)); i >= 0; i--) { freq = clk_round_rate(ref_clk, new_clock << i); - if (freq > (new_clock << i)) { + new_upper_limit = (new_clock << i) + ((new_clock << i) >> 10); + if (freq > new_upper_limit) { /* Too fast; look for a slightly slower option */ freq = clk_round_rate(ref_clk, (new_clock << i) / 4 * 3); - if (freq > (new_clock << i)) + if (freq > new_upper_limit) continue; } @@ -181,6 +189,7 @@ static unsigned int renesas_sdhi_clk_update(struct tmio_mmc_host *host, static void renesas_sdhi_set_clock(struct tmio_mmc_host *host, unsigned int new_clock) { + unsigned int clk_margin; u32 clk = 0, clock; sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, ~CLK_CTL_SCLKEN & @@ -194,7 +203,13 @@ static void renesas_sdhi_set_clock(struct tmio_mmc_host *host, host->mmc->actual_clock = renesas_sdhi_clk_update(host, new_clock); clock = host->mmc->actual_clock / 512; - for (clk = 0x80000080; new_clock >= (clock << 1); clk >>= 1) + /* + * Add a margin of 1/1024 rate higher to the clock rate in order + * to avoid clk variable setting a value of 0 due to the margin + * provided for actual_clock in renesas_sdhi_clk_update(). + */ + clk_margin = new_clock >> 10; + for (clk = 0x80000080; new_clock + clk_margin >= (clock << 1); clk >>= 1) clock <<= 1; /* 1/1 clock is option */ -- GitLab From 099d387ebbcd70c6adc906ab5b66ef639c09dede Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov <dmitry.torokhov@gmail.com> Date: Tue, 27 Sep 2022 08:46:09 -0700 Subject: [PATCH 1541/2223] watchdog: twl4030_wdt: add missing mod_devicetable.h include The driver is using of_device_id and therefore needs to include mod_devicetable.h header. We used to get this definition indirectly via inclusion of matrix_keypad.h from twl.h, but we are cleaning up matrix_keypad.h from unnecessary includes. Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> Reviewed-by: Guenter Roeck <linux@roeck-us.net> Link: https://lore.kernel.org/r/20220927154611.3330871-1-dmitry.torokhov@gmail.com Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org> --- drivers/watchdog/twl4030_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/twl4030_wdt.c b/drivers/watchdog/twl4030_wdt.c index 355e428c0b99f..36b4a660928d3 100644 --- a/drivers/watchdog/twl4030_wdt.c +++ b/drivers/watchdog/twl4030_wdt.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> +#include <linux/mod_devicetable.h> #include <linux/watchdog.h> #include <linux/platform_device.h> #include <linux/mfd/twl.h> -- GitLab From b78870e7f41534cc719c295d1f8809aca93aeeab Mon Sep 17 00:00:00 2001 From: Prathamesh Shete <pshete@nvidia.com> Date: Thu, 6 Oct 2022 18:36:22 +0530 Subject: [PATCH 1542/2223] mmc: sdhci-tegra: Use actual clock rate for SW tuning correction Ensure tegra_host member "curr_clk_rate" holds the actual clock rate instead of requested clock rate for proper use during tuning correction algorithm. Actual clk rate may not be the same as the requested clk frequency depending on the parent clock source set. Tuning correction algorithm depends on certain parameters which are sensitive to current clk rate. If the host clk is selected instead of the actual clock rate, tuning correction algorithm may end up applying invalid correction, which could result in errors Fixes: ea8fc5953e8b ("mmc: tegra: update hw tuning process") Signed-off-by: Aniruddha TVS Rao <anrao@nvidia.com> Signed-off-by: Prathamesh Shete <pshete@nvidia.com> Acked-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Thierry Reding <treding@nvidia.com> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221006130622.22900-4-pshete@nvidia.com Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org> --- drivers/mmc/host/sdhci-tegra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 2d2d8260c6814..413925bce0ca8 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -773,7 +773,7 @@ static void tegra_sdhci_set_clock(struct sdhci_host *host, unsigned int clock) dev_err(dev, "failed to set clk rate to %luHz: %d\n", host_clk, err); - tegra_host->curr_clk_rate = host_clk; + tegra_host->curr_clk_rate = clk_get_rate(pltfm_host->clk); if (tegra_host->ddr_signaling) host->max_clk = host_clk; else -- GitLab From 296ab4a813841ba1d5f40b03190fd1bd8f25aab0 Mon Sep 17 00:00:00 2001 From: Dominique Martinet <asmadeus@codewreck.org> Date: Sun, 4 Sep 2022 20:17:49 +0900 Subject: [PATCH 1543/2223] net/9p: use a dedicated spinlock for trans_fd Shamelessly copying the explanation from Tetsuo Handa's suggested patch[1] (slightly reworded): syzbot is reporting inconsistent lock state in p9_req_put()[2], for p9_tag_remove() from p9_req_put() from IRQ context is using spin_lock_irqsave() on "struct p9_client"->lock but trans_fd (not from IRQ context) is using spin_lock(). Since the locks actually protect different things in client.c and in trans_fd.c, just replace trans_fd.c's lock by a new one specific to the transport (client.c's protect the idr for fid/tag allocations, while trans_fd.c's protects its own req list and request status field that acts as the transport's state machine) Link: https://lore.kernel.org/r/20220904112928.1308799-1-asmadeus@codewreck.org Link: https://lkml.kernel.org/r/2470e028-9b05-2013-7198-1fdad071d999@I-love.SAKURA.ne.jp [1] Link: https://syzkaller.appspot.com/bug?extid=2f20b523930c32c160cc [2] Reported-by: syzbot <syzbot+2f20b523930c32c160cc@syzkaller.appspotmail.com> Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Reviewed-by: Christian Schoenebeck <linux_oss@crudebyte.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/trans_fd.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 98732619d8394..97db11e4cf584 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -91,6 +91,7 @@ struct p9_poll_wait { * @mux_list: list link for mux to manage multiple connections (?) * @client: reference to client instance for this connection * @err: error state + * @req_lock: lock protecting req_list and requests statuses * @req_list: accounting for requests which have been sent * @unsent_req_list: accounting for requests that haven't been sent * @rreq: read request @@ -114,6 +115,7 @@ struct p9_conn { struct list_head mux_list; struct p9_client *client; int err; + spinlock_t req_lock; struct list_head req_list; struct list_head unsent_req_list; struct p9_req_t *rreq; @@ -189,10 +191,10 @@ static void p9_conn_cancel(struct p9_conn *m, int err) p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (m->err) { - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); return; } @@ -205,7 +207,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_move(&req->req_list, &cancel_list); } - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); @@ -360,7 +362,7 @@ static void p9_read_work(struct work_struct *work) if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); m->rreq->rc.size = m->rc.offset; - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (m->rreq->status == REQ_STATUS_SENT) { list_del(&m->rreq->req_list); p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); @@ -369,14 +371,14 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "Ignore replies associated with a cancelled request\n"); } else { - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); p9_debug(P9_DEBUG_ERROR, "Request tag %d errored out while we were reading the reply\n", m->rc.tag); err = -EIO; goto error; } - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); m->rc.sdata = NULL; m->rc.offset = 0; m->rc.capacity = 0; @@ -454,10 +456,10 @@ static void p9_write_work(struct work_struct *work) } if (!m->wsize) { - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (list_empty(&m->unsent_req_list)) { clear_bit(Wworksched, &m->wsched); - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); return; } @@ -472,7 +474,7 @@ static void p9_write_work(struct work_struct *work) m->wpos = 0; p9_req_get(req); m->wreq = req; - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); } p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", @@ -589,6 +591,7 @@ static void p9_conn_create(struct p9_client *client) INIT_LIST_HEAD(&m->mux_list); m->client = client; + spin_lock_init(&m->req_lock); INIT_LIST_HEAD(&m->req_list); INIT_LIST_HEAD(&m->unsent_req_list); INIT_WORK(&m->rq, p9_read_work); @@ -670,10 +673,10 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) if (m->err < 0) return m->err; - spin_lock(&client->lock); + spin_lock(&m->req_lock); req->status = REQ_STATUS_UNSENT; list_add_tail(&req->req_list, &m->unsent_req_list); - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); if (test_and_clear_bit(Wpending, &m->wsched)) n = EPOLLOUT; @@ -688,11 +691,13 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) { + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; int ret = 1; p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); - spin_lock(&client->lock); + spin_lock(&m->req_lock); if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); @@ -700,21 +705,24 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) p9_req_put(client, req); ret = 0; } - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); return ret; } static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) { + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); - spin_lock(&client->lock); + spin_lock(&m->req_lock); /* Ignore cancelled request if message has been received * before lock. */ if (req->status == REQ_STATUS_RCVD) { - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); return 0; } @@ -723,7 +731,8 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) */ list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); + p9_req_put(client, req); return 0; -- GitLab From 0664c63af16dceb4b40a9825e738136a2dac0260 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng <xiujianfeng@huawei.com> Date: Fri, 9 Sep 2022 18:35:46 +0800 Subject: [PATCH 1544/2223] net/9p: add __init/__exit annotations to module init/exit funcs xen transport was missing annotations Link: https://lkml.kernel.org/r/20220909103546.73015-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/trans_xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 41c57d40efb69..b15c64128c3e5 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -511,7 +511,7 @@ static struct xenbus_driver xen_9pfs_front_driver = { .otherend_changed = xen_9pfs_front_changed, }; -static int p9_trans_xen_init(void) +static int __init p9_trans_xen_init(void) { int rc; @@ -530,7 +530,7 @@ static int p9_trans_xen_init(void) module_init(p9_trans_xen_init); MODULE_ALIAS_9P("xen"); -static void p9_trans_xen_exit(void) +static void __exit p9_trans_xen_exit(void) { v9fs_unregister_trans(&p9_xen_trans); return xenbus_unregister_driver(&xen_9pfs_front_driver); -- GitLab From a8e633c604476e24d26a636582c0f5bdb421e70d Mon Sep 17 00:00:00 2001 From: Li Zhong <floridsleeves@gmail.com> Date: Wed, 21 Sep 2022 14:09:21 -0700 Subject: [PATCH 1545/2223] net/9p: clarify trans_fd parse_opt failure handling This parse_opts will set invalid opts.rfd/wfd in case of failure which we already check, but it is not clear for readers that parse_opts error are handled in p9_fd_create: clarify this by explicitely checking the return value. Link: https://lkml.kernel.org/r/20220921210921.1654735-1-floridsleeves@gmail.com Signed-off-by: Li Zhong <floridsleeves@gmail.com> [Dominique: reworded commit message to clarify this is NOOP] Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> --- net/9p/trans_fd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 97db11e4cf584..56a1867687501 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1074,7 +1074,9 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) int err; struct p9_fd_opts opts; - parse_opts(args, &opts); + err = parse_opts(args, &opts); + if (err < 0) + return err; client->trans_opts.fd.rfd = opts.rfd; client->trans_opts.fd.wfd = opts.wfd; -- GitLab From f5369dcf5c0a76260cd301bd5c25d59c451d62c1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Fri, 7 Oct 2022 11:05:09 +0200 Subject: [PATCH 1546/2223] wifi: mac80211: do not drop packets smaller than the LLC-SNAP header on fast-rx Since STP TCN frames are only 7 bytes, the pskb_may_pull call returns an error. Instead of dropping those packets, bump them back to the slow path for proper processing. Fixes: 49ddf8e6e234 ("mac80211: add fast-rx path") Reported-by: Chad Monroe <chad.monroe@smartrg.com> Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bd215fe3c7969..333adad47482c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4708,7 +4708,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, if (!(status->rx_flags & IEEE80211_RX_AMSDU)) { if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) - goto drop; + return false; payload = (void *)(skb->data + snap_offs); -- GitLab From b650009fcb701ea99aa133bbe18dbfc5305ddf1a Mon Sep 17 00:00:00 2001 From: James Prestwood <prestwoj@gmail.com> Date: Wed, 28 Sep 2022 15:49:10 -0700 Subject: [PATCH 1547/2223] wifi: mac80211: fix probe req HE capabilities access When building the probe request IEs HE support is checked for the 6GHz band (wiphy->bands[NL80211_BAND_6GHZ]). If supported the HE capability IE should be included according to the spec. The problem is the 16-bit capability is obtained from the band object (sband) that was passed in, not the 6GHz band object (sband6). If the sband object doesn't support HE it will result in a warning. Fixes: 7d29bc50b30e ("mac80211: always include HE 6GHz capability in probe request") Signed-off-by: James Prestwood <prestwoj@gmail.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bf7461c41beff..1e929b82deef4 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2046,7 +2046,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, if (he_cap) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); - __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); + __le16 cap = ieee80211_get_he_6ghz_capa(sband6, iftype); pos = ieee80211_write_he_6ghz_cap(pos, cap, end); } -- GitLab From 092197f1f47f8359b46ea62445d87561949b577d Mon Sep 17 00:00:00 2001 From: James Prestwood <prestwoj@gmail.com> Date: Thu, 15 Sep 2022 12:55:53 -0700 Subject: [PATCH 1548/2223] wifi: mac80211: remove/avoid misleading prints At some point a few kernel debug prints started appearing which indicated something was sending invalid IEs: "bad VHT capabilities, disabling VHT" "Invalid HE elem, Disable HE" Turns out these were being printed because the local hardware supported HE/VHT but the peer/AP did not. Bad/invalid indicates, to me at least, that the IE is in some way malformed, not missing. For the HE print (ieee80211_verify_peer_he_mcs_support) it will now silently fail if the HE capability element is missing (still prints if the element size is wrong). For the VHT print, it has been removed completely and will silently set the DISABLE_VHT flag which is consistent with how DISABLE_HT is set. Signed-off-by: James Prestwood <prestwoj@gmail.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/mlme.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 54b8d5065bbde..d8484cd870de5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4409,8 +4409,11 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); + if (!he_cap_elem) + return false; + /* invalid HE IE */ - if (!he_cap_elem || he_cap_elem->datalen < 1 + sizeof(*he_cap)) { + if (he_cap_elem->datalen < 1 + sizeof(*he_cap)) { sdata_info(sdata, "Invalid HE elem, Disable HE\n"); return false; @@ -4676,8 +4679,6 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } if (!elems->vht_cap_elem) { - sdata_info(sdata, - "bad VHT capabilities, disabling VHT\n"); *conn_flags |= IEEE80211_CONN_DISABLE_VHT; vht_oper = NULL; } -- GitLab From ceb3d688f92231e9d9e663c56a1c8bee90140bad Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Mon, 12 Sep 2022 18:07:16 +0300 Subject: [PATCH 1549/2223] wifi: mac80211: unlock on error in ieee80211_can_powered_addr_change() Unlock before returning -EOPNOTSUPP. Fixes: 3c06e91b40db ("wifi: mac80211: Support POWERED_ADDR_CHANGE feature") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 572254366a0f8..b15afa77b87c0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -243,7 +243,7 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata */ break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; } unlock: -- GitLab From 3bf9e30e493356912f9cb600f59b51133680639e Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Sat, 1 Oct 2022 12:01:13 +0200 Subject: [PATCH 1550/2223] wifi: mac80211: fix decap offload for stations on AP_VLAN interfaces Since AP_VLAN interfaces are not passed to the driver, check offload_flags on the bss vif instead. Reported-by: Howard Hsu <howard-yh.hsu@mediatek.com> Fixes: 80a915ec4427 ("mac80211: add rx decapsulation offload support") Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/rx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 333adad47482c..589521717c358 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4352,6 +4352,7 @@ void ieee80211_check_fast_rx(struct sta_info *sta) .vif_type = sdata->vif.type, .control_port_protocol = sdata->control_port_protocol, }, *old, *new = NULL; + u32 offload_flags; bool set_offload = false; bool assign = false; bool offload; @@ -4467,10 +4468,10 @@ void ieee80211_check_fast_rx(struct sta_info *sta) if (assign) new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL); - offload = assign && - (sdata->vif.offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED); + offload_flags = get_bss_sdata(sdata)->vif.offload_flags; + offload = offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED; - if (offload) + if (assign && offload) set_offload = !test_and_set_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); else set_offload = test_and_clear_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); -- GitLab From c210b91818e81068ca2573c20684644b8e110a07 Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Tue, 20 Sep 2022 10:37:35 +0100 Subject: [PATCH 1551/2223] riscv: dts: microchip: fix fabric i2c reg size The size of the reg should've been changed when the address was changed, but obviously I forgot to do so. Fixes: ab291621a8b8 ("riscv: dts: microchip: icicle: re-jig fabric peripheral addresses") Signed-off-by: Conor Dooley <conor.dooley@microchip.com> --- arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi index b6bfe177ccb28..24b1cfb9a73e4 100644 --- a/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs-icicle-kit-fabric.dtsi @@ -16,7 +16,7 @@ i2c2: i2c@40000200 { compatible = "microchip,corei2c-rtl-v7"; - reg = <0x0 0x40000200 0x0 0x1000>; + reg = <0x0 0x40000200 0x0 0x100>; #address-cells = <1>; #size-cells = <0>; clocks = <&fabric_clk3>; -- GitLab From c95014e1d05b5acfd9e6fbe5d1f048b07c6902ff Mon Sep 17 00:00:00 2001 From: Alexander Wetzel <alexander@wetzel-home.de> Date: Tue, 20 Sep 2022 17:55:41 +0200 Subject: [PATCH 1552/2223] wifi: mac80211: netdev compatible TX stop for iTXQ drivers Properly handle TX stop for internal queues (iTXQs) within mac80211. mac80211 must not stop netdev queues when using mac80211 iTXQs. For these drivers the netdev interface is created with IFF_NO_QUEUE. While netdev still drops frames for IFF_NO_QUEUE interfaces when we stop the netdev queues, it also prints a warning when this happens: Assuming the mac80211 interface is called wlan0 we would get "Virtual device wlan0 asks to queue packet!" when netdev has to drop a frame. This patch is keeping the harmless netdev queue starts for iTXQ drivers. Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/iface.c | 6 +++--- net/mac80211/tx.c | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b15afa77b87c0..dd9ac1f7d2ea6 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -461,7 +461,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do /* * Stop TX on this interface first. */ - if (sdata->dev) + if (!local->ops->wake_tx_queue && sdata->dev) netif_tx_stop_all_queues(sdata->dev); ieee80211_roc_purge(local, sdata); @@ -1412,8 +1412,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type != NL80211_IFTYPE_STATION); } - set_bit(SDATA_STATE_RUNNING, &sdata->state); - switch (sdata->vif.type) { case NL80211_IFTYPE_P2P_DEVICE: rcu_assign_pointer(local->p2p_sdata, sdata); @@ -1472,6 +1470,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } + set_bit(SDATA_STATE_RUNNING, &sdata->state); + return 0; err_del_interface: drv_remove_interface(local, sdata); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 27c964be102e1..a364148149f94 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2319,6 +2319,10 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, u16 len_rthdr; int hdrlen; + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + if (unlikely(!ieee80211_sdata_running(sdata))) + goto fail; + memset(info, 0, sizeof(*info)); info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; @@ -2378,8 +2382,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. */ - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(tmp_sdata)) continue; @@ -4169,7 +4171,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct sk_buff *next; int len = skb->len; - if (unlikely(skb->len < ETH_HLEN)) { + if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return; } @@ -4566,7 +4568,7 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct ieee80211_key *key; struct sta_info *sta; - if (unlikely(skb->len < ETH_HLEN)) { + if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return NETDEV_TX_OK; } -- GitLab From d9e249704084982ac7581a560ffa284e11621d43 Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Fri, 7 Oct 2022 14:56:11 +0200 Subject: [PATCH 1553/2223] wifi: cfg80211: fix ieee80211_data_to_8023_exthdr handling of small packets STP topology change notification packets only have a payload of 7 bytes, so they get dropped due to the skb->len < hdrlen + 8 check. Fix this by removing the extra 8 from the skb->len check and checking the return code on the skb_copy_bits calls. Fixes: 2d1c304cb2d5 ("cfg80211: add function for 802.3 conversion with separate output buffer") Reported-by: Chad Monroe <chad.monroe@smartrg.com> Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/util.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index 01493568a21df..1f285b5150286 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -559,7 +559,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; - if (skb->len < hdrlen + 8) + if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet @@ -574,8 +574,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); - if (iftype == NL80211_IFTYPE_MESH_POINT) - skb_copy_bits(skb, hdrlen, &mesh_flags, 1); + if (iftype == NL80211_IFTYPE_MESH_POINT && + skb_copy_bits(skb, hdrlen, &mesh_flags, 1) < 0) + return -1; mesh_flags &= MESH_FLAGS_AE; @@ -595,11 +596,12 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) { if (mesh_flags == MESH_FLAGS_AE_A4) return -1; - if (mesh_flags == MESH_FLAGS_AE_A5_A6) { - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr1), - tmp.h_dest, 2 * ETH_ALEN); - } + if (mesh_flags == MESH_FLAGS_AE_A5_A6 && + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), + tmp.h_dest, 2 * ETH_ALEN) < 0) + return -1; + hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; @@ -613,10 +615,11 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) { if (mesh_flags == MESH_FLAGS_AE_A5_A6) return -1; - if (mesh_flags == MESH_FLAGS_AE_A4) - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr1), - tmp.h_source, ETH_ALEN); + if (mesh_flags == MESH_FLAGS_AE_A4 && + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), + tmp.h_source, ETH_ALEN) < 0) + return -1; hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; @@ -628,16 +631,15 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, break; } - skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)); - tmp.h_proto = payload.proto; - - if (likely((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) && - tmp.h_proto != htons(ETH_P_AARP) && - tmp.h_proto != htons(ETH_P_IPX)) || - ether_addr_equal(payload.hdr, bridge_tunnel_header))) { + if (likely(skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && + ((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) && + payload.proto != htons(ETH_P_AARP) && + payload.proto != htons(ETH_P_IPX)) || + ether_addr_equal(payload.hdr, bridge_tunnel_header)))) { /* remove RFC1042 or Bridge-Tunnel encapsulation and * replace EtherType */ hdrlen += ETH_ALEN + 2; + tmp.h_proto = payload.proto; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); -- GitLab From e3e6e1d16a4cf7b63159ec71774e822194071954 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei <yin31149@gmail.com> Date: Tue, 27 Sep 2022 07:34:59 +0800 Subject: [PATCH 1554/2223] wifi: wext: use flex array destination for memcpy() Syzkaller reports buffer overflow false positive as follows: ------------[ cut here ]------------ memcpy: detected field-spanning write (size 8) of single field "&compat_event->pointer" at net/wireless/wext-core.c:623 (size 4) WARNING: CPU: 0 PID: 3607 at net/wireless/wext-core.c:623 wireless_send_event+0xab5/0xca0 net/wireless/wext-core.c:623 Modules linked in: CPU: 1 PID: 3607 Comm: syz-executor659 Not tainted 6.0.0-rc6-next-20220921-syzkaller #0 [...] Call Trace: <TASK> ioctl_standard_call+0x155/0x1f0 net/wireless/wext-core.c:1022 wireless_process_ioctl+0xc8/0x4c0 net/wireless/wext-core.c:955 wext_ioctl_dispatch net/wireless/wext-core.c:988 [inline] wext_ioctl_dispatch net/wireless/wext-core.c:976 [inline] wext_handle_ioctl+0x26b/0x280 net/wireless/wext-core.c:1049 sock_ioctl+0x285/0x640 net/socket.c:1220 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] </TASK> Wireless events will be sent on the appropriate channels in wireless_send_event(). Different wireless events may have different payload structure and size, so kernel uses **len** and **cmd** field in struct __compat_iw_event as wireless event common LCP part, uses **pointer** as a label to mark the position of remaining different part. Yet the problem is that, **pointer** is a compat_caddr_t type, which may be smaller than the relative structure at the same position. So during wireless_send_event() tries to parse the wireless events payload, it may trigger the memcpy() run-time destination buffer bounds checking when the relative structure's data is copied to the position marked by **pointer**. This patch solves it by introducing flexible-array field **ptr_bytes**, to mark the position of the wireless events remaining part next to LCP part. What's more, this patch also adds **ptr_len** variable in wireless_send_event() to improve its maintainability. Reported-and-tested-by: syzbot+473754e5af963cf014cf@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/00000000000070db2005e95a5984@google.com/ Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Hawkins Jiawei <yin31149@gmail.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- include/linux/wireless.h | 10 +++++++++- net/wireless/wext-core.c | 17 ++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/linux/wireless.h b/include/linux/wireless.h index 2d1b54556eff4..e6e34d74dda04 100644 --- a/include/linux/wireless.h +++ b/include/linux/wireless.h @@ -26,7 +26,15 @@ struct compat_iw_point { struct __compat_iw_event { __u16 len; /* Real length of this stuff */ __u16 cmd; /* Wireless IOCTL */ - compat_caddr_t pointer; + + union { + compat_caddr_t pointer; + + /* we need ptr_bytes to make memcpy() run-time destination + * buffer bounds checking happy, nothing special + */ + DECLARE_FLEX_ARRAY(__u8, ptr_bytes); + }; }; #define IW_EV_COMPAT_LCP_LEN offsetof(struct __compat_iw_event, pointer) #define IW_EV_COMPAT_POINT_OFF offsetof(struct compat_iw_point, length) diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 76a80a41615be..fe8765c4075d3 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -468,6 +468,7 @@ void wireless_send_event(struct net_device * dev, struct __compat_iw_event *compat_event; struct compat_iw_point compat_wrqu; struct sk_buff *compskb; + int ptr_len; #endif /* @@ -582,6 +583,9 @@ void wireless_send_event(struct net_device * dev, nlmsg_end(skb, nlh); #ifdef CONFIG_COMPAT hdr_len = compat_event_type_size[descr->header_type]; + + /* ptr_len is remaining size in event header apart from LCP */ + ptr_len = hdr_len - IW_EV_COMPAT_LCP_LEN; event_len = hdr_len + extra_len; compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); @@ -612,16 +616,15 @@ void wireless_send_event(struct net_device * dev, if (descr->header_type == IW_HEADER_TYPE_POINT) { compat_wrqu.length = wrqu->data.length; compat_wrqu.flags = wrqu->data.flags; - memcpy(&compat_event->pointer, - ((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF, - hdr_len - IW_EV_COMPAT_LCP_LEN); + memcpy(compat_event->ptr_bytes, + ((char *)&compat_wrqu) + IW_EV_COMPAT_POINT_OFF, + ptr_len); if (extra_len) - memcpy(((char *) compat_event) + hdr_len, - extra, extra_len); + memcpy(&compat_event->ptr_bytes[ptr_len], + extra, extra_len); } else { /* extra_len must be zero, so no if (extra) needed */ - memcpy(&compat_event->pointer, wrqu, - hdr_len - IW_EV_COMPAT_LCP_LEN); + memcpy(compat_event->ptr_bytes, wrqu, ptr_len); } nlmsg_end(compskb, nlh); -- GitLab From 10d5ea5a436da8d60cdb5845f454d595accdbce0 Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Mon, 26 Sep 2022 19:29:23 -0700 Subject: [PATCH 1555/2223] wifi: nl80211: Split memcpy() of struct nl80211_wowlan_tcp_data_token flexible array To work around a misbehavior of the compiler's ability to see into composite flexible array structs (as detailed in the coming memcpy() hardening series[1]), split the memcpy() of the header and the payload so no false positive run-time overflow warning will be generated. [1] https://lore.kernel.org/linux-hardening/20220901065914.1417829-2-keescook@chromium.org/ Signed-off-by: Kees Cook <keescook@chromium.org> Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/nl80211.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ff8b1c040f0b..597c522365146 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13265,7 +13265,9 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, wake_mask_size); if (tok) { cfg->tokens_size = tokens_size; - memcpy(&cfg->payload_tok, tok, sizeof(*tok) + tokens_size); + cfg->payload_tok = *tok; + memcpy(cfg->payload_tok.token_stream, tok->token_stream, + tokens_size); } trig->tcp = cfg; -- GitLab From e1567b4f0eec779af99b372773ddeb5be9b6208b Mon Sep 17 00:00:00 2001 From: Mark Brown <broonie@kernel.org> Date: Wed, 5 Oct 2022 19:16:42 +0100 Subject: [PATCH 1556/2223] arm64/sysreg: Fix typo in SCTR_EL1.SPINTMASK SPINTMASK was typoed as SPINMASK, fix it. Signed-off-by: Mark Brown <broonie@kernel.org> Link: https://lore.kernel.org/r/20221005181642.711734-1-broonie@kernel.org Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- arch/arm64/tools/sysreg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 7f1fb36f208ca..384757a7eda9e 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -732,7 +732,7 @@ EndSysreg Sysreg SCTLR_EL1 3 0 1 0 0 Field 63 TIDCP -Field 62 SPINMASK +Field 62 SPINTMASK Field 61 NMI Field 60 EnTP2 Res0 59:58 -- GitLab From 171df58028bf4649460fb146a56a58dcb0c8f75a Mon Sep 17 00:00:00 2001 From: James Morse <james.morse@arm.com> Date: Fri, 30 Sep 2022 14:19:59 +0100 Subject: [PATCH 1557/2223] arm64: errata: Add Cortex-A55 to the repeat tlbi list Cortex-A55 is affected by an erratum where in rare circumstances the CPUs may not handle a race between a break-before-make sequence on one CPU, and another CPU accessing the same page. This could allow a store to a page that has been unmapped. Work around this by adding the affected CPUs to the list that needs TLB sequences to be done twice. Signed-off-by: James Morse <james.morse@arm.com> Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/20220930131959.3082594-1-james.morse@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 17 +++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 5 +++++ 3 files changed, 24 insertions(+) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 17d9fc5d14fbb..808ade4cc008a 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -76,6 +76,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A55 | #2441007 | ARM64_ERRATUM_2441007 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #852523 | N/A | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1675310f17912..20d082d54bd8d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -634,6 +634,23 @@ config ARM64_ERRATUM_1530923 config ARM64_WORKAROUND_REPEAT_TLBI bool +config ARM64_ERRATUM_2441007 + bool "Cortex-A55: Completion of affected memory accesses might not be guaranteed by completion of a TLBI" + default y + select ARM64_WORKAROUND_REPEAT_TLBI + help + This option adds a workaround for ARM Cortex-A55 erratum #2441007. + + Under very rare circumstances, affected Cortex-A55 CPUs + may not handle a race between a break-before-make sequence on one + CPU, and another CPU accessing the same page. This could allow a + store to a page that has been unmapped. + + Work around this by adding the affected CPUs to the list that needs + TLB sequences to be done twice. + + If unsure, say Y. + config ARM64_ERRATUM_1286807 bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 58ca4f6b25d6a..89ac00084f38a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -230,6 +230,11 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2441007 + { + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_2441009 { /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */ -- GitLab From ad0112f2d54cafee839e2ee99ec0b5fb9ce5c4b8 Mon Sep 17 00:00:00 2001 From: Sun Ke <sunke32@huawei.com> Date: Sat, 24 Sep 2022 11:21:27 +0800 Subject: [PATCH 1558/2223] drivers/perf: fix return value check in ali_drw_pmu_probe() In case of error, devm_ioremap_resource() returns ERR_PTR(), and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: cf7b61073e45 ("drivers/perf: add DDR Sub-System Driveway PMU driver for Yitian 710 SoC") Signed-off-by: Sun Ke <sunke32@huawei.com> Acked-by: Will Deacon <will@kernel.org> Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com> Link: https://lore.kernel.org/r/20220924032127.313156-1-sunke32@huawei.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- drivers/perf/alibaba_uncore_drw_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c index 82729b874f093..a7689fecb49d9 100644 --- a/drivers/perf/alibaba_uncore_drw_pmu.c +++ b/drivers/perf/alibaba_uncore_drw_pmu.c @@ -658,8 +658,8 @@ static int ali_drw_pmu_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); drw_pmu->cfg_base = devm_ioremap_resource(&pdev->dev, res); - if (!drw_pmu->cfg_base) - return -ENOMEM; + if (IS_ERR(drw_pmu->cfg_base)) + return PTR_ERR(drw_pmu->cfg_base); name = devm_kasprintf(drw_pmu->dev, GFP_KERNEL, "ali_drw_%llx", (u64) (res->start >> ALI_DRW_PMU_PA_SHIFT)); -- GitLab From e08d07dd9f80e997ad36a088eb276509ca484e97 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven <geert+renesas@glider.be> Date: Tue, 27 Sep 2022 15:37:16 +0200 Subject: [PATCH 1559/2223] drivers/perf: ALIBABA_UNCORE_DRW_PMU should depend on ACPI The Alibaba T-Head Yitian 710 DDR Sub-system Driveway PMU driver relies solely on ACPI for matching. Hence add a dependency on ACPI, to prevent asking the user about this driver when configuring a kernel without ACPI support. Fixes: cf7b61073e45 ("drivers/perf: add DDR Sub-System Driveway PMU driver for Yitian 710 SoC") Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be> Acked-by: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/2a4407bb598285660fa5e604e56823ddb12bb0aa.1664285774.git.geert+renesas@glider.be Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- drivers/perf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 44c07ea487f44..341010f20b777 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -185,7 +185,7 @@ config APPLE_M1_CPU_PMU config ALIBABA_UNCORE_DRW_PMU tristate "Alibaba T-Head Yitian 710 DDR Sub-system Driveway PMU driver" - depends on ARM64 || COMPILE_TEST + depends on (ARM64 && ACPI) || COMPILE_TEST help Support for Driveway PMU events monitoring on Yitian 710 DDR Sub-system. -- GitLab From 5f4853e810943af5e45fcc040cbbcbba07d8fc25 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn <lukas.bulwahn@gmail.com> Date: Thu, 29 Sep 2022 14:29:37 +0200 Subject: [PATCH 1560/2223] MAINTAINERS: rectify file entry in ALIBABA PMU DRIVER Commit cf7b61073e45 ("drivers/perf: add DDR Sub-System Driveway PMU driver for Yitian 710 SoC") adds the DDR Sub-System Driveway PMU driver here: drivers/perf/alibaba_uncore_drw_pmu.c The file entry in MAINTAINERS for the ALIBABA PMU DRIVER, introduced with commit d813a19e7d2e ("MAINTAINERS: add maintainers for Alibaba' T-Head PMU driver"), however refers to: drivers/perf/alibaba_uncore_dwr_pmu.c Note the swapping of characters. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken file pattern. Repair this file entry in ALIBABA PMU DRIVER. Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Acked-by: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20220929122937.20132-1-lukas.bulwahn@gmail.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 58d0aefead540..a8e0f73747cf0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -753,7 +753,7 @@ ALIBABA PMU DRIVER M: Shuai Xue <xueshuai@linux.alibaba.com> S: Supported F: Documentation/admin-guide/perf/alibaba_pmu.rst -F: drivers/perf/alibaba_uncore_dwr_pmu.c +F: drivers/perf/alibaba_uncore_drw_pmu.c ALIENWARE WMI DRIVER L: Dell.Client.Kernel@dell.com -- GitLab From 4b22ef042d6f54a6e5899555f2db71749133eca8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Fri, 7 Oct 2022 11:04:39 -0300 Subject: [PATCH 1561/2223] vfio: Add vfio_file_is_group() This replaces uses of vfio_file_iommu_group() which were only detecting if the file is a VFIO file with no interest in the actual group. The only remaning user of vfio_file_iommu_group() is in KVM for the SPAPR stuff. It passes the iommu_group into the arch code through kvm for some reason. Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> Tested-by: Eric Farman <farman@linux.ibm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/1-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/pci/vfio_pci_core.c | 2 +- drivers/vfio/vfio_main.c | 16 +++++++++++++++- include/linux/vfio.h | 1 + virt/kvm/vfio.c | 20 ++++++++++++++++++-- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 59a28251bb0b9..badc9d828cac2 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1313,7 +1313,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, } /* Ensure the FD is a vfio group FD.*/ - if (!vfio_file_iommu_group(file)) { + if (!vfio_file_is_group(file)) { fput(file); ret = -EINVAL; break; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9207e6c0e3cb2..9f830d0a25b7a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1553,17 +1553,31 @@ static const struct file_operations vfio_device_fops = { * @file: VFIO group file * * The returned iommu_group is valid as long as a ref is held on the file. + * This function is deprecated, only the SPAPR path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = file->private_data; - if (file->f_op != &vfio_group_fops) + if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) + return NULL; + + if (!vfio_file_is_group(file)) return NULL; return group->iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); +/** + * vfio_file_is_group - True if the file is usable with VFIO aPIS + * @file: VFIO group file + */ +bool vfio_file_is_group(struct file *file) +{ + return file->f_op == &vfio_group_fops; +} +EXPORT_SYMBOL_GPL(vfio_file_is_group); + /** * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file * is always CPU cache coherent diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ee399a768070d..e7cebeb875dd1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -199,6 +199,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, * External user API */ struct iommu_group *vfio_file_iommu_group(struct file *file); +bool vfio_file_is_group(struct file *file); bool vfio_file_enforced_coherent(struct file *file); void vfio_file_set_kvm(struct file *file, struct kvm *kvm); bool vfio_file_has_dev(struct file *file, struct vfio_device *device); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ce1b01d02c519..54aec3b0559c7 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -61,6 +61,23 @@ static bool kvm_vfio_file_enforced_coherent(struct file *file) return ret; } +static bool kvm_vfio_file_is_group(struct file *file) +{ + bool (*fn)(struct file *file); + bool ret; + + fn = symbol_get(vfio_file_is_group); + if (!fn) + return false; + + ret = fn(file); + + symbol_put(vfio_file_is_group); + + return ret; +} + +#ifdef CONFIG_SPAPR_TCE_IOMMU static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { struct iommu_group *(*fn)(struct file *file); @@ -77,7 +94,6 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) return ret; } -#ifdef CONFIG_SPAPR_TCE_IOMMU static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_group *kvg) { @@ -136,7 +152,7 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) return -EBADF; /* Ensure the FD is a vfio group FD.*/ - if (!kvm_vfio_file_iommu_group(filp)) { + if (!kvm_vfio_file_is_group(filp)) { ret = -EINVAL; goto err_fput; } -- GitLab From 819da99a7360f7e197038d12f0eba626bde11856 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Fri, 7 Oct 2022 11:04:40 -0300 Subject: [PATCH 1562/2223] vfio: Hold a reference to the iommu_group in kvm for SPAPR SPAPR exists completely outside the normal iommu driver framework, the groups it creates are fake and are only created to enable VFIO's uAPI. Thus, it does not need to follow the iommu core rule that the iommu_group will only be touched while a driver is attached. Carry a group reference into KVM and have KVM directly manage the lifetime of this object independently of VFIO. This means KVM no longer relies on the vfio group file being valid to maintain the group reference. Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/2-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/vfio_main.c | 6 ++++-- virt/kvm/vfio.c | 25 ++++++++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9f830d0a25b7a..911ee1abdff07 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1552,8 +1552,9 @@ static const struct file_operations vfio_device_fops = { * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file * - * The returned iommu_group is valid as long as a ref is held on the file. - * This function is deprecated, only the SPAPR path in kvm should call it. + * The returned iommu_group is valid as long as a ref is held on the file. This + * returns a reference on the group. This function is deprecated, only the SPAPR + * path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { @@ -1564,6 +1565,7 @@ struct iommu_group *vfio_file_iommu_group(struct file *file) if (!vfio_file_is_group(file)) return NULL; + iommu_group_ref_get(group->iommu_group); return group->iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 54aec3b0559c7..495ceabffe88b 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -24,6 +24,9 @@ struct kvm_vfio_group { struct list_head node; struct file *file; +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct iommu_group *iommu_group; +#endif }; struct kvm_vfio { @@ -97,12 +100,12 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_group *kvg) { - struct iommu_group *grp = kvm_vfio_file_iommu_group(kvg->file); - - if (WARN_ON_ONCE(!grp)) + if (WARN_ON_ONCE(!kvg->iommu_group)) return; - kvm_spapr_tce_release_iommu_group(kvm, grp); + kvm_spapr_tce_release_iommu_group(kvm, kvg->iommu_group); + iommu_group_put(kvg->iommu_group); + kvg->iommu_group = NULL; } #endif @@ -252,19 +255,19 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - struct iommu_group *grp; - if (kvg->file != f.file) continue; - grp = kvm_vfio_file_iommu_group(kvg->file); - if (WARN_ON_ONCE(!grp)) { - ret = -EIO; - goto err_fdput; + if (!kvg->iommu_group) { + kvg->iommu_group = kvm_vfio_file_iommu_group(kvg->file); + if (WARN_ON_ONCE(!kvg->iommu_group)) { + ret = -EIO; + goto err_fdput; + } } ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, - grp); + kvg->iommu_group); break; } -- GitLab From 3dd59a7dcb97e6e40d6385a1a3faa9392b6d184a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@nvidia.com> Date: Fri, 7 Oct 2022 11:04:41 -0300 Subject: [PATCH 1563/2223] vfio: Make the group FD disassociate from the iommu_group Allow the vfio_group struct to exist with a NULL iommu_group pointer. When the pointer is NULL the vfio_group users promise not to touch the iommu_group. This allows a driver to be hot unplugged while userspace is keeping the group FD open. Remove all the code waiting for the group FD to close. This fixes a userspace regression where we learned that virtnodedevd leaves a group FD open even though the /dev/ node for it has been deleted and all the drivers for it unplugged. Fixes: ca5f21b25749 ("vfio: Follow a strict lifetime for struct iommu_group") Reported-by: Christian Borntraeger <borntraeger@linux.ibm.com> Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> Tested-by: Eric Farman <farman@linux.ibm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/3-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/vfio.h | 1 - drivers/vfio/vfio_main.c | 67 ++++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 4a1bac1359a95..bcad54bbab08c 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -59,7 +59,6 @@ struct vfio_group { struct mutex group_lock; struct kvm *kvm; struct file *opened_file; - struct swait_queue_head opened_file_wait; struct blocking_notifier_head notifier; }; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 911ee1abdff07..04099a839a52a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -133,6 +133,10 @@ __vfio_group_get_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; + /* + * group->iommu_group from the vfio.group_list cannot be NULL + * under the vfio.group_lock. + */ list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) { refcount_inc(&group->drivers); @@ -159,7 +163,7 @@ static void vfio_group_release(struct device *dev) mutex_destroy(&group->device_lock); mutex_destroy(&group->group_lock); - iommu_group_put(group->iommu_group); + WARN_ON(group->iommu_group); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); } @@ -189,7 +193,6 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); - init_swait_queue_head(&group->opened_file_wait); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -248,6 +251,7 @@ err_put: static void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; + struct iommu_group *iommu_group; if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -265,31 +269,29 @@ static void vfio_device_remove_group(struct vfio_device *device) */ cdev_device_del(&group->cdev, &group->dev); - /* - * Before we allow the last driver in the group to be unplugged the - * group must be sanitized so nothing else is or can reference it. This - * is because the group->iommu_group pointer should only be used so long - * as a device driver is attached to a device in the group. - */ - while (group->opened_file) { - mutex_unlock(&vfio.group_lock); - swait_event_idle_exclusive(group->opened_file_wait, - !group->opened_file); - mutex_lock(&vfio.group_lock); - } - mutex_unlock(&vfio.group_lock); - + mutex_lock(&group->group_lock); /* * These data structures all have paired operations that can only be - * undone when the caller holds a live reference on the group. Since all - * pairs must be undone these WARN_ON's indicate some caller did not + * undone when the caller holds a live reference on the device. Since + * all pairs must be undone these WARN_ON's indicate some caller did not * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); + + /* + * Revoke all users of group->iommu_group. At this point we know there + * are no devices active because we are unplugging the last one. Setting + * iommu_group to NULL blocks all new users. + */ + if (group->container) + vfio_group_detach_container(group); + iommu_group = group->iommu_group; group->iommu_group = NULL; + mutex_unlock(&group->group_lock); + mutex_unlock(&vfio.group_lock); + iommu_group_put(iommu_group); put_device(&group->dev); } @@ -531,6 +533,10 @@ static int __vfio_register_dev(struct vfio_device *device, existing_device = vfio_group_get_device(group, device->dev); if (existing_device) { + /* + * group->iommu_group is non-NULL because we hold the drivers + * refcount. + */ dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put_registration(existing_device); @@ -702,6 +708,11 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, ret = -EINVAL; goto out_unlock; } + if (!group->iommu_group) { + ret = -ENODEV; + goto out_unlock; + } + container = vfio_container_from_file(f.file); ret = -EINVAL; if (container) { @@ -862,6 +873,11 @@ static int vfio_group_ioctl_get_status(struct vfio_group *group, status.flags = 0; mutex_lock(&group->group_lock); + if (!group->iommu_group) { + mutex_unlock(&group->group_lock); + return -ENODEV; + } + if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; @@ -947,8 +963,6 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_detach_container(group); group->opened_file = NULL; mutex_unlock(&group->group_lock); - swake_up_one(&group->opened_file_wait); - return 0; } @@ -1559,14 +1573,21 @@ static const struct file_operations vfio_device_fops = { struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = file->private_data; + struct iommu_group *iommu_group = NULL; if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) return NULL; if (!vfio_file_is_group(file)) return NULL; - iommu_group_ref_get(group->iommu_group); - return group->iommu_group; + + mutex_lock(&group->group_lock); + if (group->iommu_group) { + iommu_group = group->iommu_group; + iommu_group_ref_get(iommu_group); + } + mutex_unlock(&group->group_lock); + return iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); -- GitLab From c9133112f347907774055bbf73179a7ff8504689 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Mon, 29 Aug 2022 13:26:07 +0200 Subject: [PATCH 1564/2223] xen/virtio: restructure xen grant dma setup In order to prepare supporting other means than device tree for setting up virtio devices under Xen, restructure the functions xen_is_grant_dma_device() and xen_grant_setup_dma_ops() a little bit. Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Tested-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # Arm64 only Acked-by: Stefano Stabellini <sstabellini@kernel.org> Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/grant-dma-ops.c | 68 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index c66f56d24013b..7133272918f0f 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -289,22 +289,28 @@ static const struct dma_map_ops xen_grant_dma_ops = { .dma_supported = xen_grant_dma_supported, }; -bool xen_is_grant_dma_device(struct device *dev) +static bool xen_is_dt_grant_dma_device(struct device *dev) { struct device_node *iommu_np; bool has_iommu; - /* XXX Handle only DT devices for now */ - if (!dev->of_node) - return false; - iommu_np = of_parse_phandle(dev->of_node, "iommus", 0); - has_iommu = iommu_np && of_device_is_compatible(iommu_np, "xen,grant-dma"); + has_iommu = iommu_np && + of_device_is_compatible(iommu_np, "xen,grant-dma"); of_node_put(iommu_np); return has_iommu; } +bool xen_is_grant_dma_device(struct device *dev) +{ + /* XXX Handle only DT devices for now */ + if (dev->of_node) + return xen_is_dt_grant_dma_device(dev); + + return false; +} + bool xen_virtio_mem_acc(struct virtio_device *dev) { if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) @@ -313,45 +319,56 @@ bool xen_virtio_mem_acc(struct virtio_device *dev) return xen_is_grant_dma_device(dev->dev.parent); } -void xen_grant_setup_dma_ops(struct device *dev) +static int xen_dt_grant_init_backend_domid(struct device *dev, + struct xen_grant_dma_data *data) { - struct xen_grant_dma_data *data; struct of_phandle_args iommu_spec; - data = find_xen_grant_dma_data(dev); - if (data) { - dev_err(dev, "Xen grant DMA data is already created\n"); - return; - } - - /* XXX ACPI device unsupported for now */ - if (!dev->of_node) - goto err; - if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", 0, &iommu_spec)) { dev_err(dev, "Cannot parse iommus property\n"); - goto err; + return -ESRCH; } if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") || iommu_spec.args_count != 1) { dev_err(dev, "Incompatible IOMMU node\n"); of_node_put(iommu_spec.np); - goto err; + return -ESRCH; } of_node_put(iommu_spec.np); + /* + * The endpoint ID here means the ID of the domain where the + * corresponding backend is running + */ + data->backend_domid = iommu_spec.args[0]; + + return 0; +} + +void xen_grant_setup_dma_ops(struct device *dev) +{ + struct xen_grant_dma_data *data; + + data = find_xen_grant_dma_data(dev); + if (data) { + dev_err(dev, "Xen grant DMA data is already created\n"); + return; + } + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); if (!data) goto err; - /* - * The endpoint ID here means the ID of the domain where the corresponding - * backend is running - */ - data->backend_domid = iommu_spec.args[0]; + if (dev->of_node) { + if (xen_dt_grant_init_backend_domid(dev, data)) + goto err; + } else { + /* XXX ACPI device unsupported for now */ + goto err; + } if (store_xen_grant_dma_data(dev, data)) { dev_err(dev, "Cannot store Xen grant DMA data\n"); @@ -363,6 +380,7 @@ void xen_grant_setup_dma_ops(struct device *dev) return; err: + devm_kfree(dev, data); dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n"); } -- GitLab From 7228113d1fa0107a377aef71094d610eb8824aa2 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Mon, 29 Aug 2022 13:26:08 +0200 Subject: [PATCH 1565/2223] xen/virtio: use dom0 as default backend for CONFIG_XEN_VIRTIO_FORCE_GRANT With CONFIG_XEN_VIRTIO_FORCE_GRANT set the default backend domid to 0, enabling to use xen_grant_dma_ops for those devices. Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Acked-by: Stefano Stabellini <sstabellini@kernel.org> Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/grant-dma-ops.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 7133272918f0f..3e4c590896d08 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -365,6 +365,9 @@ void xen_grant_setup_dma_ops(struct device *dev) if (dev->of_node) { if (xen_dt_grant_init_backend_domid(dev, data)) goto err; + } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) { + dev_info(dev, "Using dom0 as backend\n"); + data->backend_domid = 0; } else { /* XXX ACPI device unsupported for now */ goto err; -- GitLab From 96dbcc0072acf4f9565a16e8da96e57e5cee1068 Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana@suse.com> Date: Mon, 3 Oct 2022 15:57:30 +0100 Subject: [PATCH 1566/2223] btrfs: add missing path cache update during fiemap When looking the stored result for a cached path node, if the stored result is valid and has a value of true, we must update all the nodes for all levels below it with a result of true as well. This is necessary when moving from one leaf in the fs tree to the next one, as well as when moving from a node at any level to the next node at the same level. Currently this logic is missing as it was somehow forgotten by a recent patch with the subject: "btrfs: speedup checking for extent sharedness during fiemap". This adds the missing logic, which is the counter part to what we do when adding a shared node to the cache at store_backref_shared_cache(). Fixes: 12a824dc67a6 ("btrfs: speedup checking for extent sharedness during fiemap") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/backref.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index dce3a16996b95..3c0c1f626c75d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1557,6 +1557,19 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache return false; *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + cache->entries[i].is_shared = true; + cache->entries[i].gen = entry->gen; + } + } return true; } -- GitLab From 78b1c6584fcedcf2d9687a4455c461859094cf04 Mon Sep 17 00:00:00 2001 From: David Gow <davidgow@google.com> Date: Fri, 22 Jul 2022 17:15:30 +0000 Subject: [PATCH 1567/2223] kunit: string-stream: Simplify resource use Currently, KUnit's string streams are themselves "KUnit resources". This is redundant since the stream itself is already allocated with kunit_kzalloc() and will thus be freed automatically at the end of the test. string-stream is only used internally within KUnit, and isn't using the extra features that resources provide like reference counting, being able to locate them dynamically as "test-local variables", etc. Indeed, the resource's refcount is never incremented when the pointer is returned. The fact that it's always manually destroyed is more evidence that the reference counting is unused. Signed-off-by: David Gow <davidgow@google.com> Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: Brendan Higgins <brendanhiggins@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- lib/kunit/string-stream.c | 90 +++++++-------------------------------- lib/kunit/string-stream.h | 2 +- lib/kunit/test.c | 2 +- 3 files changed, 18 insertions(+), 76 deletions(-) diff --git a/lib/kunit/string-stream.c b/lib/kunit/string-stream.c index 141789ca8949b..a2496abef152c 100644 --- a/lib/kunit/string-stream.c +++ b/lib/kunit/string-stream.c @@ -12,64 +12,31 @@ #include "string-stream.h" -struct string_stream_fragment_alloc_context { - struct kunit *test; - int len; - gfp_t gfp; -}; -static int string_stream_fragment_init(struct kunit_resource *res, - void *context) +static struct string_stream_fragment *alloc_string_stream_fragment( + struct kunit *test, int len, gfp_t gfp) { - struct string_stream_fragment_alloc_context *ctx = context; struct string_stream_fragment *frag; - frag = kunit_kzalloc(ctx->test, sizeof(*frag), ctx->gfp); + frag = kunit_kzalloc(test, sizeof(*frag), gfp); if (!frag) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - frag->test = ctx->test; - frag->fragment = kunit_kmalloc(ctx->test, ctx->len, ctx->gfp); + frag->test = test; + frag->fragment = kunit_kmalloc(test, len, gfp); if (!frag->fragment) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - res->data = frag; - - return 0; + return frag; } -static void string_stream_fragment_free(struct kunit_resource *res) +static void string_stream_fragment_destroy(struct string_stream_fragment *frag) { - struct string_stream_fragment *frag = res->data; - list_del(&frag->node); kunit_kfree(frag->test, frag->fragment); kunit_kfree(frag->test, frag); } -static struct string_stream_fragment *alloc_string_stream_fragment( - struct kunit *test, int len, gfp_t gfp) -{ - struct string_stream_fragment_alloc_context context = { - .test = test, - .len = len, - .gfp = gfp - }; - - return kunit_alloc_resource(test, - string_stream_fragment_init, - string_stream_fragment_free, - gfp, - &context); -} - -static int string_stream_fragment_destroy(struct string_stream_fragment *frag) -{ - return kunit_destroy_resource(frag->test, - kunit_resource_instance_match, - frag); -} - int string_stream_vadd(struct string_stream *stream, const char *fmt, va_list args) @@ -169,48 +136,23 @@ struct string_stream_alloc_context { gfp_t gfp; }; -static int string_stream_init(struct kunit_resource *res, void *context) +struct string_stream *alloc_string_stream(struct kunit *test, gfp_t gfp) { struct string_stream *stream; - struct string_stream_alloc_context *ctx = context; - stream = kunit_kzalloc(ctx->test, sizeof(*stream), ctx->gfp); + stream = kunit_kzalloc(test, sizeof(*stream), gfp); if (!stream) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - res->data = stream; - stream->gfp = ctx->gfp; - stream->test = ctx->test; + stream->gfp = gfp; + stream->test = test; INIT_LIST_HEAD(&stream->fragments); spin_lock_init(&stream->lock); - return 0; + return stream; } -static void string_stream_free(struct kunit_resource *res) +void string_stream_destroy(struct string_stream *stream) { - struct string_stream *stream = res->data; - string_stream_clear(stream); } - -struct string_stream *alloc_string_stream(struct kunit *test, gfp_t gfp) -{ - struct string_stream_alloc_context context = { - .test = test, - .gfp = gfp - }; - - return kunit_alloc_resource(test, - string_stream_init, - string_stream_free, - gfp, - &context); -} - -int string_stream_destroy(struct string_stream *stream) -{ - return kunit_destroy_resource(stream->test, - kunit_resource_instance_match, - stream); -} diff --git a/lib/kunit/string-stream.h b/lib/kunit/string-stream.h index 43f9508a55b40..494dee0f24bde 100644 --- a/lib/kunit/string-stream.h +++ b/lib/kunit/string-stream.h @@ -46,6 +46,6 @@ int string_stream_append(struct string_stream *stream, bool string_stream_is_empty(struct string_stream *stream); -int string_stream_destroy(struct string_stream *stream); +void string_stream_destroy(struct string_stream *stream); #endif /* _KUNIT_STRING_STREAM_H */ diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 1e54373309a41..638df598bcf91 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -278,7 +278,7 @@ static void kunit_fail(struct kunit *test, const struct kunit_loc *loc, kunit_print_string_stream(test, stream); - WARN_ON(string_stream_destroy(stream)); + string_stream_destroy(stream); } static void __noreturn kunit_abort(struct kunit *test) -- GitLab From 4db4598b5ed8fc26f5fd9312623a9ec5cebbe74a Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 22 Jul 2022 17:15:31 +0000 Subject: [PATCH 1568/2223] kunit: drop test pointer in string_stream_fragment We already store the `struct kunit *test` in the string_stream object itself, so we need don't need to store a copy of this pointer in every fragment in the stream. Drop it, getting string_stream_fragment down the bare minimum: a list_head and the `char *` with the actual fragment. Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Reviewed-by: Brendan Higgins <brendanhiggins@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- lib/kunit/string-stream.c | 10 +++++----- lib/kunit/string-stream.h | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/kunit/string-stream.c b/lib/kunit/string-stream.c index a2496abef152c..f5ae79c374003 100644 --- a/lib/kunit/string-stream.c +++ b/lib/kunit/string-stream.c @@ -22,7 +22,6 @@ static struct string_stream_fragment *alloc_string_stream_fragment( if (!frag) return ERR_PTR(-ENOMEM); - frag->test = test; frag->fragment = kunit_kmalloc(test, len, gfp); if (!frag->fragment) return ERR_PTR(-ENOMEM); @@ -30,11 +29,12 @@ static struct string_stream_fragment *alloc_string_stream_fragment( return frag; } -static void string_stream_fragment_destroy(struct string_stream_fragment *frag) +static void string_stream_fragment_destroy(struct kunit *test, + struct string_stream_fragment *frag) { list_del(&frag->node); - kunit_kfree(frag->test, frag->fragment); - kunit_kfree(frag->test, frag); + kunit_kfree(test, frag->fragment); + kunit_kfree(test, frag); } int string_stream_vadd(struct string_stream *stream, @@ -89,7 +89,7 @@ static void string_stream_clear(struct string_stream *stream) frag_container_safe, &stream->fragments, node) { - string_stream_fragment_destroy(frag_container); + string_stream_fragment_destroy(stream->test, frag_container); } stream->length = 0; spin_unlock(&stream->lock); diff --git a/lib/kunit/string-stream.h b/lib/kunit/string-stream.h index 494dee0f24bde..b669f9a75a948 100644 --- a/lib/kunit/string-stream.h +++ b/lib/kunit/string-stream.h @@ -14,7 +14,6 @@ #include <linux/stdarg.h> struct string_stream_fragment { - struct kunit *test; struct list_head node; char *fragment; }; -- GitLab From 047a8a0a2da716fecfd325d21ccf509c431992d9 Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 22 Jul 2022 17:15:32 +0000 Subject: [PATCH 1569/2223] kunit: make kunit_kfree() only work on pointers from kunit_malloc() and friends kunit_kfree() exists to clean up allocations from kunit_kmalloc() and friends early instead of waiting for this to happen automatically at the end of the test. But it can be used on *anything* registered with the kunit resource API. E.g. the last 2 statements are equivalent: struct kunit_resource *res = something(); kfree(res->data); kunit_put_resource(res); The problem is that there could be multiple resources that point to the same `data`. E.g. you can have a named resource acting as a pseudo-global variable in a test. If you point it to data allocated with kunit_kmalloc(), then calling `kunit_kfree(ptr)` has the chance to delete either the named resource or to kfree `ptr`. Which one it does depends on the order the resources are registered as kunit_kfree() will delete resources in LIFO order. So this patch restricts kunit_kfree() to only working on resources created by kunit_kmalloc(). Calling it is therefore guaranteed to free the memory, not do anything else. Note: kunit_resource_instance_match() wasn't used outside of KUnit, so it should be safe to remove from the public interface. It's also generally dangerous, as shown above, and shouldn't be used. Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Reviewed-by: Brendan Higgins <brendanhiggins@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- include/kunit/resource.h | 16 ---------------- lib/kunit/kunit-test.c | 7 +++++++ lib/kunit/test.c | 10 ++++++++-- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/include/kunit/resource.h b/include/kunit/resource.h index 09c2b34d1c613..cf6fb8f2ac1bd 100644 --- a/include/kunit/resource.h +++ b/include/kunit/resource.h @@ -300,22 +300,6 @@ typedef bool (*kunit_resource_match_t)(struct kunit *test, struct kunit_resource *res, void *match_data); -/** - * kunit_resource_instance_match() - Match a resource with the same instance. - * @test: Test case to which the resource belongs. - * @res: The resource. - * @match_data: The resource pointer to match against. - * - * An instance of kunit_resource_match_t that matches a resource whose - * allocation matches @match_data. - */ -static inline bool kunit_resource_instance_match(struct kunit *test, - struct kunit_resource *res, - void *match_data) -{ - return res->data == match_data; -} - /** * kunit_resource_name_match() - Match a resource with the same name. * @test: Test case to which the resource belongs. diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index 13d0bd8b07a98..4df0335d0d06e 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -161,6 +161,13 @@ static void kunit_resource_test_alloc_resource(struct kunit *test) kunit_put_resource(res); } +static inline bool kunit_resource_instance_match(struct kunit *test, + struct kunit_resource *res, + void *match_data) +{ + return res->data == match_data; +} + /* * Note: tests below use kunit_alloc_and_get_resource(), so as a consequence * they have a reference to the associated resource that they must release diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 638df598bcf91..0f9c1fb32da7e 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -713,12 +713,18 @@ void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(kunit_kmalloc_array); +static inline bool kunit_kfree_match(struct kunit *test, + struct kunit_resource *res, void *match_data) +{ + /* Only match resources allocated with kunit_kmalloc() and friends. */ + return res->free == kunit_kmalloc_array_free && res->data == match_data; +} + void kunit_kfree(struct kunit *test, const void *ptr) { struct kunit_resource *res; - res = kunit_find_resource(test, kunit_resource_instance_match, - (void *)ptr); + res = kunit_find_resource(test, kunit_kfree_match, (void *)ptr); /* * Removing the resource from the list of resources drops the -- GitLab From e562e309d1d4ac05457c1454b6007071f13b5684 Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 22 Jul 2022 17:15:33 +0000 Subject: [PATCH 1570/2223] kunit: make kunit_kfree() not segfault on invalid inputs kunit_kfree() can only work on data ("resources") allocated by KUnit. Currently for code like this, > void *ptr = kmalloc(4, GFP_KERNEL); > kunit_kfree(test, ptr); kunit_kfree() will segfault. It'll try and look up the kunit_resource associated with `ptr` and get a NULL back, but it won't check for this. This means we also segfault if you double-free. Change kunit_kfree() so it'll notice these invalid pointers and respond by failing the test. Implementation: kunit_destroy_resource() does what kunit_kfree() does, but is more generic and returns -ENOENT when it can't find the resource. Sadly, unlike just letting it crash, this means we don't get a stack trace. But kunit_kfree() is so infrequently used it shouldn't be hard to track down the bad callsite anyways. After this change, the above code gives: > # example_simple_test: EXPECTATION FAILED at lib/kunit/test.c:702 > kunit_kfree: 00000000626ec200 already freed or not allocated by kunit Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Reviewed-by: Brendan Higgins <brendanhiggins@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- lib/kunit/test.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 0f9c1fb32da7e..0e9ff5d8fe845 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -722,18 +722,8 @@ static inline bool kunit_kfree_match(struct kunit *test, void kunit_kfree(struct kunit *test, const void *ptr) { - struct kunit_resource *res; - - res = kunit_find_resource(test, kunit_kfree_match, (void *)ptr); - - /* - * Removing the resource from the list of resources drops the - * reference count to 1; the final put will trigger the free. - */ - kunit_remove_resource(test, res); - - kunit_put_resource(res); - + if (kunit_destroy_resource(test, kunit_kfree_match, (void *)ptr)) + KUNIT_FAIL(test, "kunit_kfree: %px already freed or not allocated by kunit", ptr); } EXPORT_SYMBOL_GPL(kunit_kfree); -- GitLab From 185d57797c5ea82e941befc2489dba0cf162b9c4 Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 22 Jul 2022 17:15:34 +0000 Subject: [PATCH 1571/2223] kunit: make kunit_kfree(NULL) a no-op to match kfree() The real kfree() function will silently return when given a NULL. So a user might reasonably think they can write the following code: char *buffer = NULL; if (param->use_buffer) buffer = kunit_kzalloc(test, 10, GFP_KERNEL); ... kunit_kfree(test, buffer); As-is, kunit_kfree() will mark the test as FAILED when buffer is NULL. (And in earlier times, it would segfault). Let's match the semantics of kfree(). Suggested-by: David Gow <davidgow@google.com> Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Reviewed-by: Brendan Higgins <brendanhiggins@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- lib/kunit/test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 0e9ff5d8fe845..46471bda351e5 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -722,6 +722,9 @@ static inline bool kunit_kfree_match(struct kunit *test, void kunit_kfree(struct kunit *test, const void *ptr) { + if (!ptr) + return; + if (kunit_destroy_resource(test, kunit_kfree_match, (void *)ptr)) KUNIT_FAIL(test, "kunit_kfree: %px already freed or not allocated by kunit", ptr); } -- GitLab From 3c4fc7bf4c9e66fe71abcbf93f62f4ddb89b7f15 Mon Sep 17 00:00:00 2001 From: David Gow <davidgow@google.com> Date: Fri, 23 Sep 2022 13:00:39 +0800 Subject: [PATCH 1572/2223] kunit: tool: Don't download risc-v opensbi firmware with wget When running a RISC-V test kernel under QEMU, we need an OpenSBI BIOS file. In the original QEMU support patchset, kunit_tool would optionally download this file from GitHub if it didn't exist, using wget. These days, it can usually be found in the distro's qemu-system-riscv package, and is located in /usr/share/qemu on all the distros I tried (Debian, Arch, OpenSUSE). Use this file, and thereby don't do any downloading in kunit_tool. In addition, we used to shell out to whatever 'wget' was in the path, which could have potentially been used to trick the developer into running another binary. By not using wget at all, we nicely sidestep this issue. Cc: Xu Panda <xu.panda@zte.com.cn> Fixes: 87c9c1631788 ("kunit: tool: add support for QEMU") Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: David Gow <davidgow@google.com> Tested-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: Brendan Higgins <brendanhiggins@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- tools/testing/kunit/qemu_configs/riscv.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tools/testing/kunit/qemu_configs/riscv.py b/tools/testing/kunit/qemu_configs/riscv.py index 6207be146d26e..12a1d525978a2 100644 --- a/tools/testing/kunit/qemu_configs/riscv.py +++ b/tools/testing/kunit/qemu_configs/riscv.py @@ -3,17 +3,13 @@ import os import os.path import sys -GITHUB_OPENSBI_URL = 'https://github.com/qemu/qemu/raw/master/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin' -OPENSBI_FILE = os.path.basename(GITHUB_OPENSBI_URL) +OPENSBI_FILE = 'opensbi-riscv64-generic-fw_dynamic.bin' +OPENSBI_PATH = '/usr/share/qemu/' + OPENSBI_FILE -if not os.path.isfile(OPENSBI_FILE): - print('\n\nOpenSBI file is not in the current working directory.\n' - 'Would you like me to download it for you from:\n' + GITHUB_OPENSBI_URL + ' ?\n') - response = input('yes/[no]: ') - if response.strip() == 'yes': - os.system('wget ' + GITHUB_OPENSBI_URL) - else: - sys.exit() +if not os.path.isfile(OPENSBI_PATH): + print('\n\nOpenSBI bios was not found in "' + OPENSBI_PATH + '".\n' + 'Please ensure that qemu-system-riscv is installed, or edit the path in "qemu_configs/riscv.py"\n') + sys.exit() QEMU_ARCH = QemuArchParams(linux_arch='riscv', kconfig=''' @@ -29,4 +25,4 @@ CONFIG_SERIAL_EARLYCON_RISCV_SBI=y''', extra_qemu_params=[ '-machine', 'virt', '-cpu', 'rv64', - '-bios', 'opensbi-riscv64-generic-fw_dynamic.bin']) + '-bios', OPENSBI_PATH]) -- GitLab From a8495ad8e973cb6aabbe855d3dfb66ec4c9b281a Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 30 Sep 2022 17:26:35 -0700 Subject: [PATCH 1573/2223] kunit: remove format func from struct kunit_assert, get it to 0 bytes Each calll to a KUNIT_EXPECT_*() macro creates a local variable which contains a struct kunit_assert. Normally, we'd hope the compiler would be able to optimize this away, but we've seen cases where it hasn't, see https://groups.google.com/g/kunit-dev/c/i3fZXgvBrfA/m/GbrMNej2BAAJ. In changes like commit 21957f90b28f ("kunit: split out part of kunit_assert into a static const"), we've moved more and more parts out of struct kunit_assert and its children types (kunit_binary_assert). This patch removes the final field and gets us to: sizeof(struct kunit_assert) == 0 sizeof(struct kunit_binary_assert) == 24 (on UML x86_64). This also reduces the amount of macro plumbing going on at the cost of passing in one more arg to the base KUNIT_ASSERTION macro and kunit_do_failed_assertion(). Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- include/kunit/assert.h | 28 ++++++---------------------- include/kunit/test.h | 17 +++++++++++------ lib/kunit/test.c | 7 ++++--- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/include/kunit/assert.h b/include/kunit/assert.h index 4b52e12c2ae83..ace3de8d1ee79 100644 --- a/include/kunit/assert.h +++ b/include/kunit/assert.h @@ -42,16 +42,15 @@ struct kunit_loc { /** * struct kunit_assert - Data for printing a failed assertion or expectation. - * @format: a function which formats the data in this kunit_assert to a string. * * Represents a failed expectation/assertion. Contains all the data necessary to * format a string to a user reporting the failure. */ -struct kunit_assert { - void (*format)(const struct kunit_assert *assert, - const struct va_format *message, - struct string_stream *stream); -}; +struct kunit_assert {}; + +typedef void (*assert_format_t)(const struct kunit_assert *assert, + const struct va_format *message, + struct string_stream *stream); void kunit_assert_prologue(const struct kunit_loc *loc, enum kunit_assert_type type, @@ -71,16 +70,6 @@ void kunit_fail_assert_format(const struct kunit_assert *assert, const struct va_format *message, struct string_stream *stream); -/** - * KUNIT_INIT_FAIL_ASSERT_STRUCT - Initializer for &struct kunit_fail_assert. - * - * Initializes a &struct kunit_fail_assert. Intended to be used in - * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. - */ -#define KUNIT_INIT_FAIL_ASSERT_STRUCT { \ - .assert = { .format = kunit_fail_assert_format }, \ -} - /** * struct kunit_unary_assert - Represents a KUNIT_{EXPECT|ASSERT}_{TRUE|FALSE} * @assert: The parent of this type. @@ -110,7 +99,6 @@ void kunit_unary_assert_format(const struct kunit_assert *assert, * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. */ #define KUNIT_INIT_UNARY_ASSERT_STRUCT(cond, expect_true) { \ - .assert = { .format = kunit_unary_assert_format }, \ .condition = cond, \ .expected_true = expect_true \ } @@ -145,7 +133,6 @@ void kunit_ptr_not_err_assert_format(const struct kunit_assert *assert, * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. */ #define KUNIT_INIT_PTR_NOT_ERR_STRUCT(txt, val) { \ - .assert = { .format = kunit_ptr_not_err_assert_format }, \ .text = txt, \ .value = val \ } @@ -190,7 +177,6 @@ void kunit_binary_assert_format(const struct kunit_assert *assert, * KUNIT_INIT_BINARY_ASSERT_STRUCT() - Initializes a binary assert like * kunit_binary_assert, kunit_binary_ptr_assert, etc. * - * @format_func: a function which formats the assert to a string. * @text_: Pointer to a kunit_binary_assert_text. * @left_val: The actual evaluated value of the expression in the left slot. * @right_val: The actual evaluated value of the expression in the right slot. @@ -200,11 +186,9 @@ void kunit_binary_assert_format(const struct kunit_assert *assert, * fields but with different types for left_val/right_val. * This is ultimately used by binary assertion macros like KUNIT_EXPECT_EQ, etc. */ -#define KUNIT_INIT_BINARY_ASSERT_STRUCT(format_func, \ - text_, \ +#define KUNIT_INIT_BINARY_ASSERT_STRUCT(text_, \ left_val, \ right_val) { \ - .assert = { .format = format_func }, \ .text = text_, \ .left_value = left_val, \ .right_value = right_val \ diff --git a/include/kunit/test.h b/include/kunit/test.h index 20cc4770cb3f4..57a653f6a0087 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -473,9 +473,10 @@ void kunit_do_failed_assertion(struct kunit *test, const struct kunit_loc *loc, enum kunit_assert_type type, const struct kunit_assert *assert, + assert_format_t assert_format, const char *fmt, ...); -#define KUNIT_ASSERTION(test, assert_type, pass, assert_class, INITIALIZER, fmt, ...) do { \ +#define KUNIT_ASSERTION(test, assert_type, pass, assert_class, assert_format, INITIALIZER, fmt, ...) do { \ if (unlikely(!(pass))) { \ static const struct kunit_loc __loc = KUNIT_CURRENT_LOC; \ struct assert_class __assertion = INITIALIZER; \ @@ -483,6 +484,7 @@ void kunit_do_failed_assertion(struct kunit *test, &__loc, \ assert_type, \ &__assertion.assert, \ + assert_format, \ fmt, \ ##__VA_ARGS__); \ } \ @@ -494,7 +496,8 @@ void kunit_do_failed_assertion(struct kunit *test, assert_type, \ false, \ kunit_fail_assert, \ - KUNIT_INIT_FAIL_ASSERT_STRUCT, \ + kunit_fail_assert_format, \ + {}, \ fmt, \ ##__VA_ARGS__) @@ -525,6 +528,7 @@ void kunit_do_failed_assertion(struct kunit *test, assert_type, \ !!(condition) == !!expected_true, \ kunit_unary_assert, \ + kunit_unary_assert_format, \ KUNIT_INIT_UNARY_ASSERT_STRUCT(#condition, \ expected_true), \ fmt, \ @@ -582,8 +586,8 @@ do { \ assert_type, \ __left op __right, \ assert_class, \ - KUNIT_INIT_BINARY_ASSERT_STRUCT(format_func, \ - &__text, \ + format_func, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT(&__text, \ __left, \ __right), \ fmt, \ @@ -640,8 +644,8 @@ do { \ assert_type, \ strcmp(__left, __right) op 0, \ kunit_binary_str_assert, \ - KUNIT_INIT_BINARY_ASSERT_STRUCT(kunit_binary_str_assert_format,\ - &__text, \ + kunit_binary_str_assert_format, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT(&__text, \ __left, \ __right), \ fmt, \ @@ -660,6 +664,7 @@ do { \ assert_type, \ !IS_ERR_OR_NULL(__ptr), \ kunit_ptr_not_err_assert, \ + kunit_ptr_not_err_assert_format, \ KUNIT_INIT_PTR_NOT_ERR_STRUCT(#ptr, \ __ptr), \ fmt, \ diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 46471bda351e5..90640a43cf623 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -258,7 +258,7 @@ static void kunit_print_string_stream(struct kunit *test, static void kunit_fail(struct kunit *test, const struct kunit_loc *loc, enum kunit_assert_type type, const struct kunit_assert *assert, - const struct va_format *message) + assert_format_t assert_format, const struct va_format *message) { struct string_stream *stream; @@ -274,7 +274,7 @@ static void kunit_fail(struct kunit *test, const struct kunit_loc *loc, } kunit_assert_prologue(loc, type, stream); - assert->format(assert, message, stream); + assert_format(assert, message, stream); kunit_print_string_stream(test, stream); @@ -298,6 +298,7 @@ void kunit_do_failed_assertion(struct kunit *test, const struct kunit_loc *loc, enum kunit_assert_type type, const struct kunit_assert *assert, + assert_format_t assert_format, const char *fmt, ...) { va_list args; @@ -307,7 +308,7 @@ void kunit_do_failed_assertion(struct kunit *test, message.fmt = fmt; message.va = &args; - kunit_fail(test, loc, type, assert, &message); + kunit_fail(test, loc, type, assert, assert_format, &message); va_end(args); -- GitLab From 97d453bc4007d4ac148c2ba89904026612b91ec9 Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 30 Sep 2022 17:26:36 -0700 Subject: [PATCH 1574/2223] kunit: rename base KUNIT_ASSERTION macro to _KUNIT_FAILED Context: Currently this macro's name, KUNIT_ASSERTION conflicts with the name of an enum whose values are {KUNIT_EXPECTATION, KUNIT_ASSERTION}. It's hard to think of a better name for the enum, so rename this macro. It's also a bit strange that the macro might do nothing depending on the boolean argument `pass`. Why not have callers check themselves? This patch: Moves the pass/fail checking into the callers of KUNIT_ASSERTION, so now we only call it when the check has failed. Then we rename the macro the _KUNIT_FAILED() to reflect the new semantics. Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- include/kunit/test.h | 123 +++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 58 deletions(-) diff --git a/include/kunit/test.h b/include/kunit/test.h index 57a653f6a0087..38a1aac72fb28 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -476,30 +476,27 @@ void kunit_do_failed_assertion(struct kunit *test, assert_format_t assert_format, const char *fmt, ...); -#define KUNIT_ASSERTION(test, assert_type, pass, assert_class, assert_format, INITIALIZER, fmt, ...) do { \ - if (unlikely(!(pass))) { \ - static const struct kunit_loc __loc = KUNIT_CURRENT_LOC; \ - struct assert_class __assertion = INITIALIZER; \ - kunit_do_failed_assertion(test, \ - &__loc, \ - assert_type, \ - &__assertion.assert, \ - assert_format, \ - fmt, \ - ##__VA_ARGS__); \ - } \ +#define _KUNIT_FAILED(test, assert_type, assert_class, assert_format, INITIALIZER, fmt, ...) do { \ + static const struct kunit_loc __loc = KUNIT_CURRENT_LOC; \ + struct assert_class __assertion = INITIALIZER; \ + kunit_do_failed_assertion(test, \ + &__loc, \ + assert_type, \ + &__assertion.assert, \ + assert_format, \ + fmt, \ + ##__VA_ARGS__); \ } while (0) #define KUNIT_FAIL_ASSERTION(test, assert_type, fmt, ...) \ - KUNIT_ASSERTION(test, \ - assert_type, \ - false, \ - kunit_fail_assert, \ - kunit_fail_assert_format, \ - {}, \ - fmt, \ - ##__VA_ARGS__) + _KUNIT_FAILED(test, \ + assert_type, \ + kunit_fail_assert, \ + kunit_fail_assert_format, \ + {}, \ + fmt, \ + ##__VA_ARGS__) /** * KUNIT_FAIL() - Always causes a test to fail when evaluated. @@ -524,15 +521,19 @@ void kunit_do_failed_assertion(struct kunit *test, expected_true, \ fmt, \ ...) \ - KUNIT_ASSERTION(test, \ - assert_type, \ - !!(condition) == !!expected_true, \ - kunit_unary_assert, \ - kunit_unary_assert_format, \ - KUNIT_INIT_UNARY_ASSERT_STRUCT(#condition, \ - expected_true), \ - fmt, \ - ##__VA_ARGS__) +do { \ + if (likely(!!(condition) == !!expected_true)) \ + break; \ + \ + _KUNIT_FAILED(test, \ + assert_type, \ + kunit_unary_assert, \ + kunit_unary_assert_format, \ + KUNIT_INIT_UNARY_ASSERT_STRUCT(#condition, \ + expected_true), \ + fmt, \ + ##__VA_ARGS__); \ +} while (0) #define KUNIT_TRUE_MSG_ASSERTION(test, assert_type, condition, fmt, ...) \ KUNIT_UNARY_ASSERTION(test, \ @@ -582,16 +583,18 @@ do { \ .right_text = #right, \ }; \ \ - KUNIT_ASSERTION(test, \ - assert_type, \ - __left op __right, \ - assert_class, \ - format_func, \ - KUNIT_INIT_BINARY_ASSERT_STRUCT(&__text, \ - __left, \ - __right), \ - fmt, \ - ##__VA_ARGS__); \ + if (likely(__left op __right)) \ + break; \ + \ + _KUNIT_FAILED(test, \ + assert_type, \ + assert_class, \ + format_func, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT(&__text, \ + __left, \ + __right), \ + fmt, \ + ##__VA_ARGS__); \ } while (0) #define KUNIT_BINARY_INT_ASSERTION(test, \ @@ -640,16 +643,19 @@ do { \ .right_text = #right, \ }; \ \ - KUNIT_ASSERTION(test, \ - assert_type, \ - strcmp(__left, __right) op 0, \ - kunit_binary_str_assert, \ - kunit_binary_str_assert_format, \ - KUNIT_INIT_BINARY_ASSERT_STRUCT(&__text, \ - __left, \ - __right), \ - fmt, \ - ##__VA_ARGS__); \ + if (likely(strcmp(__left, __right) op 0)) \ + break; \ + \ + \ + _KUNIT_FAILED(test, \ + assert_type, \ + kunit_binary_str_assert, \ + kunit_binary_str_assert_format, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT(&__text, \ + __left, \ + __right), \ + fmt, \ + ##__VA_ARGS__); \ } while (0) #define KUNIT_PTR_NOT_ERR_OR_NULL_MSG_ASSERTION(test, \ @@ -660,15 +666,16 @@ do { \ do { \ const typeof(ptr) __ptr = (ptr); \ \ - KUNIT_ASSERTION(test, \ - assert_type, \ - !IS_ERR_OR_NULL(__ptr), \ - kunit_ptr_not_err_assert, \ - kunit_ptr_not_err_assert_format, \ - KUNIT_INIT_PTR_NOT_ERR_STRUCT(#ptr, \ - __ptr), \ - fmt, \ - ##__VA_ARGS__); \ + if (!IS_ERR_OR_NULL(__ptr)) \ + break; \ + \ + _KUNIT_FAILED(test, \ + assert_type, \ + kunit_ptr_not_err_assert, \ + kunit_ptr_not_err_assert_format, \ + KUNIT_INIT_PTR_NOT_ERR_STRUCT(#ptr, __ptr), \ + fmt, \ + ##__VA_ARGS__); \ } while (0) /** -- GitLab From c1144e01063e67f807517a393b91fae054929dc8 Mon Sep 17 00:00:00 2001 From: Daniel Latypov <dlatypov@google.com> Date: Fri, 30 Sep 2022 17:26:38 -0700 Subject: [PATCH 1575/2223] kunit: declare kunit_assert structs as const Everywhere we use the assert structs now takes them via const*, as of commit 7466886b400b ("kunit: take `kunit_assert` as `const`"). So now let's properly declare the structs as const as well. Signed-off-by: Daniel Latypov <dlatypov@google.com> Reviewed-by: David Gow <davidgow@google.com> Reviewed-by: Miguel Ojeda <ojeda@kernel.org> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- include/kunit/test.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kunit/test.h b/include/kunit/test.h index 38a1aac72fb28..b1ab6b32216d7 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -478,7 +478,7 @@ void kunit_do_failed_assertion(struct kunit *test, #define _KUNIT_FAILED(test, assert_type, assert_class, assert_format, INITIALIZER, fmt, ...) do { \ static const struct kunit_loc __loc = KUNIT_CURRENT_LOC; \ - struct assert_class __assertion = INITIALIZER; \ + const struct assert_class __assertion = INITIALIZER; \ kunit_do_failed_assertion(test, \ &__loc, \ assert_type, \ -- GitLab From e98c4f6afc5e21507737066433699f225a180db7 Mon Sep 17 00:00:00 2001 From: David Gow <davidgow@google.com> Date: Sat, 1 Oct 2022 14:46:43 +0800 Subject: [PATCH 1576/2223] Documentation: kunit: Update description of --alltests option kunit_tool's --alltests option was changed in commit 980ac3ad0512 ("kunit: tool: rename all_test_uml.config, use it for --alltests") to use a manually curated list of architecture-indpendent Kconfig options, rather than attempting to use make allyesconfig on UML, which was broken. Update the kunit_tool documentation to reflect the new behaviour of --alltests. Signed-off-by: David Gow <davidgow@google.com> Reviewed-by: Daniel Latypov <dlatypov@google.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- Documentation/dev-tools/kunit/run_wrapper.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Documentation/dev-tools/kunit/run_wrapper.rst b/Documentation/dev-tools/kunit/run_wrapper.rst index 6b33caf6c8ab8..dafe8eb28d301 100644 --- a/Documentation/dev-tools/kunit/run_wrapper.rst +++ b/Documentation/dev-tools/kunit/run_wrapper.rst @@ -251,14 +251,15 @@ command line arguments: compiling a kernel (using ``build`` or ``run`` commands). For example: to enable compiler warnings, we can pass ``--make_options W=1``. -- ``--alltests``: Builds a UML kernel with all config options enabled - using ``make allyesconfig``. This allows us to run as many tests as - possible. - - .. note:: It is slow and prone to breakage as new options are - added or modified. Instead, enable all tests - which have satisfied dependencies by adding - ``CONFIG_KUNIT_ALL_TESTS=y`` to your ``.kunitconfig``. +- ``--alltests``: Enable a predefined set of options in order to build + as many tests as possible. + + .. note:: The list of enabled options can be found in + ``tools/testing/kunit/configs/all_tests.config``. + + If you only want to enable all tests with otherwise satisfied + dependencies, instead add ``CONFIG_KUNIT_ALL_TESTS=y`` to your + ``.kunitconfig``. - ``--kunitconfig``: Specifies the path or the directory of the ``.kunitconfig`` file. For example: -- GitLab From cadf306460c8f1281fc8bdd270514944ed75d3d0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Mon, 3 Oct 2022 10:58:23 -0700 Subject: [PATCH 1577/2223] selftests/ftrace: func_event_triggers: fix typo in user message Correct typo of "it's" to "it". Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Shuah Khan <shuah@kernel.org> Cc: linux-kselftest@vger.kernel.org Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- .../selftests/ftrace/test.d/ftrace/func_event_triggers.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc index 3145b0f1835c3..8d26d5505808b 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc @@ -85,7 +85,7 @@ run_enable_disable() { echo $check_disable > $EVENT_ENABLE done sleep $SLEEP_TIME - echo " make sure it's still works" + echo " make sure it still works" test_event_enabled $check_enable_star reset_ftrace_filter -- GitLab From 13023c33c962730a38d6b43995910c8805637a9a Mon Sep 17 00:00:00 2001 From: Zhao Gongyi <zhaogongyi@huawei.com> Date: Fri, 30 Sep 2022 14:35:24 +0800 Subject: [PATCH 1578/2223] selftests/memory-hotplug: Add checking after online or offline Add checking for online_memory_expect_success()/ offline_memory_expect_success()/offline_memory_expect_fail(), or the test would exit 0 although the functions return 1. Signed-off-by: Zhao Gongyi <zhaogongyi@huawei.com> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- .../selftests/memory-hotplug/mem-on-off-test.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh index 46a97f318f58e..1d87611a7d52b 100755 --- a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh +++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh @@ -266,7 +266,9 @@ done # echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error for memory in `hotpluggable_offline_memory`; do - online_memory_expect_fail $memory + if ! online_memory_expect_fail $memory; then + retval=1 + fi done # @@ -274,7 +276,9 @@ done # echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error for memory in `hotpluggable_offline_memory`; do - online_memory_expect_success $memory + if ! online_memory_expect_success $memory; then + retval=1 + fi done # @@ -283,7 +287,9 @@ done echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error for memory in `hotpluggable_online_memory`; do if [ $((RANDOM % 100)) -lt $ratio ]; then - offline_memory_expect_fail $memory + if ! offline_memory_expect_fail $memory; then + retval=1 + fi fi done -- GitLab From 3e77a49aa78a65c7cfc4a2662366442ea1498fbb Mon Sep 17 00:00:00 2001 From: Zhao Gongyi <zhaogongyi@huawei.com> Date: Fri, 30 Sep 2022 14:35:25 +0800 Subject: [PATCH 1579/2223] selftests/memory-hotplug: Restore memory before exit Some momory will be left in offline state when calling offline_memory_expect_fail() failed. Restore it before exit. Signed-off-by: Zhao Gongyi <zhaogongyi@huawei.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- .../memory-hotplug/mem-on-off-test.sh | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh index 1d87611a7d52b..91a7457616bb5 100755 --- a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh +++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh @@ -134,6 +134,16 @@ offline_memory_expect_fail() return 0 } +online_all_offline_memory() +{ + for memory in `hotpluggable_offline_memory`; do + if ! online_memory_expect_success $memory; then + echo "$FUNCNAME $memory: unexpected fail" >&2 + retval=1 + fi + done +} + error=-12 priority=0 # Run with default of ratio=2 for Kselftest run @@ -275,11 +285,7 @@ done # Online all hot-pluggable memory # echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error -for memory in `hotpluggable_offline_memory`; do - if ! online_memory_expect_success $memory; then - retval=1 - fi -done +online_all_offline_memory # # Test memory hot-remove error handling (online => offline) @@ -296,4 +302,9 @@ done echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error /sbin/modprobe -q -r memory-notifier-error-inject +# +# Restore memory before exit +# +online_all_offline_memory + exit $retval -- GitLab From 95e5a911f9746710daf4753ba494426f802c2299 Mon Sep 17 00:00:00 2001 From: Zhao Gongyi <zhaogongyi@huawei.com> Date: Fri, 30 Sep 2022 14:35:26 +0800 Subject: [PATCH 1580/2223] selftests/memory-hotplug: Adjust log info for maintainability Redirect misleading error message to /dev/null for offline_memory_expect_success(), And, add an output for online->offline test. Signed-off-by: Zhao Gongyi <zhaogongyi@huawei.com> Acked-by: David Hildenbrand <david@redhat.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- tools/testing/selftests/memory-hotplug/mem-on-off-test.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh index 91a7457616bb5..74ee5067a8ce2 100755 --- a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh +++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh @@ -207,8 +207,11 @@ echo -e "\t trying to offline $target out of $hotpluggable_num memory block(s):" for memory in `hotpluggable_online_memory`; do if [ "$target" -gt 0 ]; then echo "online->offline memory$memory" - if offline_memory_expect_success $memory; then + if offline_memory_expect_success $memory &>/dev/null; then target=$(($target - 1)) + echo "-> Success" + else + echo "-> Failure" fi fi done @@ -267,7 +270,7 @@ prerequisite_extra echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error for memory in `hotpluggable_online_memory`; do if [ $((RANDOM % 100)) -lt $ratio ]; then - offline_memory_expect_success $memory + offline_memory_expect_success $memory &>/dev/null fi done -- GitLab From 6a24247132db8122600dc5523e3a62fa8fd28367 Mon Sep 17 00:00:00 2001 From: Zhao Gongyi <zhaogongyi@huawei.com> Date: Fri, 30 Sep 2022 14:35:27 +0800 Subject: [PATCH 1581/2223] docs: notifier-error-inject: Correct test's name Correct test's name for mem-on-off-test.sh/cpu-on-off-test.sh. Signed-off-by: Zhao Gongyi <zhaogongyi@huawei.com> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Shuah Khan <skhan@linuxfoundation.org> --- Documentation/fault-injection/notifier-error-inject.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/fault-injection/notifier-error-inject.rst b/Documentation/fault-injection/notifier-error-inject.rst index 1668b6e48d3a2..fdf2dc433eadf 100644 --- a/Documentation/fault-injection/notifier-error-inject.rst +++ b/Documentation/fault-injection/notifier-error-inject.rst @@ -91,8 +91,8 @@ For more usage examples There are tools/testing/selftests using the notifier error injection features for CPU and memory notifiers. - * tools/testing/selftests/cpu-hotplug/on-off-test.sh - * tools/testing/selftests/memory-hotplug/on-off-test.sh + * tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh + * tools/testing/selftests/memory-hotplug/mem-on-off-test.sh These scripts first do simple online and offline tests and then do fault injection tests if notifier error injection module is available. -- GitLab From 97c96e9fa36616d7890a6f3438172fc501927f01 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken <dylany@fb.com> Date: Mon, 26 Sep 2022 10:09:26 -0700 Subject: [PATCH 1582/2223] io_uring: simplify __io_uring_add_tctx_node Remove submitter parameter from __io_uring_add_tctx_node. It was only called from one place, and we can do that logic in that one place. Signed-off-by: Dylan Yudaken <dylany@fb.com> Fixes: 97bbdc06a444 ("io_uring: add IORING_SETUP_SINGLE_ISSUER") Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 2 +- io_uring/tctx.c | 30 ++++++++++++++++++++---------- io_uring/tctx.h | 6 ++++-- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 99a52f34b7d30..fe6ef64c873e0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3355,7 +3355,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = __io_uring_add_tctx_node(ctx, false); + ret = __io_uring_add_tctx_node(ctx); if (ret) { put_unused_fd(fd); return ret; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7f97d97fef0a9..dd0205fcdb13b 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -105,18 +105,12 @@ static int io_register_submitter(struct io_ring_ctx *ctx) return ret; } -int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter) +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; int ret; - if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) { - ret = io_register_submitter(ctx); - if (ret) - return ret; - } - if (unlikely(!tctx)) { ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) @@ -150,8 +144,24 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter) list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } - if (submitter) - tctx->last = ctx; + return 0; +} + +int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) +{ + int ret; + + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { + ret = io_register_submitter(ctx); + if (ret) + return ret; + } + + ret = __io_uring_add_tctx_node(ctx); + if (ret) + return ret; + + current->io_uring->last = ctx; return 0; } @@ -259,7 +269,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, return -EINVAL; mutex_unlock(&ctx->uring_lock); - ret = __io_uring_add_tctx_node(ctx, false); + ret = __io_uring_add_tctx_node(ctx); mutex_lock(&ctx->uring_lock); if (ret) return ret; diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 25974beed4d6b..608e96de70a2c 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -9,7 +9,8 @@ struct io_tctx_node { int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); -int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter); +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); +int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); void io_uring_clean_tctx(struct io_uring_task *tctx); void io_uring_unreg_ringfd(void); @@ -27,5 +28,6 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) if (likely(tctx && tctx->last == ctx)) return 0; - return __io_uring_add_tctx_node(ctx, true); + + return __io_uring_add_tctx_node_from_submit(ctx); } -- GitLab From 4add705e4eebbdd919741de0548d7029c8c92b68 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken <dylany@fb.com> Date: Mon, 26 Sep 2022 10:09:27 -0700 Subject: [PATCH 1583/2223] io_uring: remove io_register_submitter this is no longer needed, as submitter_task is set at creation time. Signed-off-by: Dylan Yudaken <dylany@fb.com> Fixes: 97bbdc06a444 ("io_uring: add IORING_SETUP_SINGLE_ISSUER") Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/tctx.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index dd0205fcdb13b..4324b1cf1f6af 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -91,20 +91,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return 0; } -static int io_register_submitter(struct io_ring_ctx *ctx) -{ - int ret = 0; - - mutex_lock(&ctx->uring_lock); - if (!ctx->submitter_task) - ctx->submitter_task = get_task_struct(current); - else if (ctx->submitter_task != current) - ret = -EEXIST; - mutex_unlock(&ctx->uring_lock); - - return ret; -} - int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; @@ -151,11 +137,9 @@ int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) { int ret; - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { - ret = io_register_submitter(ctx); - if (ret) - return ret; - } + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER + && ctx->submitter_task != current) + return -EEXIST; ret = __io_uring_add_tctx_node(ctx); if (ret) -- GitLab From d7cce96c449e35bbfd41e830b341b95973891eed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Tue, 27 Sep 2022 01:13:30 +0100 Subject: [PATCH 1584/2223] io_uring: limit registration w/ SINGLE_ISSUER IORING_SETUP_SINGLE_ISSUER restricts what tasks can submit requests. Extend it to registration as well, so non-owning task can't do registrations. It's not necessary at the moment but might be useful in the future. Cc: <stable@vger.kernel.org> # 6.0 Fixes: 97bbdc06a444 ("io_uring: add IORING_SETUP_SINGLE_ISSUER") Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/f52a6a9c8a8990d4a831f73c0571e7406aac2bba.1664237592.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fe6ef64c873e0..63f6ce5e53551 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3890,6 +3890,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO; + if (ctx->submitter_task && ctx->submitter_task != current) + return -EEXIST; + if (ctx->restricted) { if (opcode >= IORING_REGISTER_LAST) return -EINVAL; -- GitLab From b1b8132a651cf6a5b18a01d8f1bd304f5d210315 Mon Sep 17 00:00:00 2001 From: Alex Williamson <alex.williamson@redhat.com> Date: Fri, 7 Oct 2022 12:03:00 -0600 Subject: [PATCH 1585/2223] vfio: More vfio_file_is_group() use cases Replace further open coded tests with helper. Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/166516896843.1215571.5378890510536477434.stgit@omen Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/vfio_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 04099a839a52a..2d168793d4e1c 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1615,7 +1615,7 @@ bool vfio_file_enforced_coherent(struct file *file) struct vfio_group *group = file->private_data; bool ret; - if (file->f_op != &vfio_group_fops) + if (!vfio_file_is_group(file)) return true; mutex_lock(&group->group_lock); @@ -1647,7 +1647,7 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) { struct vfio_group *group = file->private_data; - if (file->f_op != &vfio_group_fops) + if (!vfio_file_is_group(file)) return; mutex_lock(&group->group_lock); @@ -1667,7 +1667,7 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device) { struct vfio_group *group = file->private_data; - if (file->f_op != &vfio_group_fops) + if (!vfio_file_is_group(file)) return false; return group == device->group; -- GitLab From 8ec071c363da3f45585b338b2037de289379939c Mon Sep 17 00:00:00 2001 From: Chao Yu <chao.yu@oppo.com> Date: Tue, 4 Oct 2022 09:11:33 +0800 Subject: [PATCH 1586/2223] f2fs: account swapfile inodes In order to check count of opened swapfile inodes. Signed-off-by: Chao Yu <chao.yu@oppo.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/data.c | 2 ++ fs/f2fs/debug.c | 4 ++++ fs/f2fs/f2fs.h | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1c82a4a4e8616..5f895ddcd64ab 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3989,6 +3989,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -3998,6 +3999,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 29cf5b6b23414..7a9dd23191551 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -135,6 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; @@ -385,6 +386,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -607,6 +610,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ebc08be958eb..134581defe4a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1770,6 +1770,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -3876,7 +3877,7 @@ struct f2fs_stat_info { int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode; + int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; @@ -3965,6 +3966,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -4049,6 +4054,8 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) -- GitLab From b4dac1203f39821c6119033cdeebcea83cf45786 Mon Sep 17 00:00:00 2001 From: Chao Yu <chao@kernel.org> Date: Tue, 4 Oct 2022 09:41:02 +0800 Subject: [PATCH 1587/2223] f2fs: change to use atomic_t type form sbi.atomic_files inode_lock[ATOMIC_FILE] was used for protecting sbi->atomic_files, update atomic_files variable's type to atomic_t instead of unsigned int, then inode_lock[ATOMIC_FILE] can be obsoleted. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/debug.c | 3 ++- fs/f2fs/f2fs.h | 11 ++++++++--- fs/f2fs/file.c | 4 +--- fs/f2fs/segment.c | 6 +----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 7a9dd23191551..a216dcdf69418 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->aw_cnt = sbi->atomic_files; + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); @@ -611,6 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 134581defe4a8..e7e750e6b3321 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1257,7 +1257,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1739,7 +1738,6 @@ struct f2fs_sb_info { unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1771,6 +1769,7 @@ struct f2fs_sb_info { atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -3970,6 +3969,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_dec_swapfile_inode(inode) \ (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3989,7 +3992,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -4056,6 +4059,8 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_swapfile_inode(inode) do { } while (0) #define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7b3ed4a9bb46e..ec9ee0f6d502d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2051,9 +2051,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d7b13127b0b8a..289bcb7ca3009 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -187,7 +187,6 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) @@ -200,10 +199,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files--; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_dec_atomic_inode(inode); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, -- GitLab From 14aa8b2d5c2ebead01b542f62d68029023054774 Mon Sep 17 00:00:00 2001 From: Yu Zhao <yuzhao@google.com> Date: Wed, 28 Sep 2022 13:36:58 -0600 Subject: [PATCH 1588/2223] mm/mglru: don't sync disk for each aging cycle wakeup_flusher_threads() was added under the assumption that if a system runs out of clean cold pages, it might want to write back dirty pages more aggressively so that they can become clean and be dropped. However, doing so can breach the rate limit a system wants to impose on writeback, resulting in early SSD wearout. Link: https://lkml.kernel.org/r/YzSiWq9UEER5LKup@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao <yuzhao@google.com> Reported-by: Axel Rasmussen <axelrasmussen@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c5a4bff11da69..3240d5dd7784a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4413,8 +4413,6 @@ done: if (wq_has_sleeper(&lruvec->mm_state.wait)) wake_up_all(&lruvec->mm_state.wait); - wakeup_flusher_threads(WB_REASON_VMSCAN); - return true; } -- GitLab From e4fea72b143848d8bbbeae6d39a890212bcf848e Mon Sep 17 00:00:00 2001 From: Yu Zhao <yuzhao@google.com> Date: Wed, 28 Sep 2022 12:46:20 -0600 Subject: [PATCH 1589/2223] mglru: mm/vmscan.c: fix imprecise comments Link: https://lkml.kernel.org/r/YzSWfFI+MOeb1ils@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/vmscan.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3240d5dd7784a..04d8b88e52164 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5076,7 +5076,7 @@ static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, DEFINE_MAX_SEQ(lruvec); if (!current_is_kswapd()) { - /* age each memcg once to ensure fairness */ + /* age each memcg at most once to ensure fairness */ if (max_seq - seq > 1) return true; @@ -5101,10 +5101,9 @@ static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, /* * A minimum amount of work was done under global memory pressure. For - * kswapd, it may be overshooting. For direct reclaim, the target isn't - * met, and yet the allocation may still succeed, since kswapd may have - * caught up. In either case, it's better to stop now, and restart if - * necessary. + * kswapd, it may be overshooting. For direct reclaim, the allocation + * may succeed if all suitable zones are somewhat safe. In either case, + * it's better to stop now, and restart later if necessary. */ for (i = 0; i <= sc->reclaim_idx; i++) { unsigned long wmark; -- GitLab From 131a79b474e973f023c5c75e2323a940332103be Mon Sep 17 00:00:00 2001 From: Mike Kravetz <mike.kravetz@oracle.com> Date: Tue, 4 Oct 2022 18:17:05 -0700 Subject: [PATCH 1590/2223] hugetlb: fix vma lock handling during split vma and range unmapping Patch series "hugetlb: fixes for new vma lock series". In review of the series "hugetlb: Use new vma lock for huge pmd sharing synchronization", Miaohe Lin pointed out two key issues: 1) There is a race in the routine hugetlb_unmap_file_folio when locks are dropped and reacquired in the correct order [1]. 2) With the switch to using vma lock for fault/truncate synchronization, we need to make sure lock exists for all VM_MAYSHARE vmas, not just vmas capable of pmd sharing. These two issues are addressed here. In addition, having a vma lock present in all VM_MAYSHARE vmas, uncovered some issues around vma splitting. Those are also addressed. [1] https://lore.kernel.org/linux-mm/01f10195-7088-4462-6def-909549c75ef4@huawei.com/ This patch (of 3): The hugetlb vma lock hangs off the vm_private_data field and is specific to the vma. When vm_area_dup() is called as part of vma splitting, the vma lock pointer is copied to the new vma. This will result in issues such as double freeing of the structure. Update the hugetlb open vm_ops to allocate a new vma lock for the new vma. The routine __unmap_hugepage_range_final unconditionally unset VM_MAYSHARE to prevent subsequent pmd sharing. hugetlb_vma_lock_free attempted to anticipate this by checking both VM_MAYSHARE and VM_SHARED. However, if only VM_MAYSHARE was set we would miss the free. With the introduction of the vma lock, a vma can not participate in pmd sharing if vm_private_data is NULL. Instead of clearing VM_MAYSHARE in __unmap_hugepage_range_final, free the vma lock to prevent sharing. Also, update the sharing code to make sure vma lock is indeed a condition for pmd sharing. hugetlb_vma_lock_free can then key off VM_MAYSHARE and not miss any vmas. Link: https://lkml.kernel.org/r/20221005011707.514612-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20221005011707.514612-2-mike.kravetz@oracle.com Fixes: "hugetlb: add vma based lock for pmd sharing" Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Cc: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 43 +++++++++++++++++++++++++++---------------- mm/memory.c | 4 ---- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8de5a6b5a172a..45e305d182f6b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4612,7 +4612,14 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) kref_get(&resv->refs); } - hugetlb_vma_lock_alloc(vma); + /* + * vma_lock structure for sharable mappings is vma specific. + * Clear old pointer (if copied via vm_area_dup) and create new. + */ + if (vma->vm_flags & VM_MAYSHARE) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) @@ -5168,19 +5175,23 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); /* - * Clear this flag so that x86's huge_pmd_share page_table_shareable - * test will fail on a vma being torn down, and not grab a page table - * on its way out. We're lucky that the flag has such an appropriate - * name, and can in fact be safely cleared here. We could clear it - * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_rwsem. This works - * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_rwsem is held. + * Unlock and free the vma lock before releasing i_mmap_rwsem. When + * the vma_lock is freed, this makes the vma ineligible for pmd + * sharing. And, i_mmap_rwsem is required to set up pmd sharing. + * This is important as page tables for this unmapped range will + * be asynchrously deleted. If the page tables are shared, there + * will be issues when accessed by someone else. */ - vma->vm_flags &= ~VM_MAYSHARE; + hugetlb_vma_unlock_write(vma); + hugetlb_vma_lock_free(vma); + + i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, @@ -6664,10 +6675,13 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, /* * match the virtual addresses, permission and the alignment of the * page table page. + * + * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || - !range_in_vma(svma, sbase, s_end)) + !range_in_vma(svma, sbase, s_end) || + !svma->vm_private_data) return 0; return saddr; @@ -6817,12 +6831,9 @@ void hugetlb_vma_lock_release(struct kref *kref) static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* - * Only present in sharable vmas. See comment in - * __unmap_hugepage_range_final about how VM_SHARED could - * be set without VM_MAYSHARE. As a result, we need to - * check if either is set in the free path. + * Only present in sharable vmas. */ - if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED))) + if (!vma || !__vma_shareable_flags_pmd(vma)) return; if (vma->vm_private_data) { diff --git a/mm/memory.c b/mm/memory.c index 118e5f023597c..df678fa30cdb9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1685,12 +1685,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); - i_mmap_unlock_write(vma->vm_file->f_mapping); - hugetlb_vma_unlock_write(vma); } } else unmap_page_range(tlb, vma, start, end, details); -- GitLab From ecfbd733878da48ed03a5b8a9c301366a03e3cca Mon Sep 17 00:00:00 2001 From: Mike Kravetz <mike.kravetz@oracle.com> Date: Tue, 4 Oct 2022 18:17:06 -0700 Subject: [PATCH 1591/2223] hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb file truncation/hole punch code may need to back out and take locks in order in the routine hugetlb_unmap_file_folio(). This code could race with vma freeing as pointed out in [1] and result in accessing a stale vma pointer. To address this, take the vma_lock when clearing the vma_lock->vma pointer. [1] https://lore.kernel.org/linux-mm/01f10195-7088-4462-6def-909549c75ef4@huawei.com/ [mike.kravetz@oracle.com: address build issues] Link: https://lkml.kernel.org/r/Yz5L1uxQYR1VqFtJ@monkey Link: https://lkml.kernel.org/r/20221005011707.514612-3-mike.kravetz@oracle.com Fixes: "hugetlb: use new vma_lock for pmd sharing synchronization" Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Cc: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 45e305d182f6b..01f3e36caa6c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -93,6 +93,7 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -5188,8 +5189,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, * be asynchrously deleted. If the page tables are shared, there * will be issues when accessed by someone else. */ - hugetlb_vma_unlock_write(vma); - hugetlb_vma_lock_free(vma); + __hugetlb_vma_unlock_write_free(vma); i_mmap_unlock_write(vma->vm_file->f_mapping); } @@ -6828,6 +6828,30 @@ void hugetlb_vma_lock_release(struct kref *kref) kfree(vma_lock); } +void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +{ + struct vm_area_struct *vma = vma_lock->vma; + + /* + * vma_lock structure may or not be released as a result of put, + * it certainly will no longer be attached to vma so clear pointer. + * Semaphore synchronizes access to vma_lock->vma field. + */ + vma_lock->vma = NULL; + vma->vm_private_data = NULL; + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); +} + +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* @@ -6839,14 +6863,8 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) if (vma->vm_private_data) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - /* - * vma_lock structure may or not be released, but it - * certainly will no longer be attached to vma so clear - * pointer. - */ - vma_lock->vma = NULL; - kref_put(&vma_lock->refs, hugetlb_vma_lock_release); - vma->vm_private_data = NULL; + down_write(&vma_lock->rw_sema); + __hugetlb_vma_unlock_write_put(vma_lock); } } @@ -6997,6 +7015,10 @@ void hugetlb_vma_lock_release(struct kref *kref) { } +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ +} + static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { } -- GitLab From bbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 Mon Sep 17 00:00:00 2001 From: Mike Kravetz <mike.kravetz@oracle.com> Date: Tue, 4 Oct 2022 18:17:07 -0700 Subject: [PATCH 1592/2223] hugetlb: allocate vma lock for all sharable vmas The hugetlb vma lock was originally designed to synchronize pmd sharing. As such, it was only necessary to allocate the lock for vmas that were capable of pmd sharing. Later in the development cycle, it was discovered that it could also be used to simplify fault/truncation races as described in [1]. However, a subsequent change to allocate the lock for all vmas that use the page cache was never made. A fault/truncation race could leave pages in a file past i_size until the file is removed. Remove the previous restriction and allocate lock for all VM_MAYSHARE vmas. Warn in the unlikely event of allocation failure. [1] https://lore.kernel.org/lkml/Yxiv0SkMkZ0JWGGp@monkey/#t Link: https://lkml.kernel.org/r/20221005011707.514612-4-mike.kravetz@oracle.com Fixes: "hugetlb: clean up code checking for fault/truncation races" Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Cc: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 50 +++++++++++++++----------------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 01f3e36caa6c7..0ad53ad98e742 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6687,10 +6687,11 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool check_vma_lock) +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; @@ -6700,38 +6701,13 @@ static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, */ if (!(vma->vm_flags & VM_MAYSHARE)) return false; - if (check_vma_lock && !vma->vm_private_data) + if (!vma->vm_private_data) /* vma lock required for sharing */ return false; if (!range_in_vma(vma, start, end)) return false; return true; } -static bool vma_pmd_shareable(struct vm_area_struct *vma) -{ - unsigned long start = ALIGN(vma->vm_start, PUD_SIZE), - end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - - if (start >= end) - return false; - - return __vma_aligned_range_pmd_shareable(vma, start, end, false); -} - -static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, - unsigned long addr) -{ - unsigned long start = addr & PUD_MASK; - unsigned long end = start + PUD_SIZE; - - return __vma_aligned_range_pmd_shareable(vma, start, end, true); -} - -bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) -{ - return vma_addr_pmd_shareable(vma, addr); -} - /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -6880,17 +6856,21 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) if (vma->vm_private_data) return; - /* Check size/alignment for pmd sharing possible */ - if (!vma_pmd_shareable(vma)) - return; - vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); - if (!vma_lock) + if (!vma_lock) { /* * If we can not allocate structure, then vma can not - * participate in pmd sharing. + * participate in pmd sharing. This is only a possible + * performance enhancement and memory saving issue. + * However, the lock is also used to synchronize page + * faults with truncation. If the lock is not present, + * unlikely races could leave pages in a file past i_size + * until the file is removed. Warn in the unlikely case of + * allocation failure. */ + pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return; + } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); -- GitLab From a4e430c8c8ba96be8c6ec4f2eb108bb8bcbee069 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya <ematsumiya@suse.de> Date: Tue, 20 Sep 2022 15:10:35 -0300 Subject: [PATCH 1593/2223] cifs: replace kfree() with kfree_sensitive() for sensitive data Replace kfree with kfree_sensitive, or prepend memzero_explicit() in other cases, when freeing sensitive material that could still be left in memory. Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Reported-by: kernel test robot <oliver.sang@intel.com> Link: https://lore.kernel.org/r/202209201529.ec633796-oliver.sang@intel.com Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsencrypt.c | 12 ++++++------ fs/cifs/connect.c | 6 +++--- fs/cifs/fs_context.c | 12 ++++++++++-- fs/cifs/misc.c | 2 +- fs/cifs/sess.c | 24 +++++++++++++++--------- fs/cifs/smb2ops.c | 6 +++--- fs/cifs/smb2pdu.c | 19 ++++++++++++++----- 7 files changed, 52 insertions(+), 29 deletions(-) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 46f5718754f94..d848bc0aac274 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -679,7 +679,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) unlock: cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: - kfree(tiblob); + kfree_sensitive(tiblob); return rc; } @@ -753,14 +753,14 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) server->secmech.ccmaesdecrypt = NULL; } - kfree(server->secmech.sdesccmacaes); + kfree_sensitive(server->secmech.sdesccmacaes); server->secmech.sdesccmacaes = NULL; - kfree(server->secmech.sdeschmacsha256); + kfree_sensitive(server->secmech.sdeschmacsha256); server->secmech.sdeschmacsha256 = NULL; - kfree(server->secmech.sdeschmacmd5); + kfree_sensitive(server->secmech.sdeschmacmd5); server->secmech.sdeschmacmd5 = NULL; - kfree(server->secmech.sdescmd5); + kfree_sensitive(server->secmech.sdescmd5); server->secmech.sdescmd5 = NULL; - kfree(server->secmech.sdescsha512); + kfree_sensitive(server->secmech.sdescsha512); server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 93e59b3b36c73..40900aace416e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -311,7 +311,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; @@ -1580,7 +1580,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_crypto_secmech_release(server); - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; kfree(server->hostname); @@ -4135,7 +4135,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; ses->auth_key.len = 0; } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 0e13dec86b252..45119597c7655 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -791,6 +791,13 @@ do { \ cifs_sb->ctx->field = NULL; \ } while (0) +#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ +do { \ + kfree_sensitive(ctx->field); \ + ctx->field = cifs_sb->ctx->field; \ + cifs_sb->ctx->field = NULL; \ +} while (0) + static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); @@ -811,7 +818,7 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); - STEAL_STRING(cifs_sb, ctx, password); + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -1162,7 +1169,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } break; case Opt_pass: - kfree(ctx->password); + kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; @@ -1470,6 +1477,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return 0; cifs_parse_mount_err: + kfree_sensitive(ctx->password); return -EINVAL; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 20a112c96bae5..72bd1b2b323f6 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1119,7 +1119,7 @@ cifs_alloc_hash(const char *name, void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) { - kfree(*sdesc); + kfree_sensitive(*sdesc); *sdesc = NULL; if (*shash) crypto_free_shash(*shash); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3af3b05b6c740..f1c3c6d9146c3 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1213,6 +1213,12 @@ out_free_smb_buf: static void sess_free_buffer(struct sess_data *sess_data) { + int i; + + /* zero the session data before freeing, as it might contain sensitive info (keys, etc) */ + for (i = 0; i < 3; i++) + if (sess_data->iov[i].iov_base) + memzero_explicit(sess_data->iov[i].iov_base, sess_data->iov[i].iov_len); free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; @@ -1374,7 +1380,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1513,7 +1519,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1648,7 +1654,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1658,9 +1664,9 @@ out: } /* Else error. Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1759,7 +1765,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) } out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1767,9 +1773,9 @@ out: rc = sess_establish_session(sess_data); /* Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1845,7 +1851,7 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 10f9ef68e510c..9a686870e8b79 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4423,11 +4423,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree(iv); + kfree_sensitive(iv); free_sg: - kfree(sg); + kfree_sensitive(sg); free_req: - kfree(req); + kfree_sensitive(req); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 40fce33763072..b3c4d2e54eaa3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1345,6 +1345,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) static void SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) { + int i; + + /* zero the session data before freeing, as it might contain sensitive info (keys, etc) */ + for (i = 0; i < 2; i++) + if (sess_data->iov[i].iov_base) + memzero_explicit(sess_data->iov[i].iov_base, sess_data->iov[i].iov_len); + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; } @@ -1477,6 +1484,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) out_put_spnego_key: key_invalidate(spnego_key); key_put(spnego_key); + if (rc) + kfree_sensitive(ses->auth_key.response); out: sess_data->result = rc; sess_data->func = NULL; @@ -1573,7 +1582,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) } out: - kfree(ntlmssp_blob); + memzero_explicit(ntlmssp_blob, blob_length); SMB2_sess_free_buffer(sess_data); if (!rc) { sess_data->result = 0; @@ -1581,7 +1590,7 @@ out: return; } out_err: - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1657,9 +1666,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } #endif out: - kfree(ntlmssp_blob); + memzero_explicit(ntlmssp_blob, blob_length); SMB2_sess_free_buffer(sess_data); - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1737,7 +1746,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } -- GitLab From 8698baa1b768fc5cd4bf73e846680a812678d029 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya <ematsumiya@suse.de> Date: Wed, 5 Oct 2022 02:42:07 -0500 Subject: [PATCH 1594/2223] smb3: rename encryption/decryption TFMs Detach the TFM name from a specific algorithm (AES-CCM) as AES-GCM is also supported, making the name misleading. s/ccmaesencrypt/enc/ s/ccmaesdecrypt/dec/ Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsencrypt.c | 12 ++++++------ fs/cifs/cifsglob.h | 4 ++-- fs/cifs/smb2ops.c | 3 +-- fs/cifs/smb2transport.c | 12 ++++++------ 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index d848bc0aac274..1f766f3e185e6 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -743,14 +743,14 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) server->secmech.hmacmd5 = NULL; } - if (server->secmech.ccmaesencrypt) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; } - if (server->secmech.ccmaesdecrypt) { - crypto_free_aead(server->secmech.ccmaesdecrypt); - server->secmech.ccmaesdecrypt = NULL; + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); + server->secmech.dec = NULL; } kfree_sensitive(server->secmech.sdesccmacaes); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 338bc11f682ee..95e90d662f065 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -171,8 +171,8 @@ struct cifs_secmech { struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */ - struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */ - struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */ + struct crypto_aead *enc; /* smb3 AEAD encryption TFM (AES-CCM and AES-GCM) */ + struct crypto_aead *dec; /* smb3 AEAD decryption TFM (AES-CCM and AES-GCM) */ }; /* per smb session structure/fields */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9a686870e8b79..5187250c5f662 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4357,8 +4357,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - tfm = enc ? server->secmech.ccmaesencrypt : - server->secmech.ccmaesdecrypt; + tfm = enc ? server->secmech.enc : server->secmech.dec; if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 4640fc4a8b133..d4e1a5d74dcde 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -904,7 +904,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) { struct crypto_aead *tfm; - if (!server->secmech.ccmaesencrypt) { + if (!server->secmech.enc) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); @@ -915,23 +915,23 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesencrypt = tfm; + server->secmech.enc = tfm; } - if (!server->secmech.ccmaesdecrypt) { + if (!server->secmech.dec) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n", __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesdecrypt = tfm; + server->secmech.dec = tfm; } return 0; -- GitLab From 1f3d5477b944c8db8d73d7070ea98d8f1a8224c0 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya <ematsumiya@suse.de> Date: Thu, 29 Sep 2022 17:36:50 -0300 Subject: [PATCH 1595/2223] cifs: secmech: use shash_desc directly, remove sdesc The struct sdesc is just a wrapper around shash_desc, with exact same memory layout. Replace the hashing TFMs with shash_desc as it's what's passed to the crypto API anyway. Also remove the crypto_shash pointers as they can be accessed via shash_desc->tfm (and are actually only used in the setkey calls). Adapt cifs_{alloc,free}_hash functions to this change. Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsencrypt.c | 86 +++++++++++++---------------------------- fs/cifs/cifsglob.h | 26 ++++--------- fs/cifs/cifsproto.h | 5 +-- fs/cifs/link.c | 13 +++---- fs/cifs/misc.c | 49 ++++++++++++----------- fs/cifs/smb2misc.c | 13 +++---- fs/cifs/smb2transport.c | 72 +++++++++++++--------------------- 7 files changed, 98 insertions(+), 166 deletions(-) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 1f766f3e185e6..5db73c0f792a5 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -103,26 +103,24 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (!rqst->rq_iov || !signature || !server) return -EINVAL; - rc = cifs_alloc_hash("md5", &server->secmech.md5, - &server->secmech.sdescmd5); + rc = cifs_alloc_hash("md5", &server->secmech.md5); if (rc) return -1; - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + rc = crypto_shash_init(server->secmech.md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(server->secmech.md5, server->session_key.response, server->session_key.len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } - return __cifs_calc_signature(rqst, server, signature, - &server->secmech.sdescmd5->shash); + return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); } /* must be called with server->srv_mutex held */ @@ -412,7 +410,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, wchar_t *domain; wchar_t *server; - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } @@ -420,14 +418,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -448,7 +446,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, memset(user, '\0', 2); } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, (char *)user, 2 * len); kfree(user); if (rc) { @@ -468,7 +466,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)domain, 2 * len); kfree(domain); if (rc) { @@ -488,7 +486,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)server, 2 * len); kfree(server); if (rc) { @@ -498,7 +496,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -518,12 +516,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -531,7 +529,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -543,7 +541,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->challenge.key, hash_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); @@ -551,7 +549,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) } /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -627,9 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", - &ses->server->secmech.hmacmd5, - &ses->server->secmech.sdeschmacmd5); + rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5); if (rc) { goto unlock; } @@ -649,7 +645,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -657,13 +653,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); goto unlock; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { @@ -671,7 +667,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ses->auth_key.response); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -718,30 +714,11 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_secmech_release(struct TCP_Server_Info *server) { - if (server->secmech.cmacaes) { - crypto_free_shash(server->secmech.cmacaes); - server->secmech.cmacaes = NULL; - } - - if (server->secmech.hmacsha256) { - crypto_free_shash(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - } - - if (server->secmech.md5) { - crypto_free_shash(server->secmech.md5); - server->secmech.md5 = NULL; - } - - if (server->secmech.sha512) { - crypto_free_shash(server->secmech.sha512); - server->secmech.sha512 = NULL; - } - - if (server->secmech.hmacmd5) { - crypto_free_shash(server->secmech.hmacmd5); - server->secmech.hmacmd5 = NULL; - } + cifs_free_hash(&server->secmech.aes_cmac); + cifs_free_hash(&server->secmech.hmacsha256); + cifs_free_hash(&server->secmech.md5); + cifs_free_hash(&server->secmech.sha512); + cifs_free_hash(&server->secmech.hmacmd5); if (server->secmech.enc) { crypto_free_aead(server->secmech.enc); @@ -752,15 +729,4 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) crypto_free_aead(server->secmech.dec); server->secmech.dec = NULL; } - - kfree_sensitive(server->secmech.sdesccmacaes); - server->secmech.sdesccmacaes = NULL; - kfree_sensitive(server->secmech.sdeschmacsha256); - server->secmech.sdeschmacsha256 = NULL; - kfree_sensitive(server->secmech.sdeschmacmd5); - server->secmech.sdeschmacmd5 = NULL; - kfree_sensitive(server->secmech.sdescmd5); - server->secmech.sdescmd5 = NULL; - kfree_sensitive(server->secmech.sdescsha512); - server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 95e90d662f065..52ddf4163b981 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -153,26 +153,16 @@ struct session_key { char *response; }; -/* crypto security descriptor definition */ -struct sdesc { - struct shash_desc shash; - char ctx[]; -}; - /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ - struct crypto_shash *md5; /* md5 hash function */ - struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ - struct crypto_shash *cmacaes; /* block-cipher based MAC function */ - struct crypto_shash *sha512; /* sha512 hash function */ - struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ - struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ - struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ - struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ - struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */ - struct crypto_aead *enc; /* smb3 AEAD encryption TFM (AES-CCM and AES-GCM) */ - struct crypto_aead *dec; /* smb3 AEAD decryption TFM (AES-CCM and AES-GCM) */ + struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */ + struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ + struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ + struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ + struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ + + struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ + struct crypto_aead *dec; /* smb3 decryption AEAD TFM (AES-CCM and AES-GCM) */ }; /* per smb session structure/fields */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 71386978858eb..84ec71bdfacdf 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -598,9 +598,8 @@ struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); -int cifs_alloc_hash(const char *name, struct crypto_shash **shash, - struct sdesc **sdesc); -void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); +int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); +void cifs_free_hash(struct shash_desc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 6803cb27eecc3..cd29c296cec60 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -38,29 +38,28 @@ static int symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) { int rc; - struct crypto_shash *md5 = NULL; - struct sdesc *sdescmd5 = NULL; + struct shash_desc *md5 = NULL; - rc = cifs_alloc_hash("md5", &md5, &sdescmd5); + rc = cifs_alloc_hash("md5", &md5); if (rc) goto symlink_hash_err; - rc = crypto_shash_init(&sdescmd5->shash); + rc = crypto_shash_init(md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(md5, link_str, link_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + rc = crypto_shash_final(md5, md5_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: - cifs_free_hash(&md5, &sdescmd5); + cifs_free_hash(&md5); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 72bd1b2b323f6..da51ffd029280 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1071,59 +1071,58 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo - * @shash: Where to put the pointer to the hash algo - * @sdesc: Where to put the pointer to the hash descriptor + * @sdesc: SHASH descriptor where to put the pointer to the hash TFM * * The caller has to make sure @sdesc is initialized to either NULL or - * a valid context. Both can be freed via cifs_free_hash(). + * a valid context. It can be freed via cifs_free_hash(). */ int -cifs_alloc_hash(const char *name, - struct crypto_shash **shash, struct sdesc **sdesc) +cifs_alloc_hash(const char *name, struct shash_desc **sdesc) { int rc = 0; - size_t size; + struct crypto_shash *alg = NULL; - if (*sdesc != NULL) + if (*sdesc) return 0; - *shash = crypto_alloc_shash(name, 0, 0); - if (IS_ERR(*shash)) { - cifs_dbg(VFS, "Could not allocate crypto %s\n", name); - rc = PTR_ERR(*shash); - *shash = NULL; + alg = crypto_alloc_shash(name, 0, 0); + if (IS_ERR(alg)) { + cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name); + rc = PTR_ERR(alg); *sdesc = NULL; return rc; } - size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash); - *sdesc = kmalloc(size, GFP_KERNEL); + *sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL); if (*sdesc == NULL) { - cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name); - crypto_free_shash(*shash); - *shash = NULL; + cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name); + crypto_free_shash(alg); return -ENOMEM; } - (*sdesc)->shash.tfm = *shash; + (*sdesc)->tfm = alg; return 0; } /** * cifs_free_hash - free hash and hash context together - * @shash: Where to find the pointer to the hash algo - * @sdesc: Where to find the pointer to the hash descriptor + * @sdesc: Where to find the pointer to the hash TFM * - * Freeing a NULL hash or context is safe. + * Freeing a NULL descriptor is safe. */ void -cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) +cifs_free_hash(struct shash_desc **sdesc) { + if (unlikely(!sdesc) || !*sdesc) + return; + + if ((*sdesc)->tfm) { + crypto_free_shash((*sdesc)->tfm); + (*sdesc)->tfm = NULL; + } + kfree_sensitive(*sdesc); *sdesc = NULL; - if (*shash) - crypto_free_shash(*shash); - *shash = NULL; } /** diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index d73e5672aac49..7db5c09ecceba 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -870,8 +870,8 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec) { int i, rc; - struct sdesc *d; struct smb2_hdr *hdr; + struct shash_desc *sha512 = NULL; hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ @@ -901,14 +901,14 @@ ok: if (rc) return rc; - d = server->secmech.sdescsha512; - rc = crypto_shash_init(&d->shash); + sha512 = server->secmech.sha512; + rc = crypto_shash_init(sha512); if (rc) { cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); return rc; } - rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash, + rc = crypto_shash_update(sha512, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -916,8 +916,7 @@ ok: } for (i = 0; i < nvec; i++) { - rc = crypto_shash_update(&d->shash, - iov[i].iov_base, iov[i].iov_len); + rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -925,7 +924,7 @@ ok: } } - rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash); + rc = crypto_shash_final(sha512, ses->preauth_sha_hash); if (rc) { cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", __func__); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d4e1a5d74dcde..dfcbcc0b86e4a 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -32,19 +32,17 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) goto err; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; return 0; err: - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -54,25 +52,23 @@ smb311_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc = 0; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) return rc; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; - rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512); + rc = cifs_alloc_hash("sha512", &p->sha512); if (rc) goto err; return 0; err: - cifs_free_hash(&p->cmacaes, &p->sdesccmacaes); - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->aes_cmac); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -220,8 +216,6 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; struct smb_rqst drqst; ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); @@ -234,19 +228,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); if (allocate_crypto) { - rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc); + rc = cifs_alloc_hash("hmac(sha256)", &shash); if (rc) { cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); goto out; } - shash = &sdesc->shash; } else { - hash = server->secmech.hmacsha256; - shash = &server->secmech.sdeschmacsha256->shash; + shash = server->secmech.hmacsha256; } - rc = crypto_shash_setkey(hash, ses->auth_key.response, + rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, @@ -288,7 +280,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); if (ses) cifs_put_smb_ses(ses); return rc; @@ -315,42 +307,38 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_setkey(server->secmech.hmacsha256, + rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(server->secmech.hmacsha256); if (rc) { cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - i, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - label.iov_base, label.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - &zero, 1); + rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - context.iov_base, context.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -358,19 +346,16 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L256, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); } else { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L128, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, - hashptr); + rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); if (rc) { cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; @@ -551,8 +536,6 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; @@ -563,27 +546,24 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, } if (allocate_crypto) { - rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc); + rc = cifs_alloc_hash("cmac(aes)", &shash); if (rc) return rc; - - shash = &sdesc->shash; } else { - hash = server->secmech.cmacaes; - shash = &server->secmech.sdesccmacaes->shash; + shash = server->secmech.aes_cmac; } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); - rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE); + rc = crypto_shash_setkey(shash->tfm, key, SMB2_CMACAES_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); goto out; } /* - * we already allocate sdesccmacaes when we init smb3 signing key, + * we already allocate aes_cmac when we init smb3 signing key, * so unlike smb2 case we do not have to check here if secmech are * initialized */ @@ -619,7 +599,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); return rc; } -- GitLab From 958553d13478ad0e35fa09fecad3ce73277ccaf5 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Sun, 2 Oct 2022 22:09:45 -0500 Subject: [PATCH 1596/2223] smb3: fix oops in calculating shash_setkey shash was not being initialized in one place in smb3_calc_signature and smb2_calc_signature Reviewed-by: Enzo Matsumiya <ematsumiya@suse.de> Acked-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index dfcbcc0b86e4a..8e3f26e6f6b9b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -215,7 +215,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - struct shash_desc *shash; + struct shash_desc *shash = NULL; struct smb_rqst drqst; ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); @@ -535,7 +535,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct shash_desc *shash; + struct shash_desc *shash = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; -- GitLab From 28148a17c988b614534f457da86893f83664ad43 Mon Sep 17 00:00:00 2001 From: Jann Horn <jannh@google.com> Date: Thu, 6 Oct 2022 20:33:01 +0200 Subject: [PATCH 1597/2223] openrisc: Fix pagewalk usage in arch_dma_{clear, set}_uncached Since commit 8782fb61cc848 ("mm: pagewalk: Fix race between unmap and page walker"), walk_page_range() on kernel ranges won't work anymore, walk_page_range_novma() must be used instead. Note: I don't have an openrisc development setup, so this is completely untested. Fixes: 8782fb61cc848 ("mm: pagewalk: Fix race between unmap and page walker") Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Stafford Horne <shorne@gmail.com> --- arch/openrisc/kernel/dma.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index a82b2caaa560d..b3edbb33b621d 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -74,10 +74,10 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size) * We need to iterate through the pages, clearing the dcache for * them and setting the cache-inhibit bit. */ - mmap_read_lock(&init_mm); - error = walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, - NULL); - mmap_read_unlock(&init_mm); + mmap_write_lock(&init_mm); + error = walk_page_range_novma(&init_mm, va, va + size, + &set_nocache_walk_ops, NULL, NULL); + mmap_write_unlock(&init_mm); if (error) return ERR_PTR(error); @@ -88,11 +88,11 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size) { unsigned long va = (unsigned long)cpu_addr; - mmap_read_lock(&init_mm); + mmap_write_lock(&init_mm); /* walk_page_range shouldn't be able to fail here */ - WARN_ON(walk_page_range(&init_mm, va, va + size, - &clear_nocache_walk_ops, NULL)); - mmap_read_unlock(&init_mm); + WARN_ON(walk_page_range_novma(&init_mm, va, va + size, + &clear_nocache_walk_ops, NULL, NULL)); + mmap_write_unlock(&init_mm); } void arch_sync_dma_for_device(phys_addr_t addr, size_t size, -- GitLab From 59945216889518982d262d4cab099c6554f58867 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Wed, 7 Sep 2022 14:50:56 +0100 Subject: [PATCH 1598/2223] fbdev: udlfb: Remove redundant initialization to variable identical The variable identical is being initialized with a value that is never read. The variable is being re-assigned later on. The initialization is redundant and can be removed. Cleans up clang scan-build warning: drivers/video/fbdev/udlfb.c:373:6: warning: Value stored to 'identical' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/udlfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/udlfb.c b/drivers/video/fbdev/udlfb.c index c863244ef12cb..216d49c9d47e5 100644 --- a/drivers/video/fbdev/udlfb.c +++ b/drivers/video/fbdev/udlfb.c @@ -370,7 +370,7 @@ static int dlfb_trim_hline(const u8 *bback, const u8 **bfront, int *width_bytes) const unsigned long *back = (const unsigned long *) bback; const unsigned long *front = (const unsigned long *) *bfront; const int width = *width_bytes / sizeof(unsigned long); - int identical = width; + int identical; int start = width; int end = width; -- GitLab From 83434cc0ae8c344b085d0ed6104e3b91e5f02a09 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Wed, 14 Sep 2022 18:22:59 +0800 Subject: [PATCH 1599/2223] fbdev: controlfb: Remove the unused function VAR_MATCH() The function VAR_MATCH is defined in the controlfb.c file, but not called elsewhere, so delete this unused function. drivers/video/fbdev/controlfb.c:111:19: warning: unused function 'VAR_MATCH'. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2153 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/controlfb.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/video/fbdev/controlfb.c b/drivers/video/fbdev/controlfb.c index aba46118b208b..6bbcd9fc864e9 100644 --- a/drivers/video/fbdev/controlfb.c +++ b/drivers/video/fbdev/controlfb.c @@ -108,13 +108,6 @@ static inline int PAR_EQUAL(struct fb_par_control *x, struct fb_par_control *y) return (!DIRTY(cmode) && !DIRTY(xres) && !DIRTY(yres) && !DIRTY(vxres) && !DIRTY(vyres)); } -static inline int VAR_MATCH(struct fb_var_screeninfo *x, struct fb_var_screeninfo *y) -{ - return (!DIRTY(bits_per_pixel) && !DIRTY(xres) - && !DIRTY(yres) && !DIRTY(xres_virtual) - && !DIRTY(yres_virtual) - && !DIRTY_CMAP(red) && !DIRTY_CMAP(green) && !DIRTY_CMAP(blue)); -} struct fb_info_control { struct fb_info info; -- GitLab From eec5190fc0b14130ed2f67b0de43b6a302b7837f Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Wed, 14 Sep 2022 18:23:00 +0800 Subject: [PATCH 1600/2223] fbdev: tridentfb: Remove the unused function shadowmode_off() The function shadowmode_off() is defined in the tridentfb.c file, but not called elsewhere, so delete this unused function. drivers/video/fbdev/tridentfb.c:1131:20: warning: unused function 'shadowmode_off'. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2154 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/tridentfb.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/video/fbdev/tridentfb.c b/drivers/video/fbdev/tridentfb.c index f9c3b1d38fc26..2154dd5e37bda 100644 --- a/drivers/video/fbdev/tridentfb.c +++ b/drivers/video/fbdev/tridentfb.c @@ -1128,11 +1128,6 @@ static inline void shadowmode_on(struct tridentfb_par *par) write3CE(par, CyberControl, read3CE(par, CyberControl) | 0x81); } -static inline void shadowmode_off(struct tridentfb_par *par) -{ - write3CE(par, CyberControl, read3CE(par, CyberControl) & 0x7E); -} - /* Set the hardware to the requested video mode */ static int tridentfb_set_par(struct fb_info *info) { -- GitLab From 851e0986d964bb75deea24011b0845d550215076 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Wed, 14 Sep 2022 18:23:01 +0800 Subject: [PATCH 1601/2223] fbdev: arkfb: Remove the unused function dac_read_reg() The function dac_read_reg() is defined in the arkfb.c file, but not called elsewhere, so delete this unused function. drivers/video/fbdev/arkfb.c:322:18: warning: unused function 'dac_read_reg'. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2155 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/arkfb.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/video/fbdev/arkfb.c b/drivers/video/fbdev/arkfb.c index a317d9fe1d67d..5f8fec9e5fd4d 100644 --- a/drivers/video/fbdev/arkfb.c +++ b/drivers/video/fbdev/arkfb.c @@ -318,14 +318,6 @@ struct dac_info void *data; }; - -static inline u8 dac_read_reg(struct dac_info *info, u8 reg) -{ - u8 code[2] = {reg, 0}; - info->dac_read_regs(info->data, code, 1); - return code[1]; -} - static inline void dac_read_regs(struct dac_info *info, u8 *code, int count) { info->dac_read_regs(info->data, code, count); -- GitLab From 2559f17ec878adf5c54815e55cf0b72c02bb5303 Mon Sep 17 00:00:00 2001 From: Jules Irenge <jbi.octave@gmail.com> Date: Sun, 18 Sep 2022 00:44:20 +0100 Subject: [PATCH 1602/2223] fbdev: uvesafb: Convert snprintf to scnprintf Coccinelle reports: WARNING: use scnprintf or sprintf Adding to that, there has also been some slow migration from snprintf to scnprintf. This article explains the rationale for this change: https: //lwn.net/Articles/69419/ Signed-off-by: Jules Irenge <jbi.octave@gmail.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/uvesafb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c index 4df6772802d78..fd5d701106e1d 100644 --- a/drivers/video/fbdev/uvesafb.c +++ b/drivers/video/fbdev/uvesafb.c @@ -1580,7 +1580,7 @@ static ssize_t uvesafb_show_vendor(struct device *dev, struct uvesafb_par *par = info->par; if (par->vbe_ib.oem_vendor_name_ptr) - return snprintf(buf, PAGE_SIZE, "%s\n", (char *) + return scnprintf(buf, PAGE_SIZE, "%s\n", (char *) (&par->vbe_ib) + par->vbe_ib.oem_vendor_name_ptr); else return 0; @@ -1595,7 +1595,7 @@ static ssize_t uvesafb_show_product_name(struct device *dev, struct uvesafb_par *par = info->par; if (par->vbe_ib.oem_product_name_ptr) - return snprintf(buf, PAGE_SIZE, "%s\n", (char *) + return scnprintf(buf, PAGE_SIZE, "%s\n", (char *) (&par->vbe_ib) + par->vbe_ib.oem_product_name_ptr); else return 0; @@ -1610,7 +1610,7 @@ static ssize_t uvesafb_show_product_rev(struct device *dev, struct uvesafb_par *par = info->par; if (par->vbe_ib.oem_product_rev_ptr) - return snprintf(buf, PAGE_SIZE, "%s\n", (char *) + return scnprintf(buf, PAGE_SIZE, "%s\n", (char *) (&par->vbe_ib) + par->vbe_ib.oem_product_rev_ptr); else return 0; @@ -1625,7 +1625,7 @@ static ssize_t uvesafb_show_oem_string(struct device *dev, struct uvesafb_par *par = info->par; if (par->vbe_ib.oem_string_ptr) - return snprintf(buf, PAGE_SIZE, "%s\n", + return scnprintf(buf, PAGE_SIZE, "%s\n", (char *)(&par->vbe_ib) + par->vbe_ib.oem_string_ptr); else return 0; @@ -1639,7 +1639,7 @@ static ssize_t uvesafb_show_nocrtc(struct device *dev, struct fb_info *info = dev_get_drvdata(dev); struct uvesafb_par *par = info->par; - return snprintf(buf, PAGE_SIZE, "%d\n", par->nocrtc); + return scnprintf(buf, PAGE_SIZE, "%d\n", par->nocrtc); } static ssize_t uvesafb_store_nocrtc(struct device *dev, -- GitLab From b0e0706007030d1eb05d25de0359725357fe5be6 Mon Sep 17 00:00:00 2001 From: Zhang Qilong <zhangqilong3@huawei.com> Date: Fri, 23 Sep 2022 21:38:44 +0800 Subject: [PATCH 1603/2223] fbdev: omapfb/dss: Use pm_runtime_resume_and_get() instead of pm_runtime_get_sync() Using the newest pm_runtime_resume_and_get is more appropriate for simplifing code here. Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/omap2/omapfb/dss/dispc.c | 6 ++---- drivers/video/fbdev/omap2/omapfb/dss/dsi.c | 6 ++---- drivers/video/fbdev/omap2/omapfb/dss/dss.c | 6 ++---- drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c | 6 ++---- drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c | 6 ++---- drivers/video/fbdev/omap2/omapfb/dss/venc.c | 6 ++---- 6 files changed, 12 insertions(+), 24 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dispc.c b/drivers/video/fbdev/omap2/omapfb/dss/dispc.c index b2d6e6df21615..92fb6b7e1f681 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dispc.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dispc.c @@ -519,11 +519,9 @@ int dispc_runtime_get(void) DSSDBG("dispc_runtime_get\n"); - r = pm_runtime_get_sync(&dispc.pdev->dev); - if (WARN_ON(r < 0)) { - pm_runtime_put_sync(&dispc.pdev->dev); + r = pm_runtime_resume_and_get(&dispc.pdev->dev); + if (WARN_ON(r < 0)) return r; - } return 0; } EXPORT_SYMBOL(dispc_runtime_get); diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c index d43b081d592f0..54b0f034c2edf 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c @@ -1136,11 +1136,9 @@ static int dsi_runtime_get(struct platform_device *dsidev) DSSDBG("dsi_runtime_get\n"); - r = pm_runtime_get_sync(&dsi->pdev->dev); - if (WARN_ON(r < 0)) { - pm_runtime_put_sync(&dsi->pdev->dev); + r = pm_runtime_resume_and_get(&dsi->pdev->dev); + if (WARN_ON(r < 0)) return r; - } return 0; } diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss.c b/drivers/video/fbdev/omap2/omapfb/dss/dss.c index 45b9d3cf38602..335e0af4eec1a 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dss.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dss.c @@ -767,11 +767,9 @@ int dss_runtime_get(void) DSSDBG("dss_runtime_get\n"); - r = pm_runtime_get_sync(&dss.pdev->dev); - if (WARN_ON(r < 0)) { - pm_runtime_put_sync(&dss.pdev->dev); + r = pm_runtime_resume_and_get(&dss.pdev->dev); + if (WARN_ON(r < 0)) return r; - } return 0; } diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c index 800bd108e834d..0f39612e002e8 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c @@ -38,11 +38,9 @@ static int hdmi_runtime_get(void) DSSDBG("hdmi_runtime_get\n"); - r = pm_runtime_get_sync(&hdmi.pdev->dev); - if (WARN_ON(r < 0)) { - pm_runtime_put_sync(&hdmi.pdev->dev); + r = pm_runtime_resume_and_get(&hdmi.pdev->dev); + if (WARN_ON(r < 0)) return r; - } return 0; } diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c index 2c03608addcd7..bfccc2cb917af 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c @@ -42,11 +42,9 @@ static int hdmi_runtime_get(void) DSSDBG("hdmi_runtime_get\n"); - r = pm_runtime_get_sync(&hdmi.pdev->dev); - if (WARN_ON(r < 0)) { - pm_runtime_put_sync(&hdmi.pdev->dev); + r = pm_runtime_resume_and_get(&hdmi.pdev->dev); + if (WARN_ON(r < 0)) return r; - } return 0; } diff --git a/drivers/video/fbdev/omap2/omapfb/dss/venc.c b/drivers/video/fbdev/omap2/omapfb/dss/venc.c index 905d642ff9ed7..78a7309d25dd3 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/venc.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/venc.c @@ -347,11 +347,9 @@ static int venc_runtime_get(void) DSSDBG("venc_runtime_get\n"); - r = pm_runtime_get_sync(&venc.pdev->dev); - if (WARN_ON(r < 0)) { - pm_runtime_put_sync(&venc.pdev->dev); + r = pm_runtime_resume_and_get(&venc.pdev->dev); + if (WARN_ON(r < 0)) return r; - } return 0; } -- GitLab From d13189badcb2d850244034eb73faeb61edce914e Mon Sep 17 00:00:00 2001 From: Shang XiaoJing <shangxiaojing@huawei.com> Date: Fri, 23 Sep 2022 18:20:07 +0800 Subject: [PATCH 1604/2223] fbdev: imxfb: Remove redundant dev_err() call devm_ioremap_resource() prints error message in itself. Remove the dev_err call to avoid redundant error message. Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/imxfb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c index 94f3bc637fc88..51fde1b2a7938 100644 --- a/drivers/video/fbdev/imxfb.c +++ b/drivers/video/fbdev/imxfb.c @@ -972,7 +972,6 @@ static int imxfb_probe(struct platform_device *pdev) fbi->regs = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(fbi->regs)) { - dev_err(&pdev->dev, "Cannot map frame buffer registers\n"); ret = PTR_ERR(fbi->regs); goto failed_ioremap; } -- GitLab From e69dade8a4cfe49f3f3af90d966dd34b67721d26 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang <jiasheng@iscas.ac.cn> Date: Fri, 2 Sep 2022 10:55:55 +0800 Subject: [PATCH 1605/2223] fbdev: gbefb: Convert to use dev_groups The driver core supports the ability to handle the creation and removal of device-specific sysfs files in a race-free manner. Moreover, it can guarantee the success of creation. Therefore, it should be better to convert to use dev_groups. Signed-off-by: Jiasheng Jiang <jiasheng@iscas.ac.cn> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/gbefb.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c index 6b4d5a7f3e152..1582c718329c7 100644 --- a/drivers/video/fbdev/gbefb.c +++ b/drivers/video/fbdev/gbefb.c @@ -1072,17 +1072,12 @@ static ssize_t gbefb_show_rev(struct device *device, struct device_attribute *at static DEVICE_ATTR(revision, S_IRUGO, gbefb_show_rev, NULL); -static void gbefb_remove_sysfs(struct device *dev) -{ - device_remove_file(dev, &dev_attr_size); - device_remove_file(dev, &dev_attr_revision); -} - -static void gbefb_create_sysfs(struct device *dev) -{ - device_create_file(dev, &dev_attr_size); - device_create_file(dev, &dev_attr_revision); -} +static struct attribute *gbefb_attrs[] = { + &dev_attr_size.attr, + &dev_attr_revision.attr, + NULL, +}; +ATTRIBUTE_GROUPS(gbefb); /* * Initialization @@ -1221,7 +1216,6 @@ static int gbefb_probe(struct platform_device *p_dev) } platform_set_drvdata(p_dev, info); - gbefb_create_sysfs(&p_dev->dev); fb_info(info, "%s rev %d @ 0x%08x using %dkB memory\n", info->fix.id, gbe_revision, (unsigned)GBE_BASE, @@ -1248,7 +1242,6 @@ static int gbefb_remove(struct platform_device* p_dev) gbe_turn_off(); arch_phys_wc_del(par->wc_cookie); release_mem_region(GBE_BASE, sizeof(struct sgi_gbe)); - gbefb_remove_sysfs(&p_dev->dev); framebuffer_release(info); return 0; @@ -1259,6 +1252,7 @@ static struct platform_driver gbefb_driver = { .remove = gbefb_remove, .driver = { .name = "gbefb", + .dev_groups = gbefb_groups, }, }; -- GitLab From 5610bcfe8693c02e2e4c8b31427f1bdbdecc839c Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim <imv4bel@gmail.com> Date: Sun, 25 Sep 2022 06:32:43 -0700 Subject: [PATCH 1606/2223] fbdev: smscufx: Fix use-after-free in ufx_ops_open() A race condition may occur if the user physically removes the USB device while calling open() for this device node. This is a race condition between the ufx_ops_open() function and the ufx_usb_disconnect() function, which may eventually result in UAF. So, add a mutex to the ufx_ops_open() and ufx_usb_disconnect() functions to avoid race contidion of krefs. Signed-off-by: Hyunwoo Kim <imv4bel@gmail.com> Cc: stable@vger.kernel.org Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/smscufx.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/smscufx.c b/drivers/video/fbdev/smscufx.c index d7aa5511c3617..e65bdc499c236 100644 --- a/drivers/video/fbdev/smscufx.c +++ b/drivers/video/fbdev/smscufx.c @@ -137,6 +137,8 @@ static int ufx_submit_urb(struct ufx_data *dev, struct urb * urb, size_t len); static int ufx_alloc_urb_list(struct ufx_data *dev, int count, size_t size); static void ufx_free_urb_list(struct ufx_data *dev); +static DEFINE_MUTEX(disconnect_mutex); + /* reads a control register */ static int ufx_reg_read(struct ufx_data *dev, u32 index, u32 *data) { @@ -1071,9 +1073,13 @@ static int ufx_ops_open(struct fb_info *info, int user) if (user == 0 && !console) return -EBUSY; + mutex_lock(&disconnect_mutex); + /* If the USB device is gone, we don't accept new opens */ - if (dev->virtualized) + if (dev->virtualized) { + mutex_unlock(&disconnect_mutex); return -ENODEV; + } dev->fb_count++; @@ -1097,6 +1103,8 @@ static int ufx_ops_open(struct fb_info *info, int user) pr_debug("open /dev/fb%d user=%d fb_info=%p count=%d", info->node, user, info, dev->fb_count); + mutex_unlock(&disconnect_mutex); + return 0; } @@ -1741,6 +1749,8 @@ static void ufx_usb_disconnect(struct usb_interface *interface) { struct ufx_data *dev; + mutex_lock(&disconnect_mutex); + dev = usb_get_intfdata(interface); pr_debug("USB disconnect starting\n"); @@ -1761,6 +1771,8 @@ static void ufx_usb_disconnect(struct usb_interface *interface) kref_put(&dev->kref, ufx_free); /* consider ufx_data freed */ + + mutex_unlock(&disconnect_mutex); } static struct usb_driver ufx_driver = { -- GitLab From e82b0c3ea520609b953ace13ae8d44ba7b3cee54 Mon Sep 17 00:00:00 2001 From: Ruan Jinjie <ruanjinjie@huawei.com> Date: Thu, 22 Sep 2022 09:37:09 +0800 Subject: [PATCH 1607/2223] fbdev: tridentfb: Fix missing pci_disable_device() in probe and remove Replace pci_enable_device() with pcim_enable_device(), pci_disable_device() and pci_release_regions() will be called in release automatically. Signed-off-by: ruanjinjie <ruanjinjie@huawei.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/tridentfb.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/video/fbdev/tridentfb.c b/drivers/video/fbdev/tridentfb.c index 2154dd5e37bda..219ce72923370 100644 --- a/drivers/video/fbdev/tridentfb.c +++ b/drivers/video/fbdev/tridentfb.c @@ -1470,7 +1470,7 @@ static int trident_pci_probe(struct pci_dev *dev, if (err) return err; - err = pci_enable_device(dev); + err = pcim_enable_device(dev); if (err) return err; @@ -1710,12 +1710,10 @@ out_unmap2: kfree(info->pixmap.addr); if (info->screen_base) iounmap(info->screen_base); - release_mem_region(tridentfb_fix.smem_start, tridentfb_fix.smem_len); disable_mmio(info->par); out_unmap1: if (default_par->io_virt) iounmap(default_par->io_virt); - release_mem_region(tridentfb_fix.mmio_start, tridentfb_fix.mmio_len); framebuffer_release(info); return err; } @@ -1730,8 +1728,6 @@ static void trident_pci_remove(struct pci_dev *dev) i2c_del_adapter(&par->ddc_adapter); iounmap(par->io_virt); iounmap(info->screen_base); - release_mem_region(tridentfb_fix.smem_start, tridentfb_fix.smem_len); - release_mem_region(tridentfb_fix.mmio_start, tridentfb_fix.mmio_len); kfree(info->pixmap.addr); fb_dealloc_cmap(&info->cmap); framebuffer_release(info); -- GitLab From 3b29f36efc2fc0d09a178d6d4f9e772f3ffe8591 Mon Sep 17 00:00:00 2001 From: Zeng Heng <zengheng4@huawei.com> Date: Wed, 28 Sep 2022 23:17:10 +0800 Subject: [PATCH 1608/2223] fbdev: vga16fb: Add missing MODULE_DEVICE_TABLE() entry This patch adds missing MODULE_DEVICE_TABLE definition which generates correct modalias for automatic loading of this driver when it is built as an external module. Signed-off-by: Zeng Heng <zengheng4@huawei.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/vga16fb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c index 35cf51ae32929..af47f82170956 100644 --- a/drivers/video/fbdev/vga16fb.c +++ b/drivers/video/fbdev/vga16fb.c @@ -1421,6 +1421,7 @@ static const struct platform_device_id vga16fb_driver_id_table[] = { {"vga-framebuffer", 0}, { } }; +MODULE_DEVICE_TABLE(platform, vga16fb_driver_id_table); static struct platform_driver vga16fb_driver = { .probe = vga16fb_probe, -- GitLab From 29926f1cd3535f565f200430d5b6a794543fe130 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Thu, 6 Oct 2022 07:33:17 +0200 Subject: [PATCH 1609/2223] fbdev: mb862xx: Fix check of return value from irq_of_parse_and_map() NO_IRQ is used to check the return of irq_of_parse_and_map(). On some architecture NO_IRQ is 0, on other architectures it is -1. irq_of_parse_and_map() returns 0 on error, independent of NO_IRQ. So use 0 instead of using NO_IRQ. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/video/fbdev/mb862xx/mb862xxfbdrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c b/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c index 96800c9c9cd9e..90c79e8c11570 100644 --- a/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c +++ b/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c @@ -693,7 +693,7 @@ static int of_platform_mb862xx_probe(struct platform_device *ofdev) par->dev = dev; par->irq = irq_of_parse_and_map(np, 0); - if (par->irq == NO_IRQ) { + if (!par->irq) { dev_err(dev, "failed to map irq\n"); ret = -ENODEV; goto fbrel; -- GitLab From 3b5c082bbfa20d9a57924edd655bbe63fe98ab06 Mon Sep 17 00:00:00 2001 From: Oliver Upton <oliver.upton@linux.dev> Date: Fri, 7 Oct 2022 23:41:50 +0000 Subject: [PATCH 1610/2223] KVM: arm64: Work out supported block level at compile time Work out the minimum page table level where KVM supports block mappings at compile time. While at it, rewrite the comment around supported block mappings to directly describe what KVM supports instead of phrasing in terms of what it does not. Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221007234151.461779-2-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 1b098bd4cd378..3252eb50ecfe5 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -13,6 +13,18 @@ #define KVM_PGTABLE_MAX_LEVELS 4U +/* + * The largest supported block sizes for KVM (no 52-bit PA support): + * - 4K (level 1): 1GB + * - 16K (level 2): 32MB + * - 64K (level 2): 512MB + */ +#ifdef CONFIG_ARM64_4K_PAGES +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1U +#else +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U +#endif + static inline u64 kvm_get_parange(u64 mmfr0) { u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, @@ -58,11 +70,7 @@ static inline u64 kvm_granule_size(u32 level) static inline bool kvm_level_supports_block_mapping(u32 level) { - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); + return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } /** -- GitLab From 5994bc9e05c2f8811f233aa434e391cd2783f0f5 Mon Sep 17 00:00:00 2001 From: Oliver Upton <oliver.upton@linux.dev> Date: Fri, 7 Oct 2022 23:41:51 +0000 Subject: [PATCH 1611/2223] KVM: arm64: Limit stage2_apply_range() batch size to largest block Presently stage2_apply_range() works on a batch of memory addressed by a stage 2 root table entry for the VM. Depending on the IPA limit of the VM and PAGE_SIZE of the host, this could address a massive range of memory. Some examples: 4 level, 4K paging -> 512 GB batch size 3 level, 64K paging -> 4TB batch size Unsurprisingly, working on such a large range of memory can lead to soft lockups. When running dirty_log_perf_test: ./dirty_log_perf_test -m -2 -s anonymous_thp -b 4G -v 48 watchdog: BUG: soft lockup - CPU#0 stuck for 45s! [dirty_log_perf_:16703] Modules linked in: vfat fat cdc_ether usbnet mii xhci_pci xhci_hcd sha3_generic gq(O) CPU: 0 PID: 16703 Comm: dirty_log_perf_ Tainted: G O 6.0.0-smp-DEV #1 pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : dcache_clean_inval_poc+0x24/0x38 lr : clean_dcache_guest_page+0x28/0x4c sp : ffff800021763990 pmr_save: 000000e0 x29: ffff800021763990 x28: 0000000000000005 x27: 0000000000000de0 x26: 0000000000000001 x25: 00400830b13bc77f x24: ffffad4f91ead9c0 x23: 0000000000000000 x22: ffff8000082ad9c8 x21: 0000fffafa7bc000 x20: ffffad4f9066ce50 x19: 0000000000000003 x18: ffffad4f92402000 x17: 000000000000011b x16: 000000000000011b x15: 0000000000000124 x14: ffff07ff8301d280 x13: 0000000000000000 x12: 00000000ffffffff x11: 0000000000010001 x10: fffffc0000000000 x9 : ffffad4f9069e580 x8 : 000000000000000c x7 : 0000000000000000 x6 : 000000000000003f x5 : ffff07ffa2076980 x4 : 0000000000000001 x3 : 000000000000003f x2 : 0000000000000040 x1 : ffff0830313bd000 x0 : ffff0830313bcc40 Call trace: dcache_clean_inval_poc+0x24/0x38 stage2_unmap_walker+0x138/0x1ec __kvm_pgtable_walk+0x130/0x1d4 __kvm_pgtable_walk+0x170/0x1d4 __kvm_pgtable_walk+0x170/0x1d4 __kvm_pgtable_walk+0x170/0x1d4 kvm_pgtable_stage2_unmap+0xc4/0xf8 kvm_arch_flush_shadow_memslot+0xa4/0x10c kvm_set_memslot+0xb8/0x454 __kvm_set_memory_region+0x194/0x244 kvm_vm_ioctl_set_memory_region+0x58/0x7c kvm_vm_ioctl+0x49c/0x560 __arm64_sys_ioctl+0x9c/0xd4 invoke_syscall+0x4c/0x124 el0_svc_common+0xc8/0x194 do_el0_svc+0x38/0xc0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x84/0xf0 el0t_64_sync+0x1a0/0x1a4 Use the largest supported block mapping for the configured page size as the batch granularity. In so doing the walker is guaranteed to visit a leaf only once. Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221007234151.461779-3-oliver.upton@linux.dev --- arch/arm64/include/asm/stage2_pgtable.h | 20 -------------------- arch/arm64/kvm/mmu.c | 9 ++++++++- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index fe341a6578c3c..c8dca8ae359cd 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -10,13 +10,6 @@ #include <linux/pgtable.h> -/* - * PGDIR_SHIFT determines the size a top-level page table entry can map - * and depends on the number of levels in the page table. Compute the - * PGDIR_SHIFT for a given number of levels. - */ -#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) - /* * The hardware supports concatenation of up to 16 tables at stage2 entry * level and we use the feature whenever possible, which means we resolve 4 @@ -30,11 +23,6 @@ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ -#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) -#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) -#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) - /* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at @@ -42,12 +30,4 @@ */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) -static inline phys_addr_t -stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); - - return (boundary - 1 < end - 1) ? boundary : end; -} - #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c9a13e487187c..caf6cfeff35be 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,6 +31,13 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, @@ -52,7 +59,7 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, if (!pgt) return -EINVAL; - next = stage2_pgd_addr_end(kvm, addr, end); + next = stage2_range_addr_end(addr, end); ret = fn(pgt, addr, next - addr); if (ret) break; -- GitLab From 837d632a383f13df7a67207a196d6eb4aeb4adca Mon Sep 17 00:00:00 2001 From: Vincent Donnefort <vdonnefort@google.com> Date: Tue, 4 Oct 2022 16:42:16 +0100 Subject: [PATCH 1612/2223] KVM: arm64: Enable stack protection and branch profiling for VHE For historical reasons, the VHE code inherited the build configuration from nVHE. Now those two parts have their own folder and makefile, we can enable stack protection and branch profiling for VHE. Signed-off-by: Vincent Donnefort <vdonnefort@google.com> Reviewed-by: Quentin Perret <qperret@google.com> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221004154216.2833636-1-vdonnefort@google.com --- arch/arm64/kvm/hyp/Makefile | 5 +---- arch/arm64/kvm/hyp/nvhe/Makefile | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 687598e41b21f..a38dea6186c90 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -5,9 +5,6 @@ incdir := $(srctree)/$(src)/include subdir-asflags-y := -I$(incdir) -subdir-ccflags-y := -I$(incdir) \ - -fno-stack-protector \ - -DDISABLE_BRANCH_PROFILING \ - $(DISABLE_STACKLEAK_PLUGIN) +subdir-ccflags-y := -I$(incdir) obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index b5c5119c7396b..48f6ae7cc6e64 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -10,6 +10,9 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS # will explode instantly (Words of Marc Zyngier). So introduce a generic flag # __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM. ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ +ccflags-y += -fno-stack-protector \ + -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include -- GitLab From 556a11a082ee208455ed42e7c460849cc3dbd68c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= <amadeuszx.slawinski@linux.intel.com> Date: Fri, 7 Oct 2022 10:48:56 +0200 Subject: [PATCH 1613/2223] ALSA: hda: Update register polling macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent commit d91857059def ("ALSA: hda: Rework snd_hdac_stream_reset() to use macros") missed that on some devices register access needs to be done with unaligned access helper. Change polling macros to use read_poll_timeout_atomic() in order to specify register read function. Fixes: d91857059def ("ALSA: hda: Rework snd_hdac_stream_reset() to use macros") Reported-by: Jon Hunter <jonathanh@nvidia.com> Link: https://lore.kernel.org/alsa-devel/20220818141517.109280-1-amadeuszx.slawinski@linux.intel.com/T/#m1270737db52b5ef163eff73cb5f862d16a07a428 Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com> Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com> Link: https://lore.kernel.org/r/20221007084856.1638302-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- include/sound/hdaudio.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h index ddff03e546e9f..35778f953a3f9 100644 --- a/include/sound/hdaudio.h +++ b/include/sound/hdaudio.h @@ -592,11 +592,11 @@ int snd_hdac_get_stream_stripe_ctl(struct hdac_bus *bus, #define snd_hdac_stream_readb(dev, reg) \ snd_hdac_reg_readb((dev)->bus, (dev)->sd_addr + AZX_REG_ ## reg) #define snd_hdac_stream_readb_poll(dev, reg, val, cond, delay_us, timeout_us) \ - readb_poll_timeout((dev)->sd_addr + AZX_REG_ ## reg, val, cond, \ - delay_us, timeout_us) + read_poll_timeout_atomic(snd_hdac_reg_readb, val, cond, delay_us, timeout_us, \ + false, (dev)->bus, (dev)->sd_addr + AZX_REG_ ## reg) #define snd_hdac_stream_readl_poll(dev, reg, val, cond, delay_us, timeout_us) \ - readl_poll_timeout((dev)->sd_addr + AZX_REG_ ## reg, val, cond, \ - delay_us, timeout_us) + read_poll_timeout_atomic(snd_hdac_reg_readl, val, cond, delay_us, timeout_us, \ + false, (dev)->bus, (dev)->sd_addr + AZX_REG_ ## reg) /* update a register, pass without AZX_REG_ prefix */ #define snd_hdac_stream_updatel(dev, reg, mask, val) \ -- GitLab From 9902b303b5ade208b58f0dd38a09831813582211 Mon Sep 17 00:00:00 2001 From: Takashi Iwai <tiwai@suse.de> Date: Sun, 9 Oct 2022 12:42:09 +0200 Subject: [PATCH 1614/2223] ALSA: usb-audio: Avoid unnecessary interface change at EP close We toggle USB interface at PCM prepare and reset at close. When the PCM isn't prepared, resetting again makes little sense. Check the current altset and avoid unnecessary interface reset at EP close. Link: https://lore.kernel.org/r/20221009104212.18877-2-tiwai@suse.de Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/usb/endpoint.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 48a3843a08f11..f21acbc9f4f40 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -32,6 +32,7 @@ struct snd_usb_iface_ref { unsigned char iface; bool need_setup; int opened; + int altset; struct list_head list; }; @@ -899,6 +900,9 @@ static int endpoint_set_interface(struct snd_usb_audio *chip, int altset = set ? ep->altsetting : 0; int err; + if (ep->iface_ref->altset == altset) + return 0; + usb_audio_dbg(chip, "Setting usb interface %d:%d for EP 0x%x\n", ep->iface, altset, ep->ep_num); err = usb_set_interface(chip->dev, ep->iface, altset); @@ -910,6 +914,7 @@ static int endpoint_set_interface(struct snd_usb_audio *chip, if (chip->quirk_flags & QUIRK_FLAG_IFACE_DELAY) msleep(50); + ep->iface_ref->altset = altset; return 0; } -- GitLab From a74f8d0aa902ca494676b79226e0b5a1747b81d4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai <tiwai@suse.de> Date: Sun, 9 Oct 2022 12:42:10 +0200 Subject: [PATCH 1615/2223] ALSA: usb-audio: Apply mutex around snd_usb_endpoint_set_params() The protection with chip->mutex was lost after splitting snd_usb_endpoint_set_params() and snd_usb_endpoint_prepare(). Apply the same mutex again to the former function. Fixes: 2be79d586454 ("ALSA: usb-audio: Split endpoint setups for hw_params and prepare (take#2)") Link: https://lore.kernel.org/r/20221009104212.18877-3-tiwai@suse.de Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/usb/endpoint.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index f21acbc9f4f40..da378e565ef84 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -1337,10 +1337,11 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, const struct audioformat *fmt = ep->cur_audiofmt; int err; + mutex_lock(&chip->mutex); /* release old buffers, if any */ err = release_urbs(ep, false); if (err < 0) - return err; + goto unlock; ep->datainterval = fmt->datainterval; ep->maxpacksize = fmt->maxpacksize; @@ -1378,13 +1379,16 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, usb_audio_dbg(chip, "Set up %d URBS, ret=%d\n", ep->nurbs, err); if (err < 0) - return err; + goto unlock; /* some unit conversions in runtime */ ep->maxframesize = ep->maxpacksize / ep->cur_frame_bytes; ep->curframesize = ep->curpacksize / ep->cur_frame_bytes; - return update_clock_ref_rate(chip, ep); + err = update_clock_ref_rate(chip, ep); + unlock: + mutex_unlock(&chip->mutex); + return err; } static int init_sample_rate(struct snd_usb_audio *chip, -- GitLab From 9355b60e401d825590d37f04ea873c58efe9b7bf Mon Sep 17 00:00:00 2001 From: Takashi Iwai <tiwai@suse.de> Date: Sun, 9 Oct 2022 12:42:11 +0200 Subject: [PATCH 1616/2223] ALSA: usb-audio: Correct the return code from snd_usb_endpoint_set_params() snd_usb_endpoint_set_params() should return zero for a success, but currently it returns the sample rate. Correct it. Fixes: 2be79d586454 ("ALSA: usb-audio: Split endpoint setups for hw_params and prepare (take#2)") Link: https://lore.kernel.org/r/20221009104212.18877-4-tiwai@suse.de Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/usb/endpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index da378e565ef84..44cce6cec9dac 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -1386,6 +1386,8 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ep->curframesize = ep->curpacksize / ep->cur_frame_bytes; err = update_clock_ref_rate(chip, ep); + if (err >= 0) + err = 0; unlock: mutex_unlock(&chip->mutex); return err; -- GitLab From 1045f5f1ff0751423aeb65648e5e1abd7a7a8672 Mon Sep 17 00:00:00 2001 From: Takashi Iwai <tiwai@suse.de> Date: Sun, 9 Oct 2022 12:42:12 +0200 Subject: [PATCH 1617/2223] ALSA: usb-audio: Avoid superfluous endpoint setup After splitting to snd_usb_endpoint_set_params() and *_prepare(), the skip of each function should be checked with different flags, while we still use ep->need_setup as the single one. Introduce ep->need_prepare for indicating the need of prepare, and also add the missing check of ep->need_setup at the set_params. Fixes: 2be79d586454 ("ALSA: usb-audio: Split endpoint setups for hw_params and prepare (take#2)") Link: https://lore.kernel.org/r/20221009104212.18877-5-tiwai@suse.de Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/usb/card.h | 3 ++- sound/usb/endpoint.c | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/sound/usb/card.h b/sound/usb/card.h index ca75f2206170f..40061550105ac 100644 --- a/sound/usb/card.h +++ b/sound/usb/card.h @@ -129,7 +129,8 @@ struct snd_usb_endpoint { in a stream */ bool implicit_fb_sync; /* syncs with implicit feedback */ bool lowlatency_playback; /* low-latency playback mode */ - bool need_setup; /* (re-)need for configure? */ + bool need_setup; /* (re-)need for hw_params? */ + bool need_prepare; /* (re-)need for prepare? */ /* for hw constraints */ const struct audioformat *cur_audiofmt; diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 44cce6cec9dac..d0b8d61d1d22b 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -824,6 +824,7 @@ snd_usb_endpoint_open(struct snd_usb_audio *chip, ep->implicit_fb_sync = fp->implicit_fb; ep->need_setup = true; + ep->need_prepare = true; usb_audio_dbg(chip, " channels=%d, rate=%d, format=%s, period_bytes=%d, periods=%d, implicit_fb=%d\n", ep->cur_channels, ep->cur_rate, @@ -952,7 +953,7 @@ void snd_usb_endpoint_close(struct snd_usb_audio *chip, /* Prepare for suspening EP, called from the main suspend handler */ void snd_usb_endpoint_suspend(struct snd_usb_endpoint *ep) { - ep->need_setup = true; + ep->need_prepare = true; if (ep->iface_ref) ep->iface_ref->need_setup = true; if (ep->clock_ref) @@ -1335,9 +1336,12 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { const struct audioformat *fmt = ep->cur_audiofmt; - int err; + int err = 0; mutex_lock(&chip->mutex); + if (!ep->need_setup) + goto unlock; + /* release old buffers, if any */ err = release_urbs(ep, false); if (err < 0) @@ -1386,8 +1390,11 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ep->curframesize = ep->curpacksize / ep->cur_frame_bytes; err = update_clock_ref_rate(chip, ep); - if (err >= 0) + if (err >= 0) { + ep->need_setup = false; err = 0; + } + unlock: mutex_unlock(&chip->mutex); return err; @@ -1437,7 +1444,7 @@ int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, mutex_lock(&chip->mutex); if (WARN_ON(!ep->iface_ref)) goto unlock; - if (!ep->need_setup) + if (!ep->need_prepare) goto unlock; /* If the interface has been already set up, just set EP parameters */ @@ -1491,7 +1498,7 @@ int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, ep->iface_ref->need_setup = false; done: - ep->need_setup = false; + ep->need_prepare = false; err = 1; unlock: -- GitLab From 285febabac4a16655372d23ff43e89ff6f216691 Mon Sep 17 00:00:00 2001 From: Yu Kuai <yukuai3@huawei.com> Date: Sun, 9 Oct 2022 18:10:38 +0800 Subject: [PATCH 1618/2223] blk-wbt: fix that 'rwb->wc' is always set to 1 in wbt_init() commit 8c5035dfbb94 ("blk-wbt: call rq_qos_add() after wb_normal is initialized") moves wbt_set_write_cache() before rq_qos_add(), which is wrong because wbt_rq_qos() is still NULL. Fix the problem by removing wbt_set_write_cache() and setting 'rwb->wc' directly. Noted that this patch also remove the redundant setting of 'rab->wc'. Fixes: 8c5035dfbb94 ("blk-wbt: call rq_qos_add() after wb_normal is initialized") Reported-by: kernel test robot <yujie.liu@intel.com> Link: https://lore.kernel.org/r/202210081045.77ddf59b-yujie.liu@intel.com Signed-off-by: Yu Kuai <yukuai3@huawei.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20221009101038.1692875-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-wbt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 2464679262531..c293e08b301ff 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -841,12 +841,11 @@ int wbt_init(struct request_queue *q) rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = 1; + rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); /* * Assign rwb and add the stats callback. -- GitLab From 175302f6b79ebbb207c2d58d6d3e679465de23b0 Mon Sep 17 00:00:00 2001 From: Duoming Zhou <duoming@zju.edu.cn> Date: Sun, 9 Oct 2022 14:37:31 +0800 Subject: [PATCH 1619/2223] mISDN: hfcpci: Fix use-after-free bug in hfcpci_softirq The function hfcpci_softirq() is a timer handler. If it is running, the timer_pending() will return 0 and the del_timer_sync() in HFC_cleanup() will not be executed. As a result, the use-after-free bug will happen. The process is shown below: (cleanup routine) | (timer handler) HFC_cleanup() | hfcpci_softirq() if (timer_pending(&hfc_tl)) | del_timer_sync() | ... | ... pci_unregister_driver(hc) | driver_unregister | driver_for_each_device bus_remove_driver | _hfcpci_softirq driver_detach | ... put_device(dev) //[1]FREE | | dev_get_drvdata(dev) //[2]USE The device is deallocated is position [1] and used in position [2]. Fix by removing the "timer_pending" check in HFC_cleanup(), which makes sure that the hfcpci_softirq() have finished before the resource is deallocated. Fixes: 009fc857c5f6 ("mISDN: fix possible use-after-free in HFC_cleanup()") Signed-off-by: Duoming Zhou <duoming@zju.edu.cn> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/isdn/hardware/mISDN/hfcpci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c index af17459c1a5c0..e964a8dd8512a 100644 --- a/drivers/isdn/hardware/mISDN/hfcpci.c +++ b/drivers/isdn/hardware/mISDN/hfcpci.c @@ -2345,8 +2345,7 @@ HFC_init(void) static void __exit HFC_cleanup(void) { - if (timer_pending(&hfc_tl)) - del_timer_sync(&hfc_tl); + del_timer_sync(&hfc_tl); pci_unregister_driver(&hfc_driver); } -- GitLab From b64085b00044bdf3cd1c9825e9ef5b2e0feae91a Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Fri, 7 Oct 2022 15:57:43 -0700 Subject: [PATCH 1620/2223] macvlan: enforce a consistent minimal mtu macvlan should enforce a minimal mtu of 68, even at link creation. This patch avoids the current behavior (which could lead to crashes in ipv6 stack if the link is brought up) $ ip link add macvlan1 link eno1 mtu 8 type macvlan # This should fail ! $ ip link sh dev macvlan1 5: macvlan1@eno1: <BROADCAST,MULTICAST> mtu 8 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 02:47:6c:24:74:82 brd ff:ff:ff:ff:ff:ff $ ip link set macvlan1 mtu 67 Error: mtu less than device minimum. $ ip link set macvlan1 mtu 68 $ ip link set macvlan1 mtu 8 Error: mtu less than device minimum. Fixes: 91572088e3fd ("net: use core MTU range checking in core net infra") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 713e3354cb2eb..8f8f73099de8d 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1192,7 +1192,7 @@ void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); - dev->min_mtu = 0; + /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); -- GitLab From 897fab7a726aa461ad0dcda7c345d5261bb6a0ca Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Sat, 8 Oct 2022 16:26:50 +0800 Subject: [PATCH 1621/2223] octeontx2-pf: mcs: fix missing unlock in some error paths Add the missing unlock in some error paths. Fixes: c54ffc73601c ("octeontx2-pf: mcs: Introduce MACSEC hardware offloading") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index 18420d9a145fb..9809f551fc2e3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -284,7 +284,7 @@ static int cn10k_mcs_write_sc_cam(struct otx2_nic *pfvf, sc_req = otx2_mbox_alloc_msg_mcs_rx_sc_cam_write(mbox); if (!sc_req) { - return -ENOMEM; + ret = -ENOMEM; goto fail; } @@ -594,7 +594,7 @@ static int cn10k_mcs_ena_dis_flowid(struct otx2_nic *pfvf, u16 hw_flow_id, req = otx2_mbox_alloc_msg_mcs_flowid_ena_entry(mbox); if (!req) { - return -ENOMEM; + ret = -ENOMEM; goto fail; } @@ -1653,6 +1653,7 @@ int cn10k_mcs_init(struct otx2_nic *pfvf) return 0; fail: dev_err(pfvf->dev, "Cannot notify PN wrapped event\n"); + mutex_unlock(&mbox->lock); return 0; } -- GitLab From 557f050166e523ce86018d7a43e7d543d9598b3d Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Sat, 8 Oct 2022 16:39:42 +0800 Subject: [PATCH 1622/2223] net: dsa: fix wrong pointer passed to PTR_ERR() in dsa_port_phylink_create() Fix wrong pointer passed to PTR_ERR() in dsa_port_phylink_create() to print error message. Fixes: cf5ca4ddc37a ("net: dsa: don't leave dangling pointers in dp->pl when failing") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/dsa/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/port.c b/net/dsa/port.c index e4a0513816bb6..208168276995a 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1681,7 +1681,7 @@ int dsa_port_phylink_create(struct dsa_port *dp) pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(pl)) { - pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); + pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl)); return PTR_ERR(pl); } -- GitLab From b2cf5d902ec1a7560f20945ddd2b0de82eff7cf3 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Sun, 9 Oct 2022 09:51:26 +0800 Subject: [PATCH 1623/2223] octeontx2-af: cn10k: mcs: Fix error return code in mcs_register_interrupts() If alloc_mem() fails in mcs_register_interrupts(), it should return error code. Fixes: 6c635f78c474 ("octeontx2-af: cn10k: mcs: Handle MCS block interrupts") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/octeontx2/af/mcs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c index 5ba618aed6adc..4a343f853b28b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c @@ -1182,8 +1182,10 @@ static int mcs_register_interrupts(struct mcs *mcs) mcs_reg_write(mcs, MCSX_PAB_TX_SLAVE_PAB_INT_ENB, 0xff); mcs->tx_sa_active = alloc_mem(mcs, mcs->hw->sc_entries); - if (!mcs->tx_sa_active) + if (!mcs->tx_sa_active) { + ret = -ENOMEM; goto exit; + } return ret; exit: -- GitLab From 17406967ec0ff8e14737ee7a073c7a45fc8210f1 Mon Sep 17 00:00:00 2001 From: Samuel Holland <samuel@sholland.org> Date: Fri, 30 Sep 2022 22:53:06 -0700 Subject: [PATCH 1624/2223] Input: pinephone-keyboard - add PinePhone keyboard driver The official Pine64 PinePhone keyboard case contains a matrix keypad and a MCU which runs a libre firmware. Add support for its I2C interface. Signed-off-by: Samuel Holland <samuel@sholland.org> Link: https://lore.kernel.org/r/20220618165747.55709-3-samuel@sholland.org Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> --- MAINTAINERS | 6 + drivers/input/keyboard/Kconfig | 13 + drivers/input/keyboard/Makefile | 1 + drivers/input/keyboard/pinephone-keyboard.c | 392 ++++++++++++++++++++ 4 files changed, 412 insertions(+) create mode 100644 drivers/input/keyboard/pinephone-keyboard.c diff --git a/MAINTAINERS b/MAINTAINERS index aa71df8b699ba..9ddcc242081cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16186,6 +16186,12 @@ F: Documentation/devicetree/bindings/pinctrl/sunplus,* F: drivers/pinctrl/sunplus/ F: include/dt-bindings/pinctrl/sppctl*.h +PINE64 PINEPHONE KEYBOARD DRIVER +M: Samuel Holland <samuel@sholland.org> +S: Supported +F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml +F: drivers/input/keyboard/pinephone-keyboard.c + PKTCDVD DRIVER M: linux-block@vger.kernel.org S: Orphan diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 8b0281c4f3c52..00292118b79bc 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -527,6 +527,19 @@ config KEYBOARD_OPENCORES To compile this driver as a module, choose M here; the module will be called opencores-kbd. +config KEYBOARD_PINEPHONE + tristate "Pine64 PinePhone Keyboard" + depends on I2C && REGULATOR + select CRC8 + select INPUT_MATRIXKMAP + help + Say Y here to enable support for the keyboard in the Pine64 PinePhone + keyboard case. This driver supports the FLOSS firmware available at + https://megous.com/git/pinephone-keyboard/ + + To compile this driver as a module, choose M here; the + module will be called pinephone-keyboard. + config KEYBOARD_PXA27x tristate "PXA27x/PXA3xx keypad support" depends on PXA27x || PXA3xx || ARCH_MMP diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 721936e902900..5f67196bb2c1e 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_KEYBOARD_NSPIRE) += nspire-keypad.o obj-$(CONFIG_KEYBOARD_OMAP) += omap-keypad.o obj-$(CONFIG_KEYBOARD_OMAP4) += omap4-keypad.o obj-$(CONFIG_KEYBOARD_OPENCORES) += opencores-kbd.o +obj-$(CONFIG_KEYBOARD_PINEPHONE) += pinephone-keyboard.o obj-$(CONFIG_KEYBOARD_PMIC8XXX) += pmic8xxx-keypad.o obj-$(CONFIG_KEYBOARD_PXA27x) += pxa27x_keypad.o obj-$(CONFIG_KEYBOARD_PXA930_ROTARY) += pxa930_rotary.o diff --git a/drivers/input/keyboard/pinephone-keyboard.c b/drivers/input/keyboard/pinephone-keyboard.c new file mode 100644 index 0000000000000..b10113b4b29b1 --- /dev/null +++ b/drivers/input/keyboard/pinephone-keyboard.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright (C) 2021-2022 Samuel Holland <samuel@sholland.org> + +#include <linux/crc8.h> +#include <linux/err.h> +#include <linux/i2c.h> +#include <linux/input.h> +#include <linux/input/matrix_keypad.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/mod_devicetable.h> +#include <linux/regulator/consumer.h> +#include <linux/types.h> + +#define DRV_NAME "pinephone-keyboard" + +#define PPKB_CRC8_POLYNOMIAL 0x07 + +#define PPKB_DEVICE_ID_HI 0x00 +#define PPKB_DEVICE_ID_HI_VALUE 'K' +#define PPKB_DEVICE_ID_LO 0x01 +#define PPKB_DEVICE_ID_LO_VALUE 'B' +#define PPKB_FW_REVISION 0x02 +#define PPKB_FW_FEATURES 0x03 +#define PPKB_MATRIX_SIZE 0x06 +#define PPKB_SCAN_CRC 0x07 +#define PPKB_SCAN_DATA 0x08 +#define PPKB_SYS_CONFIG 0x20 +#define PPKB_SYS_CONFIG_DISABLE_SCAN BIT(0) + +#define PPKB_ROWS 6 +#define PPKB_COLS 12 + +/* Size of the scan buffer, including the CRC byte at the beginning. */ +#define PPKB_BUF_LEN (1 + PPKB_COLS) + +static const uint32_t ppkb_keymap[] = { + KEY(0, 0, KEY_ESC), + KEY(0, 1, KEY_1), + KEY(0, 2, KEY_2), + KEY(0, 3, KEY_3), + KEY(0, 4, KEY_4), + KEY(0, 5, KEY_5), + KEY(0, 6, KEY_6), + KEY(0, 7, KEY_7), + KEY(0, 8, KEY_8), + KEY(0, 9, KEY_9), + KEY(0, 10, KEY_0), + KEY(0, 11, KEY_BACKSPACE), + + KEY(1, 0, KEY_TAB), + KEY(1, 1, KEY_Q), + KEY(1, 2, KEY_W), + KEY(1, 3, KEY_E), + KEY(1, 4, KEY_R), + KEY(1, 5, KEY_T), + KEY(1, 6, KEY_Y), + KEY(1, 7, KEY_U), + KEY(1, 8, KEY_I), + KEY(1, 9, KEY_O), + KEY(1, 10, KEY_P), + KEY(1, 11, KEY_ENTER), + + KEY(2, 0, KEY_LEFTMETA), + KEY(2, 1, KEY_A), + KEY(2, 2, KEY_S), + KEY(2, 3, KEY_D), + KEY(2, 4, KEY_F), + KEY(2, 5, KEY_G), + KEY(2, 6, KEY_H), + KEY(2, 7, KEY_J), + KEY(2, 8, KEY_K), + KEY(2, 9, KEY_L), + KEY(2, 10, KEY_SEMICOLON), + + KEY(3, 0, KEY_LEFTSHIFT), + KEY(3, 1, KEY_Z), + KEY(3, 2, KEY_X), + KEY(3, 3, KEY_C), + KEY(3, 4, KEY_V), + KEY(3, 5, KEY_B), + KEY(3, 6, KEY_N), + KEY(3, 7, KEY_M), + KEY(3, 8, KEY_COMMA), + KEY(3, 9, KEY_DOT), + KEY(3, 10, KEY_SLASH), + + KEY(4, 1, KEY_LEFTCTRL), + KEY(4, 4, KEY_SPACE), + KEY(4, 6, KEY_APOSTROPHE), + KEY(4, 8, KEY_RIGHTBRACE), + KEY(4, 9, KEY_LEFTBRACE), + + KEY(5, 2, KEY_FN), + KEY(5, 3, KEY_LEFTALT), + KEY(5, 5, KEY_RIGHTALT), + + /* FN layer */ + KEY(PPKB_ROWS + 0, 0, KEY_FN_ESC), + KEY(PPKB_ROWS + 0, 1, KEY_F1), + KEY(PPKB_ROWS + 0, 2, KEY_F2), + KEY(PPKB_ROWS + 0, 3, KEY_F3), + KEY(PPKB_ROWS + 0, 4, KEY_F4), + KEY(PPKB_ROWS + 0, 5, KEY_F5), + KEY(PPKB_ROWS + 0, 6, KEY_F6), + KEY(PPKB_ROWS + 0, 7, KEY_F7), + KEY(PPKB_ROWS + 0, 8, KEY_F8), + KEY(PPKB_ROWS + 0, 9, KEY_F9), + KEY(PPKB_ROWS + 0, 10, KEY_F10), + KEY(PPKB_ROWS + 0, 11, KEY_DELETE), + + KEY(PPKB_ROWS + 1, 10, KEY_PAGEUP), + + KEY(PPKB_ROWS + 2, 0, KEY_SYSRQ), + KEY(PPKB_ROWS + 2, 9, KEY_PAGEDOWN), + KEY(PPKB_ROWS + 2, 10, KEY_INSERT), + + KEY(PPKB_ROWS + 3, 0, KEY_LEFTSHIFT), + KEY(PPKB_ROWS + 3, 8, KEY_HOME), + KEY(PPKB_ROWS + 3, 9, KEY_UP), + KEY(PPKB_ROWS + 3, 10, KEY_END), + + KEY(PPKB_ROWS + 4, 1, KEY_LEFTCTRL), + KEY(PPKB_ROWS + 4, 6, KEY_LEFT), + KEY(PPKB_ROWS + 4, 8, KEY_RIGHT), + KEY(PPKB_ROWS + 4, 9, KEY_DOWN), + + KEY(PPKB_ROWS + 5, 3, KEY_LEFTALT), + KEY(PPKB_ROWS + 5, 5, KEY_RIGHTALT), +}; + +static const struct matrix_keymap_data ppkb_keymap_data = { + .keymap = ppkb_keymap, + .keymap_size = ARRAY_SIZE(ppkb_keymap), +}; + +struct pinephone_keyboard { + struct input_dev *input; + u8 buf[2][PPKB_BUF_LEN]; + u8 crc_table[CRC8_TABLE_SIZE]; + u8 fn_state[PPKB_COLS]; + bool buf_swap; + bool fn_pressed; +}; + +static void ppkb_update(struct i2c_client *client) +{ + struct pinephone_keyboard *ppkb = i2c_get_clientdata(client); + unsigned short *keymap = ppkb->input->keycode; + int row_shift = get_count_order(PPKB_COLS); + u8 *old_buf = ppkb->buf[!ppkb->buf_swap]; + u8 *new_buf = ppkb->buf[ppkb->buf_swap]; + int col, crc, ret, row; + struct device *dev = &client->dev; + + ret = i2c_smbus_read_i2c_block_data(client, PPKB_SCAN_CRC, + PPKB_BUF_LEN, new_buf); + if (ret != PPKB_BUF_LEN) { + dev_err(dev, "Failed to read scan data: %d\n", ret); + return; + } + + crc = crc8(ppkb->crc_table, &new_buf[1], PPKB_COLS, CRC8_INIT_VALUE); + if (crc != new_buf[0]) { + dev_err(dev, "Bad scan data (%02x != %02x)\n", crc, new_buf[0]); + return; + } + + ppkb->buf_swap = !ppkb->buf_swap; + + for (col = 0; col < PPKB_COLS; ++col) { + u8 old = old_buf[1 + col]; + u8 new = new_buf[1 + col]; + u8 changed = old ^ new; + + if (!changed) + continue; + + for (row = 0; row < PPKB_ROWS; ++row) { + u8 mask = BIT(row); + u8 value = new & mask; + unsigned short code; + bool fn_state; + + if (!(changed & mask)) + continue; + + /* + * Save off the FN key state when the key was pressed, + * and use that to determine the code during a release. + */ + fn_state = value ? ppkb->fn_pressed : ppkb->fn_state[col] & mask; + if (fn_state) + ppkb->fn_state[col] ^= mask; + + /* The FN layer is a second set of rows. */ + code = MATRIX_SCAN_CODE(fn_state ? PPKB_ROWS + row : row, + col, row_shift); + input_event(ppkb->input, EV_MSC, MSC_SCAN, code); + input_report_key(ppkb->input, keymap[code], value); + if (keymap[code] == KEY_FN) + ppkb->fn_pressed = value; + } + } + input_sync(ppkb->input); +} + +static irqreturn_t ppkb_irq_thread(int irq, void *data) +{ + struct i2c_client *client = data; + + ppkb_update(client); + + return IRQ_HANDLED; +} + +static int ppkb_set_scan(struct i2c_client *client, bool enable) +{ + struct device *dev = &client->dev; + int ret, val; + + ret = i2c_smbus_read_byte_data(client, PPKB_SYS_CONFIG); + if (ret < 0) { + dev_err(dev, "Failed to read config: %d\n", ret); + return ret; + } + + if (enable) + val = ret & ~PPKB_SYS_CONFIG_DISABLE_SCAN; + else + val = ret | PPKB_SYS_CONFIG_DISABLE_SCAN; + + ret = i2c_smbus_write_byte_data(client, PPKB_SYS_CONFIG, val); + if (ret) { + dev_err(dev, "Failed to write config: %d\n", ret); + return ret; + } + + return 0; +} + +static int ppkb_open(struct input_dev *input) +{ + struct i2c_client *client = input_get_drvdata(input); + int error; + + error = ppkb_set_scan(client, true); + if (error) + return error; + + return 0; +} + +static void ppkb_close(struct input_dev *input) +{ + struct i2c_client *client = input_get_drvdata(input); + + ppkb_set_scan(client, false); +} + +static void ppkb_regulator_disable(void *regulator) +{ + regulator_disable(regulator); +} + +static int ppkb_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + unsigned int phys_rows, phys_cols; + struct pinephone_keyboard *ppkb; + struct regulator *vbat_supply; + u8 info[PPKB_MATRIX_SIZE + 1]; + int ret; + int error; + + vbat_supply = devm_regulator_get(dev, "vbat"); + error = PTR_ERR_OR_ZERO(vbat_supply); + if (error) { + dev_err(dev, "Failed to get VBAT supply: %d\n", error); + return error; + } + + error = regulator_enable(vbat_supply); + if (error) { + dev_err(dev, "Failed to enable VBAT: %d\n", error); + return error; + } + + error = devm_add_action_or_reset(dev, ppkb_regulator_disable, + vbat_supply); + if (error) + return error; + + ret = i2c_smbus_read_i2c_block_data(client, 0, sizeof(info), info); + if (ret != sizeof(info)) { + error = ret < 0 ? ret : -EIO; + dev_err(dev, "Failed to read device ID: %d\n", error); + return error; + } + + if (info[PPKB_DEVICE_ID_HI] != PPKB_DEVICE_ID_HI_VALUE || + info[PPKB_DEVICE_ID_LO] != PPKB_DEVICE_ID_LO_VALUE) { + dev_warn(dev, "Unexpected device ID: %#02x %#02x\n", + info[PPKB_DEVICE_ID_HI], info[PPKB_DEVICE_ID_LO]); + return -ENODEV; + } + + dev_info(dev, "Found firmware version %d.%d features %#x\n", + info[PPKB_FW_REVISION] >> 4, + info[PPKB_FW_REVISION] & 0xf, + info[PPKB_FW_FEATURES]); + + phys_rows = info[PPKB_MATRIX_SIZE] & 0xf; + phys_cols = info[PPKB_MATRIX_SIZE] >> 4; + if (phys_rows != PPKB_ROWS || phys_cols != PPKB_COLS) { + dev_err(dev, "Unexpected keyboard size %ux%u\n", + phys_rows, phys_cols); + return -EINVAL; + } + + /* Disable scan by default to save power. */ + error = ppkb_set_scan(client, false); + if (error) + return error; + + ppkb = devm_kzalloc(dev, sizeof(*ppkb), GFP_KERNEL); + if (!ppkb) + return -ENOMEM; + + i2c_set_clientdata(client, ppkb); + + crc8_populate_msb(ppkb->crc_table, PPKB_CRC8_POLYNOMIAL); + + ppkb->input = devm_input_allocate_device(dev); + if (!ppkb->input) + return -ENOMEM; + + input_set_drvdata(ppkb->input, client); + + ppkb->input->name = "PinePhone Keyboard"; + ppkb->input->phys = DRV_NAME "/input0"; + ppkb->input->id.bustype = BUS_I2C; + ppkb->input->open = ppkb_open; + ppkb->input->close = ppkb_close; + + input_set_capability(ppkb->input, EV_MSC, MSC_SCAN); + __set_bit(EV_REP, ppkb->input->evbit); + + error = matrix_keypad_build_keymap(&ppkb_keymap_data, NULL, + 2 * PPKB_ROWS, PPKB_COLS, NULL, + ppkb->input); + if (error) { + dev_err(dev, "Failed to build keymap: %d\n", error); + return error; + } + + error = input_register_device(ppkb->input); + if (error) { + dev_err(dev, "Failed to register input: %d\n", error); + return error; + } + + error = devm_request_threaded_irq(dev, client->irq, + NULL, ppkb_irq_thread, + IRQF_ONESHOT, client->name, client); + if (error) { + dev_err(dev, "Failed to request IRQ: %d\n", error); + return error; + } + + return 0; +} + +static const struct of_device_id ppkb_of_match[] = { + { .compatible = "pine64,pinephone-keyboard" }, + { } +}; +MODULE_DEVICE_TABLE(of, ppkb_of_match); + +static struct i2c_driver ppkb_driver = { + .probe_new = ppkb_probe, + .driver = { + .name = DRV_NAME, + .of_match_table = ppkb_of_match, + }, +}; +module_i2c_driver(ppkb_driver); + +MODULE_AUTHOR("Samuel Holland <samuel@sholland.org>"); +MODULE_DESCRIPTION("Pine64 PinePhone keyboard driver"); +MODULE_LICENSE("GPL"); -- GitLab From 63c5eb157cfdb6f20387c4492d27d7248e239e85 Mon Sep 17 00:00:00 2001 From: Samuel Holland <samuel@sholland.org> Date: Fri, 30 Sep 2022 22:53:23 -0700 Subject: [PATCH 1625/2223] Input: pinephone-keyboard - support the proxied I2C bus The PinePhone keyboard case contains a battery managed by an integrated power bank IC. The power bank IC communicates over I2C, and the keyboard MCU firmware provides an interface to read and write its registers. Let's use this interface to implement a SMBus adapter, so we can reuse the driver for the power bank IC. Signed-off-by: Samuel Holland <samuel@sholland.org> Link: https://lore.kernel.org/r/20220618165747.55709-4-samuel@sholland.org Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> --- drivers/input/keyboard/pinephone-keyboard.c | 76 +++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/drivers/input/keyboard/pinephone-keyboard.c b/drivers/input/keyboard/pinephone-keyboard.c index b10113b4b29b1..5548699b8b389 100644 --- a/drivers/input/keyboard/pinephone-keyboard.c +++ b/drivers/input/keyboard/pinephone-keyboard.c @@ -3,6 +3,7 @@ // Copyright (C) 2021-2022 Samuel Holland <samuel@sholland.org> #include <linux/crc8.h> +#include <linux/delay.h> #include <linux/err.h> #include <linux/i2c.h> #include <linux/input.h> @@ -10,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/mod_devicetable.h> +#include <linux/of.h> #include <linux/regulator/consumer.h> #include <linux/types.h> @@ -28,6 +30,11 @@ #define PPKB_SCAN_DATA 0x08 #define PPKB_SYS_CONFIG 0x20 #define PPKB_SYS_CONFIG_DISABLE_SCAN BIT(0) +#define PPKB_SYS_SMBUS_COMMAND 0x21 +#define PPKB_SYS_SMBUS_DATA 0x22 +#define PPKB_SYS_COMMAND 0x23 +#define PPKB_SYS_COMMAND_SMBUS_READ 0x91 +#define PPKB_SYS_COMMAND_SMBUS_WRITE 0xa1 #define PPKB_ROWS 6 #define PPKB_COLS 12 @@ -136,6 +143,7 @@ static const struct matrix_keymap_data ppkb_keymap_data = { }; struct pinephone_keyboard { + struct i2c_adapter adapter; struct input_dev *input; u8 buf[2][PPKB_BUF_LEN]; u8 crc_table[CRC8_TABLE_SIZE]; @@ -144,6 +152,57 @@ struct pinephone_keyboard { bool fn_pressed; }; +static int ppkb_adap_smbus_xfer(struct i2c_adapter *adap, u16 addr, + unsigned short flags, char read_write, + u8 command, int size, + union i2c_smbus_data *data) +{ + struct i2c_client *client = adap->algo_data; + u8 buf[3]; + int ret; + + buf[0] = command; + buf[1] = data->byte; + buf[2] = read_write == I2C_SMBUS_READ ? PPKB_SYS_COMMAND_SMBUS_READ + : PPKB_SYS_COMMAND_SMBUS_WRITE; + + ret = i2c_smbus_write_i2c_block_data(client, PPKB_SYS_SMBUS_COMMAND, + sizeof(buf), buf); + if (ret) + return ret; + + /* Read back the command status until it passes or fails. */ + do { + usleep_range(300, 500); + ret = i2c_smbus_read_byte_data(client, PPKB_SYS_COMMAND); + } while (ret == buf[2]); + if (ret < 0) + return ret; + /* Commands return 0x00 on success and 0xff on failure. */ + if (ret) + return -EIO; + + if (read_write == I2C_SMBUS_READ) { + ret = i2c_smbus_read_byte_data(client, PPKB_SYS_SMBUS_DATA); + if (ret < 0) + return ret; + + data->byte = ret; + } + + return 0; +} + +static u32 ppkg_adap_functionality(struct i2c_adapter *adap) +{ + return I2C_FUNC_SMBUS_BYTE_DATA; +} + +static const struct i2c_algorithm ppkb_adap_algo = { + .smbus_xfer = ppkb_adap_smbus_xfer, + .functionality = ppkg_adap_functionality, +}; + static void ppkb_update(struct i2c_client *client) { struct pinephone_keyboard *ppkb = i2c_get_clientdata(client); @@ -271,6 +330,7 @@ static int ppkb_probe(struct i2c_client *client) struct pinephone_keyboard *ppkb; struct regulator *vbat_supply; u8 info[PPKB_MATRIX_SIZE + 1]; + struct device_node *i2c_bus; int ret; int error; @@ -330,6 +390,22 @@ static int ppkb_probe(struct i2c_client *client) i2c_set_clientdata(client, ppkb); + i2c_bus = of_get_child_by_name(dev->of_node, "i2c"); + if (i2c_bus) { + ppkb->adapter.owner = THIS_MODULE; + ppkb->adapter.algo = &ppkb_adap_algo; + ppkb->adapter.algo_data = client; + ppkb->adapter.dev.parent = dev; + ppkb->adapter.dev.of_node = i2c_bus; + strscpy(ppkb->adapter.name, DRV_NAME, sizeof(ppkb->adapter.name)); + + error = devm_i2c_add_adapter(dev, &ppkb->adapter); + if (error) { + dev_err(dev, "Failed to add I2C adapter: %d\n", error); + return error; + } + } + crc8_populate_msb(ppkb->crc_table, PPKB_CRC8_POLYNOMIAL); ppkb->input = devm_input_allocate_device(dev); -- GitLab From 8761b9b580d53162cca7868385069c0d4354c9e0 Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Sat, 1 Oct 2022 14:28:34 -0700 Subject: [PATCH 1626/2223] Input: i8042 - rename i8042-x86ia64io.h to i8042-acpipnpio.h Now i8042-x86ia64io.h is shared by X86 and IA64, but it can be shared by more platforms (such as LoongArch) with ACPI firmware on which PNP typed keyboard and mouse is configured in DSDT. So rename it to i8042- acpipnpio.h. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> Reviewed-by: Mattijs Korpershoek <mkorpershoek@baylibre.com> Link: https://lore.kernel.org/r/20220917064020.1639709-1-chenhuacai@loongson.cn Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> --- .../input/serio/{i8042-x86ia64io.h => i8042-acpipnpio.h} | 6 +++--- drivers/input/serio/i8042.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) rename drivers/input/serio/{i8042-x86ia64io.h => i8042-acpipnpio.h} (99%) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-acpipnpio.h similarity index 99% rename from drivers/input/serio/i8042-x86ia64io.h rename to drivers/input/serio/i8042-acpipnpio.h index 732b7a6b315d6..e91c6a1814a88 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _I8042_X86IA64IO_H -#define _I8042_X86IA64IO_H +#ifndef _I8042_ACPIPNPIO_H +#define _I8042_ACPIPNPIO_H #ifdef CONFIG_X86 @@ -1665,4 +1665,4 @@ static inline void i8042_platform_exit(void) i8042_pnp_exit(); } -#endif /* _I8042_X86IA64IO_H */ +#endif /* _I8042_ACPIPNPIO_H */ diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h index 55381783dc82d..bf2592fa9a783 100644 --- a/drivers/input/serio/i8042.h +++ b/drivers/input/serio/i8042.h @@ -20,7 +20,7 @@ #elif defined(CONFIG_SPARC) #include "i8042-sparcio.h" #elif defined(CONFIG_X86) || defined(CONFIG_IA64) -#include "i8042-x86ia64io.h" +#include "i8042-acpipnpio.h" #else #include "i8042-io.h" #endif -- GitLab From fdd7c96176de823229d88e787b9e554b0f05b6c1 Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Sat, 1 Oct 2022 14:29:01 -0700 Subject: [PATCH 1627/2223] Input: i8042 - add LoongArch support in i8042-acpipnpio.h LoongArch uses ACPI and nearly the same as X86/IA64 for 8042. So modify i8042-acpipnpio.h slightly and enable it for LoongArch in i8042.h. Then i8042 driver can work well under the ACPI firmware with PNP typed key- board and mouse configured in DSDT. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> Reviewed-by: Mattijs Korpershoek <mkorpershoek@baylibre.com> Link: https://lore.kernel.org/r/20220917064020.1639709-2-chenhuacai@loongson.cn Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> --- drivers/input/serio/i8042-acpipnpio.h | 6 ++++++ drivers/input/serio/i8042.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index e91c6a1814a88..0778dc03cd9e0 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -2,6 +2,7 @@ #ifndef _I8042_ACPIPNPIO_H #define _I8042_ACPIPNPIO_H +#include <linux/acpi.h> #ifdef CONFIG_X86 #include <asm/x86_init.h> @@ -1453,9 +1454,14 @@ static int __init i8042_pnp_init(void) return -ENODEV; #else pr_info("PNP: No PS/2 controller found.\n"); +#if defined(__loongarch__) + if (acpi_disabled == 0) + return -ENODEV; +#else if (x86_platform.legacy.i8042 != X86_LEGACY_I8042_EXPECTED_PRESENT) return -ENODEV; +#endif pr_info("Probing ports directly.\n"); return 0; #endif diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h index bf2592fa9a783..adb5173372d3e 100644 --- a/drivers/input/serio/i8042.h +++ b/drivers/input/serio/i8042.h @@ -19,7 +19,7 @@ #include "i8042-snirm.h" #elif defined(CONFIG_SPARC) #include "i8042-sparcio.h" -#elif defined(CONFIG_X86) || defined(CONFIG_IA64) +#elif defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH) #include "i8042-acpipnpio.h" #else #include "i8042-io.h" -- GitLab From fe5b6aaef72a0f7daa06e7960e0bee45c2984e41 Mon Sep 17 00:00:00 2001 From: Liang He <windhl@126.com> Date: Sat, 1 Oct 2022 14:42:24 -0700 Subject: [PATCH 1628/2223] Input: i8042 - fix refount leak on sparc In i8042_platform_init() and i8042_platform_exit(), we should call of_node_put() for the reference 'root' returned by of_find_node_by_path() which has increased the refcount. Fixes: f57caaefacc2 ("[SERIO] i8042-sparcio.h: Convert to of_driver framework.") Signed-off-by: Liang He <windhl@126.com> Link: https://lore.kernel.org/r/20220711064300.358757-1-windhl@126.com [dtor: rearranged i8042_is_mr_coffee() a bit] Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> --- drivers/input/serio/i8042-sparcio.h | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/input/serio/i8042-sparcio.h b/drivers/input/serio/i8042-sparcio.h index fce76812843bb..c712c1fe06053 100644 --- a/drivers/input/serio/i8042-sparcio.h +++ b/drivers/input/serio/i8042-sparcio.h @@ -3,6 +3,7 @@ #define _I8042_SPARCIO_H #include <linux/of_device.h> +#include <linux/types.h> #include <asm/io.h> #include <asm/oplib.h> @@ -103,12 +104,25 @@ static struct platform_driver sparc_i8042_driver = { .remove = sparc_i8042_remove, }; -static int __init i8042_platform_init(void) +static bool i8042_is_mr_coffee(void) { - struct device_node *root = of_find_node_by_path("/"); - const char *name = of_get_property(root, "name", NULL); + struct device_node *root; + const char *name; + bool is_mr_coffee; + + root = of_find_node_by_path("/"); + + name = of_get_property(root, "name", NULL); + is_mr_coffee = name && !strcmp(name, "SUNW,JavaStation-1"); - if (name && !strcmp(name, "SUNW,JavaStation-1")) { + of_node_put(root); + + return is_mr_coffee; +} + +static int __init i8042_platform_init(void) +{ + if (i8042_is_mr_coffee()) { /* Hardcoded values for MrCoffee. */ i8042_kbd_irq = i8042_aux_irq = 13 | 0x20; kbd_iobase = ioremap(0x71300060, 8); @@ -136,10 +150,7 @@ static int __init i8042_platform_init(void) static inline void i8042_platform_exit(void) { - struct device_node *root = of_find_node_by_path("/"); - const char *name = of_get_property(root, "name", NULL); - - if (!name || strcmp(name, "SUNW,JavaStation-1")) + if (!i8042_is_mr_coffee()) platform_driver_unregister(&sparc_i8042_driver); } -- GitLab From 84cdf5bcbdce1622eeb6c857f8a7e383de1074a9 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko <vfedorenko@novek.ru> Date: Mon, 10 Oct 2022 04:29:34 +0300 Subject: [PATCH 1629/2223] ] ptp: ocp: remove symlink for second GNSS Destroy code doesn't remove symlink for ttyGNSS2 device introduced earlier. Add cleanup code. Fixes: 71d7e0850476 ("ptp: ocp: Add second GNSS device") Signed-off-by: Vadim Fedorenko <vadfed@fb.com> Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/ptp/ptp_ocp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index d36c3f597f777..a48d9b7d29217 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -3657,6 +3657,7 @@ ptp_ocp_detach_sysfs(struct ptp_ocp *bp) struct device *dev = &bp->dev; sysfs_remove_link(&dev->kobj, "ttyGNSS"); + sysfs_remove_link(&dev->kobj, "ttyGNSS2"); sysfs_remove_link(&dev->kobj, "ttyMAC"); sysfs_remove_link(&dev->kobj, "ptp"); sysfs_remove_link(&dev->kobj, "pps"); -- GitLab From af7d23f9d96a3e9647cff8619a6860d73b109b5f Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 10 Oct 2022 11:39:45 +0800 Subject: [PATCH 1630/2223] octeontx2-pf: mcs: fix possible memory leak in otx2_probe() In error path after calling cn10k_mcs_init(), cn10k_mcs_free() need be called to avoid memory leak. Fixes: c54ffc73601c ("octeontx2-pf: mcs: Introduce MACSEC hardware offloading") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 5803d7f9137ca..892ca88e0cf43 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -2810,7 +2810,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = register_netdev(netdev); if (err) { dev_err(dev, "Failed to register netdevice\n"); - goto err_del_mcam_entries; + goto err_mcs_free; } err = otx2_wq_init(pf); @@ -2849,6 +2849,8 @@ err_mcam_flow_del: otx2_mcam_flow_del(pf); err_unreg_netdev: unregister_netdev(netdev); +err_mcs_free: + cn10k_mcs_free(pf); err_del_mcam_entries: otx2_mcam_flow_del(pf); err_ptp_destroy: -- GitLab From 7023472834a39341460dae5c9b506c76c5940cad Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Date: Mon, 3 Oct 2022 13:16:30 +0100 Subject: [PATCH 1631/2223] drm/i915/guc: Fix revocation of non-persistent contexts Patch which added graceful exit for non-persistent contexts missed the fact it is not enough to set the exiting flag on a context and let the backend handle it from there. GuC backend cannot handle it because it runs independently in the firmware and driver might not see the requests ever again. Patch also missed the fact some usages of intel_context_is_banned in the GuC backend needed replacing with newly introduced intel_context_is_schedulable. Fix the first issue by calling into backend revoke when we know this is the last chance to do it. Fix the second issue by replacing intel_context_is_banned with intel_context_is_schedulable, which should always be safe since latter is a superset of the former. v2: * Just call ce->ops->revoke unconditionally. (Andrzej) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Fixes: 45c64ecf97ee ("drm/i915: Improve user experience and driver robustness under SIGINT or similar") Cc: Andrzej Hajda <andrzej.hajda@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com> Acked-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221003121630.694249-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit 0add082cebac8555ee3972ba768ae5c01db7a498) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 8 +----- drivers/gpu/drm/i915/gt/intel_context.c | 5 ++-- drivers/gpu/drm/i915/gt/intel_context.h | 3 +-- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 26 +++++++++---------- 4 files changed, 17 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index dabdfe09f5e51..e7148a994b3ad 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1383,14 +1383,8 @@ kill_engines(struct i915_gem_engines *engines, bool exit, bool persistent) */ for_each_gem_engine(ce, engines, it) { struct intel_engine_cs *engine; - bool skip = false; - if (exit) - skip = intel_context_set_exiting(ce); - else if (!persistent) - skip = intel_context_exit_nonpersistent(ce, NULL); - - if (skip) + if ((exit || !persistent) && intel_context_revoke(ce)) continue; /* Already marked. */ /* diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 654a092ed3d69..e94365b08f1ef 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -614,13 +614,12 @@ bool intel_context_ban(struct intel_context *ce, struct i915_request *rq) return ret; } -bool intel_context_exit_nonpersistent(struct intel_context *ce, - struct i915_request *rq) +bool intel_context_revoke(struct intel_context *ce) { bool ret = intel_context_set_exiting(ce); if (ce->ops->revoke) - ce->ops->revoke(ce, rq, ce->engine->props.preempt_timeout_ms); + ce->ops->revoke(ce, NULL, ce->engine->props.preempt_timeout_ms); return ret; } diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index 8e2d70630c49e..be09fb2e883a5 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -329,8 +329,7 @@ static inline bool intel_context_set_exiting(struct intel_context *ce) return test_and_set_bit(CONTEXT_EXITING, &ce->flags); } -bool intel_context_exit_nonpersistent(struct intel_context *ce, - struct i915_request *rq); +bool intel_context_revoke(struct intel_context *ce); static inline bool intel_context_force_single_submission(const struct intel_context *ce) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 22ba66e48a9b0..1db59eeb34db9 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -684,7 +684,7 @@ static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq) * Corner case where requests were sitting in the priority list or a * request resubmitted after the context was banned. */ - if (unlikely(intel_context_is_banned(ce))) { + if (unlikely(!intel_context_is_schedulable(ce))) { i915_request_put(i915_request_mark_eio(rq)); intel_engine_signal_breadcrumbs(ce->engine); return 0; @@ -870,15 +870,15 @@ static int guc_wq_item_append(struct intel_guc *guc, struct i915_request *rq) { struct intel_context *ce = request_to_scheduling_context(rq); - int ret = 0; + int ret; - if (likely(!intel_context_is_banned(ce))) { - ret = __guc_wq_item_append(rq); + if (unlikely(!intel_context_is_schedulable(ce))) + return 0; - if (unlikely(ret == -EBUSY)) { - guc->stalled_request = rq; - guc->submission_stall_reason = STALL_MOVE_LRC_TAIL; - } + ret = __guc_wq_item_append(rq); + if (unlikely(ret == -EBUSY)) { + guc->stalled_request = rq; + guc->submission_stall_reason = STALL_MOVE_LRC_TAIL; } return ret; @@ -897,7 +897,7 @@ static bool multi_lrc_submit(struct i915_request *rq) * submitting all the requests generated in parallel. */ return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) || - intel_context_is_banned(ce); + !intel_context_is_schedulable(ce); } static int guc_dequeue_one_context(struct intel_guc *guc) @@ -966,7 +966,7 @@ register_context: struct intel_context *ce = request_to_scheduling_context(last); if (unlikely(!ctx_id_mapped(guc, ce->guc_id.id) && - !intel_context_is_banned(ce))) { + intel_context_is_schedulable(ce))) { ret = try_context_registration(ce, false); if (unlikely(ret == -EPIPE)) { goto deadlk; @@ -1576,7 +1576,7 @@ static void guc_reset_state(struct intel_context *ce, u32 head, bool scrub) { struct intel_engine_cs *engine = __context_to_physical_engine(ce); - if (intel_context_is_banned(ce)) + if (!intel_context_is_schedulable(ce)) return; GEM_BUG_ON(!intel_context_is_pinned(ce)); @@ -4424,12 +4424,12 @@ static void guc_handle_context_reset(struct intel_guc *guc, { trace_intel_context_reset(ce); - if (likely(!intel_context_is_banned(ce))) { + if (likely(intel_context_is_schedulable(ce))) { capture_error_state(guc, ce); guc_context_replay(ce); } else { drm_info(&guc_to_gt(guc)->i915->drm, - "Ignoring context reset notification of banned context 0x%04X on %s", + "Ignoring context reset notification of exiting context 0x%04X on %s", ce->guc_id.id, ce->engine->name); } } -- GitLab From c5e595e752b3a1c68cca57c3559521237332fbec Mon Sep 17 00:00:00 2001 From: Matthew Auld <matthew.auld@intel.com> Date: Tue, 4 Oct 2022 14:19:13 +0100 Subject: [PATCH 1632/2223] drm/i915/display: handle migration for dpt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On platforms like DG2, it looks like the dpt path here is missing the migrate-to-lmem step on discrete platforms. v2: - Move the vma_pin() under the for_i915_gem_ww(), otherwise the object can be moved after dropping the lock and then doing the pin. Fixes: 33e7a975103c ("drm/i915/xelpd: First stab at DPT support") Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Jianshui Yu <jianshui.yu@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Cc: Nirmoy Das <nirmoy.das@intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221004131916.233474-2-matthew.auld@intel.com (cherry picked from commit 5769f64ff09aab23a9045fa13b464fb5070d3fb2) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/display/intel_fb_pin.c | 51 +++++++++++++-------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.c b/drivers/gpu/drm/i915/display/intel_fb_pin.c index c86e5d4ee016f..733972fab07f5 100644 --- a/drivers/gpu/drm/i915/display/intel_fb_pin.c +++ b/drivers/gpu/drm/i915/display/intel_fb_pin.c @@ -26,10 +26,17 @@ intel_pin_fb_obj_dpt(struct drm_framebuffer *fb, struct drm_device *dev = fb->dev; struct drm_i915_private *dev_priv = to_i915(dev); struct drm_i915_gem_object *obj = intel_fb_obj(fb); + struct i915_gem_ww_ctx ww; struct i915_vma *vma; u32 alignment; int ret; + /* + * We are not syncing against the binding (and potential migrations) + * below, so this vm must never be async. + */ + GEM_WARN_ON(vm->bind_async_flags); + if (WARN_ON(!i915_gem_object_is_framebuffer(obj))) return ERR_PTR(-EINVAL); @@ -37,29 +44,37 @@ intel_pin_fb_obj_dpt(struct drm_framebuffer *fb, atomic_inc(&dev_priv->gpu_error.pending_fb_pin); - ret = i915_gem_object_lock_interruptible(obj, NULL); - if (!ret) { + for_i915_gem_ww(&ww, ret, true) { + ret = i915_gem_object_lock(obj, &ww); + if (ret) + continue; + + if (HAS_LMEM(dev_priv)) { + ret = i915_gem_object_migrate(obj, &ww, INTEL_REGION_LMEM_0); + if (ret) + continue; + } + ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE); - i915_gem_object_unlock(obj); - } - if (ret) { - vma = ERR_PTR(ret); - goto err; - } + if (ret) + continue; - vma = i915_vma_instance(obj, vm, view); - if (IS_ERR(vma)) - goto err; + vma = i915_vma_instance(obj, vm, view); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + continue; + } - if (i915_vma_misplaced(vma, 0, alignment, 0)) { - ret = i915_vma_unbind_unlocked(vma); - if (ret) { - vma = ERR_PTR(ret); - goto err; + if (i915_vma_misplaced(vma, 0, alignment, 0)) { + ret = i915_vma_unbind(vma); + if (ret) + continue; } - } - ret = i915_vma_pin(vma, 0, alignment, PIN_GLOBAL); + ret = i915_vma_pin_ww(vma, &ww, 0, alignment, PIN_GLOBAL); + if (ret) + continue; + } if (ret) { vma = ERR_PTR(ret); goto err; -- GitLab From aebe9f4639b13a1f4e9a6b42cdd2e38c617b442d Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 28 Sep 2022 21:56:15 +0200 Subject: [PATCH 1633/2223] wifi: cfg80211: fix u8 overflow in cfg80211_update_notlisted_nontrans() In the copy code of the elements, we do the following calculation to reach the end of the MBSSID element: /* copy the IEs after MBSSID */ cpy_len = mbssid[1] + 2; This looks fine, however, cpy_len is a u8, the same as mbssid[1], so the addition of two can overflow. In this case the subsequent memcpy() will overflow the allocated buffer, since it copies 256 bytes too much due to the way the allocation and memcpy() sizes are calculated. Fix this by using size_t for the cpy_len variable. This fixes CVE-2022-41674. Reported-by: Soenke Huster <shuster@seemoo.tu-darmstadt.de> Tested-by: Soenke Huster <shuster@seemoo.tu-darmstadt.de> Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 5382fc2003db4..62f8c10412ad3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2279,7 +2279,7 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, size_t new_ie_len; struct cfg80211_bss_ies *new_ies; const struct cfg80211_bss_ies *old; - u8 cpy_len; + size_t cpy_len; lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); -- GitLab From 8f033d2becc24aa6bfd2a5c104407963560caabc Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 28 Sep 2022 22:01:37 +0200 Subject: [PATCH 1634/2223] wifi: cfg80211/mac80211: reject bad MBSSID elements Per spec, the maximum value for the MaxBSSID ('n') indicator is 8, and the minimum is 1 since a multiple BSSID set with just one BSSID doesn't make sense (the # of BSSIDs is limited by 2^n). Limit this in the parsing in both cfg80211 and mac80211, rejecting any elements with an invalid value. This fixes potentially bad shifts in the processing of these inside the cfg80211_gen_new_bssid() function later. I found this during the investigation of CVE-2022-41674 fixed by the previous patch. Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Fixes: 78ac51f81532 ("mac80211: support multi-bssid") Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/util.c | 2 ++ net/wireless/scan.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bf7461c41beff..f61289c5fed24 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1445,6 +1445,8 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { if (elem->datalen < 2) continue; + if (elem->data[0] < 1 || elem->data[0] > 8) + continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 new_bssid[ETH_ALEN]; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 62f8c10412ad3..5dab33e1f3a81 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2143,6 +2143,8 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, ie, ielen) { if (elem->datalen < 4) continue; + if (elem->data[0] < 1 || (int)elem->data[0] > 8) + continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 profile_len; -- GitLab From ff05d4b45dd89b922578dac497dcabf57cf771c6 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 28 Sep 2022 22:07:15 +0200 Subject: [PATCH 1635/2223] wifi: mac80211: fix MBSSID parsing use-after-free When we parse a multi-BSSID element, we might point some element pointers into the allocated nontransmitted_profile. However, we free this before returning, causing UAF when the relevant pointers in the parsed elements are accessed. Fix this by not allocating the scratch buffer separately but as part of the returned structure instead, that way, there are no lifetime issues with it. The scratch buffer introduction as part of the returned data here is taken from MLO feature work done by Ilan. This fixes CVE-2022-42719. Fixes: 5023b14cf4df ("mac80211: support profile split between elements") Co-developed-by: Ilan Peer <ilan.peer@intel.com> Signed-off-by: Ilan Peer <ilan.peer@intel.com> Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/ieee80211_i.h | 8 ++++++++ net/mac80211/util.c | 30 +++++++++++++++--------------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4e1d4c339f2de..a842f2e1c2309 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1709,6 +1709,14 @@ struct ieee802_11_elems { /* whether a parse error occurred while retrieving these elements */ bool parse_error; + + /* + * scratch buffer that can be used for various element parsing related + * tasks, e.g., element de-fragmentation etc. + */ + size_t scratch_len; + u8 *scratch_pos; + u8 scratch[]; }; static inline struct ieee80211_local *hw_to_local( diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f61289c5fed24..99e903299143e 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1506,24 +1506,26 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; + size_t scratch_len = params->len; - elems = kzalloc(sizeof(*elems), GFP_ATOMIC); + elems = kzalloc(sizeof(*elems) + scratch_len, GFP_ATOMIC); if (!elems) return NULL; elems->ie_start = params->start; elems->total_len = params->len; - - nontransmitted_profile = kmalloc(params->len, GFP_ATOMIC); - if (nontransmitted_profile) { - nontransmitted_profile_len = - ieee802_11_find_bssid_profile(params->start, params->len, - elems, params->bss, - nontransmitted_profile); - non_inherit = - cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - nontransmitted_profile, - nontransmitted_profile_len); - } + elems->scratch_len = scratch_len; + elems->scratch_pos = elems->scratch; + + nontransmitted_profile = elems->scratch_pos; + nontransmitted_profile_len = + ieee802_11_find_bssid_profile(params->start, params->len, + elems, params->bss, + nontransmitted_profile); + elems->scratch_pos += nontransmitted_profile_len; + elems->scratch_len -= nontransmitted_profile_len; + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + nontransmitted_profile, + nontransmitted_profile_len); elems->crc = _ieee802_11_parse_elems_full(params, elems, non_inherit); @@ -1557,8 +1559,6 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) offsetofend(struct ieee80211_bssid_index, dtim_count)) elems->dtim_count = elems->bssid_index->dtim_count; - kfree(nontransmitted_profile); - return elems; } -- GitLab From 567e14e39e8f8c6997a1378bc3be615afca86063 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Thu, 29 Sep 2022 21:50:44 +0200 Subject: [PATCH 1636/2223] wifi: cfg80211: ensure length byte is present before access When iterating the elements here, ensure the length byte is present before checking it to see if the entire element will fit into the buffer. Longer term, we should rewrite this code using the type-safe element iteration macros that check all of this. Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Reported-by: Soenke Huster <shuster@seemoo.tu-darmstadt.de> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/scan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 5dab33e1f3a81..a183f2b758742 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -304,7 +304,8 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, tmp_old = cfg80211_find_ie(WLAN_EID_SSID, ie, ielen); tmp_old = (tmp_old) ? tmp_old + tmp_old[1] + 2 : ie; - while (tmp_old + tmp_old[1] + 2 - ie <= ielen) { + while (tmp_old + 2 - ie <= ielen && + tmp_old + tmp_old[1] + 2 - ie <= ielen) { if (tmp_old[0] == 0) { tmp_old++; continue; @@ -364,7 +365,8 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, * copied to new ie, skip ssid, capability, bssid-index ie */ tmp_new = sub_copy; - while (tmp_new + tmp_new[1] + 2 - sub_copy <= subie_len) { + while (tmp_new + 2 - sub_copy <= subie_len && + tmp_new + tmp_new[1] + 2 - sub_copy <= subie_len) { if (!(tmp_new[0] == WLAN_EID_NON_TX_BSSID_CAP || tmp_new[0] == WLAN_EID_SSID)) { memcpy(pos, tmp_new, tmp_new[1] + 2); -- GitLab From 0b7808818cb9df6680f98996b8e9a439fa7bcc2f Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Fri, 30 Sep 2022 23:44:23 +0200 Subject: [PATCH 1637/2223] wifi: cfg80211: fix BSS refcounting bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are multiple refcounting bugs related to multi-BSSID: - In bss_ref_get(), if the BSS has a hidden_beacon_bss, then the bss pointer is overwritten before checking for the transmitted BSS, which is clearly wrong. Fix this by using the bss_from_pub() macro. - In cfg80211_bss_update() we copy the transmitted_bss pointer from tmp into new, but then if we release new, we'll unref it erroneously. We already set the pointer and ref it, but need to NULL it since it was copied from the tmp data. - In cfg80211_inform_single_bss_data(), if adding to the non- transmitted list fails, we unlink the BSS and yet still we return it, but this results in returning an entry without a reference. We shouldn't return it anyway if it was broken enough to not get added there. This fixes CVE-2022-42720. Reported-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Tested-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Fixes: a3584f56de1c ("cfg80211: Properly track transmitting and non-transmitting BSS") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/scan.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a183f2b758742..249107212c099 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -143,18 +143,12 @@ static inline void bss_ref_get(struct cfg80211_registered_device *rdev, lockdep_assert_held(&rdev->bss_lock); bss->refcount++; - if (bss->pub.hidden_beacon_bss) { - bss = container_of(bss->pub.hidden_beacon_bss, - struct cfg80211_internal_bss, - pub); - bss->refcount++; - } - if (bss->pub.transmitted_bss) { - bss = container_of(bss->pub.transmitted_bss, - struct cfg80211_internal_bss, - pub); - bss->refcount++; - } + + if (bss->pub.hidden_beacon_bss) + bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++; + + if (bss->pub.transmitted_bss) + bss_from_pub(bss->pub.transmitted_bss)->refcount++; } static inline void bss_ref_put(struct cfg80211_registered_device *rdev, @@ -1741,6 +1735,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); INIT_LIST_HEAD(&new->pub.nontrans_list); + /* we'll set this later if it was non-NULL */ + new->pub.transmitted_bss = NULL; if (rcu_access_pointer(tmp->pub.proberesp_ies)) { hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); @@ -2023,10 +2019,15 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, spin_lock_bh(&rdev->bss_lock); if (cfg80211_add_nontrans_list(non_tx_data->tx_bss, &res->pub)) { - if (__cfg80211_unlink_bss(rdev, res)) + if (__cfg80211_unlink_bss(rdev, res)) { rdev->bss_generation++; + res = NULL; + } } spin_unlock_bh(&rdev->bss_lock); + + if (!res) + return NULL; } trace_cfg80211_return_bss(&res->pub); -- GitLab From bcca852027e5878aec911a347407ecc88d6fff7f Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Sat, 1 Oct 2022 00:01:44 +0200 Subject: [PATCH 1638/2223] wifi: cfg80211: avoid nontransmitted BSS list corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a non-transmitted BSS shares enough information (both SSID and BSSID!) with another non-transmitted BSS of a different AP, then we can find and update it, and then try to add it to the non-transmitted BSS list. We do a search for it on the transmitted BSS, but if it's not there (but belongs to another transmitted BSS), the list gets corrupted. Since this is an erroneous situation, simply fail the list insertion in this case and free the non-transmitted BSS. This fixes CVE-2022-42721. Reported-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Tested-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/scan.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 249107212c099..703b05c6c43e7 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -423,6 +423,15 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, rcu_read_unlock(); + /* + * This is a bit weird - it's not on the list, but already on another + * one! The only way that could happen is if there's some BSSID/SSID + * shared by multiple APs in their multi-BSSID profiles, potentially + * with hidden SSID mixed in ... ignore it. + */ + if (!list_empty(&nontrans_bss->nontrans_list)) + return -EINVAL; + /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; -- GitLab From 1833b6f46d7e2830251a063935ab464256defe22 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 5 Oct 2022 15:10:09 +0200 Subject: [PATCH 1639/2223] wifi: mac80211_hwsim: avoid mac80211 warning on bad rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the tool on the other side (e.g. wmediumd) gets confused about the rate, we hit a warning in mac80211. Silence that by effectively duplicating the check here and dropping the frame silently (in mac80211 it's dropped with the warning). Reported-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Tested-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- drivers/net/wireless/mac80211_hwsim.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index df51b5b1f1710..a40636c90ec36 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -4973,6 +4973,8 @@ static int hwsim_cloned_frame_received_nl(struct sk_buff *skb_2, } rx_status.rate_idx = nla_get_u32(info->attrs[HWSIM_ATTR_RX_RATE]); + if (rx_status.rate_idx >= data2->hw->wiphy->bands[rx_status.band]->n_bitrates) + goto out; rx_status.signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]); hdr = (void *)skb->data; -- GitLab From b2d03cabe2b2e150ff5a381731ea0355459be09f Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 5 Oct 2022 21:24:10 +0200 Subject: [PATCH 1640/2223] wifi: mac80211: fix crash in beacon protection for P2P-device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If beacon protection is active but the beacon cannot be decrypted or is otherwise malformed, we call the cfg80211 API to report this to userspace, but that uses a netdev pointer, which isn't present for P2P-Device. Fix this to call it only conditionally to ensure cfg80211 won't crash in the case of P2P-Device. This fixes CVE-2022-42722. Reported-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Fixes: 9eaf183af741 ("mac80211: Report beacon protection failures to user space") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/rx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bd215fe3c7969..6001adc0a00e3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1978,10 +1978,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS) { - cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, - skb->data, - skb->len); + NUM_DEFAULT_BEACON_KEYS) { + if (rx->sdata->dev) + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, + skb->len); return RX_DROP_MONITOR; /* unexpected BIP keyidx */ } @@ -2131,7 +2132,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; - if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE)) + if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE && + rx->sdata->dev)) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); -- GitLab From c90b93b5b782891ebfda49d4e5da36632fefd5d1 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 5 Oct 2022 23:11:43 +0200 Subject: [PATCH 1641/2223] wifi: cfg80211: update hidden BSSes to avoid WARN_ON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When updating beacon elements in a non-transmitted BSS, also update the hidden sub-entries to the same beacon elements, so that a future update through other paths won't trigger a WARN_ON(). The warning is triggered because the beacon elements in the hidden BSSes that are children of the BSS should always be the same as in the parent. Reported-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Tested-by: Sönke Huster <shuster@seemoo.tu-darmstadt.de> Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/scan.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 703b05c6c43e7..806a5f1330ff5 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1607,6 +1607,23 @@ struct cfg80211_non_tx_bss { u8 bssid_index; }; +static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, + const struct cfg80211_bss_ies *new_ies, + const struct cfg80211_bss_ies *old_ies) +{ + struct cfg80211_internal_bss *bss; + + /* Assign beacon IEs to all sub entries */ + list_for_each_entry(bss, &known->hidden_list, hidden_list) { + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(bss->pub.beacon_ies); + WARN_ON(ies != old_ies); + + rcu_assign_pointer(bss->pub.beacon_ies, new_ies); + } +} + static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, @@ -1630,7 +1647,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } else if (rcu_access_pointer(new->pub.beacon_ies)) { const struct cfg80211_bss_ies *old; - struct cfg80211_internal_bss *bss; if (known->pub.hidden_beacon_bss && !list_empty(&known->hidden_list)) { @@ -1658,16 +1674,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); - /* Assign beacon IEs to all sub entries */ - list_for_each_entry(bss, &known->hidden_list, hidden_list) { - const struct cfg80211_bss_ies *ies; - - ies = rcu_access_pointer(bss->pub.beacon_ies); - WARN_ON(ies != old); - - rcu_assign_pointer(bss->pub.beacon_ies, - new->pub.beacon_ies); - } + cfg80211_update_hidden_bsses(known, new->pub.beacon_ies, old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); @@ -2360,6 +2367,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, } else { old = rcu_access_pointer(nontrans_bss->beacon_ies); rcu_assign_pointer(nontrans_bss->beacon_ies, new_ies); + cfg80211_update_hidden_bsses(bss_from_pub(nontrans_bss), + new_ies, old); rcu_assign_pointer(nontrans_bss->ies, new_ies); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); -- GitLab From f3e59ff348c077a6afd4edb23d7e69e9cba62fdc Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter <oberpar@linux.ibm.com> Date: Wed, 28 Sep 2022 15:34:33 +0200 Subject: [PATCH 1642/2223] s390/vmur: remove unnecessary BUG statement An existing BUG statement in vmur's interrupt handler triggers if: 1. An online vmur device is removed (e.g. due to driver unload, manual unbind or channel-report words indicating hypervisor-side device removal) 2. Device deactivation fails due to firmware/hypervisor error, leaving subchannel enabled for interrupts + drvdata=NULL 3. Interrupt occurs This situation is highly unlikely and not a clear indication of a general system error that would warrant stopping the full Linux system. Also it can be prevented completely by clearing the interrupt handler when unsetting a vmur device's drvdata. Replace the BUG statement in vmur's interrupt handler by clearing the interrupt handler callback during device removal. Also move the initial setting of the interrupt handler callback under lock for consistency reasons. Reviewed-by: Sven Schnelle <svens@linux.ibm.com> Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Signed-off-by: Peter Oberparleiter <oberpar@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com> --- drivers/s390/char/vmur.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index 68f49e2e964c0..471f07ca5066c 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -293,7 +293,6 @@ static void ur_int_handler(struct ccw_device *cdev, unsigned long intparm, return; } urd = dev_get_drvdata(&cdev->dev); - BUG_ON(!urd); /* On special conditions irb is an error pointer */ if (IS_ERR(irb)) urd->io_request_rc = PTR_ERR(irb); @@ -809,7 +808,6 @@ static int ur_probe(struct ccw_device *cdev) rc = -ENOMEM; goto fail_urdev_put; } - cdev->handler = ur_int_handler; /* validate virtual unit record device */ urd->class = get_urd_class(urd); @@ -823,6 +821,7 @@ static int ur_probe(struct ccw_device *cdev) } spin_lock_irq(get_ccwdev_lock(cdev)); dev_set_drvdata(&cdev->dev, urd); + cdev->handler = ur_int_handler; spin_unlock_irq(get_ccwdev_lock(cdev)); mutex_unlock(&vmur_mutex); @@ -963,6 +962,7 @@ static void ur_remove(struct ccw_device *cdev) spin_lock_irqsave(get_ccwdev_lock(cdev), flags); urdev_put(dev_get_drvdata(&cdev->dev)); dev_set_drvdata(&cdev->dev, NULL); + cdev->handler = NULL; spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); mutex_unlock(&vmur_mutex); -- GitLab From bf18140d30541c2c1e5c0f57879634f3d0d04912 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter <oberpar@linux.ibm.com> Date: Mon, 26 Sep 2022 16:56:04 +0200 Subject: [PATCH 1643/2223] s390/vmur: generate uevent on unsolicited device end When a traditional channel-attached device transitions from not-ready to ready state, an unsolicited DEVICE END I/O interrupt is raised. This happens for example when a new file arrives in the z/VM virtual reader device. Change the Linux kernel to generate a change uevent when such an interrupt occurs for any online unit record devices supported by the vmur driver. This can be useful to automatically trigger processing of files as they arrive in the reader device. A sample udev rule for running a program when this event occurs looks as follows: ENV{DRIVER}=="vmur", ACTION=="change", ENV{EVENT}=="unsol_de", \ RUN{program}="/path/to/program" The rule can be tested using the following steps: 1. Set reader device online (assuming default reader device number 000c) $ chzdev -ea 0.0.000c 2. Force a ready-state transition using z/VM's READY CP command $ vmcp ready 000c Suggested-by: Alan Altmark <Alan_Altmark@us.ibm.com> Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Peter Oberparleiter <oberpar@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com> --- drivers/s390/char/vmur.c | 33 ++++++++++++++++++++++++++++++++- drivers/s390/char/vmur.h | 2 ++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index 471f07ca5066c..131293f7f1521 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -15,12 +15,14 @@ #include <linux/cdev.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/kobject.h> #include <linux/uaccess.h> #include <asm/cio.h> #include <asm/ccwdev.h> #include <asm/debug.h> #include <asm/diag.h> +#include <asm/scsw.h> #include "vmur.h" @@ -78,6 +80,8 @@ static struct ccw_driver ur_driver = { static DEFINE_MUTEX(vmur_mutex); +static void ur_uevent(struct work_struct *ws); + /* * Allocation, freeing, getting and putting of urdev structures * @@ -108,6 +112,7 @@ static struct urdev *urdev_alloc(struct ccw_device *cdev) ccw_device_get_id(cdev, &urd->dev_id); mutex_init(&urd->io_mutex); init_waitqueue_head(&urd->wait); + INIT_WORK(&urd->uevent_work, ur_uevent); spin_lock_init(&urd->open_lock); refcount_set(&urd->ref_count, 1); urd->cdev = cdev; @@ -275,6 +280,18 @@ out: return rc; } +static void ur_uevent(struct work_struct *ws) +{ + struct urdev *urd = container_of(ws, struct urdev, uevent_work); + char *envp[] = { + "EVENT=unsol_de", /* Unsolicited device-end interrupt */ + NULL + }; + + kobject_uevent_env(&urd->cdev->dev.kobj, KOBJ_CHANGE, envp); + urdev_put(urd); +} + /* * ur interrupt handler, called from the ccw_device layer */ @@ -288,11 +305,21 @@ static void ur_int_handler(struct ccw_device *cdev, unsigned long intparm, intparm, irb->scsw.cmd.cstat, irb->scsw.cmd.dstat, irb->scsw.cmd.count); } + urd = dev_get_drvdata(&cdev->dev); if (!intparm) { TRACE("ur_int_handler: unsolicited interrupt\n"); + + if (scsw_dstat(&irb->scsw) & DEV_STAT_DEV_END) { + /* + * Userspace might be interested in a transition to + * device-ready state. + */ + urdev_get(urd); + schedule_work(&urd->uevent_work); + } + return; } - urd = dev_get_drvdata(&cdev->dev); /* On special conditions irb is an error pointer */ if (IS_ERR(irb)) urd->io_request_rc = PTR_ERR(irb); @@ -927,6 +954,10 @@ static int ur_set_offline_force(struct ccw_device *cdev, int force) rc = -EBUSY; goto fail_urdev_put; } + if (cancel_work_sync(&urd->uevent_work)) { + /* Work not run yet - need to release reference here */ + urdev_put(urd); + } device_destroy(vmur_class, urd->char_device->dev); cdev_del(urd->char_device); urd->char_device = NULL; diff --git a/drivers/s390/char/vmur.h b/drivers/s390/char/vmur.h index 608b0719ce17b..92d17d7cb47bd 100644 --- a/drivers/s390/char/vmur.h +++ b/drivers/s390/char/vmur.h @@ -13,6 +13,7 @@ #define _VMUR_H_ #include <linux/refcount.h> +#include <linux/workqueue.h> #define DEV_CLASS_UR_I 0x20 /* diag210 unit record input device class */ #define DEV_CLASS_UR_O 0x10 /* diag210 unit record output device class */ @@ -76,6 +77,7 @@ struct urdev { wait_queue_head_t wait; /* wait queue to serialize open */ int open_flag; /* "urdev is open" flag */ spinlock_t open_lock; /* serialize critical sections */ + struct work_struct uevent_work; /* work to send uevent */ }; /* -- GitLab From 2e21c1575208786f667cb66d8cf87a52160b81db Mon Sep 17 00:00:00 2001 From: Arnd Bergmann <arnd@arndb.de> Date: Mon, 10 Oct 2022 10:33:38 +0200 Subject: [PATCH 1644/2223] alpha: fix marvel_ioread8 build regression The previous build fix contained a small typo that led to another regression: arch/alpha/kernel/core_marvel.c:807:1: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'marvel_ioread8' Reported-by: kernel test robot <lkp@intel.com> Fixes: e19d4ebc536d ("alpha: add full ioread64/iowrite64 implementation") Signed-off-by: Arnd Bergmann <arnd@arndb.de> --- arch/alpha/kernel/core_marvel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 6d0b3baf97ff4..e9348aec46499 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -803,7 +803,7 @@ void __iomem *marvel_ioportmap (unsigned long addr) return (void __iomem *)addr; } -unsigned u8 +u8 marvel_ioread8(const void __iomem *xaddr) { unsigned long addr = (unsigned long) xaddr; -- GitLab From 9afd20a5a1b3558950b5e12f3ed057ef93c64a05 Mon Sep 17 00:00:00 2001 From: Viresh Kumar <viresh.kumar@linaro.org> Date: Mon, 10 Oct 2022 09:52:37 +0530 Subject: [PATCH 1645/2223] clk: spear: Move prototype to accessible header Fixes the following W=1 kernel build warning(s): drivers/clk/spear/spear6xx_clock.c:116:13: warning: no previous prototype for function 'spear6xx_clk_init' [-Wmissing-prototypes] Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Arnd Bergmann <arnd@arndb.de> --- arch/arm/mach-spear/generic.h | 3 --- arch/arm/mach-spear/spear3xx.c | 1 + arch/arm/mach-spear/spear6xx.c | 1 + drivers/clk/spear/spear3xx_clock.c | 1 + drivers/clk/spear/spear6xx_clock.c | 1 + include/linux/clk/spear.h | 14 ++++++++++++++ 6 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-spear/generic.h b/arch/arm/mach-spear/generic.h index 43b7996ab7545..9e36920d4cfd1 100644 --- a/arch/arm/mach-spear/generic.h +++ b/arch/arm/mach-spear/generic.h @@ -25,11 +25,8 @@ extern struct pl022_ssp_controller pl022_plat_data; extern struct pl08x_platform_data pl080_plat_data; void __init spear_setup_of_timer(void); -void __init spear3xx_clk_init(void __iomem *misc_base, - void __iomem *soc_config_base); void __init spear3xx_map_io(void); void __init spear3xx_dt_init_irq(void); -void __init spear6xx_clk_init(void __iomem *misc_base); void __init spear13xx_map_io(void); void __init spear13xx_l2x0_init(void); diff --git a/arch/arm/mach-spear/spear3xx.c b/arch/arm/mach-spear/spear3xx.c index 2ba406e92c41b..7ef9670d30292 100644 --- a/arch/arm/mach-spear/spear3xx.c +++ b/arch/arm/mach-spear/spear3xx.c @@ -13,6 +13,7 @@ #include <linux/amba/pl022.h> #include <linux/amba/pl080.h> #include <linux/clk.h> +#include <linux/clk/spear.h> #include <linux/io.h> #include <asm/mach/map.h> #include "pl080.h" diff --git a/arch/arm/mach-spear/spear6xx.c b/arch/arm/mach-spear/spear6xx.c index 58183493e06d4..98d9f2ad6b743 100644 --- a/arch/arm/mach-spear/spear6xx.c +++ b/arch/arm/mach-spear/spear6xx.c @@ -12,6 +12,7 @@ #include <linux/amba/pl08x.h> #include <linux/clk.h> +#include <linux/clk/spear.h> #include <linux/err.h> #include <linux/of.h> #include <linux/of_address.h> diff --git a/drivers/clk/spear/spear3xx_clock.c b/drivers/clk/spear/spear3xx_clock.c index 41717ff707f69..ba87913031567 100644 --- a/drivers/clk/spear/spear3xx_clock.c +++ b/drivers/clk/spear/spear3xx_clock.c @@ -8,6 +8,7 @@ #include <linux/clk.h> #include <linux/clkdev.h> +#include <linux/clk/spear.h> #include <linux/err.h> #include <linux/io.h> #include <linux/of_platform.h> diff --git a/drivers/clk/spear/spear6xx_clock.c b/drivers/clk/spear/spear6xx_clock.c index 490701ac9e938..c192a9141b866 100644 --- a/drivers/clk/spear/spear6xx_clock.c +++ b/drivers/clk/spear/spear6xx_clock.c @@ -7,6 +7,7 @@ */ #include <linux/clkdev.h> +#include <linux/clk/spear.h> #include <linux/io.h> #include <linux/spinlock_types.h> #include "clk.h" diff --git a/include/linux/clk/spear.h b/include/linux/clk/spear.h index a64d034ceddd2..eaf95ca656f83 100644 --- a/include/linux/clk/spear.h +++ b/include/linux/clk/spear.h @@ -8,6 +8,20 @@ #ifndef __LINUX_CLK_SPEAR_H #define __LINUX_CLK_SPEAR_H +#ifdef CONFIG_ARCH_SPEAR3XX +void __init spear3xx_clk_init(void __iomem *misc_base, + void __iomem *soc_config_base); +#else +static inline void __init spear3xx_clk_init(void __iomem *misc_base, + void __iomem *soc_config_base) {} +#endif + +#ifdef CONFIG_ARCH_SPEAR6XX +void __init spear6xx_clk_init(void __iomem *misc_base); +#else +static inline void __init spear6xx_clk_init(void __iomem *misc_base) {} +#endif + #ifdef CONFIG_MACH_SPEAR1310 void __init spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base); #else -- GitLab From 390ca5bca7cdb95042dcb445375db0d9eba7aa4a Mon Sep 17 00:00:00 2001 From: Viresh Kumar <viresh.kumar@linaro.org> Date: Mon, 10 Oct 2022 09:41:07 +0530 Subject: [PATCH 1646/2223] ARM: spear6xx: Staticize few definitions Fix warnings with clang like: arch/arm/mach-spear/spear6xx.c:365:13: warning: no previous prototype for function 'spear6xx_map_io' [-Wmissing-prototypes] by making few definitions static. Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Arnd Bergmann <arnd@arndb.de> --- arch/arm/mach-spear/spear6xx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-spear/spear6xx.c b/arch/arm/mach-spear/spear6xx.c index 98d9f2ad6b743..f0a1e704ccebc 100644 --- a/arch/arm/mach-spear/spear6xx.c +++ b/arch/arm/mach-spear/spear6xx.c @@ -340,7 +340,7 @@ static struct pl08x_platform_data spear6xx_pl080_plat_data = { * 0xD0000000 0xFD000000 * 0xFC000000 0xFC000000 */ -struct map_desc spear6xx_io_desc[] __initdata = { +static struct map_desc spear6xx_io_desc[] __initdata = { { .virtual = (unsigned long)VA_SPEAR6XX_ML_CPU_BASE, .pfn = __phys_to_pfn(SPEAR_ICM3_ML1_2_BASE), @@ -360,12 +360,12 @@ struct map_desc spear6xx_io_desc[] __initdata = { }; /* This will create static memory mapping for selected devices */ -void __init spear6xx_map_io(void) +static void __init spear6xx_map_io(void) { iotable_init(spear6xx_io_desc, ARRAY_SIZE(spear6xx_io_desc)); } -void __init spear6xx_timer_init(void) +static void __init spear6xx_timer_init(void) { char pclk_name[] = "pll3_clk"; struct clk *gpt_clk, *pclk; @@ -395,7 +395,7 @@ void __init spear6xx_timer_init(void) } /* Add auxdata to pass platform data */ -struct of_dev_auxdata spear6xx_auxdata_lookup[] __initdata = { +static struct of_dev_auxdata spear6xx_auxdata_lookup[] __initdata = { OF_DEV_AUXDATA("arm,pl080", SPEAR_ICM3_DMA_BASE, NULL, &spear6xx_pl080_plat_data), {} -- GitLab From bd60aafce5e1943fd395b8bf726e9824fa621eca Mon Sep 17 00:00:00 2001 From: Chen Lifu <chenlifu@huawei.com> Date: Wed, 17 Aug 2022 16:14:20 +0800 Subject: [PATCH 1647/2223] ARM: mmp: Make some symbols static These symbols pxa168_usb_phy_resources, pxa168_u2o_resources, pxa168_u2oehci_resources and pxa168_u2ootg_resources are not used outside of arch/arm/mach-mmp/devices.c, so mark them static. Fixes the following sparse warning: arch/arm/mach-mmp/devices.c:241:17: warning: symbol 'pxa168_usb_phy_resources' was not declared. Should it be static? arch/arm/mach-mmp/devices.c:262:17: warning: symbol 'pxa168_u2o_resources' was not declared. Should it be static? arch/arm/mach-mmp/devices.c:297:17: warning: symbol 'pxa168_u2oehci_resources' was not declared. Should it be static? arch/arm/mach-mmp/devices.c:324:17: warning: symbol 'pxa168_u2ootg_resources' was not declared. Should it be static? Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Chen Lifu <chenlifu@huawei.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> --- arch/arm/mach-mmp/devices.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-mmp/devices.c b/arch/arm/mach-mmp/devices.c index 79f4a2aa5475d..9968239d80415 100644 --- a/arch/arm/mach-mmp/devices.c +++ b/arch/arm/mach-mmp/devices.c @@ -238,7 +238,7 @@ void pxa_usb_phy_deinit(void __iomem *phy_reg) static u64 __maybe_unused usb_dma_mask = ~(u32)0; #if IS_ENABLED(CONFIG_PHY_PXA_USB) -struct resource pxa168_usb_phy_resources[] = { +static struct resource pxa168_usb_phy_resources[] = { [0] = { .start = PXA168_U2O_PHYBASE, .end = PXA168_U2O_PHYBASE + USB_PHY_RANGE, @@ -259,7 +259,7 @@ struct platform_device pxa168_device_usb_phy = { #endif /* CONFIG_PHY_PXA_USB */ #if IS_ENABLED(CONFIG_USB_MV_UDC) -struct resource pxa168_u2o_resources[] = { +static struct resource pxa168_u2o_resources[] = { /* regbase */ [0] = { .start = PXA168_U2O_REGBASE + U2x_CAPREGS_OFFSET, @@ -294,7 +294,7 @@ struct platform_device pxa168_device_u2o = { #endif /* CONFIG_USB_MV_UDC */ #if IS_ENABLED(CONFIG_USB_EHCI_MV_U2O) -struct resource pxa168_u2oehci_resources[] = { +static struct resource pxa168_u2oehci_resources[] = { [0] = { .start = PXA168_U2O_REGBASE, .end = PXA168_U2O_REGBASE + USB_REG_RANGE, @@ -321,7 +321,7 @@ struct platform_device pxa168_device_u2oehci = { #endif #if IS_ENABLED(CONFIG_USB_MV_OTG) -struct resource pxa168_u2ootg_resources[] = { +static struct resource pxa168_u2ootg_resources[] = { /* regbase */ [0] = { .start = PXA168_U2O_REGBASE + U2x_CAPREGS_OFFSET, -- GitLab From 8a6ffcbe26fd14d58075dcf3cbdf1b5b69b20402 Mon Sep 17 00:00:00 2001 From: Zenghui Yu <yuzenghui@huawei.com> Date: Sun, 9 Oct 2022 11:31:31 +0800 Subject: [PATCH 1648/2223] KVM: arm64: selftests: Fix multiple versions of GIC creation Commit 98f94ce42ac6 ("KVM: selftests: Move KVM_CREATE_DEVICE_TEST code to separate helper") wrongly converted a "real" GIC device creation to __kvm_test_create_device() and caused the test failure on my D05 (which supports v2 emulation). Fix it. Fixes: 98f94ce42ac6 ("KVM: selftests: Move KVM_CREATE_DEVICE_TEST code to separate helper") Signed-off-by: Zenghui Yu <yuzenghui@huawei.com> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221009033131.365-1-yuzenghui@huawei.com --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index e05ecb31823fb..9c131d977a1b5 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -662,8 +662,8 @@ int test_kvm_device(uint32_t gic_dev_type) : KVM_DEV_TYPE_ARM_VGIC_V2; if (!__kvm_test_create_device(v.vm, other)) { - ret = __kvm_test_create_device(v.vm, other); - TEST_ASSERT(ret && (errno == EINVAL || errno == EEXIST), + ret = __kvm_create_device(v.vm, other); + TEST_ASSERT(ret < 0 && (errno == EINVAL || errno == EEXIST), "create GIC device while other version exists"); } -- GitLab From 61367688f1fb07678b1d865a0ce9364f5267a896 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Mon, 29 Aug 2022 13:26:08 +0200 Subject: [PATCH 1649/2223] xen/virtio: enable grant based virtio on x86 Use an x86-specific virtio_check_mem_acc_cb() for Xen in order to setup the correct DMA ops. Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # common code Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Juergen Gross <jgross@suse.com> --- arch/x86/xen/enlighten_hvm.c | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- drivers/xen/grant-dma-ops.c | 12 +++++++++++- include/xen/xen-ops.h | 6 ++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 1c1ac418484b5..c1cd28e915a3a 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -212,7 +212,7 @@ static void __init xen_hvm_guest_init(void) return; if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); init_hvm_pv_info(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 0ed2e487a693f..0a5dcadf23b93 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -112,7 +112,7 @@ static void __init xen_pv_init_platform(void) { /* PV guests can't operate virtio devices without grants. */ if (IS_ENABLED(CONFIG_XEN_VIRTIO)) - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 3e4c590896d08..860f37c93af41 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -313,7 +313,7 @@ bool xen_is_grant_dma_device(struct device *dev) bool xen_virtio_mem_acc(struct virtio_device *dev) { - if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain()) return true; return xen_is_grant_dma_device(dev->dev.parent); @@ -387,6 +387,16 @@ err: dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n"); } +bool xen_virtio_restricted_mem_acc(struct virtio_device *dev) +{ + bool ret = xen_virtio_mem_acc(dev); + + if (ret) + xen_grant_setup_dma_ops(dev->dev.parent); + + return ret; +} + MODULE_DESCRIPTION("Xen grant DMA-mapping layer"); MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); MODULE_LICENSE("GPL"); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index dae0f350c6780..a34f4271a2e9f 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -219,6 +219,7 @@ static inline void xen_preemptible_hcall_end(void) { } void xen_grant_setup_dma_ops(struct device *dev); bool xen_is_grant_dma_device(struct device *dev); bool xen_virtio_mem_acc(struct virtio_device *dev); +bool xen_virtio_restricted_mem_acc(struct virtio_device *dev); #else static inline void xen_grant_setup_dma_ops(struct device *dev) { @@ -234,6 +235,11 @@ static inline bool xen_virtio_mem_acc(struct virtio_device *dev) { return false; } + +static inline bool xen_virtio_restricted_mem_acc(struct virtio_device *dev) +{ + return false; +} #endif /* CONFIG_XEN_GRANT_DMA_OPS */ #endif /* INCLUDE_XEN_OPS_H */ -- GitLab From 66ba7c88507344dee68ad1acbdb630473ab36114 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" <luke@ljones.dev> Date: Mon, 10 Oct 2022 19:57:02 +1300 Subject: [PATCH 1650/2223] ALSA: hda/realtek: Correct pin configs for ASUS G533Z The initial fix for ASUS G533Z was based on faulty information. This fixes the pincfg to values that have been verified with no existing module options or other hacks enabled. Enables headphone jack, and 5.1 surround. [ corrected the indent level by tiwai ] Fixes: bc2c23549ccd ("ALSA: hda/realtek: Add pincfg for ASUS G533Z HP jack") Signed-off-by: Luke D. Jones <luke@ljones.dev> Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/20221010065702.35190-1-luke@ljones.dev Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/patch_realtek.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d89f95ae0efc7..77a308a71cd42 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8449,11 +8449,13 @@ static const struct hda_fixup alc269_fixups[] = { [ALC285_FIXUP_ASUS_G533Z_PINS] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { - { 0x14, 0x90170120 }, + { 0x14, 0x90170152 }, /* Speaker Surround Playback Switch */ + { 0x19, 0x03a19020 }, /* Mic Boost Volume */ + { 0x1a, 0x03a11c30 }, /* Mic Boost Volume */ + { 0x1e, 0x90170151 }, /* Rear jack, IN OUT EAPD Detect */ + { 0x21, 0x03211420 }, { } }, - .chained = true, - .chain_id = ALC294_FIXUP_ASUS_G513_PINS, }, [ALC294_FIXUP_ASUS_COEF_1B] = { .type = HDA_FIXUP_VERBS, -- GitLab From 2ea8e1297801f7b0220ebf6ae61a5b74ca83981e Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" <luke@ljones.dev> Date: Mon, 10 Oct 2022 20:03:47 +1300 Subject: [PATCH 1651/2223] ALSA: hda/realtek: Add quirk for ASUS GV601R laptop The ASUS ROG X16 (GV601R) series laptop has the same node-to-DAC pairs as early models and the G14, this includes bass speakers which are by default mapped incorrectly to the 0x06 node. Add a quirk to use the same DAC pairs as the G14. Signed-off-by: Luke D. Jones <luke@ljones.dev> Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/20221010070347.36883-1-luke@ljones.dev Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 77a308a71cd42..54a0f6b4ffc77 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9423,6 +9423,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1c52, "ASUS Zephyrus G15 2022", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1f11, "ASUS Zephyrus G14", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1f92, "ASUS ROG Flow X16", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), -- GitLab From ca5eebda3e1c1a58a1c5a337da393ed6734593e3 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Mon, 3 Oct 2022 09:35:34 -0400 Subject: [PATCH 1652/2223] block: avoid sign extend problem with default queue flags mask request_queue->queue_flags is unsigned long, which is 8-bytes on 64-bit architectures. Most queue flag modifications occur through bit field helpers, but default flags can be logically OR'd via the QUEUE_FLAG_MQ_DEFAULT mask. If this mask happens to include bit 31, the assignment can sign extend the field and set all upper 32 bits. This exact problem has been observed on a downstream kernel that happens to use bit 31 for QUEUE_FLAG_NOWAIT. This is not an immediate problem for current upstream because bit 31 is not included in the default flag assignment (and is not used at all, actually). Regardless, fix up the QUEUE_FLAG_MQ_DEFAULT mask definition to avoid the landmine in the future. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20221003133534.1075582-1-bfoster@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49373d0026317..e4fb477531775 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -580,9 +580,9 @@ struct request_queue { #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ -#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_SAME_COMP) | \ - (1 << QUEUE_FLAG_NOWAIT)) +#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ + (1UL << QUEUE_FLAG_SAME_COMP) | \ + (1UL << QUEUE_FLAG_NOWAIT)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -- GitLab From a0a6314ae774f8a5e52a599946aa2ad0db867b83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 10 Oct 2022 15:18:57 +0200 Subject: [PATCH 1653/2223] block: fix leaking minors of hidden disks The major/minor of a hidden gendisk is not propagated to the block device because it is never registered using bdev_add. But the lack of bd_dev also causes the dynamic major minor number not to be freed. Assign bd_dev manually to ensure the dynamic major minor gets freed. Based on a patch by Keith Busch. Fixes: 8ddcd653257c ("block: introduce GENHD_FL_HIDDEN") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de> Tested-by: Daniel Wagner <dwagner@suse.de> Reviewed-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20221010131857.748129-1-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index d6a21803a57e2..dc9b61dfb6920 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -507,6 +507,13 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); + } else { + /* + * Even if the block_device for a hidden gendisk is not + * registered, it needs to have a valid bd_dev so that the + * freeing of the dynamic major works. + */ + disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } disk_update_readahead(disk); -- GitLab From 9c1ab6d54a2e9e59b8922d12145d895e7f88b62c Mon Sep 17 00:00:00 2001 From: Leo Yan <leo.yan@linaro.org> Date: Sat, 8 Oct 2022 08:32:50 +0000 Subject: [PATCH 1654/2223] docs: ftrace: Correct access mode The documentation gives an example for opening trace marker with write-only mode, but the flag WR_ONLY is not defined by glibc. Use O_WRONLY to replace it. Signed-off-by: Leo Yan <leo.yan@linaro.org> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> Link: https://lore.kernel.org/r/20221008083250.3160-1-leo.yan@linaro.org Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/trace/ftrace.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index b37dc19e4d409..60bceb018d6a9 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -564,7 +564,7 @@ of ftrace. Here is a list of some of the key files: start:: - trace_fd = open("trace_marker", WR_ONLY); + trace_fd = open("trace_marker", O_WRONLY); Note: Writing into the trace_marker file can also initiate triggers that are written into /sys/kernel/tracing/events/ftrace/print/trigger -- GitLab From a0a6859f8330e007e014ae6f7187766786745e74 Mon Sep 17 00:00:00 2001 From: Yanteng Si <siyanteng@loongson.cn> Date: Sat, 8 Oct 2022 17:41:39 +0800 Subject: [PATCH 1655/2223] docs/zh_CN: Fix build warning Since a patch set in my translation devicetree introduce some build warnings: Warning: Documentation/translations/zh_CN/devicetree/changesets.rst references a file that doesn't exist: Documentation/Devicetree/changesets.rst ... Change the first letter of Devicetree to lowercase. Fixes: 9485acfded20 ("docs/zh_CN: add dt kernel-api translation") Fixes: f773455ce59d ("docs/zh_CN: add dt overlay-notes translation") Fixes: 5e38432db8f3 ("docs/zh_CN: add dt dynamic-resolution-notes translation") Fixes: 330f5a300548 ("docs/zh_CN: add dt changesets translation") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Yanteng Si <siyanteng@loongson.cn> Reviewed-by: Wu XiangCheng <bobwxc@email.cn> Link: https://lore.kernel.org/r/20221008094139.314151-1-siyanteng@loongson.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/devicetree/changesets.rst | 2 +- .../translations/zh_CN/devicetree/dynamic-resolution-notes.rst | 2 +- Documentation/translations/zh_CN/devicetree/kernel-api.rst | 2 +- Documentation/translations/zh_CN/devicetree/overlay-notes.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/translations/zh_CN/devicetree/changesets.rst b/Documentation/translations/zh_CN/devicetree/changesets.rst index 2ace05f3c3773..3df1b03c5695c 100644 --- a/Documentation/translations/zh_CN/devicetree/changesets.rst +++ b/Documentation/translations/zh_CN/devicetree/changesets.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/Devicetree/changesets.rst +:Original: Documentation/devicetree/changesets.rst :翻译: diff --git a/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst b/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst index 115190341305f..6dfd946d70932 100644 --- a/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst +++ b/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/Devicetree/dynamic-resolution-notes.rst +:Original: Documentation/devicetree/dynamic-resolution-notes.rst :翻译: diff --git a/Documentation/translations/zh_CN/devicetree/kernel-api.rst b/Documentation/translations/zh_CN/devicetree/kernel-api.rst index 6aa3b685494ed..2fb729368b406 100644 --- a/Documentation/translations/zh_CN/devicetree/kernel-api.rst +++ b/Documentation/translations/zh_CN/devicetree/kernel-api.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/Devicetree/kernel-api.rst +:Original: Documentation/devicetree/kernel-api.rst :翻译: diff --git a/Documentation/translations/zh_CN/devicetree/overlay-notes.rst b/Documentation/translations/zh_CN/devicetree/overlay-notes.rst index 1bd482cb0a1bb..43e3c0bc5a9f8 100644 --- a/Documentation/translations/zh_CN/devicetree/overlay-notes.rst +++ b/Documentation/translations/zh_CN/devicetree/overlay-notes.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/Devicetree/overlay-notes.rst +:Original: Documentation/devicetree/overlay-notes.rst :翻译: -- GitLab From 0719fdba54836b6d7acbe7d74f81df2153a40810 Mon Sep 17 00:00:00 2001 From: Yixuan Cao <caoyixuan2019@email.szu.edu.cn> Date: Wed, 5 Oct 2022 22:55:25 +0800 Subject: [PATCH 1656/2223] Documentation/mm/page_owner.rst: delete frequently changing experimental data The kernel size changes due to many factors, such as compiler version, configuration, and the build environment. This makes size comparison figures irrelevant to reader's setup. Remove these figures and describe the effects of page owner to the kernel size in general instead. Thanks for Jonathan Corbet, Bagas Sanjaya and Mike Rapoport's constructive suggestions. Signed-off-by: Yixuan Cao <caoyixuan2019@email.szu.edu.cn> Link: https://lore.kernel.org/r/20221005145525.10359-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/mm/page_owner.rst | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index f5c954afe97c7..f1efbb414ea68 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -38,22 +38,10 @@ not affect to allocation performance, especially if the static keys jump label patching functionality is available. Following is the kernel's code size change due to this facility. -- Without page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- With page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -Although, roughly, 8 KB code is added in total, page_alloc.o increase by -520 bytes and less than half of it is in hotpath. Building the kernel with -page owner and turning it on if needed would be great option to debug -kernel memory problem. +Although enabling page owner increases kernel size by several kilobytes, +most of this code is outside page allocator and its hot path. Building +the kernel with page owner and turning it on if needed would be great +option to debug kernel memory problem. There is one notice that is caused by implementation detail. page owner stores information into the memory from struct page extension. This memory -- GitLab From 5f5cae9b0e815c27b614e761b065129b8481821a Mon Sep 17 00:00:00 2001 From: Joel Stanley <joel@jms.id.au> Date: Wed, 5 Oct 2022 13:39:04 +1030 Subject: [PATCH 1657/2223] Documentation: ubifs: Fix compression idiom Clearly the author meant 'on the fly'. Signed-off-by: Joel Stanley <joel@jms.id.au> Link: https://lore.kernel.org/r/20221005030904.65604-1-joel@jms.id.au Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/filesystems/ubifs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/ubifs.rst b/Documentation/filesystems/ubifs.rst index e6ee997625345..ced2f7679ddb5 100644 --- a/Documentation/filesystems/ubifs.rst +++ b/Documentation/filesystems/ubifs.rst @@ -59,7 +59,7 @@ differences. * JFFS2 is a write-through file-system, while UBIFS supports write-back, which makes UBIFS much faster on writes. -Similarly to JFFS2, UBIFS supports on-the-flight compression which makes +Similarly to JFFS2, UBIFS supports on-the-fly compression which makes it possible to fit quite a lot of data to the flash. Similarly to JFFS2, UBIFS is tolerant of unclean reboots and power-cuts. -- GitLab From 7cc395312a364777ed428257c014cb7569fe3f48 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa <akiyks@gmail.com> Date: Fri, 30 Sep 2022 11:19:36 +0900 Subject: [PATCH 1658/2223] docs/howto: Replace abundoned URL of gmane.org Somehow, there remains a link to gmane.org, which stopped working in 2016, in howto.rst. Replace it with the one at lore.kernel.org. Do the same changes under translations/ as well. Signed-off-by: Akira Yokosawa <akiyks@gmail.com> Cc: Federico Vaga <federico.vaga@vaga.pv.it> Cc: Alex Shi <alexs@kernel.org> Cc: Yanteng Si <siyanteng@loongson.cn> Cc: Hu Haowen <src.res@email.cn> Reviewed-by: Alex Shi <alexs@kernel.org> Link: https://lore.kernel.org/r/20220930021936.26238-1-akiyks@gmail.com Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/process/howto.rst | 2 +- Documentation/translations/it_IT/process/howto.rst | 2 +- Documentation/translations/ja_JP/howto.rst | 2 +- Documentation/translations/ko_KR/howto.rst | 2 +- Documentation/translations/zh_CN/process/howto.rst | 2 +- Documentation/translations/zh_TW/process/howto.rst | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/process/howto.rst b/Documentation/process/howto.rst index cd6997a9d2032..bd15c393ba3cd 100644 --- a/Documentation/process/howto.rst +++ b/Documentation/process/howto.rst @@ -379,7 +379,7 @@ to subscribe and unsubscribe from the list can be found at: There are archives of the mailing list on the web in many different places. Use a search engine to find these archives. For example: - http://dir.gmane.org/gmane.linux.kernel + https://lore.kernel.org/lkml/ It is highly recommended that you search the archives about the topic you want to bring up, before you post it to the list. A lot of things diff --git a/Documentation/translations/it_IT/process/howto.rst b/Documentation/translations/it_IT/process/howto.rst index 16ad5622d5495..15c08aea1dfea 100644 --- a/Documentation/translations/it_IT/process/howto.rst +++ b/Documentation/translations/it_IT/process/howto.rst @@ -394,7 +394,7 @@ trovati al sito: Ci sono diversi archivi della lista di discussione. Usate un qualsiasi motore di ricerca per trovarli. Per esempio: - http://dir.gmane.org/gmane.linux.kernel + https://lore.kernel.org/lkml/ É caldamente consigliata una ricerca in questi archivi sul tema che volete sollevare, prima di pubblicarlo sulla lista. Molte cose sono già state diff --git a/Documentation/translations/ja_JP/howto.rst b/Documentation/translations/ja_JP/howto.rst index 649e2ff2a407e..b47a682d8dedc 100644 --- a/Documentation/translations/ja_JP/howto.rst +++ b/Documentation/translations/ja_JP/howto.rst @@ -410,7 +410,7 @@ https://bugzilla.kernel.org に行ってください。もし今後のバグレ このメーリングリストのアーカイブは web 上の多数の場所に存在します。こ れらのアーカイブを探すにはサーチエンジンを使いましょう。例えば- - http://dir.gmane.org/gmane.linux.kernel + https://lore.kernel.org/lkml/ リストに投稿する前にすでにその話題がアーカイブに存在するかどうかを検索 することを是非やってください。多数の事がすでに詳細に渡って議論されてお diff --git a/Documentation/translations/ko_KR/howto.rst b/Documentation/translations/ko_KR/howto.rst index e43970584ca4d..df53fafd1b10a 100644 --- a/Documentation/translations/ko_KR/howto.rst +++ b/Documentation/translations/ko_KR/howto.rst @@ -386,7 +386,7 @@ https://bugzilla.kernel.org 를 체크하고자 할 수도 있다; 소수의 커 웹상의 많은 다른 곳에도 메일링 리스트의 아카이브들이 있다. 이러한 아카이브들을 찾으려면 검색 엔진을 사용하라. 예를 들어: - http://dir.gmane.org/gmane.linux.kernel + https://lore.kernel.org/lkml/ 여러분이 새로운 문제에 관해 리스트에 올리기 전에 말하고 싶은 주제에 관한 것을 아카이브에서 먼저 찾아보기를 강력히 권장한다. 이미 상세하게 토론된 많은 diff --git a/Documentation/translations/zh_CN/process/howto.rst b/Documentation/translations/zh_CN/process/howto.rst index 1455190dc087a..5bf953146929f 100644 --- a/Documentation/translations/zh_CN/process/howto.rst +++ b/Documentation/translations/zh_CN/process/howto.rst @@ -306,7 +306,7 @@ bugzilla.kernel.org是Linux内核开发者们用来跟踪内核Bug的网站。 网上很多地方都有这个邮件列表的存档(archive)。可以使用搜索引擎来找到这些 存档。比如: - http://dir.gmane.org/gmane.linux.kernel + https://lore.kernel.org/lkml/ 在发信之前,我们强烈建议你先在存档中搜索你想要讨论的问题。很多已经被详细 讨论过的问题只在邮件列表的存档中可以找到。 diff --git a/Documentation/translations/zh_TW/process/howto.rst b/Documentation/translations/zh_TW/process/howto.rst index 68ae4411285b8..86b0d4c6d6f97 100644 --- a/Documentation/translations/zh_TW/process/howto.rst +++ b/Documentation/translations/zh_TW/process/howto.rst @@ -309,7 +309,7 @@ bugzilla.kernel.org是Linux內核開發者們用來跟蹤內核Bug的網站。 網上很多地方都有這個郵件列表的存檔(archive)。可以使用搜尋引擎來找到這些 存檔。比如: - http://dir.gmane.org/gmane.linux.kernel + https://lore.kernel.org/lkml/ 在發信之前,我們強烈建議你先在存檔中搜索你想要討論的問題。很多已經被詳細 討論過的問題只在郵件列表的存檔中可以找到。 -- GitLab From 84bed8a8bf3133ace08d6fb62a4d14d129f6ff8e Mon Sep 17 00:00:00 2001 From: Yanteng Si <siyanteng@loongson.cn> Date: Wed, 28 Sep 2022 18:31:29 +0800 Subject: [PATCH 1659/2223] docs/zh_CN: Update the translation of ksm to 6.0-rc7 Update to commit bc6a2828a963 ("ksm: add the ksm prefix to the names of the ksm private structures") Signed-off-by: Yanteng Si <siyanteng@loongson.cn> Reviewed-by: Alex Shi <alexs@kernel.org> Link: https://lore.kernel.org/r/60017007349357dc1fd8fa849a5ddb5672f8ab5b.1664360331.git.siyanteng@loongson.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/mm/ksm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/translations/zh_CN/mm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst index d1f82e857ad72..f0f458753d0cd 100644 --- a/Documentation/translations/zh_CN/mm/ksm.rst +++ b/Documentation/translations/zh_CN/mm/ksm.rst @@ -30,7 +30,7 @@ KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ks KSM维护着稳定树中的KSM页的逆映射信息。 当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 -KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 +KSM页的稳定树其中的节点指向了一个ksm_rmap_item结构体类型的列表。同时,这个KSM页 的 ``page->mapping`` 指向了该稳定树节点。 如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 -- GitLab From 4e3ce6d04da3d1058ad887000440e81ce34c0149 Mon Sep 17 00:00:00 2001 From: Yanteng Si <siyanteng@loongson.cn> Date: Wed, 28 Sep 2022 18:31:30 +0800 Subject: [PATCH 1660/2223] docs/zh_CN: Update the translation of page_owner to 6.0-rc7 1)Update to commit 8f0efa81dfbc ("mm/page_owner.c: add llseek for page_owner") 2)Translate some words into Chinese. Signed-off-by: Yanteng Si <siyanteng@loongson.cn> Reviewed-by: Alex Shi <alexs@kernel.org> Link: https://lore.kernel.org/r/52bc8df87618af951b34759487f05775416cb4d4.1664360331.git.siyanteng@loongson.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/mm/page_owner.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst index b7f81d7a6589c..21a6a0837d42a 100644 --- a/Documentation/translations/zh_CN/mm/page_owner.rst +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -74,15 +74,19 @@ page owner在默认情况下是禁用的。所以,如果你想使用它,你 cat /sys/kernel/debug/page_owner > page_owner_full.txt ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: + ``page_owner_full.txt`` 的一般输出情况如下:: Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // 栈详情 Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // 栈详情 + 默认情况下,它将以一个给定的pfn开始,做完整的pfn转储,且page_owner支持fseek。 + + FILE *fp = fopen("/sys/kernel/debug/page_owner", "r"); + fseek(fp, pfn_start, SEEK_SET); ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 取页序值,计算buf的次数和页数,最后根据参数进行排序。 -- GitLab From 46307fd6e27a3f678a1678b02e667678c22aa8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com> Date: Mon, 10 Oct 2022 10:29:18 +0200 Subject: [PATCH 1661/2223] cgroup: Reorganize css_set_lock and kernfs path processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 74e4b956eb1c incorrectly wrapped kernfs_walk_and_get (might_sleep) under css_set_lock (spinlock). css_set_lock is needed by __cset_cgroup_from_root to ensure stable cset->cgrp_links but not for kernfs_walk_and_get. We only need to make sure that the returned root_cgrp won't be freed under us. This is given in the case of global root because it is static (cgrp_dfl_root.cgrp). When the root_cgrp is lower in the hierarchy, it is pinned by cgroup_ns->root_cset (and `current` task cannot switch namespace asynchronously so ns_proxy pins cgroup_ns). Note this reasoning won't hold for root cgroups in v1 hierarchies, therefore create a special-cased helper function just for the default hierarchy. Fixes: 74e4b956eb1c ("cgroup: Honor caller's cgroup NS when resolving path") Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Michal Koutný <mkoutny@suse.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup/cgroup.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 764bdd5fd8d14..ecf409e3c3a72 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1392,6 +1392,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * Returned cgroup is without refcount but it's valid as long as cset pins it. + */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { @@ -1403,6 +1406,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1414,6 +1418,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } + BUG_ON(!res_cgroup); return res_cgroup; } @@ -1436,23 +1441,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); - BUG_ON(!res); return res; } +/* + * Look up cgroup associated with current task's cgroup namespace on the default + * hierarchy. + * + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu + * pointers. + * - css_set_lock is not needed because we just read cset->dfl_cgrp. + * - As a bonus returned cgrp is pinned with the current because it cannot + * switch cgroup_ns asynchronously. + */ +static struct cgroup *current_cgns_cgroup_dfl(void) +{ + struct css_set *cset; + + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - res = __cset_cgroup_from_root(cset, root); - - BUG_ON(!res); - return res; + return __cset_cgroup_from_root(cset, root); } /* @@ -6105,9 +6123,7 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); - spin_lock_irq(&css_set_lock); - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); + root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); return ERR_PTR(-ENOENT); @@ -6686,10 +6702,8 @@ struct cgroup *cgroup_get_from_path(const char *path) struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp; - spin_lock_irq(&css_set_lock); - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); + root_cgrp = current_cgns_cgroup_dfl(); kn = kernfs_walk_and_get(root_cgrp->kn, path); - spin_unlock_irq(&css_set_lock); if (!kn) goto out; -- GitLab From 03db7716159477b595e9af01be8003b7e994cc79 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 10 Oct 2022 11:08:17 -1000 Subject: [PATCH 1662/2223] Revert "cgroup: enable cgroup_get_from_file() on cgroup1" This reverts commit f3a2aebdd6fb90e444d595e46de64e822af419da. The commit enabled looking up v1 cgroups via cgroup_get_from_file(). However, there are multiple users, including CLONE_INTO_CGROUP, which have been assuming that it would only look up v2 cgroups. Returning v1 cgroups breaks them. Let's revert the commit and retry later with a separate lookup interface which allows both v1 and v2. Signed-off-by: Tejun Heo <tj@kernel.org> Link: http://lkml.kernel.org/r/000000000000385cbf05ea3f1862@google.com Cc: Yosry Ahmed <yosryahmed@google.com> --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ecf409e3c3a72..6d8a5a40c24d8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6234,6 +6234,11 @@ static struct cgroup *cgroup_get_from_file(struct file *f) return ERR_CAST(css); cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; } -- GitLab From 6094b9136ca9038b61e9c4b5d25cd5512ce50b34 Mon Sep 17 00:00:00 2001 From: Shirish S <shirish.s@amd.com> Date: Fri, 7 Oct 2022 20:31:49 +0530 Subject: [PATCH 1663/2223] drm/amd/display: explicitly disable psr_feature_enable appropriately [Why] If psr_feature_enable is set to true by default, it continues to be enabled for non capable links. [How] explicitly disable the feature on links that are not capable of the same. Fixes: 8c322309e48e9 ("drm/amd/display: Enable PSR") Signed-off-by: Shirish S <shirish.s@amd.com> Reviewed-by: Leo Li <sunpeng.li@amd.com> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 5.15+ --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c index 8ca10ab3dfc12..26291db0a3cf6 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c @@ -60,11 +60,15 @@ static bool link_supports_psrsu(struct dc_link *link) */ void amdgpu_dm_set_psr_caps(struct dc_link *link) { - if (!(link->connector_signal & SIGNAL_TYPE_EDP)) + if (!(link->connector_signal & SIGNAL_TYPE_EDP)) { + link->psr_settings.psr_feature_enabled = false; return; + } - if (link->type == dc_connection_none) + if (link->type == dc_connection_none) { + link->psr_settings.psr_feature_enabled = false; return; + } if (link->dpcd_caps.psr_info.psr_version == 0) { link->psr_settings.psr_version = DC_PSR_VERSION_UNSUPPORTED; -- GitLab From 4f5bdde386d3b8e9317df5562950e1b4fa177599 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com> Date: Fri, 9 Sep 2022 15:24:55 -0400 Subject: [PATCH 1664/2223] drm/amd/display: Update PMFW z-state interface for DCN314 [Why] Request from PMFW to change the messaging format to specify whether we support z-state via individual bits. [How] Update the args we pass in the support message. Fixes: d5c6909e7460 ("drm/amd/display: Add DCN314 clock manager") Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Charlene Liu <Charlene.Liu@amd.com> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0 --- .../drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c | 11 +++-------- .../gpu/drm/amd/display/dc/dcn314/dcn314_resource.c | 3 ++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c index 897105d1c111e..ef0795b14a1fd 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c @@ -339,29 +339,24 @@ void dcn314_smu_set_zstate_support(struct clk_mgr_internal *clk_mgr, enum dcn_zs if (!clk_mgr->smu_present) return; - if (!clk_mgr->base.ctx->dc->debug.enable_z9_disable_interface && - (support == DCN_ZSTATE_SUPPORT_ALLOW_Z10_ONLY)) - support = DCN_ZSTATE_SUPPORT_DISALLOW; - - // Arg[15:0] = 8/9/0 for Z8/Z9/disallow -> existing bits // Arg[16] = Disallow Z9 -> new bit switch (support) { case DCN_ZSTATE_SUPPORT_ALLOW: msg_id = VBIOSSMC_MSG_AllowZstatesEntry; - param = 9; + param = (1 << 10) | (1 << 9) | (1 << 8); break; case DCN_ZSTATE_SUPPORT_DISALLOW: msg_id = VBIOSSMC_MSG_AllowZstatesEntry; - param = 8; + param = 0; break; case DCN_ZSTATE_SUPPORT_ALLOW_Z10_ONLY: msg_id = VBIOSSMC_MSG_AllowZstatesEntry; - param = 0x00010008; + param = (1 << 10); break; default: //DCN_ZSTATE_SUPPORT_UNKNOWN diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c index 70b647b9b4d37..d0ad72caead28 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c @@ -881,7 +881,8 @@ static const struct dc_plane_cap plane_cap = { }; static const struct dc_debug_options debug_defaults_drv = { - .disable_z10 = true, /*hw not support it*/ + .disable_z10 = false, + .enable_z9_disable_interface = true, .disable_dmcu = true, .force_abm_enable = false, .timing_trace = false, -- GitLab From 99243fd1f3ca40d487209ac76241de0478962a9d Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Tue, 27 Sep 2022 12:35:10 -0400 Subject: [PATCH 1665/2223] Revert "drm/amd/display: skip commit minimal transition state" This reverts commit e4e481e4d838f30985dd46d43ed195110ed265f5. [Why & How] The reverted commit creates memory leak and causes issue upon driver install. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Martin Leung <Martin.Leung@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 40a34b600c8ee..b5058a2ce7e88 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3650,30 +3650,10 @@ static bool commit_minimal_transition_state(struct dc *dc, bool temp_subvp_policy; enum dc_status ret = DC_ERROR_UNEXPECTED; unsigned int i, j; - unsigned int pipe_in_use = 0; if (!transition_context) return false; - /* check current pipes in use*/ - for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe = &transition_base_context->res_ctx.pipe_ctx[i]; - - if (pipe->plane_state) - pipe_in_use++; - } - - /* When the OS add a new surface if we have been used all of pipes with odm combine - * and mpc split feature, it need use commit_minimal_transition_state to transition safely. - * After OS exit MPO, it will back to use odm and mpc split with all of pipes, we need - * call it again. Otherwise return true to skip. - * - * Reduce the scenarios to use dc_commit_state_no_check in the stage of flip. Especially - * enter/exit MPO when DCN still have enough resources. - */ - if (pipe_in_use != dc->res_pool->pipe_count) - return true; - if (!dc->config.is_vmin_only_asic) { tmp_mpc_policy = dc->debug.pipe_split_policy; dc->debug.pipe_split_policy = MPC_SPLIT_AVOID; -- GitLab From eae2331899f9dcc923d37d1d753f2de847c92359 Mon Sep 17 00:00:00 2001 From: Vladimir Stempen <vladimir.stempen@amd.com> Date: Thu, 22 Sep 2022 15:03:05 -0400 Subject: [PATCH 1666/2223] drm/amd/display: properly configure DCFCLK when enable/disable Freesync [Why] Bandwidth validation is using Freesync parameters from previous Freesync state. Bandwidth validation ignores DCFCLK calculated after Freesync parameters are configured [How] Set Freesync bandwidth parameters to its default state before running bandwidth validation. Take DCFCLK calculated after Freesync bandwidth parameters are assigned and bandwidth is recalculated. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Martin Leung <Martin.Leung@amd.com> Reviewed-by: Nevenko Stupar <Nevenko.Stupar@amd.com> Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Vladimir Stempen <vladimir.stempen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c | 7 +++++++ drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c index 9585b25f10e52..a88dd7b3d1c10 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c @@ -1805,6 +1805,13 @@ bool dcn32_validate_bandwidth(struct dc *dc, int pipe_cnt = 0; display_e2e_pipe_params_st *pipes = kzalloc(dc->res_pool->pipe_count * sizeof(display_e2e_pipe_params_st), GFP_KERNEL); struct mall_temp_config mall_temp_config; + + /* To handle Freesync properly, setting FreeSync DML parameters + * to its default state for the first stage of validation + */ + context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching = false; + context->bw_ctx.dml.soc.dram_clock_change_requirement_final = true; + DC_LOGGER_INIT(dc->ctx->logger); /* For fast validation, there are situations where a shallow copy of diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index 6bdd509d292a6..819de0f110126 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -1769,6 +1769,7 @@ void dcn32_calculate_wm_and_dlg_fpu(struct dc *dc, struct dc_state *context, int i, pipe_idx, vlevel_temp = 0; double dcfclk = dcn3_2_soc.clock_limits[0].dcfclk_mhz; double dcfclk_from_validation = context->bw_ctx.dml.vba.DCFCLKState[vlevel][context->bw_ctx.dml.vba.maxMpcComb]; + double dcfclk_from_fw_based_mclk_switching = dcfclk_from_validation; bool pstate_en = context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] != dm_dram_clock_change_unsupported; unsigned int dummy_latency_index = 0; @@ -1804,7 +1805,7 @@ void dcn32_calculate_wm_and_dlg_fpu(struct dc *dc, struct dc_state *context, dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].dml_input.pstate_latency_us; dcn32_internal_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel, false); maxMpcComb = context->bw_ctx.dml.vba.maxMpcComb; - dcfclk = context->bw_ctx.dml.vba.DCFCLKState[vlevel][context->bw_ctx.dml.vba.maxMpcComb]; + dcfclk_from_fw_based_mclk_switching = context->bw_ctx.dml.vba.DCFCLKState[vlevel][context->bw_ctx.dml.vba.maxMpcComb]; pstate_en = context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][maxMpcComb] != dm_dram_clock_change_unsupported; } @@ -1890,6 +1891,10 @@ void dcn32_calculate_wm_and_dlg_fpu(struct dc *dc, struct dc_state *context, pipes[0].clks_cfg.dcfclk_mhz = dcfclk_from_validation; pipes[0].clks_cfg.socclk_mhz = context->bw_ctx.dml.soc.clock_limits[vlevel].socclk_mhz; + if (context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching) { + pipes[0].clks_cfg.dcfclk_mhz = dcfclk_from_fw_based_mclk_switching; + } + if (dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].valid) { min_dram_speed_mts = context->bw_ctx.dml.vba.DRAMSpeed; min_dram_speed_mts_margin = 160; -- GitLab From 5ff32b52995155f91de582124485d0f0f8881363 Mon Sep 17 00:00:00 2001 From: Martin Leung <Martin.Leung@amd.com> Date: Tue, 27 Sep 2022 18:13:38 -0400 Subject: [PATCH 1667/2223] drm/amd/display: zeromem mypipe heap struct before using it [Why & How] bug was caused when moving variable from stack to heap because it was reusable and garbage was left over, so we need to zero mem Fixes: 7acc487ab57e ("drm/amd/display: reduce stack size in dcn32 dml (v2)") Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Martin Leung <Martin.Leung@amd.com> Cc: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c index 11d5750e15afe..5b91660a6496b 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c @@ -733,6 +733,8 @@ static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerforman mode_lib->vba.FCLKChangeLatency, v->UrgentLatency, mode_lib->vba.SREnterPlusExitTime); + memset(&v->dummy_vars.DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation.myPipe, 0, sizeof(DmlPipe)); + v->dummy_vars.DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation.myPipe.Dppclk = mode_lib->vba.DPPCLK[k]; v->dummy_vars.DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation.myPipe.Dispclk = mode_lib->vba.DISPCLK; v->dummy_vars.DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation.myPipe.PixelClock = mode_lib->vba.PixelClock[k]; -- GitLab From 2fd23d467d4fb4e9bb3c3758ee49799f690f5f72 Mon Sep 17 00:00:00 2001 From: Josip Pavic <Josip.Pavic@amd.com> Date: Fri, 23 Sep 2022 15:29:07 -0400 Subject: [PATCH 1668/2223] drm/amd/display: do not compare integers of different widths [Why & How] Increase width of some variables to avoid comparing integers of different widths Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com> Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Josip Pavic <Josip.Pavic@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c index 955ca273cfe1e..426b07edb4267 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c @@ -206,8 +206,7 @@ static bool dcn32_check_no_memory_request_for_cab(struct dc *dc) */ static uint32_t dcn32_calculate_cab_allocation(struct dc *dc, struct dc_state *ctx) { - uint8_t i; - int j; + int i, j; struct dc_stream_state *stream = NULL; struct dc_plane_state *plane = NULL; uint32_t cursor_size = 0; -- GitLab From c19d3eace484ca5627817a1de85af1de06d538b6 Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Wed, 28 Sep 2022 16:33:47 -0400 Subject: [PATCH 1669/2223] drm/amd/display: Use correct pixel clock to program DTBCLK DTO's [Why?] Currently phy_pix_clk is used to program DTO's which is incorrect. [How?] Use the timing pixel clock to program DTO's correctly. Reviewed-by: Martin Leung <Martin.Leung@amd.com> Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/link/link_hwss_hpo_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/link_hwss_hpo_dp.c b/drivers/gpu/drm/amd/display/dc/link/link_hwss_hpo_dp.c index 7d3147175ca21..153a88381f2c7 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_hwss_hpo_dp.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_hwss_hpo_dp.c @@ -111,7 +111,7 @@ static void setup_hpo_dp_stream_encoder(struct pipe_ctx *pipe_ctx) enum phyd32clk_clock_source phyd32clk = get_phyd32clk_src(pipe_ctx->stream->link); dto_params.otg_inst = tg->inst; - dto_params.pixclk_khz = pipe_ctx->stream->phy_pix_clk; + dto_params.pixclk_khz = pipe_ctx->stream->timing.pix_clk_100hz / 10; dto_params.num_odm_segments = get_odm_segment_count(pipe_ctx); dto_params.timing = &pipe_ctx->stream->timing; dto_params.ref_dtbclk_khz = dc->clk_mgr->funcs->get_dtb_ref_clk_frequency(dc->clk_mgr); -- GitLab From e50f67cf5e168d92e24cfb61fb11f2f0a35708cd Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai <aurabindo.pillai@amd.com> Date: Thu, 29 Sep 2022 11:15:12 -0400 Subject: [PATCH 1670/2223] drm/amd/display: Do not trigger timing sync for phantom pipes [Why&How] Doing timing sync seqence for phantom pipes will not go through since they are not fully programmed like normal pipes. Skip the sequence on such pipes Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Alvin Lee <Alvin.Lee2@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../amd/display/dc/dcn10/dcn10_hw_sequencer.c | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index f4b3ec32a331b..305e0c5453745 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -2244,6 +2244,9 @@ void dcn10_enable_timing_synchronization( DC_SYNC_INFO("Setting up OTG reset trigger\n"); for (i = 1; i < group_size; i++) { + if (grouped_pipes[i]->stream && grouped_pipes[i]->stream->mall_stream_config.type == SUBVP_PHANTOM) + continue; + opp = grouped_pipes[i]->stream_res.opp; tg = grouped_pipes[i]->stream_res.tg; tg->funcs->get_otg_active_size(tg, &width, &height); @@ -2254,13 +2257,21 @@ void dcn10_enable_timing_synchronization( for (i = 0; i < group_size; i++) { if (grouped_pipes[i]->stream == NULL) continue; + + if (grouped_pipes[i]->stream && grouped_pipes[i]->stream->mall_stream_config.type == SUBVP_PHANTOM) + continue; + grouped_pipes[i]->stream->vblank_synchronized = false; } - for (i = 1; i < group_size; i++) + for (i = 1; i < group_size; i++) { + if (grouped_pipes[i]->stream && grouped_pipes[i]->stream->mall_stream_config.type == SUBVP_PHANTOM) + continue; + grouped_pipes[i]->stream_res.tg->funcs->enable_reset_trigger( grouped_pipes[i]->stream_res.tg, grouped_pipes[0]->stream_res.tg->inst); + } DC_SYNC_INFO("Waiting for trigger\n"); @@ -2268,12 +2279,21 @@ void dcn10_enable_timing_synchronization( * synchronized. Look at last pipe programmed to reset. */ - wait_for_reset_trigger_to_occur(dc_ctx, grouped_pipes[1]->stream_res.tg); - for (i = 1; i < group_size; i++) + if (grouped_pipes[1]->stream && grouped_pipes[1]->stream->mall_stream_config.type != SUBVP_PHANTOM) + wait_for_reset_trigger_to_occur(dc_ctx, grouped_pipes[1]->stream_res.tg); + + for (i = 1; i < group_size; i++) { + if (grouped_pipes[i]->stream && grouped_pipes[i]->stream->mall_stream_config.type == SUBVP_PHANTOM) + continue; + grouped_pipes[i]->stream_res.tg->funcs->disable_reset_trigger( grouped_pipes[i]->stream_res.tg); + } for (i = 1; i < group_size; i++) { + if (grouped_pipes[i]->stream && grouped_pipes[i]->stream->mall_stream_config.type == SUBVP_PHANTOM) + continue; + opp = grouped_pipes[i]->stream_res.opp; tg = grouped_pipes[i]->stream_res.tg; tg->funcs->get_otg_active_size(tg, &width, &height); -- GitLab From fa28030a83a6302f8724cdbf0c477536b2101033 Mon Sep 17 00:00:00 2001 From: Vladimir Stempen <vladimir.stempen@amd.com> Date: Thu, 29 Sep 2022 13:32:50 -0400 Subject: [PATCH 1671/2223] drm/amd/display: increase hardware status wait time [Why] Diagnostics reports exceptions generated when timeout waiting for DISPCLK frequency divider change expires when testing ODM4to1. Diagnostics reports exceptions generated when timeout waiting for OTG busy status expires when disabling OTG during ODM4to1 test. [How] Increase HW status waiting time for DISPCLK frequency divider change and OTG busy status when disable OTG. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Ariel Bernstein <Eric.Bernstein@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Vladimir Stempen <vladimir.stempen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c | 4 ++-- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_optc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c index 0d30d1d9d67e9..650f3b4b562e9 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c @@ -179,7 +179,7 @@ void dcn20_update_clocks_update_dentist(struct clk_mgr_internal *clk_mgr, struct } else if (dispclk_wdivider == 127 && current_dispclk_wdivider != 127) { REG_UPDATE(DENTIST_DISPCLK_CNTL, DENTIST_DISPCLK_WDIVIDER, 126); - REG_WAIT(DENTIST_DISPCLK_CNTL, DENTIST_DISPCLK_CHG_DONE, 1, 50, 100); + REG_WAIT(DENTIST_DISPCLK_CNTL, DENTIST_DISPCLK_CHG_DONE, 1, 50, 2000); for (i = 0; i < clk_mgr->base.ctx->dc->res_pool->pipe_count; i++) { struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i]; struct dccg *dccg = clk_mgr->base.ctx->dc->res_pool->dccg; @@ -206,7 +206,7 @@ void dcn20_update_clocks_update_dentist(struct clk_mgr_internal *clk_mgr, struct REG_UPDATE(DENTIST_DISPCLK_CNTL, DENTIST_DISPCLK_WDIVIDER, dispclk_wdivider); - REG_WAIT(DENTIST_DISPCLK_CNTL, DENTIST_DISPCLK_CHG_DONE, 1, 50, 1000); + REG_WAIT(DENTIST_DISPCLK_CNTL, DENTIST_DISPCLK_CHG_DONE, 1, 50, 2000); REG_UPDATE(DENTIST_DISPCLK_CNTL, DENTIST_DPPCLK_WDIVIDER, dppclk_wdivider); REG_WAIT(DENTIST_DISPCLK_CNTL, DENTIST_DPPCLK_CHG_DONE, 1, 5, 100); diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_optc.c index ec3989d370861..2b33eeb213e2a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_optc.c @@ -151,7 +151,7 @@ static bool optc32_disable_crtc(struct timing_generator *optc) /* CRTC disabled, so disable clock. */ REG_WAIT(OTG_CLOCK_CONTROL, OTG_BUSY, 0, - 1, 100000); + 1, 150000); return true; } -- GitLab From 20dad3813b3c15d118bda0496711eb7dff98e74a Mon Sep 17 00:00:00 2001 From: Jun Lei <jun.lei@amd.com> Date: Thu, 29 Sep 2022 15:47:31 -0400 Subject: [PATCH 1672/2223] drm/amd/display: Add a helper to map ODM/MPC/Multi-Plane resources [Why & How] Add a helper to map ODM/MPC/Multi-Plane resources from DC Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Nevenko Stupar <Nevenko.Stupar@amd.com> Reviewed-by: Chaitanya Dhere <chaitanya.dhere@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Jun Lei <jun.lei@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/core/dc_resource.c | 49 ++++++++++++++++++- drivers/gpu/drm/amd/display/dc/dc.h | 2 + .../drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 2 +- .../gpu/drm/amd/display/dc/inc/core_types.h | 4 ++ drivers/gpu/drm/amd/display/dc/inc/resource.h | 6 +++ 5 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 8ee0d946bb2f0..4a6e867369b84 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -1747,7 +1747,6 @@ bool dc_remove_plane_from_context( for (i = 0; i < stream_status->plane_count; i++) { if (stream_status->plane_states[i] == plane_state) { - dc_plane_state_release(stream_status->plane_states[i]); break; } @@ -3683,4 +3682,52 @@ bool is_h_timing_divisible_by_2(struct dc_stream_state *stream) (stream->timing.h_sync_width % 2 == 0); } return divisible; +} + +bool dc_resource_acquire_secondary_pipe_for_mpc_odm( + const struct dc *dc, + struct dc_state *state, + struct pipe_ctx *pri_pipe, + struct pipe_ctx *sec_pipe, + bool odm) +{ + int pipe_idx = sec_pipe->pipe_idx; + struct pipe_ctx *sec_top, *sec_bottom, *sec_next, *sec_prev; + const struct resource_pool *pool = dc->res_pool; + + sec_top = sec_pipe->top_pipe; + sec_bottom = sec_pipe->bottom_pipe; + sec_next = sec_pipe->next_odm_pipe; + sec_prev = sec_pipe->prev_odm_pipe; + + *sec_pipe = *pri_pipe; + + sec_pipe->top_pipe = sec_top; + sec_pipe->bottom_pipe = sec_bottom; + sec_pipe->next_odm_pipe = sec_next; + sec_pipe->prev_odm_pipe = sec_prev; + + sec_pipe->pipe_idx = pipe_idx; + sec_pipe->plane_res.mi = pool->mis[pipe_idx]; + sec_pipe->plane_res.hubp = pool->hubps[pipe_idx]; + sec_pipe->plane_res.ipp = pool->ipps[pipe_idx]; + sec_pipe->plane_res.xfm = pool->transforms[pipe_idx]; + sec_pipe->plane_res.dpp = pool->dpps[pipe_idx]; + sec_pipe->plane_res.mpcc_inst = pool->dpps[pipe_idx]->inst; + sec_pipe->stream_res.dsc = NULL; + if (odm) { + if (!sec_pipe->top_pipe) + sec_pipe->stream_res.opp = pool->opps[pipe_idx]; + else + sec_pipe->stream_res.opp = sec_pipe->top_pipe->stream_res.opp; + if (sec_pipe->stream->timing.flags.DSC == 1) { + dcn20_acquire_dsc(dc, &state->res_ctx, &sec_pipe->stream_res.dsc, pipe_idx); + ASSERT(sec_pipe->stream_res.dsc); + if (sec_pipe->stream_res.dsc == NULL) + return false; + } + dcn20_build_mapped_resource(dc, state, sec_pipe->stream); + } + + return true; } \ No newline at end of file diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 66b7482d2e729..b0afcff94591a 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1191,6 +1191,8 @@ struct dc_plane_state { enum dc_irq_source irq_source; struct kref refcount; struct tg_color visual_confirm_color; + + bool is_statically_allocated; }; struct dc_plane_info { diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index 819de0f110126..2a3f5a485b2be 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -1372,7 +1372,7 @@ static struct pipe_ctx *dcn32_find_split_pipe( return pipe; } -static bool dcn32_split_stream_for_mpc_or_odm( +bool dcn32_split_stream_for_mpc_or_odm( const struct dc *dc, struct resource_context *res_ctx, struct pipe_ctx *pri_pipe, diff --git a/drivers/gpu/drm/amd/display/dc/inc/core_types.h b/drivers/gpu/drm/amd/display/dc/inc/core_types.h index 1fd7ad8532107..9498105c98ab3 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/core_types.h +++ b/drivers/gpu/drm/amd/display/dc/inc/core_types.h @@ -39,6 +39,8 @@ #include "panel_cntl.h" #define MAX_CLOCK_SOURCES 7 +#define MAX_SVP_PHANTOM_STREAMS 2 +#define MAX_SVP_PHANTOM_PLANES 2 void enable_surface_flip_reporting(struct dc_plane_state *plane_state, uint32_t controller_id); @@ -492,6 +494,8 @@ struct dcn_bw_output { struct dcn_watermark_set watermarks; struct dcn_bw_writeback bw_writeback; int compbuf_size_kb; + unsigned int legacy_svp_drr_stream_index; + bool legacy_svp_drr_stream_index_valid; }; union bw_output { diff --git a/drivers/gpu/drm/amd/display/dc/inc/resource.h b/drivers/gpu/drm/amd/display/dc/inc/resource.h index c37d1141febe1..5040836f404d0 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/resource.h +++ b/drivers/gpu/drm/amd/display/dc/inc/resource.h @@ -230,4 +230,10 @@ const struct link_hwss *get_link_hwss(const struct dc_link *link, bool is_h_timing_divisible_by_2(struct dc_stream_state *stream); +bool dc_resource_acquire_secondary_pipe_for_mpc_odm( + const struct dc *dc, + struct dc_state *state, + struct pipe_ctx *pri_pipe, + struct pipe_ctx *sec_pipe, + bool odm); #endif /* DRIVERS_GPU_DRM_AMD_DC_DEV_DC_INC_RESOURCE_H_ */ -- GitLab From 876fcc4222e1d0e5b73343f4010a8b66be058f48 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo <Jerry.Zuo@amd.com> Date: Tue, 30 Aug 2022 12:12:53 -0400 Subject: [PATCH 1673/2223] drm/amd/display: Validate DSC After Enable All New CRTCs Before enabling new crtc, stream_count in dc_state does not sync with that in drm_atomic_state. Validating dsc in such case would leave newly added stream not jointly participating in dsc optimization with existing streams, but simply using default initialized vcpi all the time which gives wrong dsc determination decision. Consider the scenaio where one 4k60 connected to the dock under dp-alt mode. Since dp-alt mode is 2-lane setup, stream 1 consumes 63 slots with dsc needed. Then hook up a second 4k60 to the dock. stream 2 connected with 65 slot initialized by default without dsc. dsc pre validate will not jointly optimize stream 2 with stream 1 before crtc 2 added into the dc_state. That leads to stream 2 not getting dsc optimization, and trigger atomic_check failure all the time, as 65 > 63 limit. After getting all new crtcs added into the state, stream_count in dc_state correctly reflect that in drm_atomic_state which comes up with correct dsc decision. Fixes: 71be4b16d39a ("drm/amd/display: dsc validate fail not pass to atomic check") Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Roman Li <Roman.Li@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Fangzhi Zuo <Jerry.Zuo@amd.com> Tested-by: Mark Broadworth <mark.broadworth@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index b84aedb707b8f..f6a9e8fdd87d6 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9390,10 +9390,6 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, } } } - if (!pre_validate_dsc(state, &dm_state, vars)) { - ret = -EINVAL; - goto fail; - } } #endif for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { @@ -9527,6 +9523,15 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, } } +#if defined(CONFIG_DRM_AMD_DC_DCN) + if (dc_resource_is_dsc_encoding_supported(dc)) { + if (!pre_validate_dsc(state, &dm_state, vars)) { + ret = -EINVAL; + goto fail; + } + } +#endif + /* Run this here since we want to validate the streams we created */ ret = drm_atomic_helper_check_planes(dev, state); if (ret) { -- GitLab From d6170e418d1d3ae7e98cb6d96d1444e880131bbf Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Wed, 28 Sep 2022 15:44:38 -0400 Subject: [PATCH 1674/2223] drm/amd/display: Acquire FCLK DPM levels on DCN32 [Why & How] Acquire FCLK DPM levels to properly construct DML clock limits. Further add new logic to keep number of indices for each clock in clk_mgr. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c | 41 ++++++++++++------- .../gpu/drm/amd/display/dc/inc/hw/clk_mgr.h | 15 ++++++- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c index 96d5e0d5b3ce0..99ae3255dcb91 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c @@ -156,7 +156,7 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base) { struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); unsigned int num_levels; - unsigned int num_dcfclk_levels, num_dtbclk_levels, num_dispclk_levels; + struct clk_limit_num_entries *num_entries_per_clk = &clk_mgr_base->bw_params->clk_table.num_entries_per_clk; memset(&(clk_mgr_base->clks), 0, sizeof(struct dc_clocks)); clk_mgr_base->clks.p_state_change_support = true; @@ -180,27 +180,28 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base) /* DCFCLK */ dcn32_init_single_clock(clk_mgr, PPCLK_DCFCLK, &clk_mgr_base->bw_params->clk_table.entries[0].dcfclk_mhz, - &num_levels); - num_dcfclk_levels = num_levels; + &num_entries_per_clk->num_dcfclk_levels); /* SOCCLK */ dcn32_init_single_clock(clk_mgr, PPCLK_SOCCLK, &clk_mgr_base->bw_params->clk_table.entries[0].socclk_mhz, - &num_levels); + &num_entries_per_clk->num_socclk_levels); + /* DTBCLK */ if (!clk_mgr->base.ctx->dc->debug.disable_dtb_ref_clk_switch) dcn32_init_single_clock(clk_mgr, PPCLK_DTBCLK, &clk_mgr_base->bw_params->clk_table.entries[0].dtbclk_mhz, - &num_levels); - num_dtbclk_levels = num_levels; + &num_entries_per_clk->num_dtbclk_levels); /* DISPCLK */ dcn32_init_single_clock(clk_mgr, PPCLK_DISPCLK, &clk_mgr_base->bw_params->clk_table.entries[0].dispclk_mhz, - &num_levels); - num_dispclk_levels = num_levels; + &num_entries_per_clk->num_dispclk_levels); + num_levels = num_entries_per_clk->num_dispclk_levels; - if (num_dcfclk_levels && num_dtbclk_levels && num_dispclk_levels) + if (num_entries_per_clk->num_dcfclk_levels && + num_entries_per_clk->num_dtbclk_levels && + num_entries_per_clk->num_dispclk_levels) clk_mgr->dpm_present = true; if (clk_mgr_base->ctx->dc->debug.min_disp_clk_khz) { @@ -383,7 +384,7 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, /* to disable P-State switching, set UCLK min = max */ if (!clk_mgr_base->clks.p_state_change_support) dcn32_smu_set_hard_min_by_freq(clk_mgr, PPCLK_UCLK, - clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries - 1].memclk_mhz); + clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries_per_clk.num_memclk_levels - 1].memclk_mhz); } /* Always update saved value, even if new value not set due to P-State switching unsupported. Also check safe_to_lower for FCLK */ @@ -634,7 +635,7 @@ static void dcn32_set_hard_min_memclk(struct clk_mgr *clk_mgr_base, bool current khz_to_mhz_ceil(clk_mgr_base->clks.dramclk_khz)); else dcn32_smu_set_hard_min_by_freq(clk_mgr, PPCLK_UCLK, - clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries - 1].memclk_mhz); + clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries_per_clk.num_memclk_levels - 1].memclk_mhz); } else { dcn32_smu_set_hard_min_by_freq(clk_mgr, PPCLK_UCLK, clk_mgr_base->bw_params->clk_table.entries[0].memclk_mhz); @@ -650,22 +651,34 @@ static void dcn32_set_hard_max_memclk(struct clk_mgr *clk_mgr_base) return; dcn30_smu_set_hard_max_by_freq(clk_mgr, PPCLK_UCLK, - clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries - 1].memclk_mhz); + clk_mgr_base->bw_params->clk_table.entries[clk_mgr_base->bw_params->clk_table.num_entries_per_clk.num_memclk_levels - 1].memclk_mhz); } /* Get current memclk states, update bounding box */ static void dcn32_get_memclk_states_from_smu(struct clk_mgr *clk_mgr_base) { struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + struct clk_limit_num_entries *num_entries_per_clk = &clk_mgr_base->bw_params->clk_table.num_entries_per_clk; unsigned int num_levels; if (!clk_mgr->smu_present) return; - /* Refresh memclk states */ + /* Refresh memclk and fclk states */ dcn32_init_single_clock(clk_mgr, PPCLK_UCLK, &clk_mgr_base->bw_params->clk_table.entries[0].memclk_mhz, - &num_levels); + &num_entries_per_clk->num_memclk_levels); + + dcn32_init_single_clock(clk_mgr, PPCLK_FCLK, + &clk_mgr_base->bw_params->clk_table.entries[0].fclk_mhz, + &num_entries_per_clk->num_fclk_levels); + + if (num_entries_per_clk->num_memclk_levels >= num_entries_per_clk->num_fclk_levels) { + num_levels = num_entries_per_clk->num_memclk_levels; + } else { + num_levels = num_entries_per_clk->num_fclk_levels; + } + clk_mgr_base->bw_params->clk_table.num_entries = num_levels ? num_levels : 1; if (clk_mgr->dpm_present && !num_levels) diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h b/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h index d9f1b0a4fbd4a..591ab1389e3b3 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h @@ -95,10 +95,23 @@ struct clk_limit_table_entry { unsigned int wck_ratio; }; +struct clk_limit_num_entries { + unsigned int num_dcfclk_levels; + unsigned int num_fclk_levels; + unsigned int num_memclk_levels; + unsigned int num_socclk_levels; + unsigned int num_dtbclk_levels; + unsigned int num_dispclk_levels; + unsigned int num_dppclk_levels; + unsigned int num_phyclk_levels; + unsigned int num_phyclk_d18_levels; +}; + /* This table is contiguous */ struct clk_limit_table { struct clk_limit_table_entry entries[MAX_NUM_DPM_LVL]; - unsigned int num_entries; + struct clk_limit_num_entries num_entries_per_clk; + unsigned int num_entries; /* highest populated dpm level for back compatibility */ }; struct wm_range_table_entry { -- GitLab From 3867bbd44f2894a4e2b01286b3b378c058992cd7 Mon Sep 17 00:00:00 2001 From: Dillon Varone <Dillon.Varone@amd.com> Date: Sat, 1 Oct 2022 11:51:48 -0400 Subject: [PATCH 1675/2223] drm/amd/display: Fix bug preventing FCLK Pstate allow message being sent [Why & How] FCLK pstate allow message should not be dependent on local "update_fclk". Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Martin Leung <Martin.Leung@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Dillon Varone <Dillon.Varone@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c index 99ae3255dcb91..1c612ccf1944a 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c @@ -342,8 +342,8 @@ static void dcn32_update_clocks(struct clk_mgr *clk_mgr_base, if (should_update_pstate_support(safe_to_lower, fclk_p_state_change_support, clk_mgr_base->clks.fclk_p_state_change_support)) { clk_mgr_base->clks.fclk_p_state_change_support = fclk_p_state_change_support; - /* To enable FCLK P-state switching, send FCLK_PSTATE_NOTSUPPORTED message to PMFW */ - if (clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21 && clk_mgr_base->clks.fclk_p_state_change_support && update_fclk) { + /* To enable FCLK P-state switching, send FCLK_PSTATE_SUPPORTED message to PMFW */ + if (clk_mgr_base->ctx->dce_version != DCN_VERSION_3_21 && clk_mgr_base->clks.fclk_p_state_change_support) { /* Handle the code for sending a message to PMFW that FCLK P-state change is supported */ dcn32_smu_send_fclk_pstate_message(clk_mgr, FCLK_PSTATE_SUPPORTED); } -- GitLab From b73353f7f3d434e90da9f0e127bba1fe26cb1287 Mon Sep 17 00:00:00 2001 From: Max Tseng <Max.Tseng@amd.com> Date: Sun, 2 Oct 2022 20:45:37 +0800 Subject: [PATCH 1676/2223] drm/amd/display: Use the same cursor info across features Since different features would need to update cursor registers, However, they would use different approaches. To unify varied methods, this refactor is implemented the same update cursor info method for current varied features. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Anthony Koo <Anthony.Koo@amd.com> Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Max Tseng <Max.Tseng@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/core/dc_stream.c | 4 + drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 145 ++++++++++++++++++ drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h | 1 + .../gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c | 1 + .../amd/display/dc/dcn10/dcn10_hw_sequencer.c | 141 ----------------- .../gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c | 30 ++++ .../gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c | 4 + .../amd/display/dc/inc/hw/cursor_reg_cache.h | 98 ++++++++++++ drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h | 4 + drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h | 5 + .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 115 +++++++++++++- 11 files changed, 400 insertions(+), 148 deletions(-) create mode 100644 drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c index ae13887756bf5..9998f58c14b99 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c @@ -276,6 +276,8 @@ static void program_cursor_attributes( } dc->hwss.set_cursor_attribute(pipe_ctx); + + dc_send_update_cursor_info_to_dmu(pipe_ctx, i); if (dc->hwss.set_cursor_sdr_white_level) dc->hwss.set_cursor_sdr_white_level(pipe_ctx); } @@ -382,6 +384,8 @@ static void program_cursor_position( } dc->hwss.set_cursor_position(pipe_ctx); + + dc_send_update_cursor_info_to_dmu(pipe_ctx, i); } if (pipe_to_program) diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index 89d7d3fd33212..bbde635c56fc9 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -30,6 +30,7 @@ #include "dc_hw_types.h" #include "core_types.h" #include "../basics/conversion.h" +#include "cursor_reg_cache.h" #define CTX dc_dmub_srv->ctx #define DC_LOGGER CTX->logger @@ -880,3 +881,147 @@ void dc_dmub_srv_log_diagnostic_data(struct dc_dmub_srv *dc_dmub_srv) diag_data.is_cw0_enabled, diag_data.is_cw6_enabled); } + +static bool dc_dmub_should_update_cursor_data(struct pipe_ctx *pipe_ctx) +{ + if (pipe_ctx->plane_state != NULL) { + if (pipe_ctx->plane_state->address.type == PLN_ADDR_TYPE_VIDEO_PROGRESSIVE) + return false; + } + + if ((pipe_ctx->stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1 || + pipe_ctx->stream->link->psr_settings.psr_version == DC_PSR_VERSION_1) && + pipe_ctx->stream->ctx->dce_version >= DCN_VERSION_3_1) + return true; + + return false; +} + +static void dc_build_cursor_update_payload0( + struct pipe_ctx *pipe_ctx, uint8_t p_idx, + struct dmub_cmd_update_cursor_payload0 *payload) +{ + struct hubp *hubp = pipe_ctx->plane_res.hubp; + unsigned int panel_inst = 0; + + if (!dc_get_edp_link_panel_inst(hubp->ctx->dc, + pipe_ctx->stream->link, &panel_inst)) + return; + + /* Payload: Cursor Rect is built from position & attribute + * x & y are obtained from postion + */ + payload->cursor_rect.x = hubp->cur_rect.x; + payload->cursor_rect.y = hubp->cur_rect.y; + /* w & h are obtained from attribute */ + payload->cursor_rect.width = hubp->cur_rect.w; + payload->cursor_rect.height = hubp->cur_rect.h; + + payload->enable = hubp->pos.cur_ctl.bits.cur_enable; + payload->pipe_idx = p_idx; + payload->cmd_version = DMUB_CMD_PSR_CONTROL_VERSION_1; + payload->panel_inst = panel_inst; +} + +static void dc_send_cmd_to_dmu(struct dc_dmub_srv *dmub_srv, + union dmub_rb_cmd *cmd) +{ + dc_dmub_srv_cmd_queue(dmub_srv, cmd); + dc_dmub_srv_cmd_execute(dmub_srv); + dc_dmub_srv_wait_idle(dmub_srv); +} + +static void dc_build_cursor_position_update_payload0( + struct dmub_cmd_update_cursor_payload0 *pl, const uint8_t p_idx, + const struct hubp *hubp, const struct dpp *dpp) +{ + /* Hubp */ + pl->position_cfg.pHubp.cur_ctl.raw = hubp->pos.cur_ctl.raw; + pl->position_cfg.pHubp.position.raw = hubp->pos.position.raw; + pl->position_cfg.pHubp.hot_spot.raw = hubp->pos.hot_spot.raw; + pl->position_cfg.pHubp.dst_offset.raw = hubp->pos.dst_offset.raw; + + /* dpp */ + pl->position_cfg.pDpp.cur0_ctl.raw = dpp->pos.cur0_ctl.raw; + pl->position_cfg.pipe_idx = p_idx; +} + +static void dc_build_cursor_attribute_update_payload1( + struct dmub_cursor_attributes_cfg *pl_A, const uint8_t p_idx, + const struct hubp *hubp, const struct dpp *dpp) +{ + /* Hubp */ + pl_A->aHubp.SURFACE_ADDR_HIGH = hubp->att.SURFACE_ADDR_HIGH; + pl_A->aHubp.SURFACE_ADDR = hubp->att.SURFACE_ADDR; + pl_A->aHubp.cur_ctl.raw = hubp->att.cur_ctl.raw; + pl_A->aHubp.size.raw = hubp->att.size.raw; + pl_A->aHubp.settings.raw = hubp->att.settings.raw; + + /* dpp */ + pl_A->aDpp.cur0_ctl.raw = dpp->att.cur0_ctl.raw; +} + +/** + * *************************************************************************************** + * dc_send_update_cursor_info_to_dmu: Populate the DMCUB Cursor update info command + * + * This function would store the cursor related information and pass it into dmub + * + * @param [in] pCtx: pipe context + * @param [in] pipe_idx: pipe index + * + * @return: void + * + * *************************************************************************************** + */ + +void dc_send_update_cursor_info_to_dmu( + struct pipe_ctx *pCtx, uint8_t pipe_idx) +{ + union dmub_rb_cmd cmd = { 0 }; + union dmub_cmd_update_cursor_info_data *update_cursor_info = + &cmd.update_cursor_info.update_cursor_info_data; + + if (!dc_dmub_should_update_cursor_data(pCtx)) + return; + /* + * Since we use multi_cmd_pending for dmub command, the 2nd command is + * only assigned to store cursor attributes info. + * 1st command can view as 2 parts, 1st is for PSR/Replay data, the other + * is to store cursor position info. + * + * Command heaer type must be the same type if using multi_cmd_pending. + * Besides, while process 2nd command in DMU, the sub type is useless. + * So it's meanless to pass the sub type header with different type. + */ + + { + /* Build Payload#0 Header */ + cmd.update_cursor_info.header.type = DMUB_CMD__UPDATE_CURSOR_INFO; + cmd.update_cursor_info.header.payload_bytes = + sizeof(cmd.update_cursor_info.update_cursor_info_data); + cmd.update_cursor_info.header.multi_cmd_pending = 1; /* To combine multi dmu cmd, 1st cmd */ + + /* Prepare Payload */ + dc_build_cursor_update_payload0(pCtx, pipe_idx, &update_cursor_info->payload0); + + dc_build_cursor_position_update_payload0(&update_cursor_info->payload0, pipe_idx, + pCtx->plane_res.hubp, pCtx->plane_res.dpp); + /* Send update_curosr_info to queue */ + dc_dmub_srv_cmd_queue(pCtx->stream->ctx->dmub_srv, &cmd); + } + { + /* Build Payload#1 Header */ + memset(update_cursor_info, 0, sizeof(union dmub_cmd_update_cursor_info_data)); + cmd.update_cursor_info.header.type = DMUB_CMD__UPDATE_CURSOR_INFO; + cmd.update_cursor_info.header.payload_bytes = sizeof(struct cursor_attributes_cfg); + cmd.update_cursor_info.header.multi_cmd_pending = 0; /* Indicate it's the last command. */ + + dc_build_cursor_attribute_update_payload1( + &cmd.update_cursor_info.update_cursor_info_data.payload1.attribute_cfg, + pipe_idx, pCtx->plane_res.hubp, pCtx->plane_res.dpp); + + /* Combine 2nd cmds update_curosr_info to DMU */ + dc_send_cmd_to_dmu(pCtx->stream->ctx->dmub_srv, &cmd); + } +} diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h index 7e438345b1a80..d34f5563df2ec 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h @@ -88,4 +88,5 @@ bool dc_dmub_srv_get_diagnostic_data(struct dc_dmub_srv *dc_dmub_srv, struct dmu void dc_dmub_setup_subvp_dmub_command(struct dc *dc, struct dc_state *context, bool enable); void dc_dmub_srv_log_diagnostic_data(struct dc_dmub_srv *dc_dmub_srv); +void dc_send_update_cursor_info_to_dmu(struct pipe_ctx *pCtx, uint8_t pipe_idx); #endif /* _DMUB_DC_SRV_H_ */ diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c index 897f412f539e6..b9765b3899e19 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c @@ -469,6 +469,7 @@ void dpp1_set_cursor_position( REG_UPDATE(CURSOR0_CONTROL, CUR0_ENABLE, cur_en); + dpp_base->pos.cur0_ctl.bits.cur0_enable = cur_en; } void dpp1_cnv_set_optional_cursor_attributes( diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index 305e0c5453745..11e4c4e469473 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -3372,127 +3372,6 @@ static bool dcn10_can_pipe_disable_cursor(struct pipe_ctx *pipe_ctx) return false; } -static bool dcn10_dmub_should_update_cursor_data( - struct pipe_ctx *pipe_ctx, - struct dc_debug_options *debug) -{ - if (pipe_ctx->plane_state->address.type == PLN_ADDR_TYPE_VIDEO_PROGRESSIVE) - return false; - - if (dcn10_can_pipe_disable_cursor(pipe_ctx)) - return false; - - if ((pipe_ctx->stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1 || pipe_ctx->stream->link->psr_settings.psr_version == DC_PSR_VERSION_1) - && pipe_ctx->stream->ctx->dce_version >= DCN_VERSION_3_1) - return true; - - return false; -} - -static void dcn10_dmub_update_cursor_data( - struct pipe_ctx *pipe_ctx, - struct hubp *hubp, - const struct dc_cursor_mi_param *param, - const struct dc_cursor_position *cur_pos, - const struct dc_cursor_attributes *cur_attr) -{ - union dmub_rb_cmd cmd; - struct dmub_cmd_update_cursor_info_data *update_cursor_info; - const struct dc_cursor_position *pos; - const struct dc_cursor_attributes *attr; - int src_x_offset = 0; - int src_y_offset = 0; - int x_hotspot = 0; - int cursor_height = 0; - int cursor_width = 0; - uint32_t cur_en = 0; - unsigned int panel_inst = 0; - - struct dc_debug_options *debug = &hubp->ctx->dc->debug; - - if (!dcn10_dmub_should_update_cursor_data(pipe_ctx, debug)) - return; - /** - * if cur_pos == NULL means the caller is from cursor_set_attribute - * then driver use previous cursor position data - * if cur_attr == NULL means the caller is from cursor_set_position - * then driver use previous cursor attribute - * if cur_pos or cur_attr is not NULL then update it - */ - if (cur_pos != NULL) - pos = cur_pos; - else - pos = &hubp->curs_pos; - - if (cur_attr != NULL) - attr = cur_attr; - else - attr = &hubp->curs_attr; - - if (!dc_get_edp_link_panel_inst(hubp->ctx->dc, pipe_ctx->stream->link, &panel_inst)) - return; - - src_x_offset = pos->x - pos->x_hotspot - param->viewport.x; - src_y_offset = pos->y - pos->y_hotspot - param->viewport.y; - x_hotspot = pos->x_hotspot; - cursor_height = (int)attr->height; - cursor_width = (int)attr->width; - cur_en = pos->enable ? 1:0; - - // Rotated cursor width/height and hotspots tweaks for offset calculation - if (param->rotation == ROTATION_ANGLE_90 || param->rotation == ROTATION_ANGLE_270) { - swap(cursor_height, cursor_width); - if (param->rotation == ROTATION_ANGLE_90) { - src_x_offset = pos->x - pos->y_hotspot - param->viewport.x; - src_y_offset = pos->y - pos->x_hotspot - param->viewport.y; - } - } else if (param->rotation == ROTATION_ANGLE_180) { - src_x_offset = pos->x - param->viewport.x; - src_y_offset = pos->y - param->viewport.y; - } - - if (param->mirror) { - x_hotspot = param->viewport.width - x_hotspot; - src_x_offset = param->viewport.x + param->viewport.width - src_x_offset; - } - - if (src_x_offset >= (int)param->viewport.width) - cur_en = 0; /* not visible beyond right edge*/ - - if (src_x_offset + cursor_width <= 0) - cur_en = 0; /* not visible beyond left edge*/ - - if (src_y_offset >= (int)param->viewport.height) - cur_en = 0; /* not visible beyond bottom edge*/ - - if (src_y_offset + cursor_height <= 0) - cur_en = 0; /* not visible beyond top edge*/ - - // Cursor bitmaps have different hotspot values - // There's a possibility that the above logic returns a negative value, so we clamp them to 0 - if (src_x_offset < 0) - src_x_offset = 0; - if (src_y_offset < 0) - src_y_offset = 0; - - memset(&cmd, 0x0, sizeof(cmd)); - cmd.update_cursor_info.header.type = DMUB_CMD__UPDATE_CURSOR_INFO; - cmd.update_cursor_info.header.payload_bytes = - sizeof(cmd.update_cursor_info.update_cursor_info_data); - update_cursor_info = &cmd.update_cursor_info.update_cursor_info_data; - update_cursor_info->cursor_rect.x = src_x_offset + param->viewport.x; - update_cursor_info->cursor_rect.y = src_y_offset + param->viewport.y; - update_cursor_info->cursor_rect.width = attr->width; - update_cursor_info->cursor_rect.height = attr->height; - update_cursor_info->enable = cur_en; - update_cursor_info->pipe_idx = pipe_ctx->pipe_idx; - update_cursor_info->cmd_version = DMUB_CMD_PSR_CONTROL_VERSION_1; - update_cursor_info->panel_inst = panel_inst; - dc_dmub_srv_cmd_queue(pipe_ctx->stream->ctx->dmub_srv, &cmd); - dc_dmub_srv_cmd_execute(pipe_ctx->stream->ctx->dmub_srv); - dc_dmub_srv_wait_idle(pipe_ctx->stream->ctx->dmub_srv); -} - void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) { struct dc_cursor_position pos_cpy = pipe_ctx->stream->cursor_position; @@ -3727,7 +3606,6 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) pipe_ctx->plane_res.scl_data.viewport.height - pos_cpy.y; } - dcn10_dmub_update_cursor_data(pipe_ctx, hubp, ¶m, &pos_cpy, NULL); hubp->funcs->set_cursor_position(hubp, &pos_cpy, ¶m); dpp->funcs->set_cursor_position(dpp, &pos_cpy, ¶m, hubp->curs_attr.width, hubp->curs_attr.height); } @@ -3735,25 +3613,6 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) void dcn10_set_cursor_attribute(struct pipe_ctx *pipe_ctx) { struct dc_cursor_attributes *attributes = &pipe_ctx->stream->cursor_attributes; - struct dc_cursor_mi_param param = { 0 }; - - /** - * If enter PSR without cursor attribute update - * the cursor attribute of dmub_restore_plane - * are initial value. call dmub to exit PSR and - * restore plane then update cursor attribute to - * avoid override with initial value - */ - if (pipe_ctx->plane_state != NULL) { - param.pixel_clk_khz = pipe_ctx->stream->timing.pix_clk_100hz / 10; - param.ref_clk_khz = pipe_ctx->stream->ctx->dc->res_pool->ref_clocks.dchub_ref_clock_inKhz; - param.viewport = pipe_ctx->plane_res.scl_data.viewport; - param.h_scale_ratio = pipe_ctx->plane_res.scl_data.ratios.horz; - param.v_scale_ratio = pipe_ctx->plane_res.scl_data.ratios.vert; - param.rotation = pipe_ctx->plane_state->rotation; - param.mirror = pipe_ctx->plane_state->horizontal_mirror; - dcn10_dmub_update_cursor_data(pipe_ctx, pipe_ctx->plane_res.hubp, ¶m, NULL, attributes); - } pipe_ctx->plane_res.hubp->funcs->set_cursor_attributes( pipe_ctx->plane_res.hubp, attributes); diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c index b1ec0e6f7f587..4996d2810edb8 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c @@ -617,6 +617,17 @@ void hubp2_cursor_set_attributes( CURSOR0_DST_Y_OFFSET, 0, /* used to shift the cursor chunk request deadline */ CURSOR0_CHUNK_HDL_ADJUST, 3); + + hubp->att.SURFACE_ADDR_HIGH = attr->address.high_part; + hubp->att.SURFACE_ADDR = attr->address.low_part; + hubp->att.size.bits.width = attr->width; + hubp->att.size.bits.height = attr->height; + hubp->att.cur_ctl.bits.mode = attr->color_format; + hubp->att.cur_ctl.bits.pitch = hw_pitch; + hubp->att.cur_ctl.bits.line_per_chunk = lpc; + hubp->att.cur_ctl.bits.cur_2x_magnify = attr->attribute_flags.bits.ENABLE_MAGNIFICATION; + hubp->att.settings.bits.dst_y_offset = 0; + hubp->att.settings.bits.chunk_hdl_adjust = 3; } void hubp2_dmdata_set_attributes( @@ -1033,6 +1044,25 @@ void hubp2_cursor_set_position( REG_SET(CURSOR_DST_OFFSET, 0, CURSOR_DST_X_OFFSET, dst_x_offset); /* TODO Handle surface pixel formats other than 4:4:4 */ + /* Cursor Position Register Config */ + hubp->pos.cur_ctl.bits.cur_enable = cur_en; + hubp->pos.position.bits.x_pos = pos->x; + hubp->pos.position.bits.y_pos = pos->y; + hubp->pos.hot_spot.bits.x_hot = x_hotspot; + hubp->pos.hot_spot.bits.y_hot = y_hotspot; + hubp->pos.dst_offset.bits.dst_x_offset = dst_x_offset; + /* Cursor Rectangle Cache + * Cursor bitmaps have different hotspot values + * There's a possibility that the above logic returns a negative value, + * so we clamp them to 0 + */ + if (src_x_offset < 0) + src_x_offset = 0; + if (src_y_offset < 0) + src_y_offset = 0; + /* Save necessary cursor info x, y position. w, h is saved in attribute func. */ + hubp->cur_rect.x = src_x_offset + param->viewport.x; + hubp->cur_rect.y = src_y_offset + param->viewport.y; } void hubp2_clk_cntl(struct hubp *hubp, bool enable) diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c index 4a668d6563dfd..e5b7ef7422b83 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c @@ -372,6 +372,10 @@ void dpp3_set_cursor_attributes( REG_UPDATE(CURSOR0_COLOR1, CUR0_COLOR1, 0xFFFFFFFF); } + + dpp_base->att.cur0_ctl.bits.expansion_mode = 0; + dpp_base->att.cur0_ctl.bits.cur0_rom_en = cur_rom_en; + dpp_base->att.cur0_ctl.bits.mode = color_format; } diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h b/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h new file mode 100644 index 0000000000000..0e7c5880e867a --- /dev/null +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h @@ -0,0 +1,98 @@ +/* Copyright © 2022 Advanced Micro Devices, Inc. All rights reserved. */ + +#ifndef __DAL_CURSOR_CACHE_H__ +#define __DAL_CURSOR_CACHE_H__ + +union reg_cursor_control_cfg { + struct { + uint32_t cur_enable: 1; + uint32_t reser0: 3; + uint32_t cur_2x_magnify: 1; + uint32_t reser1: 3; + uint32_t mode: 3; + uint32_t reser2: 5; + uint32_t pitch: 2; + uint32_t reser3: 6; + uint32_t line_per_chunk: 5; + uint32_t reser4: 3; + } bits; + uint32_t raw; +}; +struct cursor_position_cache_hubp { + union reg_cursor_control_cfg cur_ctl; + union reg_position_cfg { + struct { + uint32_t x_pos: 16; + uint32_t y_pos: 16; + } bits; + uint32_t raw; + } position; + union reg_hot_spot_cfg { + struct { + uint32_t x_hot: 16; + uint32_t y_hot: 16; + } bits; + uint32_t raw; + } hot_spot; + union reg_dst_offset_cfg { + struct { + uint32_t dst_x_offset: 13; + uint32_t reserved: 19; + } bits; + uint32_t raw; + } dst_offset; +}; + +struct cursor_attribute_cache_hubp { + uint32_t SURFACE_ADDR_HIGH; + uint32_t SURFACE_ADDR; + union reg_cursor_control_cfg cur_ctl; + union reg_cursor_size_cfg { + struct { + uint32_t width: 16; + uint32_t height: 16; + } bits; + uint32_t raw; + } size; + union reg_cursor_settings_cfg { + struct { + uint32_t dst_y_offset: 8; + uint32_t chunk_hdl_adjust: 2; + uint32_t reserved: 22; + } bits; + uint32_t raw; + } settings; +}; + +struct cursor_rect { + uint32_t x; + uint32_t y; + uint32_t w; + uint32_t h; +}; + +union reg_cur0_control_cfg { + struct { + uint32_t cur0_enable: 1; + uint32_t expansion_mode: 1; + uint32_t reser0: 1; + uint32_t cur0_rom_en: 1; + uint32_t mode: 3; + uint32_t reserved: 25; + } bits; + uint32_t raw; +}; +struct cursor_position_cache_dpp { + union reg_cur0_control_cfg cur0_ctl; +}; + +struct cursor_attribute_cache_dpp { + union reg_cur0_control_cfg cur0_ctl; +}; + +struct cursor_attributes_cfg { + struct cursor_attribute_cache_hubp aHubp; + struct cursor_attribute_cache_dpp aDpp; +}; + +#endif diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h index 3ef7faa920528..dcb80c4747b04 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h @@ -28,6 +28,7 @@ #define __DAL_DPP_H__ #include "transform.h" +#include "cursor_reg_cache.h" union defer_reg_writes { struct { @@ -58,6 +59,9 @@ struct dpp { struct pwl_params shaper_params; bool cm_bypass_mode; + + struct cursor_position_cache_dpp pos; + struct cursor_attribute_cache_dpp att; }; struct dpp_input_csc_matrix { diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h index 44c4578193a34..d5ea7545583e8 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h @@ -27,6 +27,7 @@ #define __DAL_HUBP_H__ #include "mem_input.h" +#include "cursor_reg_cache.h" #define OPP_ID_INVALID 0xf #define MAX_TTU 0xffffff @@ -65,6 +66,10 @@ struct hubp { struct dc_cursor_attributes curs_attr; struct dc_cursor_position curs_pos; bool power_gated; + + struct cursor_position_cache_hubp pos; + struct cursor_attribute_cache_hubp att; + struct cursor_rect cur_rect; }; struct surface_flip_registers { diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index 5d1aadade8a5b..834707dfc1895 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -760,11 +760,6 @@ enum dmub_cmd_dpia_type { DMUB_CMD__DPIA_MST_ALLOC_SLOTS = 2, }; -enum dmub_cmd_header_sub_type { - DMUB_CMD__SUB_TYPE_GENERAL = 0, - DMUB_CMD__SUB_TYPE_CURSOR_POSITION = 1 -}; - #pragma pack(push, 1) /** @@ -2089,7 +2084,99 @@ struct dmub_rb_cmd_update_dirty_rect { /** * Data passed from driver to FW in a DMUB_CMD__UPDATE_CURSOR_INFO command. */ -struct dmub_cmd_update_cursor_info_data { +union dmub_reg_cursor_control_cfg { + struct { + uint32_t cur_enable: 1; + uint32_t reser0: 3; + uint32_t cur_2x_magnify: 1; + uint32_t reser1: 3; + uint32_t mode: 3; + uint32_t reser2: 5; + uint32_t pitch: 2; + uint32_t reser3: 6; + uint32_t line_per_chunk: 5; + uint32_t reser4: 3; + } bits; + uint32_t raw; +}; +struct dmub_cursor_position_cache_hubp { + union dmub_reg_cursor_control_cfg cur_ctl; + union dmub_reg_position_cfg { + struct { + uint32_t cur_x_pos: 16; + uint32_t cur_y_pos: 16; + } bits; + uint32_t raw; + } position; + union dmub_reg_hot_spot_cfg { + struct { + uint32_t hot_x: 16; + uint32_t hot_y: 16; + } bits; + uint32_t raw; + } hot_spot; + union dmub_reg_dst_offset_cfg { + struct { + uint32_t dst_x_offset: 13; + uint32_t reserved: 19; + } bits; + uint32_t raw; + } dst_offset; +}; + +union dmub_reg_cur0_control_cfg { + struct { + uint32_t cur0_enable: 1; + uint32_t expansion_mode: 1; + uint32_t reser0: 1; + uint32_t cur0_rom_en: 1; + uint32_t mode: 3; + uint32_t reserved: 25; + } bits; + uint32_t raw; +}; +struct dmub_cursor_position_cache_dpp { + union dmub_reg_cur0_control_cfg cur0_ctl; +}; +struct dmub_cursor_position_cfg { + struct dmub_cursor_position_cache_hubp pHubp; + struct dmub_cursor_position_cache_dpp pDpp; + uint8_t pipe_idx; + /* + * Padding is required. To be 4 Bytes Aligned. + */ + uint8_t padding[3]; +}; + +struct dmub_cursor_attribute_cache_hubp { + uint32_t SURFACE_ADDR_HIGH; + uint32_t SURFACE_ADDR; + union dmub_reg_cursor_control_cfg cur_ctl; + union dmub_reg_cursor_size_cfg { + struct { + uint32_t width: 16; + uint32_t height: 16; + } bits; + uint32_t raw; + } size; + union dmub_reg_cursor_settings_cfg { + struct { + uint32_t dst_y_offset: 8; + uint32_t chunk_hdl_adjust: 2; + uint32_t reserved: 22; + } bits; + uint32_t raw; + } settings; +}; +struct dmub_cursor_attribute_cache_dpp { + union dmub_reg_cur0_control_cfg cur0_ctl; +}; +struct dmub_cursor_attributes_cfg { + struct dmub_cursor_attribute_cache_hubp aHubp; + struct dmub_cursor_attribute_cache_dpp aDpp; +}; + +struct dmub_cmd_update_cursor_payload0 { /** * Cursor dirty rects. */ @@ -2116,6 +2203,20 @@ struct dmub_cmd_update_cursor_info_data { * Currently the support is only for 0 or 1 */ uint8_t panel_inst; + /** + * Cursor Position Register. + * Registers contains Hubp & Dpp modules + */ + struct dmub_cursor_position_cfg position_cfg; +}; + +struct dmub_cmd_update_cursor_payload1 { + struct dmub_cursor_attributes_cfg attribute_cfg; +}; + +union dmub_cmd_update_cursor_info_data { + struct dmub_cmd_update_cursor_payload0 payload0; + struct dmub_cmd_update_cursor_payload1 payload1; }; /** * Definition of a DMUB_CMD__UPDATE_CURSOR_INFO command. @@ -2128,7 +2229,7 @@ struct dmub_rb_cmd_update_cursor_info { /** * Data passed from driver to FW in a DMUB_CMD__UPDATE_CURSOR_INFO command. */ - struct dmub_cmd_update_cursor_info_data update_cursor_info_data; + union dmub_cmd_update_cursor_info_data update_cursor_info_data; }; /** -- GitLab From 6f4f8ff567c48823f8279206e236643e8e8f377e Mon Sep 17 00:00:00 2001 From: Meenakshikumar Somasundaram <meenakshikumar.somasundaram@amd.com> Date: Thu, 29 Sep 2022 23:55:41 -0400 Subject: [PATCH 1677/2223] drm/amd/display: Display does not light up after S4 resume [Why] Dpia hpd interrupt processing is disabled when entering S4/S0i3 and would be reenabled after detection completes during resuming. Because, keeping hpd interrupts enabled during detection leads to multiple detections for the same hpd transition. There is a S4 case where dpia hpd interrupt is missed when driver is in transitioning from hpd interrupt processing disable to enable and the display does not light up. [How] - Added dmub inbox command DMUB_CMD__DPIA_HPD_INT_ENABLE to explicitly control dmub to issue dpia hpd interrupt or not. If dpia hpd interrupt is disabled, dmub will keep the hpd pending and post it once driver reenables dpia hpd interrupt or when querying with DMUB_CMD__QUERY_HPD_STATE. - Added dmub boot option dpia_hpd_int_enable_supported to notify dmub about whether DMUB_CMD__DPIA_HPD_INT_ENABLE command would be used. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Mustapha Ghaddar <Mustapha.Ghaddar@amd.com> Reviewed-by: Jun Lei <Jun.Lei@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Meenakshikumar Somasundaram <meenakshikumar.somasundaram@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 31 +++++++++++++++++++ drivers/gpu/drm/amd/display/dc/dc.h | 3 ++ drivers/gpu/drm/amd/display/dmub/dmub_srv.h | 1 + .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 21 ++++++++++++- .../gpu/drm/amd/display/dmub/src/dmub_dcn31.c | 1 + 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index b5058a2ce7e88..660316a536f72 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -4656,6 +4656,37 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc, return DC_OK; } +/** + ***************************************************************************** + * Function: dc_process_dmub_dpia_hpd_int_enable + * + * @brief + * Submits dpia hpd int enable command to dmub via inbox message + * + * @param + * [in] dc: dc structure + * [in] hpd_int_enable: 1 for hpd int enable, 0 to disable + * + * @return + * None + ***************************************************************************** + */ +void dc_process_dmub_dpia_hpd_int_enable(const struct dc *dc, + uint32_t hpd_int_enable) +{ + union dmub_rb_cmd cmd = {0}; + struct dc_dmub_srv *dmub_srv = dc->ctx->dmub_srv; + + cmd.dpia_hpd_int_enable.header.type = DMUB_CMD__DPIA_HPD_INT_ENABLE; + cmd.dpia_hpd_int_enable.enable = hpd_int_enable; + + dc_dmub_srv_cmd_queue(dmub_srv, &cmd); + dc_dmub_srv_cmd_execute(dmub_srv); + dc_dmub_srv_wait_idle(dmub_srv); + + DC_LOG_DEBUG("%s: hpd_int_enable(%d)\n", __func__, hpd_int_enable); +} + /** * dc_disable_accelerated_mode - disable accelerated mode * @dc: dc structure diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index b0afcff94591a..5d0103e20412c 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1612,6 +1612,9 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc, uint8_t mst_alloc_slots, uint8_t *mst_slots_in_use); +void dc_process_dmub_dpia_hpd_int_enable(const struct dc *dc, + uint32_t hpd_int_enable); + /******************************************************************************* * DSC Interfaces ******************************************************************************/ diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h index f34c45b19fcb2..eb5b7eb292ef3 100644 --- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h +++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h @@ -248,6 +248,7 @@ struct dmub_srv_hw_params { bool disable_dpia; bool usb4_cm_version; bool fw_in_system_memory; + bool dpia_hpd_int_enable_supported; }; /** diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index 834707dfc1895..1d36f0fceb3e7 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -400,8 +400,9 @@ union dmub_fw_boot_options { uint32_t diag_env: 1; /* 1 if diagnostic environment */ uint32_t gpint_scratch8: 1; /* 1 if GPINT is in scratch8*/ uint32_t usb4_cm_version: 1; /**< 1 CM support */ + uint32_t dpia_hpd_int_enable_supported: 1; /* 1 if dpia hpd int enable supported */ - uint32_t reserved : 17; /**< reserved */ + uint32_t reserved : 16; /**< reserved */ } bits; /**< boot bits */ uint32_t all; /**< 32-bit access to bits */ }; @@ -728,6 +729,12 @@ enum dmub_cmd_type { /** * Command type used for all VBIOS interface commands. */ + + /** + * Command type used to set DPIA HPD interrupt state + */ + DMUB_CMD__DPIA_HPD_INT_ENABLE = 86, + DMUB_CMD__VBIOS = 128, }; @@ -1255,6 +1262,14 @@ struct dmub_rb_cmd_set_mst_alloc_slots { struct dmub_cmd_mst_alloc_slots_control_data mst_slots_control; /* mst slots control */ }; +/** + * DMUB command structure for DPIA HPD int enable control. + */ +struct dmub_rb_cmd_dpia_hpd_int_enable { + struct dmub_cmd_header header; /* header */ + uint32_t enable; /* dpia hpd interrupt enable */ +}; + /** * struct dmub_rb_cmd_dpphy_init - DPPHY init. */ @@ -3336,6 +3351,10 @@ union dmub_rb_cmd { * Definition of a DMUB_CMD__QUERY_HPD_STATE command. */ struct dmub_rb_cmd_query_hpd_state query_hpd; + /** + * Definition of a DMUB_CMD__DPIA_HPD_INT_ENABLE command. + */ + struct dmub_rb_cmd_dpia_hpd_int_enable dpia_hpd_int_enable; }; /** diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c index c7bd7e2167109..c90b9ee42e126 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c @@ -350,6 +350,7 @@ void dmub_dcn31_enable_dmub_boot_options(struct dmub_srv *dmub, const struct dmu boot_options.bits.dpia_supported = params->dpia_supported; boot_options.bits.enable_dpia = params->disable_dpia ? 0 : 1; boot_options.bits.usb4_cm_version = params->usb4_cm_version; + boot_options.bits.dpia_hpd_int_enable_supported = params->dpia_hpd_int_enable_supported; boot_options.bits.power_optimization = params->power_optimization; boot_options.bits.sel_mux_phy_c_d_phy_f_g = (dmub->asic == DMUB_ASIC_DCN31B) ? 1 : 0; -- GitLab From ba30b223c93ec5af63993b6397cd7316e5acb6c1 Mon Sep 17 00:00:00 2001 From: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com> Date: Tue, 4 Oct 2022 13:30:51 -0400 Subject: [PATCH 1678/2223] drm/amd/display: always allow pstate change when no dpps are active on dcn315 Prevents certain configs blocking s0i3 when streams aren't completely removed Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Charlene Liu <Charlene.Liu@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c index 87bfc42bdaaf1..7dd0845d1bd9f 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/dcn31_fpu.c @@ -483,7 +483,7 @@ void dcn31_calculate_wm_and_dlg_fp( int pipe_cnt, int vlevel) { - int i, pipe_idx; + int i, pipe_idx, active_dpp_count = 0; double dcfclk = context->bw_ctx.dml.vba.DCFCLKState[vlevel][context->bw_ctx.dml.vba.maxMpcComb]; dc_assert_fp_enabled(); @@ -528,6 +528,9 @@ void dcn31_calculate_wm_and_dlg_fp( if (!context->res_ctx.pipe_ctx[i].stream) continue; + if (context->res_ctx.pipe_ctx[i].plane_state) + active_dpp_count++; + pipes[pipe_idx].clks_cfg.dispclk_mhz = get_dispclk_calculated(&context->bw_ctx.dml, pipes, pipe_cnt); pipes[pipe_idx].clks_cfg.dppclk_mhz = get_dppclk_calculated(&context->bw_ctx.dml, pipes, pipe_cnt, pipe_idx); @@ -544,9 +547,9 @@ void dcn31_calculate_wm_and_dlg_fp( } dcn20_calculate_dlg_params(dc, context, pipes, pipe_cnt, vlevel); - /* For 31x apu pstate change is only supported if possible in vactive */ + /* For 31x apu pstate change is only supported if possible in vactive or if there are no active dpps */ context->bw_ctx.bw.dcn.clk.p_state_change_support = - context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] == dm_dram_clock_change_vactive; + context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] == dm_dram_clock_change_vactive || !active_dpp_count; } void dcn31_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_params) -- GitLab From 1298d9ab848653fc35431581d6e36662c7b6935a Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 27 Sep 2022 17:59:05 -0400 Subject: [PATCH 1679/2223] drm/amd/display: Add a missing hook to DCN20 The struct timing_generator_funcs provides a hook for setting up the maximum possible vertical dimension of display for OTG, as the panel supports. DCN10 has a standard function named optc1_set_vtotal_min_max which all ASICs can use to set the aforementioned hook. Since we did not set it for DCN20, this commit initializes the set_vtotal_min_max with the DCN10 function. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_optc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_optc.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_optc.c index 0340fdd3f5fbb..a08c335b73838 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_optc.c @@ -529,6 +529,7 @@ static struct timing_generator_funcs dcn20_tg_funcs = { .enable_optc_clock = optc1_enable_optc_clock, .set_drr = optc1_set_drr, .get_last_used_drr_vtotal = optc2_get_last_used_drr_vtotal, + .set_vtotal_min_max = optc1_set_vtotal_min_max, .set_static_screen_control = optc1_set_static_screen_control, .program_stereo = optc1_program_stereo, .is_stereo_left_eye = optc1_is_stereo_left_eye, -- GitLab From 15e8b368981e1e8420f08b35bb12b794b200f4a0 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Fri, 12 Mar 2021 11:58:57 -0500 Subject: [PATCH 1680/2223] drm/amd/display: Use set_vtotal_min_max to configure OTG VTOTAL In multiple parts of the DCN code, we write directly to the OTG_V_TOTAL_* registers in some OPTC functions. Let's avoid it by using the set_vtotal_min_max. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/display/dc/dcn10/dcn10_optc.c | 18 ++++-------------- .../gpu/drm/amd/display/dc/dcn30/dcn30_optc.c | 2 +- .../gpu/drm/amd/display/dc/dcn31/dcn31_optc.c | 1 - .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 4 ---- 4 files changed, 5 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c index 143a900d4d3d3..dca8a1446120b 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c @@ -207,10 +207,7 @@ void optc1_program_timing( /* In case of V_TOTAL_CONTROL is on, make sure OTG_V_TOTAL_MAX and * OTG_V_TOTAL_MIN are equal to V_TOTAL. */ - REG_SET(OTG_V_TOTAL_MAX, 0, - OTG_V_TOTAL_MAX, v_total); - REG_SET(OTG_V_TOTAL_MIN, 0, - OTG_V_TOTAL_MIN, v_total); + optc->funcs->set_vtotal_min_max(optc, v_total, v_total); /* v_sync_start = 0, v_sync_end = v_sync_width */ v_sync_end = patched_crtc_timing.v_sync_width; @@ -931,11 +928,7 @@ void optc1_set_drr( } - REG_SET(OTG_V_TOTAL_MAX, 0, - OTG_V_TOTAL_MAX, params->vertical_total_max - 1); - - REG_SET(OTG_V_TOTAL_MIN, 0, - OTG_V_TOTAL_MIN, params->vertical_total_min - 1); + optc->funcs->set_vtotal_min_max(optc, params->vertical_total_min - 1, params->vertical_total_max - 1); REG_UPDATE_5(OTG_V_TOTAL_CONTROL, OTG_V_TOTAL_MIN_SEL, 1, @@ -954,11 +947,7 @@ void optc1_set_drr( OTG_V_TOTAL_MAX_SEL, 0, OTG_FORCE_LOCK_ON_EVENT, 0); - REG_SET(OTG_V_TOTAL_MIN, 0, - OTG_V_TOTAL_MIN, 0); - - REG_SET(OTG_V_TOTAL_MAX, 0, - OTG_V_TOTAL_MAX, 0); + optc->funcs->set_vtotal_min_max(optc, 0, 0); } } @@ -1577,6 +1566,7 @@ static const struct timing_generator_funcs dcn10_tg_funcs = { .enable_optc_clock = optc1_enable_optc_clock, .set_drr = optc1_set_drr, .get_last_used_drr_vtotal = NULL, + .set_vtotal_min_max = optc1_set_vtotal_min_max, .set_static_screen_control = optc1_set_static_screen_control, .set_test_pattern = optc1_set_test_pattern, .program_stereo = optc1_program_stereo, diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c index 02459a64ee211..892d3c4d01a1e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c @@ -325,6 +325,7 @@ static struct timing_generator_funcs dcn30_tg_funcs = { .enable_optc_clock = optc1_enable_optc_clock, .set_drr = optc1_set_drr, .get_last_used_drr_vtotal = optc2_get_last_used_drr_vtotal, + .set_vtotal_min_max = optc3_set_vtotal_min_max, .set_static_screen_control = optc1_set_static_screen_control, .program_stereo = optc1_program_stereo, .is_stereo_left_eye = optc1_is_stereo_left_eye, @@ -365,4 +366,3 @@ void dcn30_timing_generator_init(struct optc *optc1) optc1->min_h_sync_width = 4; optc1->min_v_sync_width = 1; } - diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c index d873def1a8f93..63a677c8ee272 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_optc.c @@ -201,7 +201,6 @@ void optc31_set_drr( // Setup manual flow control for EOF via TRIG_A optc->funcs->setup_manual_trigger(optc); - } else { REG_UPDATE_4(OTG_V_TOTAL_CONTROL, OTG_SET_V_TOTAL_MIN_MASK, 0, diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index 1d36f0fceb3e7..7a8f61517424c 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -2941,11 +2941,7 @@ struct dmub_rb_cmd_get_visual_confirm_color { struct dmub_optc_state { uint32_t v_total_max; uint32_t v_total_min; - uint32_t v_total_mid; - uint32_t v_total_mid_frame_num; uint32_t tg_inst; - uint32_t enable_manual_trigger; - uint32_t clear_force_vsync; }; struct dmub_rb_cmd_drr_update { -- GitLab From c8588697aa4ec1f3b7fc09277cf2a5a662d40834 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 27 Sep 2022 18:30:08 -0400 Subject: [PATCH 1681/2223] drm/amd/display: Drop uncessary OTG lock check The OTG_MASTER_UPDATE_LOCK_SEL is used for GSL and OTGs in the same group for selecting the OTG_MASTER_UPDATE_LOCK from the same OTG. At some point, it a check was added to see if OTG is running or not, which is not necessary, and for this reason, this commit dropped that check. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c index dca8a1446120b..33d7802187900 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c @@ -646,13 +646,6 @@ uint32_t optc1_get_vblank_counter(struct timing_generator *optc) void optc1_lock(struct timing_generator *optc) { struct optc *optc1 = DCN10TG_FROM_TG(optc); - uint32_t regval = 0; - - regval = REG_READ(OTG_CONTROL); - - /* otg is not running, do not need to be locked */ - if ((regval & 0x1) == 0x0) - return; REG_SET(OTG_GLOBAL_CONTROL0, 0, OTG_MASTER_UPDATE_LOCK_SEL, optc->inst); @@ -660,12 +653,10 @@ void optc1_lock(struct timing_generator *optc) OTG_MASTER_UPDATE_LOCK, 1); /* Should be fast, status does not update on maximus */ - if (optc->ctx->dce_environment != DCE_ENV_FPGA_MAXIMUS) { - + if (optc->ctx->dce_environment != DCE_ENV_FPGA_MAXIMUS) REG_WAIT(OTG_MASTER_UPDATE_LOCK, UPDATE_LOCK_STATUS, 1, 1, 10); - } } void optc1_unlock(struct timing_generator *optc) -- GitLab From 9799702360d51a714e888fef4ab5fb9123dfb41f Mon Sep 17 00:00:00 2001 From: Alvin Lee <Alvin.Lee2@amd.com> Date: Wed, 29 Jun 2022 12:35:12 -0400 Subject: [PATCH 1682/2223] drm/amd/display: Fix watermark calculation Watermark calculation was incorrect due to missing brackets. Fixes: 85f4bc0c333c ("drm/amd/display: Add SubVP required code") Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Alvin Lee <Alvin.Lee2@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0 --- drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index bbde635c56fc9..0541e87e4f389 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -781,7 +781,7 @@ void dc_dmub_setup_subvp_dmub_command(struct dc *dc, // Store the original watermark value for this SubVP config so we can lower it when the // MCLK switch starts wm_val_refclk = context->bw_ctx.bw.dcn.watermarks.a.cstate_pstate.pstate_change_ns * - dc->res_pool->ref_clocks.dchub_ref_clock_inKhz / 1000 / 1000; + (dc->res_pool->ref_clocks.dchub_ref_clock_inKhz / 1000) / 1000; cmd.fw_assisted_mclk_switch_v2.config_data.watermark_a_cache = wm_val_refclk < 0xFFFF ? wm_val_refclk : 0xFFFF; } -- GitLab From e5da651985be20616a9e0662032e0ea2ee4dd468 Mon Sep 17 00:00:00 2001 From: Bokun Zhang <Bokun.Zhang@amd.com> Date: Fri, 7 Oct 2022 02:08:38 +0800 Subject: [PATCH 1683/2223] drm/amdgpu: Fix SDMA engine resume issue under SRIOV - Under SRIOV, SDMA engine is shared between VFs. Therefore, we will not stop SDMA during hw_fini. This is not an issue with normal dirver loading and unloading. - However, when we put the SDMA engine to suspend state and resume it, the issue starts to show up. Something could attempt to use that SDMA engine to clear or move memory before the engine is initialized since the DRM entity is still there. - Therefore, we will call sdma_v5_2_enable(false) during hw_fini, and if we are under SRIOV, we will call sdma_v5_2_enable(true) afterwards to allow other VFs to use SDMA. This way, the DRM entity of SDMA engine is emptied and it will follow the flow of resume code path. Tested-by: Bokun Zhang <Bokun.Zhang@amd.com> Signed-off-by: Bokun Zhang <Bokun.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index f136fec7b4f4a..3eaf1a573e737 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -1357,12 +1357,19 @@ static int sdma_v5_2_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (amdgpu_sriov_vf(adev)) - return 0; - + /* + * Under SRIOV, the VF cannot single-mindedly stop SDMA engine + * However, we still need to clean up the DRM entity + * Therefore, we will re-enable SDMA afterwards. + */ sdma_v5_2_ctx_switch_enable(adev, false); sdma_v5_2_enable(adev, false); + if (amdgpu_sriov_vf(adev)) { + sdma_v5_2_enable(adev, true); + sdma_v5_2_ctx_switch_enable(adev, true); + } + return 0; } -- GitLab From 571c053658926df3321633b7133f574d3e656c81 Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Thu, 6 Oct 2022 15:31:40 -0400 Subject: [PATCH 1684/2223] drm/amdgpu: switch sdma buffer function tear down to a helper Switch all of the SDMA implementations to use the helper to tear down the ttm buffer manager. Tested-by: Bokun Zhang <Bokun.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 21 +++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 2 ++ drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 6 +----- drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c | 6 +----- drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 6 +----- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 24 +++++------------------- drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 6 +----- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 10 +--------- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 9 +-------- drivers/gpu/drm/amd/amdgpu/si_dma.c | 5 ++--- 10 files changed, 36 insertions(+), 59 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 43cf8632cc1ac..ea5278f094c08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -285,3 +285,24 @@ out: } return err; } + +void amdgpu_sdma_unset_buffer_funcs_helper(struct amdgpu_device *adev) +{ + struct amdgpu_ring *sdma; + int i; + + for (i = 0; i < adev->sdma.num_instances; i++) { + if (adev->sdma.has_page_queue) { + sdma = &adev->sdma.instance[i].page; + if (adev->mman.buffer_funcs_ring == sdma) { + amdgpu_ttm_set_buffer_funcs_status(adev, false); + break; + } + } + sdma = &adev->sdma.instance[i].ring; + if (adev->mman.buffer_funcs_ring == sdma) { + amdgpu_ttm_set_buffer_funcs_status(adev, false); + break; + } + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index d2d88279fefb0..7d99205c2e018 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -128,4 +128,6 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, char *fw_name, u32 instance, bool duplicate); void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev, bool duplicate); +void amdgpu_sdma_unset_buffer_funcs_helper(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c index 5647f13b98d49..cbca9866645c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c @@ -309,14 +309,10 @@ static void cik_sdma_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq */ static void cik_sdma_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma0 = &adev->sdma.instance[0].ring; - struct amdgpu_ring *sdma1 = &adev->sdma.instance[1].ring; u32 rb_cntl; int i; - if ((adev->mman.buffer_funcs_ring == sdma0) || - (adev->mman.buffer_funcs_ring == sdma1)) - amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_sdma_unset_buffer_funcs_helper(adev); for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32(mmSDMA0_GFX_RB_CNTL + sdma_offsets[i]); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c index 6bdffdc1c0b92..c52d246a1d965 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c @@ -342,14 +342,10 @@ static void sdma_v2_4_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se */ static void sdma_v2_4_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma0 = &adev->sdma.instance[0].ring; - struct amdgpu_ring *sdma1 = &adev->sdma.instance[1].ring; u32 rb_cntl, ib_cntl; int i; - if ((adev->mman.buffer_funcs_ring == sdma0) || - (adev->mman.buffer_funcs_ring == sdma1)) - amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_sdma_unset_buffer_funcs_helper(adev); for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32(mmSDMA0_GFX_RB_CNTL + sdma_offsets[i]); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 2584fa3cb13e7..486d9b5c1b9e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -516,14 +516,10 @@ static void sdma_v3_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se */ static void sdma_v3_0_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma0 = &adev->sdma.instance[0].ring; - struct amdgpu_ring *sdma1 = &adev->sdma.instance[1].ring; u32 rb_cntl, ib_cntl; int i; - if ((adev->mman.buffer_funcs_ring == sdma0) || - (adev->mman.buffer_funcs_ring == sdma1)) - amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_sdma_unset_buffer_funcs_helper(adev); for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32(mmSDMA0_GFX_RB_CNTL + sdma_offsets[i]); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 7241a9fb0121f..7b4195f18a7ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -915,18 +915,12 @@ static void sdma_v4_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se */ static void sdma_v4_0_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma[AMDGPU_MAX_SDMA_INSTANCES]; u32 rb_cntl, ib_cntl; - int i, unset = 0; + int i; - for (i = 0; i < adev->sdma.num_instances; i++) { - sdma[i] = &adev->sdma.instance[i].ring; - - if ((adev->mman.buffer_funcs_ring == sdma[i]) && unset != 1) { - amdgpu_ttm_set_buffer_funcs_status(adev, false); - unset = 1; - } + amdgpu_sdma_unset_buffer_funcs_helper(adev); + for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32_SDMA(i, mmSDMA0_GFX_RB_CNTL); rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_GFX_RB_CNTL, RB_ENABLE, 0); WREG32_SDMA(i, mmSDMA0_GFX_RB_CNTL, rb_cntl); @@ -957,20 +951,12 @@ static void sdma_v4_0_rlc_stop(struct amdgpu_device *adev) */ static void sdma_v4_0_page_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma[AMDGPU_MAX_SDMA_INSTANCES]; u32 rb_cntl, ib_cntl; int i; - bool unset = false; - for (i = 0; i < adev->sdma.num_instances; i++) { - sdma[i] = &adev->sdma.instance[i].page; - - if ((adev->mman.buffer_funcs_ring == sdma[i]) && - (!unset)) { - amdgpu_ttm_set_buffer_funcs_status(adev, false); - unset = true; - } + amdgpu_sdma_unset_buffer_funcs_helper(adev); + for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32_SDMA(i, mmSDMA0_PAGE_RB_CNTL); rb_cntl = REG_SET_FIELD(rb_cntl, SDMA0_PAGE_RB_CNTL, RB_ENABLE, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index c05c3eebde4c7..783048e1b0cea 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -584,14 +584,10 @@ static void sdma_v5_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se */ static void sdma_v5_0_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma0 = &adev->sdma.instance[0].ring; - struct amdgpu_ring *sdma1 = &adev->sdma.instance[1].ring; u32 rb_cntl, ib_cntl; int i; - if ((adev->mman.buffer_funcs_ring == sdma0) || - (adev->mman.buffer_funcs_ring == sdma1)) - amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_sdma_unset_buffer_funcs_helper(adev); for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32_SOC15_IP(GC, sdma_v5_0_get_reg_offset(adev, i, mmSDMA0_GFX_RB_CNTL)); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index 3eaf1a573e737..c2ee53c2dd1b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -414,18 +414,10 @@ static void sdma_v5_2_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se */ static void sdma_v5_2_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma0 = &adev->sdma.instance[0].ring; - struct amdgpu_ring *sdma1 = &adev->sdma.instance[1].ring; - struct amdgpu_ring *sdma2 = &adev->sdma.instance[2].ring; - struct amdgpu_ring *sdma3 = &adev->sdma.instance[3].ring; u32 rb_cntl, ib_cntl; int i; - if ((adev->mman.buffer_funcs_ring == sdma0) || - (adev->mman.buffer_funcs_ring == sdma1) || - (adev->mman.buffer_funcs_ring == sdma2) || - (adev->mman.buffer_funcs_ring == sdma3)) - amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_sdma_unset_buffer_funcs_helper(adev); for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_GFX_RB_CNTL)); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 0150f66a5ae6d..a6483483404e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -398,14 +398,10 @@ static void sdma_v6_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se */ static void sdma_v6_0_gfx_stop(struct amdgpu_device *adev) { - struct amdgpu_ring *sdma0 = &adev->sdma.instance[0].ring; - struct amdgpu_ring *sdma1 = &adev->sdma.instance[1].ring; u32 rb_cntl, ib_cntl; int i; - if ((adev->mman.buffer_funcs_ring == sdma0) || - (adev->mman.buffer_funcs_ring == sdma1)) - amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_sdma_unset_buffer_funcs_helper(adev); for (i = 0; i < adev->sdma.num_instances; i++) { rb_cntl = RREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_QUEUE0_RB_CNTL)); @@ -415,9 +411,6 @@ static void sdma_v6_0_gfx_stop(struct amdgpu_device *adev) ib_cntl = REG_SET_FIELD(ib_cntl, SDMA0_QUEUE0_IB_CNTL, IB_ENABLE, 0); WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_QUEUE0_IB_CNTL), ib_cntl); } - - sdma0->sched.ready = false; - sdma1->sched.ready = false; } /** diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c b/drivers/gpu/drm/amd/amdgpu/si_dma.c index f675111ace20c..4d5e718540aa9 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dma.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c @@ -116,15 +116,14 @@ static void si_dma_stop(struct amdgpu_device *adev) u32 rb_cntl; unsigned i; + amdgpu_sdma_unset_buffer_funcs_helper(adev); + for (i = 0; i < adev->sdma.num_instances; i++) { ring = &adev->sdma.instance[i].ring; /* dma0 */ rb_cntl = RREG32(DMA_RB_CNTL + sdma_offsets[i]); rb_cntl &= ~DMA_RB_ENABLE; WREG32(DMA_RB_CNTL + sdma_offsets[i], rb_cntl); - - if (adev->mman.buffer_funcs_ring == ring) - amdgpu_ttm_set_buffer_funcs_status(adev, false); } } -- GitLab From a98cec220aa4b2502704aa0196da1bdc9eb455b4 Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Thu, 6 Oct 2022 15:53:10 -0400 Subject: [PATCH 1685/2223] drm/amdgpu: fix SDMA suspend/resume on SR-IOV Update all SDMA versions that support SR-IOV to properly tear down the ttm buffer functions on suspend. Tested-by: Bokun Zhang <Bokun.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 5 ++++- drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 5 ++++- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 16 ++++++---------- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 5 ++++- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 7b4195f18a7ca..298fa11702e75 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1940,8 +1940,11 @@ static int sdma_v4_0_hw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int i; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + /* disable the scheduler for SDMA */ + amdgpu_sdma_unset_buffer_funcs_helper(adev); return 0; + } for (i = 0; i < adev->sdma.num_instances; i++) { amdgpu_irq_put(adev, &adev->sdma.ecc_irq, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index 783048e1b0cea..d4d9f196db834 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -1456,8 +1456,11 @@ static int sdma_v5_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + /* disable the scheduler for SDMA */ + amdgpu_sdma_unset_buffer_funcs_helper(adev); return 0; + } sdma_v5_0_ctx_switch_enable(adev, false); sdma_v5_0_enable(adev, false); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index c2ee53c2dd1b0..809eca54fc617 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -1349,19 +1349,15 @@ static int sdma_v5_2_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - /* - * Under SRIOV, the VF cannot single-mindedly stop SDMA engine - * However, we still need to clean up the DRM entity - * Therefore, we will re-enable SDMA afterwards. - */ - sdma_v5_2_ctx_switch_enable(adev, false); - sdma_v5_2_enable(adev, false); - if (amdgpu_sriov_vf(adev)) { - sdma_v5_2_enable(adev, true); - sdma_v5_2_ctx_switch_enable(adev, true); + /* disable the scheduler for SDMA */ + amdgpu_sdma_unset_buffer_funcs_helper(adev); + return 0; } + sdma_v5_2_ctx_switch_enable(adev, false); + sdma_v5_2_enable(adev, false); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index a6483483404e5..da3beb0bf2fa2 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1311,8 +1311,11 @@ static int sdma_v6_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + /* disable the scheduler for SDMA */ + amdgpu_sdma_unset_buffer_funcs_helper(adev); return 0; + } sdma_v6_0_ctx_switch_enable(adev, false); sdma_v6_0_enable(adev, false); -- GitLab From 2cc4a5914ce952d6fc83b0f8089a23095ad4f677 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev <aleksei.kodanev@bell-sw.com> Date: Tue, 4 Oct 2022 11:14:01 +0300 Subject: [PATCH 1686/2223] drm/amd/pm: vega10_hwmgr: fix potential off-by-one overflow in 'performance_levels' Since 'hardwareActivityPerformanceLevels' is set to the size of the 'performance_levels' array in vega10_hwmgr_backend_init(), using the '<=' assertion to check for the next index value is incorrect. Replace it with '<'. Detected using the static analysis tool - Svace. Fixes: f83a9991648b ("drm/amd/powerplay: add Vega10 powerplay support (v5)") Reviewed-by: Evan Quan <evan.quan@amd.com> Signed-off-by: Alexey Kodanev <aleksei.kodanev@bell-sw.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c index 99bfe5efe1710..c8c9fb827bda1 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c @@ -3155,7 +3155,7 @@ static int vega10_get_pp_table_entry_callback_func(struct pp_hwmgr *hwmgr, return -1); PP_ASSERT_WITH_CODE( - (vega10_ps->performance_level_count <= + (vega10_ps->performance_level_count < hwmgr->platform_descriptor. hardwareActivityPerformanceLevels), "Performance levels exceeds Driver limit!", -- GitLab From d2bd0831b51d1123fc86c019db3452d6a1ce5029 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev <aleksei.kodanev@bell-sw.com> Date: Tue, 4 Oct 2022 11:14:02 +0300 Subject: [PATCH 1687/2223] drm/amd/pm: smu7_hwmgr: fix potential off-by-one overflow in 'performance_levels' Since 'hardwareActivityPerformanceLevels' is set to the size of the 'performance_levels' array in smu7_hwmgr_backend_init(), using the '<=' assertion to check for the next index value is incorrect. Replace it with '<'. Detected using the static analysis tool - Svace. Fixes: 599a7e9fe1b6 ("drm/amd/powerplay: implement smu7 hwmgr to manager asics with smu ip version 7.") Reviewed-by: Evan Quan <evan.quan@amd.com> Signed-off-by: Alexey Kodanev <aleksei.kodanev@bell-sw.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c index e4fcbf8a7eb5c..7ef7e81525a30 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c @@ -3603,7 +3603,7 @@ static int smu7_get_pp_table_entry_callback_func_v1(struct pp_hwmgr *hwmgr, return -EINVAL); PP_ASSERT_WITH_CODE( - (smu7_power_state->performance_level_count <= + (smu7_power_state->performance_level_count < hwmgr->platform_descriptor.hardwareActivityPerformanceLevels), "Performance levels exceeds Driver limit!", return -EINVAL); -- GitLab From faf4d8e07f5b67bece91723ad3e8b3f88a3dbf23 Mon Sep 17 00:00:00 2001 From: Guenter Roeck <linux@roeck-us.net> Date: Sun, 9 Oct 2022 23:05:12 -0700 Subject: [PATCH 1688/2223] drm/amd/display: fix array-bounds error in dc_stream_remove_writeback() [take 2] Commit 5d8c3e836fc2 ("drm/amd/display: fix array-bounds error in dc_stream_remove_writeback()") tried to fix an array bounds error seen with gcc 12.0. Unfortunately, that results in another array bounds error, seen with older versions of gcc. Building csky:allmodconfig ... failed -------------- Error log: drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_stream.c: In function 'dc_stream_remove_writeback': drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_stream.c:527:83: error: array subscript 1 is above array bounds of 'struct dc_writeback_info[1]' [-Werror=array-bounds] 527 | stream->writeback_info[j] = stream->writeback_info[i]; | ~~~~~~~~~~~~~~~~~~~~~~^~~ In file included from drivers/gpu/drm/amd/amdgpu/../display/dc/dc.h:1269, from drivers/gpu/drm/amd/amdgpu/../display/dc/inc/core_types.h:29, from drivers/gpu/drm/amd/amdgpu/../display/dc/basics/dc_common.h:29, from drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_stream.c:27: drivers/gpu/drm/amd/amdgpu/../display/dc/dc_stream.h:241:34: note: while referencing 'writeback_info' 241 | struct dc_writeback_info writeback_info[MAX_DWB_PIPES]; We could check both i and j for overflow to fix the problem. That would, however, be not make much sense since it is known and provable that j <= i. Also, the check introduced with commit 5d8c3e836fc2 does not really add value since it checks if j < MAX_DWB_PIPES. Since it is known that j <= i, it would make more sense to check if i < MAX_DWB_PIPES. Unfortunately, that does not help to solve the problem observed here: gcc still complains. To solve the problem, replace the subsequent check for 'i != j' with 'j < i'. This is identical to the original check since we know that j <= i, and it makes all versions of gcc happy. Drop the check introduced with commit 5d8c3e836fc2 since it is not really useful and does not solve the problem. Cc: Aurabindo Pillai <aurabindo.pillai@amd.com> Cc: Hamza Mahfooz <hamza.mahfooz@amd.com> Fixes: 5d8c3e836fc2 ("drm/amd/display: fix array-bounds error in dc_stream_remove_writeback()") Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc_stream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c index 9998f58c14b99..38d71b5c1f2d5 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c @@ -524,9 +524,9 @@ bool dc_stream_remove_writeback(struct dc *dc, } /* remove writeback info for disabled writeback pipes from stream */ - for (i = 0, j = 0; i < stream->num_wb_info && j < MAX_DWB_PIPES; i++) { + for (i = 0, j = 0; i < stream->num_wb_info; i++) { if (stream->writeback_info[i].wb_enabled) { - if (i != j) + if (j < i) /* trim the array */ stream->writeback_info[j] = stream->writeback_info[i]; j++; -- GitLab From 32391e646a71fc4cca4a74740bf401423d7a926d Mon Sep 17 00:00:00 2001 From: Maksym Glubokiy <maksym.glubokiy@plvision.eu> Date: Thu, 6 Oct 2022 22:06:00 +0300 Subject: [PATCH 1689/2223] net: prestera: span: do not unbind things things that were never bound Fixes: 13defa275eef ("net: marvell: prestera: Add matchall support") Signed-off-by: Maksym Glubokiy <maksym.glubokiy@plvision.eu> Link: https://lore.kernel.org/r/20221006190600.881740-1-maksym.glubokiy@plvision.eu Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/marvell/prestera/prestera_span.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_span.c b/drivers/net/ethernet/marvell/prestera/prestera_span.c index f0e9d6ea88c5f..1005182ce3bc1 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_span.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_span.c @@ -107,7 +107,7 @@ static int prestera_span_put(struct prestera_switch *sw, u8 span_id) entry = prestera_span_entry_find_by_id(sw->span, span_id); if (!entry) - return false; + return -ENOENT; if (!refcount_dec_and_test(&entry->ref_count)) return 0; @@ -151,6 +151,9 @@ int prestera_span_rule_del(struct prestera_flow_block_binding *binding, { int err; + if (binding->span_id == PRESTERA_SPAN_INVALID_ID) + return -ENOENT; + err = prestera_hw_span_unbind(binding->port, ingress); if (err) return err; -- GitLab From a390e03401e908ebfecdab0c53b70ff512f11d71 Mon Sep 17 00:00:00 2001 From: Florian Fainelli <f.fainelli@gmail.com> Date: Thu, 6 Oct 2022 20:42:01 -0700 Subject: [PATCH 1690/2223] net: systemport: Enable all RX descriptors for SYSTEMPORT Lite The original commit that added support for the SYSTEMPORT Lite variant halved the number of RX descriptors due to a confusion between the number of descriptors and the number of descriptor words. There are 512 descriptor *words* which means 256 descriptors total. Fixes: 44a4524c54af ("net: systemport: Add support for SYSTEMPORT Lite") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Link: https://lore.kernel.org/r/20221007034201.4126054-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/broadcom/bcmsysport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h index 16b73bb9acc78..5af16e5f9ad08 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.h +++ b/drivers/net/ethernet/broadcom/bcmsysport.h @@ -484,7 +484,7 @@ struct bcm_rsb { /* Number of Receive hardware descriptor words */ #define SP_NUM_HW_RX_DESC_WORDS 1024 -#define SP_LT_NUM_HW_RX_DESC_WORDS 256 +#define SP_LT_NUM_HW_RX_DESC_WORDS 512 /* Internal linked-list RAM size */ #define SP_NUM_TX_DESC 1536 -- GitLab From 5b4c189d660a9b8a852f0863360eb40a100226fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org> Date: Fri, 7 Oct 2022 10:48:44 +0200 Subject: [PATCH 1691/2223] net: sfp: fill also 5gbase-r and 25gbase-r modes in sfp_parse_support() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fill in also 5gbase-r and 25gbase-r PHY interface modes into the phy_interface_t bitmap in sfp_parse_support(). Fixes: fd580c983031 ("net: sfp: augment SFP parsing with phy_interface_t bitmap") Signed-off-by: Marek Behún <kabel@kernel.org> Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> Link: https://lore.kernel.org/r/20221007084844.20352-1-kabel@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/phy/sfp-bus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 29e3fa86bac36..daac293e8edec 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -257,6 +257,7 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, case SFF8024_ECC_100GBASE_SR4_25GBASE_SR: phylink_set(modes, 100000baseSR4_Full); phylink_set(modes, 25000baseSR_Full); + __set_bit(PHY_INTERFACE_MODE_25GBASER, interfaces); break; case SFF8024_ECC_100GBASE_LR4_25GBASE_LR: case SFF8024_ECC_100GBASE_ER4_25GBASE_ER: @@ -268,6 +269,7 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, case SFF8024_ECC_25GBASE_CR_S: case SFF8024_ECC_25GBASE_CR_N: phylink_set(modes, 25000baseCR_Full); + __set_bit(PHY_INTERFACE_MODE_25GBASER, interfaces); break; case SFF8024_ECC_10GBASE_T_SFI: case SFF8024_ECC_10GBASE_T_SR: @@ -276,6 +278,7 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, break; case SFF8024_ECC_5GBASE_T: phylink_set(modes, 5000baseT_Full); + __set_bit(PHY_INTERFACE_MODE_5GBASER, interfaces); break; case SFF8024_ECC_2_5GBASE_T: phylink_set(modes, 2500baseT_Full); -- GitLab From b15e2e49bfc4965d86b9bc4a8426d53ec90a7192 Mon Sep 17 00:00:00 2001 From: Louis Peens <louis.peens@corigine.com> Date: Fri, 7 Oct 2022 11:21:32 +0200 Subject: [PATCH 1692/2223] nfp: flower: fix incorrect struct type in GRE key_size Looks like a copy-paste error sneaked in here at some point, causing the key_size for these tunnels to be calculated incorrectly. This size ends up being send to the firmware, causing unexpected behaviour in some cases. Fixes: 78a722af4ad9 ("nfp: flower: compile match for IPv6 tunnels") Reported-by: Chaoyong He <chaoyong.he@corigine.com> Signed-off-by: Louis Peens <louis.peens@corigine.com> Signed-off-by: Simon Horman <simon.horman@corigine.com> Link: https://lore.kernel.org/r/20221007092132.218386-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/netronome/nfp/flower/offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 3ab3e4536b998..8593cafa63683 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -373,10 +373,10 @@ nfp_flower_calculate_key_layers(struct nfp_app *app, if (ipv6_tun) { key_layer_two |= NFP_FLOWER_LAYER2_TUN_IPV6; key_size += - sizeof(struct nfp_flower_ipv6_udp_tun); + sizeof(struct nfp_flower_ipv6_gre_tun); } else { key_size += - sizeof(struct nfp_flower_ipv4_udp_tun); + sizeof(struct nfp_flower_ipv4_gre_tun); } if (enc_op.key) { -- GitLab From 096f2a0c6469c8a8e70cfbb83345b7ada2929f13 Mon Sep 17 00:00:00 2001 From: Maxime Ripard <maxime@cerno.tech> Date: Mon, 10 Oct 2022 16:47:38 +0200 Subject: [PATCH 1693/2223] clk: Update req_rate on __clk_recalc_rates() Commit cb1b1dd96241 ("clk: Set req_rate on reparenting") introduced a new function, clk_core_update_orphan_child_rates(), that updates the req_rate field on reparenting. It turns out that that function will interfere with the clock notifying done by __clk_recalc_rates(). This ends up reporting the new rate in both the old_rate and new_rate fields of struct clk_notifier_data. Since clk_core_update_orphan_child_rates() is basically __clk_recalc_rates() without the notifiers, and with the req_rate field update, we can drop clk_core_update_orphan_child_rates() entirely, and make __clk_recalc_rates() update req_rate. However, __clk_recalc_rates() is being called in several code paths: when retrieving a rate (most likely through clk_get_rate()), when changing parents (through clk_set_rate() or clk_hw_reparent()), or when updating the orphan status (through clk_core_reparent_orphans_nolock(), called at registration). Updating req_rate on reparenting or initialisation makes sense, but we shouldn't do it on clk_get_rate(). Thus an extra flag has been added to update or not req_rate depending on the context. Fixes: cb1b1dd96241 ("clk: Set req_rate on reparenting") Link: https://lore.kernel.org/linux-clk/0acc7217-762c-7c0d-45a0-55c384824ce4@samsung.com/ Link: https://lore.kernel.org/linux-clk/Y0QNSx+ZgqKSvPOC@sirena.org.uk/ Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> Reported-by: Mark Brown <broonie@kernel.org> Suggested-by: Stephen Boyd <sboyd@kernel.org> Signed-off-by: Maxime Ripard <maxime@cerno.tech> Link: https://lore.kernel.org/r/20221010-rpi-clk-fixes-again-v1-1-d87ba82ac404@cerno.tech Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Stephen Boyd <sboyd@kernel.org> --- drivers/clk/clk.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index ec518dc5d4629..47b33c5b28e16 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1760,6 +1760,7 @@ static unsigned long clk_recalc(struct clk_core *core, /** * __clk_recalc_rates * @core: first clk in the subtree + * @update_req: Whether req_rate should be updated with the new rate * @msg: notification type (see include/linux/clk.h) * * Walks the subtree of clks starting with clk and recalculates rates as it @@ -1769,7 +1770,8 @@ static unsigned long clk_recalc(struct clk_core *core, * clk_recalc_rates also propagates the POST_RATE_CHANGE notification, * if necessary. */ -static void __clk_recalc_rates(struct clk_core *core, unsigned long msg) +static void __clk_recalc_rates(struct clk_core *core, bool update_req, + unsigned long msg) { unsigned long old_rate; unsigned long parent_rate = 0; @@ -1783,6 +1785,8 @@ static void __clk_recalc_rates(struct clk_core *core, unsigned long msg) parent_rate = core->parent->rate; core->rate = clk_recalc(core, parent_rate); + if (update_req) + core->req_rate = core->rate; /* * ignore NOTIFY_STOP and NOTIFY_BAD return values for POST_RATE_CHANGE @@ -1792,13 +1796,13 @@ static void __clk_recalc_rates(struct clk_core *core, unsigned long msg) __clk_notify(core, msg, old_rate, core->rate); hlist_for_each_entry(child, &core->children, child_node) - __clk_recalc_rates(child, msg); + __clk_recalc_rates(child, update_req, msg); } static unsigned long clk_core_get_rate_recalc(struct clk_core *core) { if (core && (core->flags & CLK_GET_RATE_NOCACHE)) - __clk_recalc_rates(core, 0); + __clk_recalc_rates(core, false, 0); return clk_core_get_rate_nolock(core); } @@ -1901,23 +1905,6 @@ static void clk_core_update_orphan_status(struct clk_core *core, bool is_orphan) clk_core_update_orphan_status(child, is_orphan); } -/* - * Update the orphan rate and req_rate of @core and all its children. - */ -static void clk_core_update_orphan_child_rates(struct clk_core *core) -{ - struct clk_core *child; - unsigned long parent_rate = 0; - - if (core->parent) - parent_rate = core->parent->rate; - - core->rate = core->req_rate = clk_recalc(core, parent_rate); - - hlist_for_each_entry(child, &core->children, child_node) - clk_core_update_orphan_child_rates(child); -} - static void clk_reparent(struct clk_core *core, struct clk_core *new_parent) { bool was_orphan = core->orphan; @@ -1987,8 +1974,6 @@ static struct clk_core *__clk_set_parent_before(struct clk_core *core, clk_reparent(core, parent); clk_enable_unlock(flags); - clk_core_update_orphan_child_rates(core); - return old_parent; } @@ -2034,7 +2019,6 @@ static int __clk_set_parent(struct clk_core *core, struct clk_core *parent, clk_reparent(core, old_parent); clk_enable_unlock(flags); - clk_core_update_orphan_child_rates(core); __clk_set_parent_after(core, old_parent, parent); return ret; @@ -2658,9 +2642,8 @@ static void clk_core_reparent(struct clk_core *core, struct clk_core *new_parent) { clk_reparent(core, new_parent); - clk_core_update_orphan_child_rates(core); __clk_recalc_accuracies(core); - __clk_recalc_rates(core, POST_RATE_CHANGE); + __clk_recalc_rates(core, true, POST_RATE_CHANGE); } void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent) @@ -2744,9 +2727,9 @@ static int clk_core_set_parent_nolock(struct clk_core *core, /* propagate rate an accuracy recalculation accordingly */ if (ret) { - __clk_recalc_rates(core, ABORT_RATE_CHANGE); + __clk_recalc_rates(core, true, ABORT_RATE_CHANGE); } else { - __clk_recalc_rates(core, POST_RATE_CHANGE); + __clk_recalc_rates(core, true, POST_RATE_CHANGE); __clk_recalc_accuracies(core); } @@ -3643,7 +3626,7 @@ static void clk_core_reparent_orphans_nolock(void) __clk_set_parent_before(orphan, parent); __clk_set_parent_after(orphan, parent, NULL); __clk_recalc_accuracies(orphan); - __clk_recalc_rates(orphan, 0); + __clk_recalc_rates(orphan, true, 0); /* * __clk_init_parent() will set the initial req_rate to -- GitLab From 589a2004881f0941ca46146a5de68b3666d1d54a Mon Sep 17 00:00:00 2001 From: Maxime Ripard <maxime@cerno.tech> Date: Mon, 10 Oct 2022 16:47:39 +0200 Subject: [PATCH 1694/2223] clk: tests: Add tests for notifiers We're recently encountered a regression due to the rates reported through the clk_notifier_data being off when changing parents. Let's add a test suite and a test to make sure that we do get notified and with the proper rates. Suggested-by: Stephen Boyd <sboyd@kernel.org> Signed-off-by: Maxime Ripard <maxime@cerno.tech> Link: https://lore.kernel.org/r/20221010-rpi-clk-fixes-again-v1-2-d87ba82ac404@cerno.tech Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Stephen Boyd <sboyd@kernel.org> --- drivers/clk/clk_test.c | 156 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/drivers/clk/clk_test.c b/drivers/clk/clk_test.c index 509256c5567aa..f9a5c2964c65d 100644 --- a/drivers/clk/clk_test.c +++ b/drivers/clk/clk_test.c @@ -2239,10 +2239,166 @@ static struct kunit_suite clk_leaf_mux_set_rate_parent_test_suite = { .test_cases = clk_leaf_mux_set_rate_parent_test_cases, }; +struct clk_mux_notifier_rate_change { + bool done; + unsigned long old_rate; + unsigned long new_rate; + wait_queue_head_t wq; +}; + +struct clk_mux_notifier_ctx { + struct clk_multiple_parent_ctx mux_ctx; + struct clk *clk; + struct notifier_block clk_nb; + struct clk_mux_notifier_rate_change pre_rate_change; + struct clk_mux_notifier_rate_change post_rate_change; +}; + +#define NOTIFIER_TIMEOUT_MS 100 + +static int clk_mux_notifier_callback(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct clk_notifier_data *clk_data = data; + struct clk_mux_notifier_ctx *ctx = container_of(nb, + struct clk_mux_notifier_ctx, + clk_nb); + + if (action & PRE_RATE_CHANGE) { + ctx->pre_rate_change.old_rate = clk_data->old_rate; + ctx->pre_rate_change.new_rate = clk_data->new_rate; + ctx->pre_rate_change.done = true; + wake_up_interruptible(&ctx->pre_rate_change.wq); + } + + if (action & POST_RATE_CHANGE) { + ctx->post_rate_change.old_rate = clk_data->old_rate; + ctx->post_rate_change.new_rate = clk_data->new_rate; + ctx->post_rate_change.done = true; + wake_up_interruptible(&ctx->post_rate_change.wq); + } + + return 0; +} + +static int clk_mux_notifier_test_init(struct kunit *test) +{ + struct clk_mux_notifier_ctx *ctx; + const char *top_parents[2] = { "parent-0", "parent-1" }; + int ret; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + test->priv = ctx; + ctx->clk_nb.notifier_call = clk_mux_notifier_callback; + init_waitqueue_head(&ctx->pre_rate_change.wq); + init_waitqueue_head(&ctx->post_rate_change.wq); + + ctx->mux_ctx.parents_ctx[0].hw.init = CLK_HW_INIT_NO_PARENT("parent-0", + &clk_dummy_rate_ops, + 0); + ctx->mux_ctx.parents_ctx[0].rate = DUMMY_CLOCK_RATE_1; + ret = clk_hw_register(NULL, &ctx->mux_ctx.parents_ctx[0].hw); + if (ret) + return ret; + + ctx->mux_ctx.parents_ctx[1].hw.init = CLK_HW_INIT_NO_PARENT("parent-1", + &clk_dummy_rate_ops, + 0); + ctx->mux_ctx.parents_ctx[1].rate = DUMMY_CLOCK_RATE_2; + ret = clk_hw_register(NULL, &ctx->mux_ctx.parents_ctx[1].hw); + if (ret) + return ret; + + ctx->mux_ctx.current_parent = 0; + ctx->mux_ctx.hw.init = CLK_HW_INIT_PARENTS("test-mux", top_parents, + &clk_multiple_parents_mux_ops, + 0); + ret = clk_hw_register(NULL, &ctx->mux_ctx.hw); + if (ret) + return ret; + + ctx->clk = clk_hw_get_clk(&ctx->mux_ctx.hw, NULL); + ret = clk_notifier_register(ctx->clk, &ctx->clk_nb); + if (ret) + return ret; + + return 0; +} + +static void clk_mux_notifier_test_exit(struct kunit *test) +{ + struct clk_mux_notifier_ctx *ctx = test->priv; + struct clk *clk = ctx->clk; + + clk_notifier_unregister(clk, &ctx->clk_nb); + clk_put(clk); + + clk_hw_unregister(&ctx->mux_ctx.hw); + clk_hw_unregister(&ctx->mux_ctx.parents_ctx[0].hw); + clk_hw_unregister(&ctx->mux_ctx.parents_ctx[1].hw); +} + +/* + * Test that if the we have a notifier registered on a mux, the core + * will notify us when we switch to another parent, and with the proper + * old and new rates. + */ +static void clk_mux_notifier_set_parent_test(struct kunit *test) +{ + struct clk_mux_notifier_ctx *ctx = test->priv; + struct clk_hw *hw = &ctx->mux_ctx.hw; + struct clk *clk = clk_hw_get_clk(hw, NULL); + struct clk *new_parent = clk_hw_get_clk(&ctx->mux_ctx.parents_ctx[1].hw, NULL); + int ret; + + ret = clk_set_parent(clk, new_parent); + KUNIT_ASSERT_EQ(test, ret, 0); + + ret = wait_event_interruptible_timeout(ctx->pre_rate_change.wq, + ctx->pre_rate_change.done, + msecs_to_jiffies(NOTIFIER_TIMEOUT_MS)); + KUNIT_ASSERT_GT(test, ret, 0); + + KUNIT_EXPECT_EQ(test, ctx->pre_rate_change.old_rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_EQ(test, ctx->pre_rate_change.new_rate, DUMMY_CLOCK_RATE_2); + + ret = wait_event_interruptible_timeout(ctx->post_rate_change.wq, + ctx->post_rate_change.done, + msecs_to_jiffies(NOTIFIER_TIMEOUT_MS)); + KUNIT_ASSERT_GT(test, ret, 0); + + KUNIT_EXPECT_EQ(test, ctx->post_rate_change.old_rate, DUMMY_CLOCK_RATE_1); + KUNIT_EXPECT_EQ(test, ctx->post_rate_change.new_rate, DUMMY_CLOCK_RATE_2); + + clk_put(new_parent); + clk_put(clk); +} + +static struct kunit_case clk_mux_notifier_test_cases[] = { + KUNIT_CASE(clk_mux_notifier_set_parent_test), + {} +}; + +/* + * Test suite for a mux with multiple parents, and a notifier registered + * on the mux. + * + * These tests exercise the behaviour of notifiers. + */ +static struct kunit_suite clk_mux_notifier_test_suite = { + .name = "clk-mux-notifier", + .init = clk_mux_notifier_test_init, + .exit = clk_mux_notifier_test_exit, + .test_cases = clk_mux_notifier_test_cases, +}; + kunit_test_suites( &clk_leaf_mux_set_rate_parent_test_suite, &clk_test_suite, &clk_multiple_parents_mux_test_suite, + &clk_mux_notifier_test_suite, &clk_orphan_transparent_multiple_parent_mux_test_suite, &clk_orphan_transparent_single_parent_test_suite, &clk_orphan_two_level_root_last_test_suite, -- GitLab From 4f2e56a59b9947b3e698d3cabcb858765c12b1e8 Mon Sep 17 00:00:00 2001 From: Saranya Gopal <saranya.gopal@intel.com> Date: Tue, 11 Oct 2022 10:19:16 +0530 Subject: [PATCH 1695/2223] ALSA: hda/realtek: Add Intel Reference SSID to support headset keys This patch fixes the issue with 3.5mm headset keys on RPL-P platform. [ Rearranged the entry in SSID order by tiwai ] Signed-off-by: Saranya Gopal <saranya.gopal@intel.com> Signed-off-by: Ninad Naik <ninad.naik@intel.com> Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/20221011044916.2278867-1-saranya.gopal@intel.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 54a0f6b4ffc77..4b076912bbf4b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9445,6 +9445,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x10ec, 0x10f2, "Intel Reference board", ALC700_FIXUP_INTEL_REFERENCE), SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE), SND_PCI_QUIRK(0x10ec, 0x1230, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), + SND_PCI_QUIRK(0x10ec, 0x124c, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x1252, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x1254, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-SZ6", ALC269_FIXUP_HEADSET_MODE), -- GitLab From 1499ecaea9d2ba68d5e18d80573b4561a8dc4ee7 Mon Sep 17 00:00:00 2001 From: Anssi Hannula <anssi.hannula@bitwise.fi> Date: Mon, 10 Oct 2022 17:08:26 +0200 Subject: [PATCH 1696/2223] can: kvaser_usb_leaf: Fix overread with an invalid command For command events read from the device, kvaser_usb_leaf_read_bulk_callback() verifies that cmd->len does not exceed the size of the received data, but the actual kvaser_cmd handlers will happily read any kvaser_cmd fields without checking for cmd->len. This can cause an overread if the last cmd in the buffer is shorter than expected for the command type (with cmd->len showing the actual short size). Maximum overread seems to be 22 bytes (CMD_LEAF_LOG_MESSAGE), some of which are delivered to userspace as-is. Fix that by verifying the length of command before handling it. This issue can only occur after RX URBs have been set up, i.e. the interface has been opened at least once. Cc: stable@vger.kernel.org Fixes: 080f40a6fa28 ("can: kvaser_usb: Add support for Kvaser CAN/USB devices") Tested-by: Jimmy Assarsson <extja@kvaser.com> Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi> Signed-off-by: Jimmy Assarsson <extja@kvaser.com> Link: https://lore.kernel.org/all/20221010150829.199676-2-extja@kvaser.com Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- .../net/can/usb/kvaser_usb/kvaser_usb_leaf.c | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index 07f687f29b341..8e11cda85624c 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -310,6 +310,38 @@ struct kvaser_cmd { } u; } __packed; +#define CMD_SIZE_ANY 0xff +#define kvaser_fsize(field) sizeof_field(struct kvaser_cmd, field) + +static const u8 kvaser_usb_leaf_cmd_sizes_leaf[] = { + [CMD_START_CHIP_REPLY] = kvaser_fsize(u.simple), + [CMD_STOP_CHIP_REPLY] = kvaser_fsize(u.simple), + [CMD_GET_CARD_INFO_REPLY] = kvaser_fsize(u.cardinfo), + [CMD_TX_ACKNOWLEDGE] = kvaser_fsize(u.tx_acknowledge_header), + [CMD_GET_SOFTWARE_INFO_REPLY] = kvaser_fsize(u.leaf.softinfo), + [CMD_RX_STD_MESSAGE] = kvaser_fsize(u.leaf.rx_can), + [CMD_RX_EXT_MESSAGE] = kvaser_fsize(u.leaf.rx_can), + [CMD_LEAF_LOG_MESSAGE] = kvaser_fsize(u.leaf.log_message), + [CMD_CHIP_STATE_EVENT] = kvaser_fsize(u.leaf.chip_state_event), + [CMD_CAN_ERROR_EVENT] = kvaser_fsize(u.leaf.error_event), + /* ignored events: */ + [CMD_FLUSH_QUEUE_REPLY] = CMD_SIZE_ANY, +}; + +static const u8 kvaser_usb_leaf_cmd_sizes_usbcan[] = { + [CMD_START_CHIP_REPLY] = kvaser_fsize(u.simple), + [CMD_STOP_CHIP_REPLY] = kvaser_fsize(u.simple), + [CMD_GET_CARD_INFO_REPLY] = kvaser_fsize(u.cardinfo), + [CMD_TX_ACKNOWLEDGE] = kvaser_fsize(u.tx_acknowledge_header), + [CMD_GET_SOFTWARE_INFO_REPLY] = kvaser_fsize(u.usbcan.softinfo), + [CMD_RX_STD_MESSAGE] = kvaser_fsize(u.usbcan.rx_can), + [CMD_RX_EXT_MESSAGE] = kvaser_fsize(u.usbcan.rx_can), + [CMD_CHIP_STATE_EVENT] = kvaser_fsize(u.usbcan.chip_state_event), + [CMD_CAN_ERROR_EVENT] = kvaser_fsize(u.usbcan.error_event), + /* ignored events: */ + [CMD_USBCAN_CLOCK_OVERFLOW_EVENT] = CMD_SIZE_ANY, +}; + /* Summary of a kvaser error event, for a unified Leaf/Usbcan error * handling. Some discrepancies between the two families exist: * @@ -397,6 +429,43 @@ static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_imx_dev_cfg_32mhz = { .bittiming_const = &kvaser_usb_flexc_bittiming_const, }; +static int kvaser_usb_leaf_verify_size(const struct kvaser_usb *dev, + const struct kvaser_cmd *cmd) +{ + /* buffer size >= cmd->len ensured by caller */ + u8 min_size = 0; + + switch (dev->driver_info->family) { + case KVASER_LEAF: + if (cmd->id < ARRAY_SIZE(kvaser_usb_leaf_cmd_sizes_leaf)) + min_size = kvaser_usb_leaf_cmd_sizes_leaf[cmd->id]; + break; + case KVASER_USBCAN: + if (cmd->id < ARRAY_SIZE(kvaser_usb_leaf_cmd_sizes_usbcan)) + min_size = kvaser_usb_leaf_cmd_sizes_usbcan[cmd->id]; + break; + } + + if (min_size == CMD_SIZE_ANY) + return 0; + + if (min_size) { + min_size += CMD_HEADER_LEN; + if (cmd->len >= min_size) + return 0; + + dev_err_ratelimited(&dev->intf->dev, + "Received command %u too short (size %u, needed %u)", + cmd->id, cmd->len, min_size); + return -EIO; + } + + dev_warn_ratelimited(&dev->intf->dev, + "Unhandled command (%d, size %d)\n", + cmd->id, cmd->len); + return -EINVAL; +} + static void * kvaser_usb_leaf_frame_to_cmd(const struct kvaser_usb_net_priv *priv, const struct sk_buff *skb, int *cmd_len, @@ -502,6 +571,9 @@ static int kvaser_usb_leaf_wait_cmd(const struct kvaser_usb *dev, u8 id, end: kfree(buf); + if (err == 0) + err = kvaser_usb_leaf_verify_size(dev, cmd); + return err; } @@ -1133,6 +1205,9 @@ static void kvaser_usb_leaf_stop_chip_reply(const struct kvaser_usb *dev, static void kvaser_usb_leaf_handle_command(const struct kvaser_usb *dev, const struct kvaser_cmd *cmd) { + if (kvaser_usb_leaf_verify_size(dev, cmd) < 0) + return; + switch (cmd->id) { case CMD_START_CHIP_REPLY: kvaser_usb_leaf_start_chip_reply(dev, cmd); -- GitLab From cd7f30e174d09a02ca2afa5ef093fb0f0352e0d8 Mon Sep 17 00:00:00 2001 From: Anssi Hannula <anssi.hannula@bitwise.fi> Date: Mon, 10 Oct 2022 17:08:27 +0200 Subject: [PATCH 1697/2223] can: kvaser_usb: Fix use of uninitialized completion flush_comp is initialized when CMD_FLUSH_QUEUE is sent to the device and completed when the device sends CMD_FLUSH_QUEUE_RESP. This causes completion of uninitialized completion if the device sends CMD_FLUSH_QUEUE_RESP before CMD_FLUSH_QUEUE is ever sent (e.g. as a response to a flush by a previously bound driver, or a misbehaving device). Fix that by initializing flush_comp in kvaser_usb_init_one() like the other completions. This issue is only triggerable after RX URBs have been set up, i.e. the interface has been opened at least once. Cc: stable@vger.kernel.org Fixes: aec5fb2268b7 ("can: kvaser_usb: Add support for Kvaser USB hydra family") Tested-by: Jimmy Assarsson <extja@kvaser.com> Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi> Signed-off-by: Jimmy Assarsson <extja@kvaser.com> Link: https://lore.kernel.org/all/20221010150829.199676-3-extja@kvaser.com Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 1 + drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 824cab80aa02f..c2bce6773adc8 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -729,6 +729,7 @@ static int kvaser_usb_init_one(struct kvaser_usb *dev, int channel) init_usb_anchor(&priv->tx_submitted); init_completion(&priv->start_comp); init_completion(&priv->stop_comp); + init_completion(&priv->flush_comp); priv->can.ctrlmode_supported = 0; priv->dev = dev; diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c index 6871d474dabf2..7b52fda73d827 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c @@ -1916,7 +1916,7 @@ static int kvaser_usb_hydra_flush_queue(struct kvaser_usb_net_priv *priv) { int err; - init_completion(&priv->flush_comp); + reinit_completion(&priv->flush_comp); err = kvaser_usb_hydra_send_simple_cmd(priv->dev, CMD_FLUSH_QUEUE, priv->channel); -- GitLab From 455561fb618fde40558776b5b8435f9420f335db Mon Sep 17 00:00:00 2001 From: Anssi Hannula <anssi.hannula@bitwise.fi> Date: Mon, 10 Oct 2022 17:08:28 +0200 Subject: [PATCH 1698/2223] can: kvaser_usb_leaf: Fix TX queue out of sync after restart The TX queue seems to be implicitly flushed by the hardware during bus-off or bus-off recovery, but the driver does not reset the TX bookkeeping. Despite not resetting TX bookkeeping the driver still re-enables TX queue unconditionally, leading to "cannot find free context" / NETDEV_TX_BUSY errors if the TX queue was full at bus-off time. Fix that by resetting TX bookkeeping on CAN restart. Tested with 0bfd:0124 Kvaser Mini PCI Express 2xHS FW 4.18.778. Cc: stable@vger.kernel.org Fixes: 080f40a6fa28 ("can: kvaser_usb: Add support for Kvaser CAN/USB devices") Tested-by: Jimmy Assarsson <extja@kvaser.com> Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi> Signed-off-by: Jimmy Assarsson <extja@kvaser.com> Link: https://lore.kernel.org/all/20221010150829.199676-4-extja@kvaser.com Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- drivers/net/can/usb/kvaser_usb/kvaser_usb.h | 2 ++ drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 2 +- drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb.h b/drivers/net/can/usb/kvaser_usb/kvaser_usb.h index 841da29cef939..f6c0938027ece 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb.h +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb.h @@ -178,6 +178,8 @@ struct kvaser_usb_dev_cfg { extern const struct kvaser_usb_dev_ops kvaser_usb_hydra_dev_ops; extern const struct kvaser_usb_dev_ops kvaser_usb_leaf_dev_ops; +void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv); + int kvaser_usb_recv_cmd(const struct kvaser_usb *dev, void *cmd, int len, int *actual_len); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index c2bce6773adc8..e91648ed73862 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -477,7 +477,7 @@ static void kvaser_usb_reset_tx_urb_contexts(struct kvaser_usb_net_priv *priv) /* This method might sleep. Do not call it in the atomic context * of URB completions. */ -static void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv) +void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv) { usb_kill_anchored_urbs(&priv->tx_submitted); kvaser_usb_reset_tx_urb_contexts(priv); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index 8e11cda85624c..59c220ef3049b 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -1426,6 +1426,8 @@ static int kvaser_usb_leaf_set_mode(struct net_device *netdev, switch (mode) { case CAN_MODE_START: + kvaser_usb_unlink_tx_urbs(priv); + err = kvaser_usb_leaf_simple_cmd_async(priv, CMD_START_CHIP); if (err) return err; -- GitLab From 0be1a655fe68c8e6dcadbcbddb69cf2fb29881f5 Mon Sep 17 00:00:00 2001 From: Anssi Hannula <anssi.hannula@bitwise.fi> Date: Mon, 10 Oct 2022 17:08:29 +0200 Subject: [PATCH 1699/2223] can: kvaser_usb_leaf: Fix CAN state after restart can_restart() expects CMD_START_CHIP to set the error state to ERROR_ACTIVE as it calls netif_carrier_on() immediately afterwards. Otherwise the user may immediately trigger restart again and hit a BUG_ON() in can_restart(). Fix kvaser_usb_leaf set_mode(CMD_START_CHIP) to set the expected state. Cc: stable@vger.kernel.org Fixes: 080f40a6fa28 ("can: kvaser_usb: Add support for Kvaser CAN/USB devices") Tested-by: Jimmy Assarsson <extja@kvaser.com> Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi> Signed-off-by: Jimmy Assarsson <extja@kvaser.com> Link: https://lore.kernel.org/all/20221010150829.199676-5-extja@kvaser.com Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index 59c220ef3049b..50f2ac8319ff8 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -1431,6 +1431,8 @@ static int kvaser_usb_leaf_set_mode(struct net_device *netdev, err = kvaser_usb_leaf_simple_cmd_async(priv, CMD_START_CHIP); if (err) return err; + + priv->can.state = CAN_STATE_ERROR_ACTIVE; break; default: return -EOPNOTSUPP; -- GitLab From a70aef7982b012e86dfd39fbb235e76a21ae778a Mon Sep 17 00:00:00 2001 From: Takashi Iwai <tiwai@suse.de> Date: Tue, 11 Oct 2022 09:01:46 +0200 Subject: [PATCH 1700/2223] ALSA: rawmidi: Drop register_mutex in snd_rawmidi_free() The register_mutex taken around the dev_unregister callback call in snd_rawmidi_free() may potentially lead to a mutex deadlock, when OSS emulation and a hot unplug are involved. Since the mutex doesn't protect the actual race (as the registration itself is already protected by another means), let's drop it. Link: https://lore.kernel.org/r/CAB7eexJP7w1B0mVgDF0dQ+gWor7UdkiwPczmL7pn91xx8xpzOA@mail.gmail.com Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/20221011070147.7611-1-tiwai@suse.de Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/core/rawmidi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 6963d5a487b32..d8edb60550724 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -1899,10 +1899,8 @@ static int snd_rawmidi_free(struct snd_rawmidi *rmidi) snd_info_free_entry(rmidi->proc_entry); rmidi->proc_entry = NULL; - mutex_lock(®ister_mutex); if (rmidi->ops && rmidi->ops->dev_unregister) rmidi->ops->dev_unregister(rmidi); - mutex_unlock(®ister_mutex); snd_rawmidi_free_substreams(&rmidi->streams[SNDRV_RAWMIDI_STREAM_INPUT]); snd_rawmidi_free_substreams(&rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT]); -- GitLab From 97d917879d7f92df09c3f21fd54609a8bcd654b2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai <tiwai@suse.de> Date: Tue, 11 Oct 2022 09:01:47 +0200 Subject: [PATCH 1701/2223] ALSA: oss: Fix potential deadlock at unregistration We took sound_oss_mutex around the calls of unregister_sound_special() at unregistering OSS devices. This may, however, lead to a deadlock, because we manage the card release via the card's device object, and the release may happen at unregister_sound_special() call -- which will take sound_oss_mutex again in turn. Although the deadlock might be fixed by relaxing the rawmidi mutex in the previous commit, it's safer to move unregister_sound_special() calls themselves out of the sound_oss_mutex, too. The call is race-safe as the function has a spinlock protection by itself. Link: https://lore.kernel.org/r/CAB7eexJP7w1B0mVgDF0dQ+gWor7UdkiwPczmL7pn91xx8xpzOA@mail.gmail.com Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/r/20221011070147.7611-2-tiwai@suse.de Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/core/sound_oss.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sound/core/sound_oss.c b/sound/core/sound_oss.c index 7ed0a2a910352..2751bf2ff61bc 100644 --- a/sound/core/sound_oss.c +++ b/sound/core/sound_oss.c @@ -162,7 +162,6 @@ int snd_unregister_oss_device(int type, struct snd_card *card, int dev) mutex_unlock(&sound_oss_mutex); return -ENOENT; } - unregister_sound_special(minor); switch (SNDRV_MINOR_OSS_DEVICE(minor)) { case SNDRV_MINOR_OSS_PCM: track2 = SNDRV_MINOR_OSS(cidx, SNDRV_MINOR_OSS_AUDIO); @@ -174,12 +173,18 @@ int snd_unregister_oss_device(int type, struct snd_card *card, int dev) track2 = SNDRV_MINOR_OSS(cidx, SNDRV_MINOR_OSS_DMMIDI1); break; } - if (track2 >= 0) { - unregister_sound_special(track2); + if (track2 >= 0) snd_oss_minors[track2] = NULL; - } snd_oss_minors[minor] = NULL; mutex_unlock(&sound_oss_mutex); + + /* call unregister_sound_special() outside sound_oss_mutex; + * otherwise may deadlock, as it can trigger the release of a card + */ + unregister_sound_special(minor); + if (track2 >= 0) + unregister_sound_special(track2); + kfree(mptr); return 0; } -- GitLab From 47c44088ac089adfa2f852770ac11e3b7ce8d7c5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Wed, 5 Oct 2022 15:08:23 +0200 Subject: [PATCH 1702/2223] wifi: mt76: fix receiving LLC packets on mt7615/mt7915 When 802.3 decap offload is enabled, the hardware indicates header translation failure, whenever either the LLC-SNAP header was not found, or a VLAN header with an unregcognized tag is present. In that case, the hardware inserts a 2-byte length fields after the MAC addresses. For VLAN packets, this tag needs to be removed. However, for 802.3 LLC packets, the length bytes should be preserved, since there is no separate ethertype field in the data. This fixes an issue where the length field was omitted for LLC frames, causing them to be malformed after hardware decap. Fixes: 1eeff0b4c1a6 ("mt76: mt7915: fix decap offload corner case with 4-addr VLAN frames") Reported-by: Chad Monroe <chad.monroe@smartrg.com> Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Kalle Valo <kvalo@kernel.org> Link: https://lore.kernel.org/r/20221005130824.23371-1-nbd@nbd.name --- drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 8 ++++---- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c index d6aae60c440de..cbc6859e38ace 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c @@ -610,14 +610,14 @@ static int mt7615_mac_fill_rx(struct mt7615_dev *dev, struct sk_buff *skb) * When header translation failure is indicated, * the hardware will insert an extra 2-byte field * containing the data length after the protocol - * type field. + * type field. This happens either when the LLC-SNAP + * pattern did not match, or if a VLAN header was + * detected. */ pad_start = 12; if (get_unaligned_be16(skb->data + pad_start) == ETH_P_8021Q) pad_start += 4; - - if (get_unaligned_be16(skb->data + pad_start) != - skb->len - pad_start - 2) + else pad_start = 0; } diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index be97dede2634d..e32092f67ea1b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -446,14 +446,14 @@ mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) * When header translation failure is indicated, * the hardware will insert an extra 2-byte field * containing the data length after the protocol - * type field. + * type field. This happens either when the LLC-SNAP + * pattern did not match, or if a VLAN header was + * detected. */ pad_start = 12; if (get_unaligned_be16(skb->data + pad_start) == ETH_P_8021Q) pad_start += 4; - - if (get_unaligned_be16(skb->data + pad_start) != - skb->len - pad_start - 2) + else pad_start = 0; } -- GitLab From 443dc85ad13eeb0340fa3a555c04a6c04c9b61ed Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Wed, 5 Oct 2022 15:08:24 +0200 Subject: [PATCH 1703/2223] wifi: mt76: fix rx checksum offload on mt7615/mt7915/mt7921 Checking the relevant rxd bits for the checksum information only indicates if the checksum verification was performed by the hardware and doesn't show actual checksum errors. Checksum errors are indicated in the info field of the DMA descriptor. Fix packets erroneously marked as CHECKSUM_UNNECESSARY by checking the extra bits as well. Those bits are only passed to the driver for MMIO devices at the moment, so limit checksum offload to those. Fixes: 2122dfbfd0bd ("mt76: mt7615: add rx checksum offload support") Fixes: 94244d2ea503 ("mt76: mt7915: add rx checksum offload support") Fixes: 0e75732764e8 ("mt76: mt7921: enable rx csum offload") Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Kalle Valo <kvalo@kernel.org> Link: https://lore.kernel.org/r/20221005130824.23371-2-nbd@nbd.name --- drivers/net/wireless/mediatek/mt76/dma.c | 5 +---- drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 4 +++- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 4 +++- drivers/net/wireless/mediatek/mt76/mt7921/mac.c | 4 +++- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 4901aa02b4fb1..7378c4d1e1567 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -696,10 +696,7 @@ mt76_dma_rx_process(struct mt76_dev *dev, struct mt76_queue *q, int budget) skb_reserve(skb, q->buf_offset); - if (q == &dev->q_rx[MT_RXQ_MCU]) { - u32 *rxfce = (u32 *)skb->cb; - *rxfce = info; - } + *(u32 *)skb->cb = info; __skb_put(skb, len); done++; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c index cbc6859e38ace..2ce1705c0f433 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c @@ -345,6 +345,7 @@ static int mt7615_mac_fill_rx(struct mt7615_dev *dev, struct sk_buff *skb) u32 rxd1 = le32_to_cpu(rxd[1]); u32 rxd2 = le32_to_cpu(rxd[2]); u32 csum_mask = MT_RXD0_NORMAL_IP_SUM | MT_RXD0_NORMAL_UDP_TCP_SUM; + u32 csum_status = *(u32 *)skb->cb; bool unicast, hdr_trans, remove_pad, insert_ccmp_hdr = false; u16 hdr_gap; int phy_idx; @@ -394,7 +395,8 @@ static int mt7615_mac_fill_rx(struct mt7615_dev *dev, struct sk_buff *skb) spin_unlock_bh(&dev->sta_poll_lock); } - if ((rxd0 & csum_mask) == csum_mask) + if (mt76_is_mmio(&dev->mt76) && (rxd0 & csum_mask) == csum_mask && + !(csum_status & (BIT(0) | BIT(2) | BIT(3)))) skb->ip_summed = CHECKSUM_UNNECESSARY; if (rxd2 & MT_RXD2_NORMAL_FCS_ERR) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index e32092f67ea1b..a4bcc617c1a34 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -233,6 +233,7 @@ mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) u8 remove_pad, amsdu_info; u8 mode = 0, qos_ctl = 0; struct mt7915_sta *msta = NULL; + u32 csum_status = *(u32 *)skb->cb; bool hdr_trans; u16 hdr_gap; u16 seq_ctrl = 0; @@ -288,7 +289,8 @@ mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) if (!sband->channels) return -EINVAL; - if ((rxd0 & csum_mask) == csum_mask) + if ((rxd0 & csum_mask) == csum_mask && + !(csum_status & (BIT(0) | BIT(2) | BIT(3)))) skb->ip_summed = CHECKSUM_UNNECESSARY; if (rxd1 & MT_RXD1_NORMAL_FCS_ERR) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c index e4868c492bc04..650ab97ae0524 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c @@ -230,6 +230,7 @@ mt7921_mac_fill_rx(struct mt7921_dev *dev, struct sk_buff *skb) struct mt76_phy *mphy = &dev->mt76.phy; struct mt7921_phy *phy = &dev->phy; struct ieee80211_supported_band *sband; + u32 csum_status = *(u32 *)skb->cb; u32 rxd0 = le32_to_cpu(rxd[0]); u32 rxd1 = le32_to_cpu(rxd[1]); u32 rxd2 = le32_to_cpu(rxd[2]); @@ -290,7 +291,8 @@ mt7921_mac_fill_rx(struct mt7921_dev *dev, struct sk_buff *skb) if (!sband->channels) return -EINVAL; - if ((rxd0 & csum_mask) == csum_mask) + if (mt76_is_mmio(&dev->mt76) && (rxd0 & csum_mask) == csum_mask && + !(csum_status & (BIT(0) | BIT(2) | BIT(3)))) skb->ip_summed = CHECKSUM_UNNECESSARY; if (rxd1 & MT_RXD1_NORMAL_FCS_ERR) -- GitLab From 95b0f66649bb04c6c9c15e461ecf9522efe9555c Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez <jtornosm@redhat.com> Date: Mon, 10 Oct 2022 10:16:11 +0200 Subject: [PATCH 1704/2223] wifi: iwlwifi: mvm: fix double list_add at iwl_mvm_mac_wake_tx_queue (other cases) BUGs like this are still reproducible: [ 31.509616] list_add corruption. prev->next should be next (ffff8f8644242300), but was ffff8f86493fd300. (prev=ffff8f86493fd300). [ 31.521544] ------------[ cut here ]------------ [ 31.526248] kernel BUG at lib/list_debug.c:30! [ 31.530781] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 31.535831] CPU: 1 PID: 626 Comm: wpa_supplicant Not tainted 6.0.0+ #7 [ 31.542450] Hardware name: Dell Inc. Inspiron 660s/0478VN , BIOS A07 08/24/2012 [ 31.550484] RIP: 0010:__list_add_valid.cold+0x3a/0x5b [ 31.555537] Code: f2 4c 89 c1 48 89 fe 48 c7 c7 28 20 69 89 e8 4c e3 fd ff 0f 0b 48 89 d1 4c 89 c6 4c 89 ca 48 c7 c7 d0 1f 69 89 e8 35 e3 fd ff <0f> 0b 4c 89 c1 48 c7 c7 78 1f 69 89 e8 24 e3 fd ff 0f 0b 48 c7 c7 [ 31.574605] RSP: 0018:ffff9f6f00dc3748 EFLAGS: 00010286 [ 31.579990] RAX: 0000000000000075 RBX: ffff8f8644242080 RCX: 0000000000000000 [ 31.587155] RDX: 0000000000000201 RSI: ffffffff8967862d RDI: 00000000ffffffff [ 31.594482] RBP: ffff8f86493fd2e8 R08: 0000000000000000 R09: 00000000ffffdfff [ 31.601735] R10: ffff9f6f00dc3608 R11: ffffffff89f46128 R12: ffff8f86493fd300 [ 31.608986] R13: ffff8f86493fd300 R14: ffff8f8644242300 R15: ffff8f8643dd3f2c [ 31.616151] FS: 00007f3bb9a707c0(0000) GS:ffff8f865a300000(0000) knlGS:0000000000000000 [ 31.624447] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 31.630286] CR2: 00007fe3647d5600 CR3: 00000001125a6002 CR4: 00000000000606e0 [ 31.637539] Call Trace: [ 31.639936] <TASK> [ 31.642143] iwl_mvm_mac_wake_tx_queue+0x71/0x90 [iwlmvm] [ 31.647569] ieee80211_queue_skb+0x4b6/0x720 [mac80211] ... So, it is necessary to extend the applied solution with commit 14a3aacf517a9 ("iwlwifi: mvm: fix double list_add at iwl_mvm_mac_wake_tx_queue") to all other cases where the station queues are invalidated and the related lists are not emptied. Because, otherwise as before, if some new element is added later to the list in iwl_mvm_mac_wake_tx_queue, it can match with the old one and produce the same commented BUG. That is, in order to avoid this problem completely, we must also remove the related lists for the other cases when station queues are invalidated. Fixes: cfbc6c4c5b91c ("iwlwifi: mvm: support mac80211 TXQs model") Reported-by: Petr Stourac <pstourac@redhat.com> Tested-by: Petr Stourac <pstourac@redhat.com> Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com> Signed-off-by: Kalle Valo <kvalo@kernel.org> Link: https://lore.kernel.org/r/20221010081611.145027-1-jtornosm@redhat.com --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index cc92706b3d169..cbd8053a9e35a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -384,6 +384,7 @@ static int iwl_mvm_disable_txq(struct iwl_mvm *mvm, struct ieee80211_sta *sta, iwl_mvm_txq_from_tid(sta, tid); mvmtxq->txq_id = IWL_MVM_INVALID_QUEUE; + list_del_init(&mvmtxq->list); } /* Regardless if this is a reserved TXQ for a STA - mark it as false */ @@ -478,6 +479,7 @@ static int iwl_mvm_remove_sta_queue_marking(struct iwl_mvm *mvm, int queue) mvmsta->tid_data[tid].txq_id = IWL_MVM_INVALID_QUEUE; mvmtxq->txq_id = IWL_MVM_INVALID_QUEUE; + list_del_init(&mvmtxq->list); } mvmsta->tfd_queue_msk &= ~BIT(queue); /* Don't use this queue anymore */ -- GitLab From abf93f369419249ca482a8911039fe1c75a94227 Mon Sep 17 00:00:00 2001 From: Kalle Valo <quic_kvalo@quicinc.com> Date: Mon, 10 Oct 2022 19:06:38 +0300 Subject: [PATCH 1705/2223] wifi: ath11k: mac: fix reading 16 bytes from a region of size 0 warning Linaro reported stringop-overread warnings in ath11k (this is one of many): drivers/net/wireless/ath/ath11k/mac.c:2238:29: error: 'ath11k_peer_assoc_h_he_limit' reading 16 bytes from a region of size 0 [-Werror=stringop-overread] My further investigation showed that these warnings happen on GCC 11.3 but not with GCC 12.2, and with only the kernel config Linaro provided: https://builds.tuxbuild.com/2F4W7nZHNx3T88RB0gaCZ9hBX6c/config I saw the same warnings both with arm64 and x86_64 builds and KASAN seems to be the reason triggering these warnings with GCC 11. Nobody else has reported this so this seems to be quite rare corner case. I don't know what specific commit started emitting this warning so I can't provide a Fixes tag. The function hasn't been touched for a year. I decided to workaround this by converting the pointer to a new array in stack, and then copying the data to the new array. It's only 16 bytes anyway and this is executed during association, so not in a hotpath. Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.9 Reported-by: Linux Kernel Functional Testing <lkft@linaro.org> Link: https://lore.kernel.org/all/CA+G9fYsZ_qypa=jHY_dJ=tqX4515+qrV9n2SWXVDHve826nF7Q@mail.gmail.com/ Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com> Signed-off-by: Kalle Valo <kvalo@kernel.org> Link: https://lore.kernel.org/r/20221010160638.20152-1-kvalo@kernel.org --- drivers/net/wireless/ath/ath11k/mac.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 84d956ad4093f..2d1e3fd9b526c 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -2081,7 +2081,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, struct cfg80211_chan_def def; const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; enum nl80211_band band; - u16 *he_mcs_mask; + u16 he_mcs_mask[NL80211_HE_NSS_MAX]; u8 max_nss, he_mcs; u16 he_tx_mcs = 0, v = 0; int i, he_nss, nss_idx; @@ -2098,7 +2098,8 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, return; band = def.chan->band; - he_mcs_mask = arvif->bitrate_mask.control[band].he_mcs; + memcpy(he_mcs_mask, arvif->bitrate_mask.control[band].he_mcs, + sizeof(he_mcs_mask)); if (ath11k_peer_assoc_h_he_masked(he_mcs_mask)) return; -- GitLab From 8714f7bcd3c20d36890f43cc6a8e0c3c17b843aa Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Mon, 26 Sep 2022 08:23:51 +0200 Subject: [PATCH 1706/2223] xen/pv: add fault recovery control to pmu msr accesses Today pmu_msr_read() and pmu_msr_write() fall back to the safe variants of read/write MSR in case the MSR access isn't emulated via Xen. Allow the caller to select that faults should not be recovered from by passing NULL for the error pointer. Restructure the code to make it more readable. Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com> --- arch/x86/xen/pmu.c | 66 ++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 21ecbe754cb2f..0f98cb1077e3c 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -131,6 +131,9 @@ static inline uint32_t get_fam15h_addr(u32 addr) static inline bool is_amd_pmu_msr(unsigned int msr) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return false; + if ((msr >= MSR_F15H_PERF_CTL && msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || (msr >= MSR_K7_EVNTSEL0 && @@ -144,6 +147,9 @@ static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) { u32 msr_index_pmc; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + switch (msr_index) { case MSR_CORE_PERF_FIXED_CTR_CTRL: case MSR_IA32_DS_AREA: @@ -290,48 +296,52 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) return false; } +static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read, + bool *emul) +{ + int type, index; + + if (is_amd_pmu_msr(msr)) + *emul = xen_amd_pmu_emulate(msr, val, is_read); + else if (is_intel_pmu_msr(msr, &type, &index)) + *emul = xen_intel_pmu_emulate(msr, val, type, index, is_read); + else + return false; + + return true; +} + bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - if (is_amd_pmu_msr(msr)) { - if (!xen_amd_pmu_emulate(msr, val, 1)) - *val = native_read_msr_safe(msr, err); - return true; - } - } else { - int type, index; + bool emulated; - if (is_intel_pmu_msr(msr, &type, &index)) { - if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) - *val = native_read_msr_safe(msr, err); - return true; - } + if (!pmu_msr_chk_emulated(msr, val, true, &emulated)) + return false; + + if (!emulated) { + *val = err ? native_read_msr_safe(msr, err) + : native_read_msr(msr); } - return false; + return true; } bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) { uint64_t val = ((uint64_t)high << 32) | low; + bool emulated; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - if (is_amd_pmu_msr(msr)) { - if (!xen_amd_pmu_emulate(msr, &val, 0)) - *err = native_write_msr_safe(msr, low, high); - return true; - } - } else { - int type, index; + if (!pmu_msr_chk_emulated(msr, &val, false, &emulated)) + return false; - if (is_intel_pmu_msr(msr, &type, &index)) { - if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) - *err = native_write_msr_safe(msr, low, high); - return true; - } + if (!emulated) { + if (err) + *err = native_write_msr_safe(msr, low, high); + else + native_write_msr(msr, low, high); } - return false; + return true; } static unsigned long long xen_amd_read_pmc(int counter) -- GitLab From f90d98bdd06c0f3d1a60462c85324bd61f2a7142 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Wed, 5 Oct 2022 09:42:33 +0200 Subject: [PATCH 1707/2223] xen/pv: fix vendor checks for pmu emulation The CPU vendor checks for pmu emulation are rather limited today, as the assumption seems to be that only Intel and AMD are existing and/or supported vendors. Fix that by handling Centaur and Zhaoxin CPUs the same way as Intel, and Hygon the same way as AMD. While at it fix the return type of is_intel_pmu_msr(). Suggested-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com> --- arch/x86/xen/pmu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 0f98cb1077e3c..68aff13828728 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -131,7 +131,8 @@ static inline uint32_t get_fam15h_addr(u32 addr) static inline bool is_amd_pmu_msr(unsigned int msr) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return false; if ((msr >= MSR_F15H_PERF_CTL && @@ -143,11 +144,13 @@ static inline bool is_amd_pmu_msr(unsigned int msr) return false; } -static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index) { u32 msr_index_pmc; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return false; switch (msr_index) { -- GitLab From a1886b915e81439ba045b1431f3319d37ac1b906 Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Mon, 26 Sep 2022 12:33:03 +0200 Subject: [PATCH 1708/2223] xen/pv: refactor msr access functions to support safe and unsafe accesses Refactor and rename xen_read_msr_safe() and xen_write_msr_safe() to support both cases of MSR accesses, safe ones and potentially GP-fault generating ones. This will prepare to no longer swallow GPs silently in xen_read_msr() and xen_write_msr(). Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com> --- arch/x86/xen/enlighten_pv.c | 75 +++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 0a5dcadf23b93..8c2acccebfe14 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -916,14 +916,18 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static u64 xen_read_msr_safe(unsigned int msr, int *err) +static u64 xen_do_read_msr(unsigned int msr, int *err) { - u64 val; + u64 val = 0; /* Avoid uninitialized value for safe variant. */ if (pmu_msr_read(msr, &val, err)) return val; - val = native_read_msr_safe(msr, err); + if (err) + val = native_read_msr_safe(msr, err); + else + val = native_read_msr(msr); + switch (msr) { case MSR_IA32_APICBASE: val &= ~X2APIC_ENABLE; @@ -932,23 +936,39 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) return val; } -static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +static void set_seg(unsigned int which, unsigned int low, unsigned int high, + int *err) { - int ret; - unsigned int which; - u64 base; + u64 base = ((u64)high << 32) | low; + + if (HYPERVISOR_set_segment_base(which, base) == 0) + return; - ret = 0; + if (err) + *err = -EIO; + else + WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base); +} +/* + * Support write_msr_safe() and write_msr() semantics. + * With err == NULL write_msr() semantics are selected. + * Supplying an err pointer requires err to be pre-initialized with 0. + */ +static void xen_do_write_msr(unsigned int msr, unsigned int low, + unsigned int high, int *err) +{ switch (msr) { - case MSR_FS_BASE: which = SEGBASE_FS; goto set; - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; - - set: - base = ((u64)high << 32) | low; - if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EIO; + case MSR_FS_BASE: + set_seg(SEGBASE_FS, low, high, err); + break; + + case MSR_KERNEL_GS_BASE: + set_seg(SEGBASE_GS_USER, low, high, err); + break; + + case MSR_GS_BASE: + set_seg(SEGBASE_GS_KERNEL, low, high, err); break; case MSR_STAR: @@ -964,11 +984,28 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) break; default: - if (!pmu_msr_write(msr, low, high, &ret)) - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, err)) { + if (err) + *err = native_write_msr_safe(msr, low, high); + else + native_write_msr(msr, low, high); + } } +} + +static u64 xen_read_msr_safe(unsigned int msr, int *err) +{ + return xen_do_read_msr(msr, err); +} + +static int xen_write_msr_safe(unsigned int msr, unsigned int low, + unsigned int high) +{ + int err = 0; + + xen_do_write_msr(msr, low, high, &err); - return ret; + return err; } static u64 xen_read_msr(unsigned int msr) -- GitLab From 3fac3734c43a2e21fefeb72124d8bd31dff3956f Mon Sep 17 00:00:00 2001 From: Juergen Gross <jgross@suse.com> Date: Mon, 26 Sep 2022 13:16:56 +0200 Subject: [PATCH 1709/2223] xen/pv: support selecting safe/unsafe msr accesses Instead of always doing the safe variants for reading and writing MSRs in Xen PV guests, make the behavior controllable via Kconfig option and a boot parameter. The default will be the current behavior, which is to always use the safe variant. Signed-off-by: Juergen Gross <jgross@suse.com> --- .../admin-guide/kernel-parameters.txt | 6 +++++ arch/x86/xen/Kconfig | 9 +++++++ arch/x86/xen/enlighten_pv.c | 24 +++++++++++-------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 426fa892d311a..1bda9cf18faea 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6836,6 +6836,12 @@ Crash from Xen panic notifier, without executing late panic() code such as dumping handler. + xen_msr_safe= [X86,XEN] + Format: <bool> + Select whether to always use non-faulting (safe) MSR + access functions when running as Xen PV guest. The + default value is controlled by CONFIG_XEN_PV_MSR_SAFE. + xen_nopvspin [X86,XEN] Disables the qspinlock slowpath using Xen PV optimizations. This parameter is obsoleted by "nopvspin" parameter, which diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 85246dd9faa14..9b1ec5d8c99c8 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -92,3 +92,12 @@ config XEN_DOM0 select X86_X2APIC if XEN_PVH && X86_64 help Support running as a Xen Dom0 guest. + +config XEN_PV_MSR_SAFE + bool "Always use safe MSR accesses in PV guests" + default y + depends on XEN_PV + help + Use safe (not faulting) MSR access functions even if the MSR access + should not fault anyway. + The default can be changed by using the "xen_msr_safe" boot parameter. diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8c2acccebfe14..0ad3d4bf52b33 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -108,6 +108,16 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE); + +static int __init parse_xen_msr_safe(char *str) +{ + if (str) + return strtobool(str, &xen_msr_safe); + return -EINVAL; +} +early_param("xen_msr_safe", parse_xen_msr_safe); + static void __init xen_pv_init_platform(void) { /* PV guests can't operate virtio devices without grants. */ @@ -1010,22 +1020,16 @@ static int xen_write_msr_safe(unsigned int msr, unsigned int low, static u64 xen_read_msr(unsigned int msr) { - /* - * This will silently swallow a #GP from RDMSR. It may be worth - * changing that. - */ int err; - return xen_read_msr_safe(msr, &err); + return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL); } static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) { - /* - * This will silently swallow a #GP from WRMSR. It may be worth - * changing that. - */ - xen_write_msr_safe(msr, low, high); + int err; + + xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL); } /* This is called once we have the cpu_possible_mask */ -- GitLab From b148766e2b8b7b61c9aef53aefedae33f637a1e7 Mon Sep 17 00:00:00 2001 From: Helge Deller <deller@gmx.de> Date: Wed, 28 Sep 2022 23:31:20 +0200 Subject: [PATCH 1710/2223] parisc: Reduce kernel size by packing alternative tables The values stored in the length and condition fields of the alternative tables fit into 16 bits, so we can save 4 bytes per alternative table entry. Since a typical 32-bit kernel has more than 3000 entries this saves > 12k of storage on disc. bloat-o-meter shows a reduction of -0.01% by this change: Total: Before=10196505, After=10195529, chg -0.01% $ ls -la vmlinux vmlinux.before -rwxr-xr-x 14437324 vmlinux -rwxr-xr-x 14449512 vmlinux.before Signed-off-by: Helge Deller <deller@gmx.de> --- arch/parisc/include/asm/alternative.h | 21 ++++++++++++--------- arch/parisc/kernel/alternative.c | 7 ++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/parisc/include/asm/alternative.h b/arch/parisc/include/asm/alternative.h index 0ec54f43d6d25..1ed45fd085d3b 100644 --- a/arch/parisc/include/asm/alternative.h +++ b/arch/parisc/include/asm/alternative.h @@ -22,10 +22,10 @@ struct alt_instr { s32 orig_offset; /* offset to original instructions */ - s32 len; /* end of original instructions */ - u32 cond; /* see ALT_COND_XXX */ + s16 len; /* end of original instructions */ + u16 cond; /* see ALT_COND_XXX */ u32 replacement; /* replacement instruction or code */ -}; +} __packed; void set_kernel_text_rw(int enable_read_write); void apply_alternatives_all(void); @@ -35,8 +35,9 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end, /* Alternative SMP implementation. */ #define ALTERNATIVE(cond, replacement) "!0:" \ ".section .altinstructions, \"aw\" !" \ - ".word (0b-4-.), 1, " __stringify(cond) "," \ - __stringify(replacement) " !" \ + ".word (0b-4-.) !" \ + ".hword 1, " __stringify(cond) " !" \ + ".word " __stringify(replacement) " !" \ ".previous" #else @@ -44,15 +45,17 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end, /* to replace one single instructions by a new instruction */ #define ALTERNATIVE(from, to, cond, replacement)\ .section .altinstructions, "aw" ! \ - .word (from - .), (to - from)/4 ! \ - .word cond, replacement ! \ + .word (from - .) ! \ + .hword (to - from)/4, cond ! \ + .word replacement ! \ .previous /* to replace multiple instructions by new code */ #define ALTERNATIVE_CODE(from, num_instructions, cond, new_instr_ptr)\ .section .altinstructions, "aw" ! \ - .word (from - .), -num_instructions ! \ - .word cond, (new_instr_ptr - .) ! \ + .word (from - .) ! \ + .hword -num_instructions, cond ! \ + .word (new_instr_ptr - .) ! \ .previous #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c index daa1e9047275b..66f5672c70bd4 100644 --- a/arch/parisc/kernel/alternative.c +++ b/arch/parisc/kernel/alternative.c @@ -26,7 +26,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *entry; int index = 0, applied = 0; int num_cpus = num_online_cpus(); - u32 cond_check; + u16 cond_check; cond_check = ALT_COND_ALWAYS | ((num_cpus == 1) ? ALT_COND_NO_SMP : 0) | @@ -45,8 +45,9 @@ void __init_or_module apply_alternatives(struct alt_instr *start, for (entry = start; entry < end; entry++, index++) { - u32 *from, cond, replacement; - s32 len; + u32 *from, replacement; + u16 cond; + s16 len; from = (u32 *)((ulong)&entry->orig_offset + entry->orig_offset); len = entry->len; -- GitLab From 027c3d345e2a1ea61d6e4506a250eb392e6e7b18 Mon Sep 17 00:00:00 2001 From: Helge Deller <deller@gmx.de> Date: Sat, 1 Oct 2022 00:32:07 +0200 Subject: [PATCH 1711/2223] parisc: Convert PDC console to an early console Rewrite the PDC console to become an early console. Beside the fact that now boot information is visible until another (text- or graphics) console takes over, this benefits as well machines with a yet-unsupported STI console and kgdb. Signed-off-by: Helge Deller <deller@gmx.de> --- arch/parisc/include/asm/pdc.h | 3 - arch/parisc/kernel/pdc_cons.c | 240 ++++------------------------------ arch/parisc/kernel/setup.c | 6 +- arch/parisc/kernel/traps.c | 15 +-- drivers/tty/serial/Kconfig | 15 --- lib/Kconfig.kgdb | 2 +- 6 files changed, 32 insertions(+), 249 deletions(-) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index b643092d4b985..fcbcf9a96c111 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -19,9 +19,6 @@ extern unsigned long parisc_pat_pdc_cap; /* PDC capabilities (PAT) */ #define PDC_TYPE_SYSTEM_MAP 1 /* 32-bit, but supports PDC_SYSTEM_MAP */ #define PDC_TYPE_SNAKE 2 /* Doesn't support SYSTEM_MAP */ -void pdc_console_init(void); /* in pdc_console.c */ -void pdc_console_restart(void); - void setup_pdc(void); /* in inventory.c */ /* wrapper-functions from pdc.c */ diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c index 2661cdd256ae7..7d0989f523d03 100644 --- a/arch/parisc/kernel/pdc_cons.c +++ b/arch/parisc/kernel/pdc_cons.c @@ -1,46 +1,18 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * PDC Console support - ie use firmware to dump text via boot console + * PDC early console support - use PDC firmware to dump text via boot console * - * Copyright (C) 1999-2003 Matthew Wilcox <willy at parisc-linux.org> - * Copyright (C) 2000 Martin K Petersen <mkp at mkp.net> - * Copyright (C) 2000 John Marvin <jsm at parisc-linux.org> - * Copyright (C) 2000-2003 Paul Bame <bame at parisc-linux.org> - * Copyright (C) 2000 Philipp Rumpf <prumpf with tux.org> - * Copyright (C) 2000 Michael Ang <mang with subcarrier.org> - * Copyright (C) 2000 Grant Grundler <grundler with parisc-linux.org> - * Copyright (C) 2001-2002 Ryan Bradetich <rbrad at parisc-linux.org> - * Copyright (C) 2001 Helge Deller <deller at parisc-linux.org> - * Copyright (C) 2001 Thomas Bogendoerfer <tsbogend at parisc-linux.org> - * Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org> - * Copyright (C) 2010 Guy Martin <gmsoft at tuxicoman.be> + * Copyright (C) 2001-2022 Helge Deller <deller@gmx.de> */ -/* - * The PDC console is a simple console, which can be used for debugging - * boot related problems on HP PA-RISC machines. It is also useful when no - * other console works. - * - * This code uses the ROM (=PDC) based functions to read and write characters - * from and to PDC's boot path. - */ - -/* Define EARLY_BOOTUP_DEBUG to debug kernel related boot problems. - * On production kernels EARLY_BOOTUP_DEBUG should be undefined. */ -#define EARLY_BOOTUP_DEBUG - - -#include <linux/kernel.h> #include <linux/console.h> -#include <linux/string.h> #include <linux/init.h> -#include <linux/major.h> -#include <linux/tty.h> +#include <linux/serial_core.h> +#include <linux/kgdb.h> #include <asm/page.h> /* for PAGE0 */ #include <asm/pdc.h> /* for iodc_call() proto and friends */ static DEFINE_SPINLOCK(pdc_console_lock); -static struct console pdc_cons; static void pdc_console_write(struct console *co, const char *s, unsigned count) { @@ -54,7 +26,8 @@ static void pdc_console_write(struct console *co, const char *s, unsigned count) spin_unlock_irqrestore(&pdc_console_lock, flags); } -int pdc_console_poll_key(struct console *co) +#ifdef CONFIG_KGDB +static int kgdb_pdc_read_char(void) { int c; unsigned long flags; @@ -63,201 +36,40 @@ int pdc_console_poll_key(struct console *co) c = pdc_iodc_getc(); spin_unlock_irqrestore(&pdc_console_lock, flags); - return c; -} - -static int pdc_console_setup(struct console *co, char *options) -{ - return 0; -} - -#if defined(CONFIG_PDC_CONSOLE) -#include <linux/vt_kern.h> -#include <linux/tty_flip.h> - -#define PDC_CONS_POLL_DELAY (30 * HZ / 1000) - -static void pdc_console_poll(struct timer_list *unused); -static DEFINE_TIMER(pdc_console_timer, pdc_console_poll); -static struct tty_port tty_port; - -static int pdc_console_tty_open(struct tty_struct *tty, struct file *filp) -{ - tty_port_tty_set(&tty_port, tty); - mod_timer(&pdc_console_timer, jiffies + PDC_CONS_POLL_DELAY); - - return 0; + return (c <= 0) ? NO_POLL_CHAR : c; } -static void pdc_console_tty_close(struct tty_struct *tty, struct file *filp) +static void kgdb_pdc_write_char(u8 chr) { - if (tty->count == 1) { - del_timer_sync(&pdc_console_timer); - tty_port_tty_set(&tty_port, NULL); - } + if (PAGE0->mem_cons.cl_class != CL_DUPLEX) + pdc_console_write(NULL, &chr, 1); } -static int pdc_console_tty_write(struct tty_struct *tty, const unsigned char *buf, int count) -{ - pdc_console_write(NULL, buf, count); - return count; -} - -static unsigned int pdc_console_tty_write_room(struct tty_struct *tty) -{ - return 32768; /* no limit, no buffer used */ -} - -static const struct tty_operations pdc_console_tty_ops = { - .open = pdc_console_tty_open, - .close = pdc_console_tty_close, - .write = pdc_console_tty_write, - .write_room = pdc_console_tty_write_room, +static struct kgdb_io kgdb_pdc_io_ops = { + .name = "kgdb_pdc", + .read_char = kgdb_pdc_read_char, + .write_char = kgdb_pdc_write_char, }; - -static void pdc_console_poll(struct timer_list *unused) -{ - int data, count = 0; - - while (1) { - data = pdc_console_poll_key(NULL); - if (data == -1) - break; - tty_insert_flip_char(&tty_port, data & 0xFF, TTY_NORMAL); - count ++; - } - - if (count) - tty_flip_buffer_push(&tty_port); - - if (pdc_cons.flags & CON_ENABLED) - mod_timer(&pdc_console_timer, jiffies + PDC_CONS_POLL_DELAY); -} - -static struct tty_driver *pdc_console_tty_driver; - -static int __init pdc_console_tty_driver_init(void) -{ - struct tty_driver *driver; - int err; - - /* Check if the console driver is still registered. - * It is unregistered if the pdc console was not selected as the - * primary console. */ - - struct console *tmp; - - console_lock(); - for_each_console(tmp) - if (tmp == &pdc_cons) - break; - console_unlock(); - - if (!tmp) { - printk(KERN_INFO "PDC console driver not registered anymore, not creating %s\n", pdc_cons.name); - return -ENODEV; - } - - printk(KERN_INFO "The PDC console driver is still registered, removing CON_BOOT flag\n"); - pdc_cons.flags &= ~CON_BOOT; - - driver = tty_alloc_driver(1, TTY_DRIVER_REAL_RAW | - TTY_DRIVER_RESET_TERMIOS); - if (IS_ERR(driver)) - return PTR_ERR(driver); - - tty_port_init(&tty_port); - - driver->driver_name = "pdc_cons"; - driver->name = "ttyB"; - driver->major = MUX_MAJOR; - driver->minor_start = 0; - driver->type = TTY_DRIVER_TYPE_SYSTEM; - driver->init_termios = tty_std_termios; - tty_set_operations(driver, &pdc_console_tty_ops); - tty_port_link_device(&tty_port, driver, 0); - - err = tty_register_driver(driver); - if (err) { - printk(KERN_ERR "Unable to register the PDC console TTY driver\n"); - tty_port_destroy(&tty_port); - tty_driver_kref_put(driver); - return err; - } - - pdc_console_tty_driver = driver; - - return 0; -} -device_initcall(pdc_console_tty_driver_init); - -static struct tty_driver * pdc_console_device (struct console *c, int *index) -{ - *index = c->index; - return pdc_console_tty_driver; -} -#else -#define pdc_console_device NULL #endif -static struct console pdc_cons = { - .name = "ttyB", - .write = pdc_console_write, - .device = pdc_console_device, - .setup = pdc_console_setup, - .flags = CON_BOOT | CON_PRINTBUFFER, - .index = -1, -}; - -static int pdc_console_initialized; - -static void pdc_console_init_force(void) +static int __init pdc_earlycon_setup(struct earlycon_device *device, + const char *opt) { - if (pdc_console_initialized) - return; - ++pdc_console_initialized; - + struct console *earlycon_console; + /* If the console is duplex then copy the COUT parameters to CIN. */ if (PAGE0->mem_cons.cl_class == CL_DUPLEX) memcpy(&PAGE0->mem_kbd, &PAGE0->mem_cons, sizeof(PAGE0->mem_cons)); - /* register the pdc console */ - register_console(&pdc_cons); -} + earlycon_console = device->con; + earlycon_console->write = pdc_console_write; + device->port.iotype = UPIO_MEM32BE; -void __init pdc_console_init(void) -{ -#if defined(EARLY_BOOTUP_DEBUG) || defined(CONFIG_PDC_CONSOLE) - pdc_console_init_force(); +#ifdef CONFIG_KGDB + kgdb_register_io_module(&kgdb_pdc_io_ops); #endif -#ifdef EARLY_BOOTUP_DEBUG - printk(KERN_INFO "Initialized PDC Console for debugging.\n"); -#endif -} - - -/* - * Used for emergencies. Currently only used if an HPMC occurs. If an - * HPMC occurs, it is possible that the current console may not be - * properly initialised after the PDC IO reset. This routine unregisters - * all of the current consoles, reinitializes the pdc console and - * registers it. - */ - -void pdc_console_restart(void) -{ - struct console *console; - - if (pdc_console_initialized) - return; - /* If we've already seen the output, don't bother to print it again */ - if (console_drivers != NULL) - pdc_cons.flags &= ~CON_PRINTBUFFER; - - while ((console = console_drivers) != NULL) - unregister_console(console_drivers); - - /* force registering the pdc console */ - pdc_console_init_force(); + return 0; } + +EARLYCON_DECLARE(pdc, pdc_earlycon_setup); diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index f005ddedb50e4..375f38d6e1a4d 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -70,6 +70,10 @@ void __init setup_cmdline(char **cmdline_p) strlcat(p, "tty0", COMMAND_LINE_SIZE); } + /* default to use early console */ + if (!strstr(p, "earlycon")) + strlcat(p, " earlycon=pdc", COMMAND_LINE_SIZE); + #ifdef CONFIG_BLK_DEV_INITRD if (boot_args[2] != 0) /* did palo pass us a ramdisk? */ { @@ -139,8 +143,6 @@ void __init setup_arch(char **cmdline_p) if (__pa((unsigned long) &_end) >= KERNEL_INITIAL_SIZE) panic("KERNEL_INITIAL_ORDER too small!"); - pdc_console_init(); - #ifdef CONFIG_64BIT if(parisc_narrow_firmware) { printk(KERN_INFO "Kernel is using PDC in 32-bit mode.\n"); diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index b78f1b9d45c18..f9696fbf646c4 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -239,13 +239,6 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) /* unlock the pdc lock if necessary */ pdc_emergency_unlock(); - /* maybe the kernel hasn't booted very far yet and hasn't been able - * to initialize the serial or STI console. In that case we should - * re-enable the pdc console, so that the user will be able to - * identify the problem. */ - if (!console_drivers) - pdc_console_restart(); - if (err) printk(KERN_CRIT "%s (pid %d): %s (code %ld)\n", current->comm, task_pid_nr(current), str, err); @@ -429,10 +422,6 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o /* unlock the pdc lock if necessary */ pdc_emergency_unlock(); - /* restart pdc console if necessary */ - if (!console_drivers) - pdc_console_restart(); - /* Not all paths will gutter the processor... */ switch(code){ @@ -482,9 +471,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs) unsigned long fault_space = 0; int si_code; - if (code == 1) - pdc_console_restart(); /* switch back to pdc if HPMC */ - else if (!irqs_disabled_flags(regs->gr[0])) + if (!irqs_disabled_flags(regs->gr[0])) local_irq_enable(); /* Security check: diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 877173907c536..898728ab2c18e 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -602,21 +602,6 @@ config SERIAL_MUX_CONSOLE select SERIAL_CORE_CONSOLE default y -config PDC_CONSOLE - bool "PDC software console support" - depends on PARISC && !SERIAL_MUX && VT - help - Saying Y here will enable the software based PDC console to be - used as the system console. This is useful for machines in - which the hardware based console has not been written yet. The - following steps must be completed to use the PDC console: - - 1. create the device entry (mknod /dev/ttyB0 c 11 0) - 2. Edit the /etc/inittab to start a getty listening on /dev/ttyB0 - 3. Add device ttyB0 to /etc/securetty (if you want to log on as - root on this console.) - 4. Change the kernel command console parameter to: console=ttyB0 - config SERIAL_SUNSAB tristate "Sun Siemens SAB82532 serial support" depends on SPARC && PCI diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb index 05dae05b6cc9e..3b9a440084332 100644 --- a/lib/Kconfig.kgdb +++ b/lib/Kconfig.kgdb @@ -121,7 +121,7 @@ config KDB_DEFAULT_ENABLE config KDB_KEYBOARD bool "KGDB_KDB: keyboard as input device" - depends on VT && KGDB_KDB + depends on VT && KGDB_KDB && !PARISC default n help KDB can use a PS/2 type keyboard for an input device -- GitLab From 9971a741c5f44fd72e664c35be9bc6fedb8a3498 Mon Sep 17 00:00:00 2001 From: Boris Burkov <boris@bur.io> Date: Tue, 27 Sep 2022 09:30:39 -0700 Subject: [PATCH 1712/2223] btrfs: send: allow protocol version 3 with CONFIG_BTRFS_DEBUG We haven't finalized send stream v3 yet, so gate the send stream version behind CONFIG_BTRFS_DEBUG as we want some way to test it. The original verity send did not check the protocol version, so add that actual protection as well. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Boris Burkov <boris@bur.io> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/send.c | 2 +- fs/btrfs/send.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ef4167072b89..1783476662358 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6469,7 +6469,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } - if (sctx->cur_inode_needs_verity) { + if (sctx->proto >= 3 && sctx->cur_inode_needs_verity) { ret = process_verity(sctx); if (ret < 0) goto out; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 0a4537775e0c3..f7585cfa7e52b 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -10,7 +10,12 @@ #include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +/* Conditional support for the upcoming protocol version. */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_SEND_STREAM_VERSION 3 +#else #define BTRFS_SEND_STREAM_VERSION 2 +#endif /* * In send stream v1, no command is larger than 64K. In send stream v2, no limit -- GitLab From c86eab81a23f368d08efd3df96a95f3d0b471f85 Mon Sep 17 00:00:00 2001 From: David Sterba <dsterba@suse.com> Date: Fri, 7 Oct 2022 17:10:02 +0200 Subject: [PATCH 1713/2223] btrfs: send: update command for protocol version check For a protocol and command compatibility we have a helper that hasn't been updated for v3 yet. We use it for verity so update where necessary. Fixes: 38622010a6de ("btrfs: send: add support for fs-verity") Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/send.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1783476662358..ec6e1752af2ca 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -348,6 +348,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) switch (sctx->proto) { case 1: return cmd <= BTRFS_SEND_C_MAX_V1; case 2: return cmd <= BTRFS_SEND_C_MAX_V2; + case 3: return cmd <= BTRFS_SEND_C_MAX_V3; default: return false; } } @@ -6469,7 +6470,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } - if (sctx->proto >= 3 && sctx->cur_inode_needs_verity) { + + if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) + && sctx->cur_inode_needs_verity) { ret = process_verity(sctx); if (ret < 0) goto out; -- GitLab From 9e769bd7e5db5e3bd76e7c67004c261f7fcaa8f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik <josef@toxicpanda.com> Date: Fri, 30 Sep 2022 16:45:08 -0400 Subject: [PATCH 1714/2223] btrfs: unlock locked extent area if we have contention In production we hit the following deadlock task 1 task 2 task 3 ------ ------ ------ fiemap(file) falloc(file) fsync(file) write(0, 1MiB) btrfs_commit_transaction() wait_on(!pending_ordered) lock(512MiB, 1GiB) start_transaction wait_on_transaction lock(0, 1GiB) wait_extent_bit(512MiB) task 4 ------ finish_ordered_extent(0, 1MiB) lock(0, 1MiB) **DEADLOCK** This occurs because when task 1 does it's lock, it locks everything from 0-512MiB, and then waits for the 512MiB chunk to unlock. task 2 will never unlock because it's waiting on the transaction commit to happen, the transaction commit is waiting for the outstanding ordered extents, and then the ordered extent thread is blocked waiting on the 0-1MiB range to unlock. To fix this we have to clear anything we've locked so far, wait for the extent_state that we contended on, and then try to re-lock the entire range again. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/extent-io-tree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 618275af19c49..83cb0378096f2 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1641,16 +1641,17 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, int err; u64 failed_start; - while (1) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + cached_state, NULL, GFP_NOFS); + while (err == -EEXIST) { + if (failed_start != start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached_state); + + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, cached_state, NULL, GFP_NOFS); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); } return err; } -- GitLab From 295a53ccc4ca8383f6d107534b466b91aa013f79 Mon Sep 17 00:00:00 2001 From: David Sterba <dsterba@suse.com> Date: Tue, 11 Oct 2022 11:25:33 +0200 Subject: [PATCH 1715/2223] btrfs: delete stale comments after merge conflict resolution There are two comments in btrfs_cache_block_group that I left when resolving conflict between commits ced8ecf026fd8 "btrfs: fix space cache corruption and potential double allocations" and 527c490f44f6f "btrfs: delete btrfs_wait_space_cache_v1_finished". The former reworked the caching logic to wait until the caching ends in btrfs_cache_block_group while the latter only open coded the waiting. Both removed btrfs_wait_space_cache_v1_finished, the correct code is with the waiting and returning error. Thus the conflict resolution was OK. Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/block-group.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 32c415cfbdfe7..deebc8ddbd932 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -774,10 +774,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); out: - /* REVIEW */ if (wait && caching_ctl) ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); - /* wait_event(caching_ctl->wait, space_cache_v1_done(cache)); */ if (caching_ctl) btrfs_put_caching_control(caching_ctl); -- GitLab From 4fc7b57228243d09c0d878873bf24fa64a90fa01 Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana@suse.com> Date: Tue, 11 Oct 2022 13:16:51 +0100 Subject: [PATCH 1716/2223] btrfs: fix processing of delayed data refs during backref walking When processing delayed data references during backref walking and we are using a share context (we are being called through fiemap), whenever we find a delayed data reference for an inode different from the one we are interested in, then we immediately exit and consider the data extent as shared. This is wrong, because: 1) This might be a DROP reference that will cancel out a reference in the extent tree; 2) Even if it's an ADD reference, it may be followed by a DROP reference that cancels it out. In either case we should not exit immediately. Fix this by never exiting when we find a delayed data reference for another inode - instead add the reference and if it does not cancel out other delayed reference, we will exit early when we call extent_is_shared() after processing all delayed references. If we find a drop reference, then signal the code that processes references from the extent tree (add_inline_refs() and add_keyed_refs()) to not exit immediately if it finds there a reference for another inode, since we have delayed drop references that may cancel it out. In this later case we exit once we don't have references in the rb trees that cancel out each other and have two references for different inodes. Example reproducer for case 1): $ cat test-1.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT xfs_io -f -c "pwrite 0 64K" $MNT/foo cp --reflink=always $MNT/foo $MNT/bar echo echo "fiemap after cloning:" xfs_io -c "fiemap -v" $MNT/foo rm -f $MNT/bar echo echo "fiemap after removing file bar:" xfs_io -c "fiemap -v" $MNT/foo umount $MNT Running it before this patch, the extent is still listed as shared, it has the flag 0x2000 (FIEMAP_EXTENT_SHARED) set: $ ./test-1.sh fiemap after cloning: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x2001 fiemap after removing file bar: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x2001 Example reproducer for case 2): $ cat test-2.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT xfs_io -f -c "pwrite 0 64K" $MNT/foo cp --reflink=always $MNT/foo $MNT/bar # Flush delayed references to the extent tree and commit current # transaction. sync echo echo "fiemap after cloning:" xfs_io -c "fiemap -v" $MNT/foo rm -f $MNT/bar echo echo "fiemap after removing file bar:" xfs_io -c "fiemap -v" $MNT/foo umount $MNT Running it before this patch, the extent is still listed as shared, it has the flag 0x2000 (FIEMAP_EXTENT_SHARED) set: $ ./test-2.sh fiemap after cloning: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x2001 fiemap after removing file bar: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x2001 After this patch, after deleting bar in both tests, the extent is not reported with the 0x2000 flag anymore, it gets only the flag 0x1 (which is FIEMAP_EXTENT_LAST): $ ./test-1.sh fiemap after cloning: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x2001 fiemap after removing file bar: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x1 $ ./test-2.sh fiemap after cloning: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x2001 fiemap after removing file bar: /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 26624..26751 128 0x1 These tests will later be converted to a test case for fstests. Fixes: dc046b10c8b7d4 ("Btrfs: make fiemap not blow when you have lots of snapshots") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/backref.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3c0c1f626c75d..cf47dabb786f0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -138,6 +138,7 @@ struct share_check { u64 root_objectid; u64 inum; int share_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -884,13 +885,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -920,7 +930,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -1023,7 +1033,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1033,6 +1044,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, bytenr, count, sc, GFP_NOFS); + break; } default: @@ -1122,7 +1134,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1661,6 +1674,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, + .have_delayed_delete_refs = false, }; int level; @@ -1726,6 +1740,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, break; } shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } -- GitLab From 943553ef9b51db303ab2b955c1025261abfdf6fb Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana@suse.com> Date: Tue, 11 Oct 2022 13:16:52 +0100 Subject: [PATCH 1717/2223] btrfs: fix processing of delayed tree block refs during backref walking During backref walking, when processing a delayed reference with a type of BTRFS_TREE_BLOCK_REF_KEY, we have two bugs there: 1) We are accessing the delayed references extent_op, and its key, without the protection of the delayed ref head's lock; 2) If there's no extent op for the delayed ref head, we end up with an uninitialized key in the stack, variable 'tmp_op_key', and then pass it to add_indirect_ref(), which adds the reference to the indirect refs rb tree. This is wrong, because indirect references should have a NULL key when we don't have access to the key, and in that case they should be added to the indirect_missing_keys rb tree and not to the indirect rb tree. This means that if have BTRFS_TREE_BLOCK_REF_KEY delayed ref resulting from freeing an extent buffer, therefore with a count of -1, it will not cancel out the corresponding reference we have in the extent tree (with a count of 1), since both references end up in different rb trees. When using fiemap, where we often need to check if extents are shared through shared subtrees resulting from snapshots, it means we can incorrectly report an extent as shared when it's no longer shared. However this is temporary because after the transaction is committed the extent is no longer reported as shared, as running the delayed reference results in deleting the tree block reference from the extent tree. Outside the fiemap context, the result is unpredictable, as the key was not initialized but it's used when navigating the rb trees to insert and search for references (prelim_ref_compare()), and we expect all references in the indirect rb tree to have valid keys. The following reproducer triggers the second bug: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount -o compress $DEV $MNT # With a compressed 128M file we get a tree height of 2 (level 1 root). xfs_io -f -c "pwrite -b 1M 0 128M" $MNT/foo btrfs subvolume snapshot $MNT $MNT/snap # Fiemap should output 0x2008 in the flags column. # 0x2000 means shared extent # 0x8 means encoded extent (because it's compressed) echo echo "fiemap after snapshot, range [120M, 120M + 128K):" xfs_io -c "fiemap -v 120M 128K" $MNT/foo echo # Overwrite one extent and fsync to flush delalloc and COW a new path # in the snapshot's tree. # # After this we have a BTRFS_DROP_DELAYED_REF delayed ref of type # BTRFS_TREE_BLOCK_REF_KEY with a count of -1 for every COWed extent # buffer in the path. # # In the extent tree we have inline references of type # BTRFS_TREE_BLOCK_REF_KEY, with a count of 1, for the same extent # buffers, so they should cancel each other, and the extent buffers in # the fs tree should no longer be considered as shared. # echo "Overwriting file range [120M, 120M + 128K)..." xfs_io -c "pwrite -b 128K 120M 128K" $MNT/snap/foo xfs_io -c "fsync" $MNT/snap/foo # Fiemap should output 0x8 in the flags column. The extent in the range # [120M, 120M + 128K) is no longer shared, it's now exclusive to the fs # tree. echo echo "fiemap after overwrite range [120M, 120M + 128K):" xfs_io -c "fiemap -v 120M 128K" $MNT/foo echo umount $MNT Running it before this patch: $ ./test.sh (...) wrote 134217728/134217728 bytes at offset 0 128 MiB, 128 ops; 0.1152 sec (1.085 GiB/sec and 1110.5809 ops/sec) Create a snapshot of '/mnt/sdj' in '/mnt/sdj/snap' fiemap after snapshot, range [120M, 120M + 128K): /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [245760..246015]: 34304..34559 256 0x2008 Overwriting file range [120M, 120M + 128K)... wrote 131072/131072 bytes at offset 125829120 128 KiB, 1 ops; 0.0001 sec (683.060 MiB/sec and 5464.4809 ops/sec) fiemap after overwrite range [120M, 120M + 128K): /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [245760..246015]: 34304..34559 256 0x2008 The extent in the range [120M, 120M + 128K) is still reported as shared (0x2000 bit set) after overwriting that range and flushing delalloc, which is not correct - an entire path was COWed in the snapshot's tree and the extent is now only referenced by the original fs tree. Running it after this patch: $ ./test.sh (...) wrote 134217728/134217728 bytes at offset 0 128 MiB, 128 ops; 0.1198 sec (1.043 GiB/sec and 1068.2067 ops/sec) Create a snapshot of '/mnt/sdj' in '/mnt/sdj/snap' fiemap after snapshot, range [120M, 120M + 128K): /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [245760..246015]: 34304..34559 256 0x2008 Overwriting file range [120M, 120M + 128K)... wrote 131072/131072 bytes at offset 125829120 128 KiB, 1 ops; 0.0001 sec (694.444 MiB/sec and 5555.5556 ops/sec) fiemap after overwrite range [120M, 120M + 128K): /mnt/sdj/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [245760..246015]: 34304..34559 256 0x8 Now the extent is not reported as shared anymore. So fix this by passing a NULL key pointer to add_indirect_ref() when processing a delayed reference for a tree block if there's no extent op for our delayed ref head with a defined key. Also access the extent op only after locking the delayed ref head's lock. The reproducer will be converted later to a test case for fstests. Fixes: 86d5f994425252 ("btrfs: convert prelimary reference tracking to use rbtrees") Fixes: a6dbceafb915e8 ("btrfs: Remove unused op_key var from add_delayed_refs") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/backref.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index cf47dabb786f0..4e29ccb234c0d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -821,16 +821,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -856,10 +851,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; -- GitLab From 63c84b46b3b75798f1ad63527b6250de00331907 Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana@suse.com> Date: Tue, 11 Oct 2022 13:16:53 +0100 Subject: [PATCH 1718/2223] btrfs: ignore fiemap path cache if we have multiple leaves for a data extent The path cache used during fiemap used to determine the sharedness of extent buffers in a path from a leaf containing a file extent item pointing to our data extent up to the root node of the tree, is meant to be used for a single path. Having a single path is by far the most common case, and therefore worth to optimize for, but it's possible to actually have multiple paths because we have 2 or more leaves. If we have multiple leaves, the 'level' variable keeps getting incremented in each iteration of the while loop at btrfs_is_data_extent_shared(), which means we will treat the second leaf in the 'tmp' ulist as a level 1 node, and so forth. In the worst case this can lead to getting a level greater than or equals to BTRFS_MAX_LEVEL (8), which will trigger a WARN_ON_ONCE() in the functions to lookup from or store in the path cache (lookup_backref_shared_cache() and store_backref_shared_cache()). If the current level never goes beyond 8, due to shared nodes in the paths and a fs tree height smaller than 8, it can still result in incorrectly marking one leaf as shared because some other leaf is shared and is stored one level below that other leaf, as when storing a true sharedness value in the cache results in updating the sharedness to true of all entries in the cache below the current level. Having multiple leaves happens in a case like the following: - We have a file extent item point to data extent at bytenr X, for a file range [0, 1M[ for example; - At this moment we have an extent data ref for the extent, with an offset of 0 and a count of 1; - A write into the middle of the extent happens, file range [64K, 128K) so the file extent item is split into two (at btrfs_drop_extents()): 1) One for file range [0, 64K), with a length (num_bytes field) of 64K and an extent offset of 0; 2) Another one for file range [128K, 1M), with a length of 896K (1M - 128K) and an extent offset of 128K. - At this moment the two file extent items are located in the same leaf; - A new file extent item for the range [64K, 128K), pointing to a new data extent, is inserted in the leaf. This results in a leaf split and now those two file extent items pointing to data extent X end up located in different leaves; - Once delayed refs are run, we still have a single extent data ref item for our data extent at bytenr X, for offset 0, but now with a count of 2 instead of 1; - So during fiemap, at btrfs_is_data_extent_shared(), after we call find_parent_nodes() for the data extent, we get two leaves, since we have two file extent items point to data extent at bytenr X that are located in two different leaves. So skip the use of the path cache when we get more than one leaf. Fixes: 12a824dc67a61e ("btrfs: speedup checking for extent sharedness during fiemap") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/backref.c | 25 +++++++++++++++++++++++++ fs/btrfs/backref.h | 1 + 2 files changed, 26 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 4e29ccb234c0d..4ec18ceb2f21d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1536,6 +1536,9 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache { struct btrfs_backref_shared_cache_entry *entry; + if (!cache->use_cache) + return false; + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) return false; @@ -1600,6 +1603,9 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!cache->use_cache) + return; + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) return; @@ -1697,6 +1703,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); + cache->use_cache = true; while (1) { bool is_shared; bool cached; @@ -1726,6 +1733,24 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, extent_gen > btrfs_root_last_snapshot(&root->root_item)) break; + /* + * If our data extent was not directly shared (without multiple + * reference items), than it might have a single reference item + * with a count > 1 for the same offset, which means there are 2 + * (or more) file extent items that point to the data extent - + * this happens when a file extent item needs to be split and + * then one item gets moved to another leaf due to a b+tree leaf + * split when inserting some item. In this case the file extent + * items may be located in different leaves and therefore some + * of the leaves may be referenced through shared subtrees while + * others are not. Since our extent buffer cache only works for + * a single path (by far the most common case and simpler to + * deal with), we can not use it if we have multiple leaves + * (which implies multiple paths). + */ + if (level == -1 && tmp->nnodes > 1) + cache->use_cache = false; + if (level >= 0) store_backref_shared_cache(cache, root, bytenr, level, false); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 52ae6957b4142..8e69584d538d2 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -29,6 +29,7 @@ struct btrfs_backref_shared_cache { * a given data extent should never exceed the maximum b+tree height. */ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; + bool use_cache; }; typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, -- GitLab From 6e141772e6465f937458b35ddcfd0a981b6f5280 Mon Sep 17 00:00:00 2001 From: Wenchao Chen <wenchao.chen@unisoc.com> Date: Tue, 11 Oct 2022 18:49:35 +0800 Subject: [PATCH 1719/2223] mmc: sdhci-sprd: Fix minimum clock limit The Spreadtrum controller supports 100KHz minimal clock rate, which means that the current value 400KHz is wrong. Unfortunately this has also lead to fail to initialize some cards, which are allowed to require 100KHz to work. So, let's fix the problem by changing the minimal supported clock rate to 100KHz. Signed-off-by: Wenchao Chen <wenchao.chen@unisoc.com> Acked-by: Adrian Hunter <adrian.hunter@intel.com> Fixes: fb8bd90f83c4 ("mmc: sdhci-sprd: Add Spreadtrum's initial host controller") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221011104935.10980-1-wenchao.chen666@gmail.com [Ulf: Clarified to commit-message] Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org> --- drivers/mmc/host/sdhci-sprd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index 46c55ab4884c2..b92a408f138dd 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -309,7 +309,7 @@ static unsigned int sdhci_sprd_get_max_clock(struct sdhci_host *host) static unsigned int sdhci_sprd_get_min_clock(struct sdhci_host *host) { - return 400000; + return 100000; } static void sdhci_sprd_set_uhs_signaling(struct sdhci_host *host, -- GitLab From 6c482c62a635aa4f534d2439fbf8afa37452b986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= <thomas.hellstrom@linux.intel.com> Date: Wed, 5 Oct 2022 14:11:59 +0200 Subject: [PATCH 1720/2223] drm/i915: Fix display problems after resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 39a2bd34c933 ("drm/i915: Use the vma resource as argument for gtt binding / unbinding") introduced a regression that due to the vma resource tracking of the binding state, dpt ptes were not correctly repopulated. Fix this by clearing the vma resource state before repopulating. The state will subsequently be restored by the bind_vma operation. Fixes: 39a2bd34c933 ("drm/i915: Use the vma resource as argument for gtt binding / unbinding") Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220912121957.31310-1-thomas.hellstrom@linux.intel.com Cc: Matthew Auld <matthew.auld@intel.com> Cc: intel-gfx@lists.freedesktop.org Cc: <stable@vger.kernel.org> # v5.18+ Reported-and-tested-by: Kevin Boulain <kevinboulain@gmail.com> Tested-by: David de Sousa <davidesousa@gmail.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221005121159.340245-1-thomas.hellstrom@linux.intel.com (cherry picked from commit bc2472538c0d1cce334ffc9e97df0614cd2b1469) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/gt/intel_ggtt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 30cf5c3369d9f..2049a00417afa 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -1275,10 +1275,16 @@ bool i915_ggtt_resume_vm(struct i915_address_space *vm) atomic_read(&vma->flags) & I915_VMA_BIND_MASK; GEM_BUG_ON(!was_bound); - if (!retained_ptes) + if (!retained_ptes) { + /* + * Clear the bound flags of the vma resource to allow + * ptes to be repopulated. + */ + vma->resource->bound_flags = 0; vma->ops->bind_vma(vm, NULL, vma->resource, obj ? obj->cache_level : 0, was_bound); + } if (obj) { /* only used during resume => exclusive access */ write_domain_objs |= fetch_and_zero(&obj->write_domain); obj->read_domains |= I915_GEM_DOMAIN_GTT; -- GitLab From bd86c69dae65de30f6d47249418ba7889809e31a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Date: Mon, 10 Oct 2022 14:59:02 +0900 Subject: [PATCH 1721/2223] NFSD: unregister shrinker when nfsd_init_net() fails syzbot is reporting UAF read at register_shrinker_prepared() [1], for commit 7746b32f467b3813 ("NFSD: add shrinker to reap courtesy clients on low memory condition") missed that nfsd4_leases_net_shutdown() from nfsd_exit_net() is called only when nfsd_init_net() succeeded. If nfsd_init_net() fails due to nfsd_reply_cache_init() failure, register_shrinker() from nfsd4_init_leases_net() has to be undone before nfsd_init_net() returns. Link: https://syzkaller.appspot.com/bug?extid=ff796f04613b4c84ad89 [1] Reported-by: syzbot <syzbot+ff796f04613b4c84ad89@syzkaller.appspotmail.com> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Fixes: 7746b32f467b3813 ("NFSD: add shrinker to reap courtesy clients on low memory condition") Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- fs/nfsd/nfsctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6a29bcfc93909..dc74a947a440c 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1458,12 +1458,14 @@ static __net_init int nfsd_init_net(struct net *net) goto out_drc_error; retval = nfsd_reply_cache_init(nn); if (retval) - goto out_drc_error; + goto out_cache_error; get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); return 0; +out_cache_error: + nfsd4_leases_net_shutdown(nn); out_drc_error: nfsd_idmap_shutdown(net); out_idmap_error: -- GitLab From cdbb816b5bfeb69ad925805d99b2ec312b241f1c Mon Sep 17 00:00:00 2001 From: Tao Zhou <tao.zhou1@amd.com> Date: Mon, 26 Sep 2022 17:01:33 +0800 Subject: [PATCH 1722/2223] drm/amdgpu: remove check for CE in RAS error address query Only RAS UE error address is queried currently, no need to check CE status. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 10 ++-- drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 67 +++++++++++--------------- drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 52 +++++++++----------- drivers/gpu/drm/amd/amdgpu/umc_v8_7.c | 20 +++----- 4 files changed, 59 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c index 939cb203f7ad5..f17d297b594bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c @@ -327,10 +327,9 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, return; } - /* calculate error address if ue/ce error is detected */ + /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); /* the lowest lsb bits should be ignored */ @@ -343,10 +342,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, ADDR_OF_256B_BLOCK(channel_index) | OFFSET_IN_256B_BLOCK(err_addr); - /* we only save ue error information currently, ce is skipped */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) - == 1) - amdgpu_umc_fill_error_record(err_data, err_addr, + amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c index a0d19b7683463..64d760eb92a39 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c @@ -209,10 +209,9 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, if (!err_data->err_addr) return; - /* calculate error address if ue/ce error is detected */ + /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); @@ -228,22 +227,18 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, /* clear [C4 C3 C2] in soc physical address */ soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); - /* we only save ue error information currently, ce is skipped */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) - == 1) { - /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { - retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - - /* shift R14 bit */ - retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - } + /* loop for all possibilities of [C4 C3 C2] */ + for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { + retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); + + /* shift R14 bit */ + retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); } } } @@ -481,10 +476,9 @@ static void umc_v6_7_query_error_address(struct amdgpu_device *adev, channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; - /* calculate error address if ue/ce error is detected */ + /* calculate error address if ue error is detected */ if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) || mca_addr != UMC_INVALID_ADDR) { if (mca_addr == UMC_INVALID_ADDR) { err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); @@ -505,23 +499,18 @@ static void umc_v6_7_query_error_address(struct amdgpu_device *adev, /* clear [C4 C3 C2] in soc physical address */ soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); - /* we only save ue error information currently, ce is skipped */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) - == 1 || - mca_addr != UMC_INVALID_ADDR) { - /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { - retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - - /* shift R14 bit */ - retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - } + /* loop for all possibilities of [C4 C3 C2] */ + for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { + retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); + + /* shift R14 bit */ + retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); } } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c index a8cbda81828da..38f9e29990cc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c @@ -208,7 +208,10 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, { uint64_t mc_umc_status_addr; uint64_t mc_umc_status, err_addr; - uint32_t channel_index; + uint64_t mc_umc_addrt0, na_err_addr_base; + uint64_t na_err_addr, retired_page_addr; + uint32_t channel_index, addr_lsb, col = 0; + int ret = 0; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); @@ -229,13 +232,10 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, umc_inst * adev->umc.channel_inst_num + ch_inst]; - /* calculate error address if ue/ce error is detected */ + /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { - uint32_t addr_lsb; - uint64_t mc_umc_addrt0; + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { mc_umc_addrt0 = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); @@ -243,32 +243,24 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, /* the lowest lsb bits should be ignored */ addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb); - err_addr &= ~((0x1ULL << addr_lsb) - 1); - - /* we only save ue error information currently, ce is skipped */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { - uint64_t na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT); - uint64_t na_err_addr, retired_page_addr; - uint32_t col = 0; - int ret = 0; - - /* loop for all possibilities of [C6 C5] in normal address. */ - for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) { - na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT); - - /* Mapping normal error address to retired soc physical address. */ - ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index, - na_err_addr, &retired_page_addr); - if (ret) { - dev_err(adev->dev, "Failed to map pa from umc na.\n"); - break; - } - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", - retired_page_addr); - amdgpu_umc_fill_error_record(err_data, na_err_addr, - retired_page_addr, channel_index, umc_inst); + na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT); + + /* loop for all possibilities of [C6 C5] in normal address. */ + for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) { + na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT); + + /* Mapping normal error address to retired soc physical address. */ + ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index, + na_err_addr, &retired_page_addr); + if (ret) { + dev_err(adev->dev, "Failed to map pa from umc na.\n"); + break; } + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", + retired_page_addr); + amdgpu_umc_fill_error_record(err_data, na_err_addr, + retired_page_addr, channel_index, umc_inst); } } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c index f35253e0eaa6d..e2623685cb44f 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c @@ -130,10 +130,9 @@ static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev, if (!err_data->err_addr) return; - /* calculate error address if ue/ce error is detected */ + /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); @@ -143,10 +142,7 @@ static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev, ADDR_OF_256B_BLOCK(channel_index) | OFFSET_IN_256B_BLOCK(err_addr); - /* we only save ue error information currently, ce is skipped */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) - == 1) - amdgpu_umc_fill_error_record(err_data, err_addr, + amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); } } @@ -343,10 +339,9 @@ static void umc_v8_7_query_error_address(struct amdgpu_device *adev, return; } - /* calculate error address if ue/ce error is detected */ + /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); /* the lowest lsb bits should be ignored */ @@ -359,10 +354,7 @@ static void umc_v8_7_query_error_address(struct amdgpu_device *adev, ADDR_OF_256B_BLOCK(channel_index) | OFFSET_IN_256B_BLOCK(err_addr); - /* we only save ue error information currently, ce is skipped */ - if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) - == 1) - amdgpu_umc_fill_error_record(err_data, err_addr, + amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, umc_inst); } -- GitLab From 44420ac5f855f5704d8f939926ed145f99e49e55 Mon Sep 17 00:00:00 2001 From: Tao Zhou <tao.zhou1@amd.com> Date: Tue, 27 Sep 2022 11:36:46 +0800 Subject: [PATCH 1723/2223] drm/amdgpu: define RAS convert_error_address API Make the code reusable and remove redundant code. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 +- drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 148 ++++++++++-------------- 3 files changed, 64 insertions(+), 93 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ccebd8e2a2d8d..c2f9970e851c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2889,7 +2889,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, if (adev->umc.ras && adev->umc.ras->convert_ras_error_address) adev->umc.ras->convert_ras_error_address(adev, - &err_data, 0, ch_inst, umc_inst, m->addr); + &err_data, m->addr, ch_inst, umc_inst); if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 2fb4951a64338..e46439274f3a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -22,8 +22,6 @@ #define __AMDGPU_UMC_H__ #include "amdgpu_ras.h" -#define UMC_INVALID_ADDR 0x1ULL - /* * (addr / 256) * 4096, the higher 26 bits in ErrorAddr * is the index of 4KB block @@ -54,9 +52,8 @@ struct amdgpu_umc_ras { void (*err_cnt_init)(struct amdgpu_device *adev); bool (*query_ras_poison_mode)(struct amdgpu_device *adev); void (*convert_ras_error_address)(struct amdgpu_device *adev, - struct ras_err_data *err_data, - uint32_t umc_reg_offset, uint32_t ch_inst, - uint32_t umc_inst, uint64_t mca_addr); + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst); void (*ecc_info_query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c index 64d760eb92a39..5d5d031c9e7d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c @@ -187,20 +187,51 @@ static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev, } } +static void umc_v6_7_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst) +{ + uint32_t channel_index; + uint64_t soc_pa, retired_page, column; + + channel_index = + adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; + /* translate umc channel address to soc pa, 3 parts are included */ + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(err_addr); + + /* The umc channel bits are not original values, they are hashed */ + SET_CHANNEL_HASH(channel_index, soc_pa); + + /* clear [C4 C3 C2] in soc physical address */ + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); + + /* loop for all possibilities of [C4 C3 C2] */ + for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { + retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); + + /* shift R14 bit */ + retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); + } +} + static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint32_t ch_inst, uint32_t umc_inst) { - uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column; - uint32_t channel_index; + uint64_t mc_umc_status, err_addr; uint32_t eccinfo_table_idx; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; - channel_index = - adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; - mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; if (mc_umc_status == 0) @@ -216,30 +247,8 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - /* translate umc channel address to soc pa, 3 parts are included */ - soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(err_addr); - - /* The umc channel bits are not original values, they are hashed */ - SET_CHANNEL_HASH(channel_index, soc_pa); - - /* clear [C4 C3 C2] in soc physical address */ - soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); - - /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { - retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - - /* shift R14 bit */ - retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - } + umc_v6_7_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst); } } @@ -448,75 +457,40 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev, static void umc_v6_7_query_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint32_t umc_reg_offset, uint32_t ch_inst, - uint32_t umc_inst, uint64_t mca_addr) + uint32_t umc_inst) { uint32_t mc_umc_status_addr; - uint32_t channel_index; - uint64_t mc_umc_status = 0, mc_umc_addrt0; - uint64_t err_addr, soc_pa, retired_page, column; + uint64_t mc_umc_status = 0, mc_umc_addrt0, err_addr; - if (mca_addr == UMC_INVALID_ADDR) { - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); - mc_umc_addrt0 = - SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); + mc_umc_addrt0 = + SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); - mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); + mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); - if (mc_umc_status == 0) - return; + if (mc_umc_status == 0) + return; - if (!err_data->err_addr) { - /* clear umc status */ - WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); - return; - } + if (!err_data->err_addr) { + /* clear umc status */ + WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); + return; } - channel_index = - adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; - /* calculate error address if ue error is detected */ - if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) || - mca_addr != UMC_INVALID_ADDR) { - if (mca_addr == UMC_INVALID_ADDR) { - err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); - err_addr = - REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - } else { - err_addr = mca_addr; - } + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { + err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); + err_addr = + REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - /* translate umc channel address to soc pa, 3 parts are included */ - soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(err_addr); - - /* The umc channel bits are not original values, they are hashed */ - SET_CHANNEL_HASH(channel_index, soc_pa); - - /* clear [C4 C3 C2] in soc physical address */ - soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); - - /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { - retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - - /* shift R14 bit */ - retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); - } + umc_v6_7_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst); } /* clear umc status */ - if (mca_addr == UMC_INVALID_ADDR) - WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); + WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); } static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev, @@ -538,7 +512,7 @@ static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev, umc_v6_7_query_error_address(adev, err_data, umc_reg_offset, ch_inst, - umc_inst, UMC_INVALID_ADDR); + umc_inst); } } @@ -579,5 +553,5 @@ struct amdgpu_umc_ras umc_v6_7_ras = { .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode, .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count, .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address, - .convert_ras_error_address = umc_v6_7_query_error_address, + .convert_ras_error_address = umc_v6_7_convert_error_address, }; -- GitLab From fb4d5891cee6d1c14b8d8f1b65c9d061ed3a495c Mon Sep 17 00:00:00 2001 From: Tao Zhou <tao.zhou1@amd.com> Date: Tue, 27 Sep 2022 11:42:45 +0800 Subject: [PATCH 1724/2223] drm/amdgpu: define convert_error_address for umc v8.7 So the code can be simplified. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/umc_v8_7.c | 47 ++++++++++++++------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c index e2623685cb44f..b717fdaa46e45 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c @@ -108,20 +108,35 @@ static void umc_v8_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev, } } +static void umc_v8_7_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst) +{ + uint64_t retired_page; + uint32_t channel_index; + + channel_index = + adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; + + /* translate umc channel address to soc pa, 3 parts are included */ + retired_page = ADDR_OF_4KB_BLOCK(err_addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(err_addr); + + amdgpu_umc_fill_error_record(err_data, err_addr, + retired_page, channel_index, umc_inst); +} + static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint32_t ch_inst, uint32_t umc_inst) { - uint64_t mc_umc_status, err_addr, retired_page; - uint32_t channel_index; + uint64_t mc_umc_status, err_addr; uint32_t eccinfo_table_idx; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; - channel_index = - adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; - mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; if (mc_umc_status == 0) @@ -137,13 +152,8 @@ static void umc_v8_7_ecc_info_query_error_address(struct amdgpu_device *adev, err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - /* translate umc channel address to soc pa, 3 parts are included */ - retired_page = ADDR_OF_4KB_BLOCK(err_addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(err_addr); - - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); + umc_v8_7_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst); } } @@ -320,14 +330,12 @@ static void umc_v8_7_query_error_address(struct amdgpu_device *adev, uint32_t umc_inst) { uint32_t lsb, mc_umc_status_addr; - uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0; - uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; + uint64_t mc_umc_status, err_addr, mc_umc_addrt0; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); mc_umc_addrt0 = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0); - mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); if (mc_umc_status == 0) @@ -349,13 +357,8 @@ static void umc_v8_7_query_error_address(struct amdgpu_device *adev, err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); err_addr &= ~((0x1ULL << lsb) - 1); - /* translate umc channel address to soc pa, 3 parts are included */ - retired_page = ADDR_OF_4KB_BLOCK(err_addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(err_addr); - - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, umc_inst); + umc_v8_7_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst); } /* clear umc status */ -- GitLab From 38dbbfa57c08b29ef8cf1d3fb3ad639ae819754e Mon Sep 17 00:00:00 2001 From: Tao Zhou <tao.zhou1@amd.com> Date: Thu, 29 Sep 2022 14:59:11 +0800 Subject: [PATCH 1725/2223] drm/amdgpu: fix coding style issue for mca notifier Fix some issues found by checkpatch script. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c2f9970e851c6..2dad7aa9a03b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2877,9 +2877,9 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, err_data.err_addr = kcalloc(adev->umc.max_ras_err_cnt_per_query, sizeof(struct eeprom_table_record), GFP_KERNEL); - if(!err_data.err_addr) { - dev_warn(adev->dev, "Failed to alloc memory for " - "umc error address record in mca notifier!\n"); + if (!err_data.err_addr) { + dev_warn(adev->dev, + "Failed to alloc memory for umc error record in mca notifier!\n"); return NOTIFY_DONE; } -- GitLab From 6dddc1eb9632b0eb6098d1dc849e8acb2408c1b6 Mon Sep 17 00:00:00 2001 From: Candice Li <candice.li@amd.com> Date: Mon, 26 Sep 2022 16:18:56 +0800 Subject: [PATCH 1726/2223] drm/amdgpu: Update umc v8_10_0 headers Add GeccCtrl offset and mask to umc v8_10_0 headers. Signed-off-by: Candice Li <candice.li@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h | 2 ++ drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h index b798cf5a2c39c..38adde3cae5ac 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h @@ -29,5 +29,7 @@ #define regMCA_UMC_UMC0_MCUMC_STATUST0_BASE_IDX 2 #define regMCA_UMC_UMC0_MCUMC_ADDRT0 0x03c4 #define regMCA_UMC_UMC0_MCUMC_ADDRT0_BASE_IDX 2 +#define regUMCCH0_0_GeccCtrl 0x0053 +#define regUMCCH0_0_GeccCtrl_BASE_IDX 2 #endif diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h index bd99b431247f3..4dbec524f9434 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h +++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h @@ -90,5 +90,8 @@ #define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr__SHIFT 0x0 #define MCA_UMC_UMC0_MCUMC_ADDRT0__Reserved__SHIFT 0x38 #define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr_MASK 0x00FFFFFFFFFFFFFFL +//UMCCH0_0_GeccCtrl +#define UMCCH0_0_GeccCtrl__UCFatalEn__SHIFT 0xd +#define UMCCH0_0_GeccCtrl__UCFatalEn_MASK 0x00002000L #endif -- GitLab From 832e72dd0d705bfcb4236bb2d561d82afe253e63 Mon Sep 17 00:00:00 2001 From: Candice Li <candice.li@amd.com> Date: Mon, 26 Sep 2022 16:21:05 +0800 Subject: [PATCH 1727/2223] drm/amdgpu: Add poison mode query for umc v8_10_0 Add poison mode query support on umc v8_10_0. Signed-off-by: Candice Li <candice.li@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c index 38f9e29990cc4..91235df54e22b 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c @@ -330,6 +330,31 @@ static void umc_v8_10_err_cnt_init(struct amdgpu_device *adev) } } +static uint32_t umc_v8_10_query_ras_poison_mode_per_channel( + struct amdgpu_device *adev, + uint32_t umc_reg_offset) +{ + uint32_t ecc_ctrl_addr, ecc_ctrl; + + ecc_ctrl_addr = + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccCtrl); + ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr + + umc_reg_offset) * 4); + + return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_GeccCtrl, UCFatalEn); +} + +static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev) +{ + uint32_t umc_reg_offset = 0; + + /* Enabling fatal error in umc node0 instance0 channel0 will be + * considered as fatal error mode + */ + umc_reg_offset = get_umc_v8_10_reg_offset(adev, 0, 0, 0); + return !umc_v8_10_query_ras_poison_mode_per_channel(adev, umc_reg_offset); +} + const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = { .query_ras_error_count = umc_v8_10_query_ras_error_count, .query_ras_error_address = umc_v8_10_query_ras_error_address, @@ -340,4 +365,5 @@ struct amdgpu_umc_ras umc_v8_10_ras = { .hw_ops = &umc_v8_10_ras_hw_ops, }, .err_cnt_init = umc_v8_10_err_cnt_init, + .query_ras_poison_mode = umc_v8_10_query_ras_poison_mode, }; -- GitLab From 09f1ef99ce900dbc3659d478f006081c96cc977f Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Date: Tue, 27 Sep 2022 17:59:21 -0400 Subject: [PATCH 1728/2223] drm/amd/display: Clean some DCN32 macros Some unused macros might mislead developers during the debug, which can be removed without any issue. This commit drops some unused references to SE_COMMON_MASK_SH_LIST_DCN32. Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../amd/display/dc/dcn32/dcn32_dio_stream_encoder.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h index 20e5f016a45a3..ecd041a446d2c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_stream_encoder.h @@ -95,7 +95,7 @@ SRI(DIG_FIFO_CTRL0, DIG, id) -#define SE_COMMON_MASK_SH_LIST_DCN32_BASE(mask_sh)\ +#define SE_COMMON_MASK_SH_LIST_DCN32(mask_sh)\ SE_SF(DP0_DP_PIXEL_FORMAT, DP_PIXEL_ENCODING, mask_sh),\ SE_SF(DP0_DP_PIXEL_FORMAT, DP_COMPONENT_DEPTH, mask_sh),\ SE_SF(DP0_DP_PIXEL_FORMAT, DP_PIXEL_PER_CYCLE_PROCESSING_MODE, mask_sh),\ @@ -247,15 +247,6 @@ SE_SF(DIG0_DIG_FIFO_CTRL0, DIG_FIFO_RESET_DONE, mask_sh),\ SE_SF(DIG0_DIG_FIFO_CTRL0, DIG_FIFO_OUTPUT_PIXEL_MODE, mask_sh) -#if defined(CONFIG_DRM_AMD_DC_HDCP) -#define SE_COMMON_MASK_SH_LIST_DCN32(mask_sh)\ - SE_COMMON_MASK_SH_LIST_DCN32_BASE(mask_sh),\ - SE_SF(DIG0_HDMI_VBI_PACKET_CONTROL, HDMI_ACP_SEND, mask_sh) -#else -#define SE_COMMON_MASK_SH_LIST_DCN32(mask_sh)\ - SE_COMMON_MASK_SH_LIST_DCN32_BASE(mask_sh) -#endif - void dcn32_dio_stream_encoder_construct( struct dcn10_stream_encoder *enc1, struct dc_context *ctx, -- GitLab From 1ba25b6ff24303fac890d657ffdebf3e8db3bc25 Mon Sep 17 00:00:00 2001 From: Aric Cyr <aric.cyr@amd.com> Date: Sun, 2 Oct 2022 11:59:13 -0400 Subject: [PATCH 1729/2223] drm/amd/display: 3.2.207 DC version 3.2.207 brings along the following: - PMFW z-state interface update - Cursor update refactor - Fixes to DSC validation, DCFCLK during Freesync, etc. - Code cleanup Tested-by: Daniel Wheeler <daniel.wheeler@amd.com> Acked-by: Qingqing Zhuo <qingqing.zhuo@amd.com> Signed-off-by: Aric Cyr <aric.cyr@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 5d0103e20412c..bfc5474c0f4c9 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -47,7 +47,7 @@ struct aux_payload; struct set_config_cmd_payload; struct dmub_notification; -#define DC_VER "3.2.206" +#define DC_VER "3.2.207" #define MAX_SURFACES 3 #define MAX_PLANES 6 -- GitLab From eff4ccd11313ecc8ec94c0f39961ffbf227a406d Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 11 Oct 2022 20:41:03 +0800 Subject: [PATCH 1730/2223] drm/amd/display: fix build error on arm64 dcn20_build_mapped_resource() and dcn20_acquire_dsc() is not defined, if CONFIG_DRM_AMD_DC_DCN is disabled. Fix the following build error on arm64: ERROR: modpost: "dcn20_build_mapped_resource" [drivers/gpu/drm/amd/amdgpu/amdgpu.ko] undefined! ERROR: modpost: "dcn20_acquire_dsc" [drivers/gpu/drm/amd/amdgpu/amdgpu.ko] undefined! Fixes: 20dad3813b3c ("drm/amd/display: Add a helper to map ODM/MPC/Multi-Plane resources") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 4a6e867369b84..fd8db482e56f9 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -3721,12 +3721,16 @@ bool dc_resource_acquire_secondary_pipe_for_mpc_odm( else sec_pipe->stream_res.opp = sec_pipe->top_pipe->stream_res.opp; if (sec_pipe->stream->timing.flags.DSC == 1) { +#if defined(CONFIG_DRM_AMD_DC_DCN) dcn20_acquire_dsc(dc, &state->res_ctx, &sec_pipe->stream_res.dsc, pipe_idx); +#endif ASSERT(sec_pipe->stream_res.dsc); if (sec_pipe->stream_res.dsc == NULL) return false; } +#if defined(CONFIG_DRM_AMD_DC_DCN) dcn20_build_mapped_resource(dc, state, sec_pipe->stream); +#endif } return true; -- GitLab From 9f30bf9917612b3a85cc28dc8ef98667ad5c07f8 Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Tue, 11 Oct 2022 09:27:29 -0400 Subject: [PATCH 1731/2223] drm/amd/display: make dcn32_split_stream_for_mpc_or_odm static It's not used outside of dcn32_fpu.c. Fixes: 20dad3813b3c15 ("drm/amd/display: Add a helper to map ODM/MPC/Multi-Plane resources") Reviewed-by: Harry Wentland <harry.wentland@amd.com> Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index 2a3f5a485b2be..819de0f110126 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -1372,7 +1372,7 @@ static struct pipe_ctx *dcn32_find_split_pipe( return pipe; } -bool dcn32_split_stream_for_mpc_or_odm( +static bool dcn32_split_stream_for_mpc_or_odm( const struct dc *dc, struct resource_context *res_ctx, struct pipe_ctx *pri_pipe, -- GitLab From e1e6889fc7b3e5152218db7d9f03c2f81569d54c Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Thu, 6 Oct 2022 11:31:25 -0400 Subject: [PATCH 1732/2223] drm/amd/display: fix indentation in dc.c Fixes a warning in dc.c. Reviewed-by: Harry Wentland <harry.wentland@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/core/dc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 660316a536f72..997ab031f816d 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3526,9 +3526,9 @@ static void commit_planes_for_stream(struct dc *dc, if (update_type != UPDATE_TYPE_FAST) dc->hwss.post_unlock_program_front_end(dc, context); - if (update_type != UPDATE_TYPE_FAST) - if (dc->hwss.commit_subvp_config) - dc->hwss.commit_subvp_config(dc, context); + if (update_type != UPDATE_TYPE_FAST) + if (dc->hwss.commit_subvp_config) + dc->hwss.commit_subvp_config(dc, context); if (update_type != UPDATE_TYPE_FAST) if (dc->hwss.commit_subvp_config) -- GitLab From b1d1666276cce28743e2cf90be07182ceac14f1e Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Mon, 10 Oct 2022 10:37:43 -0400 Subject: [PATCH 1733/2223] drm/amd/display: make virtual_disable_link_output static It's not used outside of virtual_link_hwss.c. Fixes a -Wmissing-prototypes warning. Reviewed-by: Harry Wentland <harry.wentland@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/virtual/virtual_link_hwss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/virtual/virtual_link_hwss.c b/drivers/gpu/drm/amd/display/dc/virtual/virtual_link_hwss.c index 9522fe0b36c98..4f7f99156897b 100644 --- a/drivers/gpu/drm/amd/display/dc/virtual/virtual_link_hwss.c +++ b/drivers/gpu/drm/amd/display/dc/virtual/virtual_link_hwss.c @@ -37,7 +37,7 @@ void virtual_reset_stream_encoder(struct pipe_ctx *pipe_ctx) { } -void virtual_disable_link_output(struct dc_link *link, +static void virtual_disable_link_output(struct dc_link *link, const struct link_resource *link_res, enum signal_type signal) { -- GitLab From f00844daa5212aac609d9cb97ce5e0a74c67890a Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Mon, 10 Oct 2022 17:30:08 -0400 Subject: [PATCH 1734/2223] drm/amd/display: add a license to cursor_reg_cache.h It's MIT. Fixes: b73353f7f3d434 ("drm/amd/display: Use the same cursor info across features") Reviewed-by: Harry Wentland <harry.wentland@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h b/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h index 0e7c5880e867a..45645f9fd86c4 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/cursor_reg_cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: MIT */ /* Copyright © 2022 Advanced Micro Devices, Inc. All rights reserved. */ #ifndef __DAL_CURSOR_CACHE_H__ -- GitLab From a895014853ea6df2778533e2a0bb7a0d53f02ec2 Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Mon, 19 Sep 2022 12:36:29 -0400 Subject: [PATCH 1735/2223] drm/amd/display: fix transfer function passed to build_coefficients() The default argument should be enum TRANSFER_FUNCTION_SRGB rather than the current boolean value which improperly maps to TRANSFER_FUNCTION_BT709. Commit 9b3d76527f6e ("drm/amd/display: Revert adding degamma coefficients") looks to have improperly reverted commit d02097095916 ("drm/amd/display: Add regamma/degamma coefficients and set sRGB when TF is BT709") replacing the enum value with a boolean value. Cc: Krunoslav Kovac <Krunoslav.Kovac@amd.com> Cc: Jaehyun Chung <jaehyun.chung@amd.com> Cc: Zeng Heng <zengheng4@huawei.com> Fixes: 9b3d76527f6e ("drm/amd/display: Revert adding degamma coefficients") Reviewed-by: Harry Wentland <harry.wentland@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/modules/color/color_gamma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c index 04f7656906ca0..447a0ec9cbe21 100644 --- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c +++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c @@ -1692,7 +1692,7 @@ static void apply_degamma_for_user_regamma(struct pwl_float_data_ex *rgb_regamma struct pwl_float_data_ex *rgb = rgb_regamma; const struct hw_x_point *coord_x = coordinates_x; - build_coefficients(&coeff, true); + build_coefficients(&coeff, TRANSFER_FUNCTION_SRGB); i = 0; while (i != hw_points_num + 1) { -- GitLab From 1f768ba469002d2dcad5c3d667151977417df7d9 Mon Sep 17 00:00:00 2001 From: Yang Li <yang.lee@linux.alibaba.com> Date: Mon, 10 Oct 2022 15:38:03 +0800 Subject: [PATCH 1736/2223] drm/amd/display: Simplify bool conversion The result of 'pwr_status == 0' is Boolean, and the question mark expression is redundant. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2354 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Reviewed-by: Harry Wentland <harry.wentland@amd.com> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c index 426b07edb4267..cf5bd9713f54f 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c @@ -1400,7 +1400,7 @@ bool dcn32_dsc_pg_status( break; } - return pwr_status == 0 ? true : false; + return pwr_status == 0; } void dcn32_update_dsc_pg(struct dc *dc, -- GitLab From 695ddc9318ad45b6a32f902b7c6998c65d575f26 Mon Sep 17 00:00:00 2001 From: Matthew Auld <matthew.auld@intel.com> Date: Tue, 4 Oct 2022 14:19:14 +0100 Subject: [PATCH 1737/2223] drm/i915: allow control over the flags when migrating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the next patch we want to move the object (if the current resource is not compatible), to the mappable part of lmem for some display buffers. Currently that requires being able to unset the I915_BO_ALLOC_GPU_ONLY hint. Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Jianshui Yu <jianshui.yu@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Cc: Nirmoy Das <nirmoy.das@intel.com> Reviewed-by: Nirmoy Das <nirmoy.das@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221004131916.233474-3-matthew.auld@intel.com (cherry picked from commit 999f4562077208b683f0519e5f1aa1e5c2fd2191) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/gem/i915_gem_object.c | 37 ++++++++++++++++++- drivers/gpu/drm/i915/gem/i915_gem_object.h | 4 ++ .../gpu/drm/i915/gem/i915_gem_object_types.h | 3 +- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 5 ++- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index 7ff9c7877becf..369006c5317f2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -652,6 +652,41 @@ bool i915_gem_object_can_migrate(struct drm_i915_gem_object *obj, int i915_gem_object_migrate(struct drm_i915_gem_object *obj, struct i915_gem_ww_ctx *ww, enum intel_region_id id) +{ + return __i915_gem_object_migrate(obj, ww, id, obj->flags); +} + +/** + * __i915_gem_object_migrate - Migrate an object to the desired region id, with + * control of the extra flags + * @obj: The object to migrate. + * @ww: An optional struct i915_gem_ww_ctx. If NULL, the backend may + * not be successful in evicting other objects to make room for this object. + * @id: The region id to migrate to. + * @flags: The object flags. Normally just obj->flags. + * + * Attempt to migrate the object to the desired memory region. The + * object backend must support migration and the object may not be + * pinned, (explicitly pinned pages or pinned vmas). The object must + * be locked. + * On successful completion, the object will have pages pointing to + * memory in the new region, but an async migration task may not have + * completed yet, and to accomplish that, i915_gem_object_wait_migration() + * must be called. + * + * Note: the @ww parameter is not used yet, but included to make sure + * callers put some effort into obtaining a valid ww ctx if one is + * available. + * + * Return: 0 on success. Negative error code on failure. In particular may + * return -ENXIO on lack of region space, -EDEADLK for deadlock avoidance + * if @ww is set, -EINTR or -ERESTARTSYS if signal pending, and + * -EBUSY if the object is pinned. + */ +int __i915_gem_object_migrate(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww, + enum intel_region_id id, + unsigned int flags) { struct drm_i915_private *i915 = to_i915(obj->base.dev); struct intel_memory_region *mr; @@ -672,7 +707,7 @@ int i915_gem_object_migrate(struct drm_i915_gem_object *obj, return 0; } - return obj->ops->migrate(obj, mr); + return obj->ops->migrate(obj, mr, flags); } /** diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 7317d4102955f..1723af9b0f6a2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -608,6 +608,10 @@ bool i915_gem_object_migratable(struct drm_i915_gem_object *obj); int i915_gem_object_migrate(struct drm_i915_gem_object *obj, struct i915_gem_ww_ctx *ww, enum intel_region_id id); +int __i915_gem_object_migrate(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww, + enum intel_region_id id, + unsigned int flags); bool i915_gem_object_can_migrate(struct drm_i915_gem_object *obj, enum intel_region_id id); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 40305e2bcd49b..d0d6772e6f36a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -107,7 +107,8 @@ struct drm_i915_gem_object_ops { * pinning or for as long as the object lock is held. */ int (*migrate)(struct drm_i915_gem_object *obj, - struct intel_memory_region *mr); + struct intel_memory_region *mr, + unsigned int flags); void (*release)(struct drm_i915_gem_object *obj); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index e3fc38dd5db04..4f861782c3e85 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -848,9 +848,10 @@ static int __i915_ttm_migrate(struct drm_i915_gem_object *obj, } static int i915_ttm_migrate(struct drm_i915_gem_object *obj, - struct intel_memory_region *mr) + struct intel_memory_region *mr, + unsigned int flags) { - return __i915_ttm_migrate(obj, mr, obj->flags); + return __i915_ttm_migrate(obj, mr, flags); } static void i915_ttm_put_pages(struct drm_i915_gem_object *obj, -- GitLab From ea19684afb545605bbcb690c49a91ce2c8e596dd Mon Sep 17 00:00:00 2001 From: Matthew Auld <matthew.auld@intel.com> Date: Tue, 4 Oct 2022 14:19:15 +0100 Subject: [PATCH 1738/2223] drm/i915/display: consider DG2_RC_CCS_CC when migrating buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For these types of display buffers, we need to able to CPU access some part of the backing memory in prepare_plane_clear_colors(). As a result we need to ensure we always place in the mappable part of lmem, which becomes necessary on small-bar systems. v2(Nirmoy & Ville): - Add some commentary for why we need to CPU access the buffer. - Split out the other changes, so we just consider the display change here. v3: - Handle this in the dpt path. v4(Ville): - Drop the intel_fb_rc_ccs_cc_plane() sanity check in pin_and_fence_fb_obj(), since we can also trigger this on DG1 it seems. Fixes: eb1c535f0d69 ("drm/i915: turn on small BAR support") Reported-by: Jianshui Yu <jianshui.yu@intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Cc: Nirmoy Das <nirmoy.das@intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Acked-by: Nirmoy Das <nirmoy.das@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221004131916.233474-4-matthew.auld@intel.com (cherry picked from commit e3afc690188be8e4385d13d1b0e7f0ba01caea40) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/display/intel_fb_pin.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.c b/drivers/gpu/drm/i915/display/intel_fb_pin.c index 733972fab07f5..1dddd6abd77b5 100644 --- a/drivers/gpu/drm/i915/display/intel_fb_pin.c +++ b/drivers/gpu/drm/i915/display/intel_fb_pin.c @@ -50,7 +50,18 @@ intel_pin_fb_obj_dpt(struct drm_framebuffer *fb, continue; if (HAS_LMEM(dev_priv)) { - ret = i915_gem_object_migrate(obj, &ww, INTEL_REGION_LMEM_0); + unsigned int flags = obj->flags; + + /* + * For this type of buffer we need to able to read from the CPU + * the clear color value found in the buffer, hence we need to + * ensure it is always in the mappable part of lmem, if this is + * a small-bar device. + */ + if (intel_fb_rc_ccs_cc_plane(fb) >= 0) + flags &= ~I915_BO_ALLOC_GPU_ONLY; + ret = __i915_gem_object_migrate(obj, &ww, INTEL_REGION_LMEM_0, + flags); if (ret) continue; } -- GitLab From a6d1ce5951185ee91bbe6909fe2758f3625561b0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed <yosryahmed@google.com> Date: Tue, 11 Oct 2022 00:33:58 +0000 Subject: [PATCH 1739/2223] cgroup: add cgroup_v1v2_get_from_[fd/file]() Add cgroup_v1v2_get_from_fd() and cgroup_v1v2_get_from_file() that support both cgroup1 and cgroup2. Signed-off-by: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 398f0bce7c214..a88de5bdeaa9f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -106,6 +106,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); +struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6d8a5a40c24d8..6349a9fe9ec15 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6224,16 +6224,36 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); - cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); @@ -6734,14 +6754,14 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ -struct cgroup *cgroup_get_from_fd(int fd) +struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct file *f; @@ -6750,10 +6770,28 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - cgrp = cgroup_get_from_file(f); + cgrp = cgroup_v1v2_get_from_file(f); fput(f); return cgrp; } + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; +} EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) -- GitLab From 35256d673a9cf723d9e2edb5d51e1b1b6b197ba3 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed <yosryahmed@google.com> Date: Tue, 11 Oct 2022 00:33:59 +0000 Subject: [PATCH 1740/2223] bpf: cgroup_iter: support cgroup1 using cgroup fd Use cgroup_v1v2_get_from_fd() in cgroup_iter to support attaching to both cgroup v1 and v2 using fds. Signed-off-by: Yosry Ahmed <yosryahmed@google.com> Acked-by: Martin KaFai Lau <martin.lau@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/bpf/cgroup_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 0d200a993489c..9fcf09f2ef00f 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -196,7 +196,7 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, return -EINVAL; if (fd) - cgrp = cgroup_get_from_fd(fd); + cgrp = cgroup_v1v2_get_from_fd(fd); else if (id) cgrp = cgroup_get_from_id(id); else /* walk the entire hierarchy by default. */ -- GitLab From 8248fe413216732f98563e8882b6c6ae617c327b Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 10 Oct 2022 22:28:08 -0700 Subject: [PATCH 1741/2223] perf stat: Support old kernels for bperf cgroup counting The recent change in the cgroup will break the backward compatiblity in the BPF program. It should support both old and new kernels using BPF CO-RE technique. Like the task_struct->__state handling in the offcpu analysis, we can check the field name in the cgroup struct. Acked-by: Jiri Olsa <jolsa@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org> --- tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c index 435a875566881..6a438e0102c5a 100644 --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -43,6 +43,18 @@ struct { __uint(value_size, sizeof(struct bpf_perf_event_value)); } cgrp_readings SEC(".maps"); +/* new kernel cgroup definition */ +struct cgroup___new { + int level; + struct cgroup *ancestors[]; +} __attribute__((preserve_access_index)); + +/* old kernel cgroup definition */ +struct cgroup___old { + int level; + u64 ancestor_ids[]; +} __attribute__((preserve_access_index)); + const volatile __u32 num_events = 1; const volatile __u32 num_cpus = 1; @@ -50,6 +62,21 @@ int enabled = 0; int use_cgroup_v2 = 0; int perf_subsys_id = -1; +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) +{ + /* recast pointer to capture new type for compiler */ + struct cgroup___new *cgrp_new = (void *)cgrp; + + if (bpf_core_field_exists(cgrp_new->ancestors)) { + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); + } else { + /* recast pointer to capture old type for compiler */ + struct cgroup___old *cgrp_old = (void *)cgrp; + + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); + } +} + static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) { struct task_struct *p = (void *)bpf_get_current_task(); @@ -77,7 +104,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) break; // convert cgroup-id to a map index - cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id); + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); if (!elem) continue; -- GitLab From 10f43fda55404cd6b5e1d422d9ecb570ce49d5de Mon Sep 17 00:00:00 2001 From: Wu XiangCheng <bobwxc@email.cn> Date: Tue, 11 Oct 2022 14:00:45 +0800 Subject: [PATCH 1742/2223] docs/zh_CN: promote the title of zh_CN/process/index.rst update to commit 9d0f5cd16744 ("docs: promote the title of process/index.rst") Signed-off-by: Wu XiangCheng <bobwxc@email.cn> Reviewed-by: Yanteng Si <siyanteng@loongson.cn> Link: https://lore.kernel.org/r/2741340f3b5f131a32d0f295224edd569aab0d98.1665467392.git.bobwxc@email.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/process/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/translations/zh_CN/process/index.rst b/Documentation/translations/zh_CN/process/index.rst index a683dbea0c83a..a1a35f88f4ae0 100644 --- a/Documentation/translations/zh_CN/process/index.rst +++ b/Documentation/translations/zh_CN/process/index.rst @@ -10,6 +10,7 @@ .. _cn_process_index: +======================== 与Linux 内核社区一起工作 ======================== -- GitLab From da2e928b2ddbcca793207ca670c6213f925a95f5 Mon Sep 17 00:00:00 2001 From: Wu XiangCheng <bobwxc@email.cn> Date: Tue, 11 Oct 2022 14:01:28 +0800 Subject: [PATCH 1743/2223] docs/zh_CN: add zh_CN/arch.rst Add an entry for all zh arch documents. Signed-off-by: Wu XiangCheng <bobwxc@email.cn> Reviewed-by: Yanteng Si <siyanteng@loongson.cn> Link: https://lore.kernel.org/r/4e9675ac83a06f2597d069f44a94c4e2cbd7ab2b.1665467392.git.bobwxc@email.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/arch.rst | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 Documentation/translations/zh_CN/arch.rst diff --git a/Documentation/translations/zh_CN/arch.rst b/Documentation/translations/zh_CN/arch.rst new file mode 100644 index 0000000000000..690e173d8b2a8 --- /dev/null +++ b/Documentation/translations/zh_CN/arch.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +处理器体系结构 +============== + +以下文档提供了具体架构实现的编程细节。 + +.. toctree:: + :maxdepth: 2 + + mips/index + arm64/index + riscv/index + openrisc/index + parisc/index + loongarch/index + +TODOList: + +* arm/index +* ia64/index +* m68k/index +* nios2/index +* powerpc/index +* s390/index +* sh/index +* sparc/index +* x86/index +* xtensa/index -- GitLab From eef24f7054a63058d6400a4386ee8bb2164fec44 Mon Sep 17 00:00:00 2001 From: Wu XiangCheng <bobwxc@email.cn> Date: Tue, 11 Oct 2022 14:02:23 +0800 Subject: [PATCH 1744/2223] docs/zh_CN: Rewrite the Chinese translation front page update to commit 0c7b4366f1ab ("docs: Rewrite the front page") Signed-off-by: Wu XiangCheng <bobwxc@email.cn> Reviewed-by: Yanteng Si <siyanteng@loongson.cn> Link: https://lore.kernel.org/r/440d7cb3c9f1526ed7c2996aa88ba2bc7fdc018c.1665467392.git.bobwxc@email.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/index.rst | 169 +++++++-------------- 1 file changed, 51 insertions(+), 118 deletions(-) diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index 2fc60e60feb40..7be728bed46d7 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -26,165 +26,98 @@ 顺便说下,中文文档也需要遵守内核编码风格,风格中中文和英文的主要不同就是中文 的字符标点占用两个英文字符宽度, 所以,当英文要求不要超过每行100个字符时, 中文就不要超过50个字符。另外,也要注意'-','=' 等符号与相关标题的对齐。在将 -补丁提交到社区之前,一定要进行必要的checkpatch.pl检查和编译测试。 +补丁提交到社区之前,一定要进行必要的 ``checkpatch.pl`` 检查和编译测试。 -许可证文档 ----------- - -下面的文档介绍了Linux内核源代码的许可证(GPLv2)、如何在源代码树中正确标记 -单个文件的许可证、以及指向完整许可证文本的链接。 +与Linux 内核社区一起工作 +------------------------ -* Documentation/translations/zh_CN/process/license-rules.rst - -用户文档 --------- - -下面的手册是为内核用户编写的——即那些试图让它在给定系统上以最佳方式工作的 -用户。 +与内核开发社区进行协作并将工作推向上游的基本指南。 .. toctree:: - :maxdepth: 2 + :maxdepth: 1 - admin-guide/index - -TODOList: - -* kbuild/index + process/development-process + process/submitting-patches + 行为准则 <process/code-of-conduct> + maintainer/index + 完整开发流程文档 <process/index> -固件相关文档 ------------- +内部API文档 +----------- -下列文档描述了内核需要的平台固件相关信息。 +开发人员使用的内核内部交互接口手册。 .. toctree:: - :maxdepth: 2 + :maxdepth: 1 - devicetree/index + core-api/index + driver-api/index + 内核中的锁 <locking/index> TODOList: -* firmware-guide/index - -应用程序开发人员文档 --------------------- - -用户空间API手册涵盖了描述应用程序开发人员可见内核接口方面的文档。 - -TODOlist: - -* userspace-api/index +* subsystem-apis -内核开发简介 ------------- +开发工具和流程 +-------------- -这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名 -开发人员做出贡献。与任何大型社区一样,知道如何完成任务将使得更改合并的过程 -变得更加容易。 +为所有内核开发人员提供有用信息的各种其他手册。 .. toctree:: - :maxdepth: 2 + :maxdepth: 1 - process/index - dev-tools/index + process/license-rules doc-guide/index + dev-tools/index + dev-tools/testing-overview kernel-hacking/index - maintainer/index TODOList: * trace/index * fault-injection/index * livepatch/index -* rust/index -内核API文档 ------------ +面向用户的文档 +-------------- -以下手册从内核开发人员的角度详细介绍了特定的内核子系统是如何工作的。这里的 -大部分信息都是直接从内核源代码获取的,并根据需要添加补充材料(或者至少是在 -我们设法添加的时候——可能不是所有的都是有需要的)。 +下列手册针对 +希望内核在给定系统上以最佳方式工作的*用户*, +和查找内核用户空间API信息的程序开发人员。 .. toctree:: - :maxdepth: 2 + :maxdepth: 1 - core-api/index - driver-api/index - locking/index - accounting/index - cpu-freq/index - iio/index - infiniband/index - power/index - virt/index - sound/index - filesystems/index - scheduler/index - mm/index - peci/index - PCI/index + admin-guide/index + admin-guide/reporting-issues.rst TODOList: -* block/index -* cdrom/index -* ide/index -* fb/index -* fpga/index -* hid/index -* i2c/index -* isdn/index -* leds/index -* netlabel/index -* networking/index -* pcmcia/index -* target/index -* timers/index -* spi/index -* w1/index -* watchdog/index -* input/index -* hwmon/index -* gpu/index -* security/index -* crypto/index -* bpf/index -* usb/index -* scsi/index -* misc-devices/index -* mhi/index - -体系结构无关文档 ----------------- - -TODOList: +* 内核构建系统 <kbuild/index> +* 用户空间工具 <tools/index> +* userspace-api/index -* asm-annotations +固件相关文档 +------------ -特定体系结构文档 ----------------- +下列文档描述了内核需要的平台固件相关信息。 .. toctree:: :maxdepth: 2 - mips/index - arm64/index - riscv/index - openrisc/index - parisc/index - loongarch/index + devicetree/index TODOList: -* arm/index -* ia64/index -* m68k/index -* nios2/index -* powerpc/index -* s390/index -* sh/index -* sparc/index -* x86/index -* xtensa/index +* firmware-guide/index + +体系结构文档 +------------ + +.. toctree:: + :maxdepth: 2 + + arch 其他文档 -------- @@ -195,9 +128,9 @@ TODOList: TODOList: * staging/index -* watch_queue -目录和表格 + +索引和表格 ---------- * :ref:`genindex` -- GitLab From ff2be4420863cc93bd4e0bea333ac9cb090a3415 Mon Sep 17 00:00:00 2001 From: Wu XiangCheng <bobwxc@email.cn> Date: Tue, 11 Oct 2022 14:03:00 +0800 Subject: [PATCH 1745/2223] docs/zh_CN: add a man-pages link to zh_CN/index.rst update to commit 489876063fb1 ("docs: add a man-pages link to the front page") Signed-off-by: Wu XiangCheng <bobwxc@email.cn> Reviewed-by: Yanteng Si <siyanteng@loongson.cn> Link: https://lore.kernel.org/r/6e289528ed1b40c1fcc03ea5e854e7c8ba264e67.1665467392.git.bobwxc@email.cn Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/translations/zh_CN/index.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index 7be728bed46d7..ec99ef5fe9903 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -97,6 +97,8 @@ TODOList: * 用户空间工具 <tools/index> * userspace-api/index +也可参考独立于内核文档的 `Linux 手册页 <https://www.kernel.org/doc/man-pages/>`_ 。 + 固件相关文档 ------------ -- GitLab From ae5b6779fa8724628bbad58126a626d0cd599414 Mon Sep 17 00:00:00 2001 From: Joel Stanley <joel@jms.id.au> Date: Tue, 11 Oct 2022 14:29:10 +1030 Subject: [PATCH 1746/2223] powerpc: Fix 85xx build The merge of the kbuild tree dropped the renaming of the FSL_BOOKE kconfig option. Fixes: 8afc66e8d43b ("Merge tag 'kbuild-v6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild") Signed-off-by: Joel Stanley <joel@jms.id.au> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/powerpc/kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 68ea30fb373e9..ee2d76cb31878 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -121,7 +121,7 @@ obj-$(CONFIG_PPC_BOOK3S_32) += head_book3s_32.o obj-$(CONFIG_40x) += head_40x.o obj-$(CONFIG_44x) += head_44x.o obj-$(CONFIG_PPC_8xx) += head_8xx.o -obj-$(CONFIG_FSL_BOOKE) += head_85xx.o +obj-$(CONFIG_PPC_85xx) += head_85xx.o extra-y += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o -- GitLab From c6cc4f7241d92cfdc36f9a13dfe318492f7eaa73 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn <lukas.bulwahn@gmail.com> Date: Tue, 4 Oct 2022 09:13:02 +0200 Subject: [PATCH 1747/2223] alpha: remove the needless aliases osf_{readv,writev} Commit 987f20a9dcce ("a.out: Remove the a.out implementation") removes CONFIG_OSF4_COMPAT and its functionality. Hence, sys_osf_{readv,writev} are now just aliases of sys_{readv,writev}. Remove these needless aliases. [ Identical patch also posted by Jason A. Donenfeld ] Link: https://lore.kernel.org/lkml/CAHk-=wjwvBc3VQMNtUVUrMBVoMPSPu26OuatZ_+1gZ2m-PmmRA@mail.gmail.com/ Link: https://lore.kernel.org/all/20221004135301.1420873-1-Jason@zx2c4.com/ Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/alpha/kernel/osf_sys.c | 12 ------------ arch/alpha/kernel/syscalls/syscall.tbl | 4 ++-- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 68ec314d3fac3..c54469b369cb6 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1278,18 +1278,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, return addr; } -SYSCALL_DEFINE3(osf_readv, unsigned long, fd, - const struct iovec __user *, vector, unsigned long, count) -{ - return sys_readv(fd, vector, count); -} - -SYSCALL_DEFINE3(osf_writev, unsigned long, fd, - const struct iovec __user *, vector, unsigned long, count) -{ - return sys_writev(fd, vector, count); -} - SYSCALL_DEFINE2(osf_getpriority, int, which, int, who) { int prio = sys_getpriority(which, who); diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 3515bc4f16a4f..8ebacf37a8cf4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -125,8 +125,8 @@ 116 common osf_gettimeofday sys_osf_gettimeofday 117 common osf_getrusage sys_osf_getrusage 118 common getsockopt sys_getsockopt -120 common readv sys_osf_readv -121 common writev sys_osf_writev +120 common readv sys_readv +121 common writev sys_writev 122 common osf_settimeofday sys_osf_settimeofday 123 common fchown sys_fchown 124 common fchmod sys_fchmod -- GitLab From 71c8517e004b950a148581ea2d2abe10aa46e02d Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Mon, 10 Oct 2022 23:17:04 +0100 Subject: [PATCH 1748/2223] MAINTAINERS: update polarfire soc clock binding The clock binding has been renamed and a new binding added for the clock controllers in the FPGA fabric. Generalise the pattern to cover both. Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20221010221704.2161221-2-conor@kernel.org/ Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0dc4a769216be..94cf47ea5f806 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17645,7 +17645,7 @@ M: Conor Dooley <conor.dooley@microchip.com> M: Daire McNamara <daire.mcnamara@microchip.com> L: linux-riscv@lists.infradead.org S: Supported -F: Documentation/devicetree/bindings/clock/microchip,mpfs.yaml +F: Documentation/devicetree/bindings/clock/microchip,mpfs*.yaml F: Documentation/devicetree/bindings/gpio/microchip,mpfs-gpio.yaml F: Documentation/devicetree/bindings/i2c/microchip,corei2c.yaml F: Documentation/devicetree/bindings/mailbox/microchip,mpfs-mailbox.yaml -- GitLab From abbb388d335f8c400d1baecb15d360fa0062de77 Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Mon, 10 Oct 2022 23:17:05 +0100 Subject: [PATCH 1749/2223] dt-bindings: riscv: update microchip.yaml's maintainership Daire and I are the platform maintainers for Microchip's RISC-V FPGAs. Update the maintainers in microchip.yaml to reflect this and explicitly add the binding to the SoC's MAINTAINERS entry. Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20221010221704.2161221-3-conor@kernel.org/ Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- Documentation/devicetree/bindings/riscv/microchip.yaml | 4 ++-- MAINTAINERS | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml index 1aa7336a9672f..9faf8447332bf 100644 --- a/Documentation/devicetree/bindings/riscv/microchip.yaml +++ b/Documentation/devicetree/bindings/riscv/microchip.yaml @@ -7,8 +7,8 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Microchip PolarFire SoC-based boards device tree bindings maintainers: - - Cyril Jean <Cyril.Jean@microchip.com> - - Lewis Hanly <lewis.hanly@microchip.com> + - Conor Dooley <conor.dooley@microchip.com> + - Daire McNamara <daire.mcnamara@microchip.com> description: Microchip PolarFire SoC-based boards diff --git a/MAINTAINERS b/MAINTAINERS index 94cf47ea5f806..b3415857a812c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17651,6 +17651,7 @@ F: Documentation/devicetree/bindings/i2c/microchip,corei2c.yaml F: Documentation/devicetree/bindings/mailbox/microchip,mpfs-mailbox.yaml F: Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml F: Documentation/devicetree/bindings/pwm/microchip,corepwm.yaml +F: Documentation/devicetree/bindings/riscv/microchip.yaml F: Documentation/devicetree/bindings/soc/microchip/microchip,mpfs-sys-controller.yaml F: Documentation/devicetree/bindings/spi/microchip,mpfs-spi.yaml F: Documentation/devicetree/bindings/usb/microchip,mpfs-musb.yaml -- GitLab From 780614ce1988f9d8ab05a58b49d5506bca60b935 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Wed, 12 Oct 2022 08:07:29 +1000 Subject: [PATCH 1750/2223] cifs: fix skipping to incorrect offset in emit_cached_dirents When application has done lseek() to a different offset on a directory fd we skipped one entry too many before we start emitting directory entries from the cache. We need to also make sure that when we are starting to emit directory entries from the cache, the ->pos sequence might have holes and skip some indices. Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Reviewed-by: Tom Talpey <tom@talpey.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/readdir.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8e060c00c9690..1bb4624e768bf 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -844,17 +844,34 @@ static bool emit_cached_dirents(struct cached_dirents *cde, struct dir_context *ctx) { struct cached_dirent *dirent; - int rc; + bool rc; list_for_each_entry(dirent, &cde->entries, entry) { - if (ctx->pos >= dirent->pos) + /* + * Skip all early entries prior to the current lseek() + * position. + */ + if (ctx->pos > dirent->pos) continue; + /* + * We recorded the current ->pos value for the dirent + * when we stored it in the cache. + * However, this sequence of ->pos values may have holes + * in it, for example dot-dirs returned from the server + * are suppressed. + * Handle this bu forcing ctx->pos to be the same as the + * ->pos of the current dirent we emit from the cache. + * This means that when we emit these entries from the cache + * we now emit them with the same ->pos value as in the + * initial scan. + */ ctx->pos = dirent->pos; rc = dir_emit(ctx, dirent->name, dirent->namelen, dirent->fattr.cf_uniqueid, dirent->fattr.cf_dtype); if (!rc) return rc; + ctx->pos++; } return true; } @@ -1202,10 +1219,10 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) ctx->pos, tmp_buf); cifs_save_resume_key(current_entry, cifsFile); break; - } else - current_entry = - nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + } + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); } kfree(tmp_buf); -- GitLab From 81895a65ec63ee1daec3255dc1a06675d2fbe915 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 16:43:38 +0200 Subject: [PATCH 1751/2223] treewide: use prandom_u32_max() when possible, part 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than incurring a division or requesting too many random bytes for the given range, use the prandom_u32_max() function, which only takes the minimum required bytes from the RNG and avoids divisions. This was done mechanically with this coccinelle script: @basic@ expression E; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u64; @@ ( - ((T)get_random_u32() % (E)) + prandom_u32_max(E) | - ((T)get_random_u32() & ((E) - 1)) + prandom_u32_max(E * XXX_MAKE_SURE_E_IS_POW2) | - ((u64)(E) * get_random_u32() >> 32) + prandom_u32_max(E) | - ((T)get_random_u32() & ~PAGE_MASK) + prandom_u32_max(PAGE_SIZE) ) @multi_line@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; identifier RAND; expression E; @@ - RAND = get_random_u32(); ... when != RAND - RAND %= (E); + RAND = prandom_u32_max(E); // Find a potential literal @literal_mask@ expression LITERAL; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; position p; @@ ((T)get_random_u32()@p & (LITERAL)) // Add one to the literal. @script:python add_one@ literal << literal_mask.LITERAL; RESULT; @@ value = None if literal.startswith('0x'): value = int(literal, 16) elif literal[0] in '123456789': value = int(literal, 10) if value is None: print("I don't know how to handle %s" % (literal)) cocci.include_match(False) elif value == 2**32 - 1 or value == 2**31 - 1 or value == 2**24 - 1 or value == 2**16 - 1 or value == 2**8 - 1: print("Skipping 0x%x for cleanup elsewhere" % (value)) cocci.include_match(False) elif value & (value + 1) != 0: print("Skipping 0x%x because it's not a power of two minus one" % (value)) cocci.include_match(False) elif literal.startswith('0x'): coccinelle.RESULT = cocci.make_expr("0x%x" % (value + 1)) else: coccinelle.RESULT = cocci.make_expr("%d" % (value + 1)) // Replace the literal mask with the calculated result. @plus_one@ expression literal_mask.LITERAL; position literal_mask.p; expression add_one.RESULT; identifier FUNC; @@ - (FUNC()@p & (LITERAL)) + prandom_u32_max(RESULT) @collapse_ret@ type T; identifier VAR; expression E; @@ { - T VAR; - VAR = (E); - return VAR; + return E; } @drop_var@ type T; identifier VAR; @@ { - T VAR; ... when != VAR } Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: KP Singh <kpsingh@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> # for ext4 and sbitmap Reviewed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com> # for drbd Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390 Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- arch/arm/kernel/process.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/loongarch/kernel/process.c | 2 +- arch/loongarch/kernel/vdso.c | 2 +- arch/mips/kernel/process.c | 2 +- arch/mips/kernel/vdso.c | 2 +- arch/parisc/kernel/vdso.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/s390/kernel/process.c | 2 +- arch/s390/kernel/vdso.c | 2 +- arch/sparc/vdso/vma.c | 2 +- arch/um/kernel/process.c | 2 +- arch/x86/entry/vdso/vma.c | 2 +- arch/x86/kernel/module.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/mm/pat/cpa-test.c | 4 +- crypto/testmgr.c | 86 +++++++++---------- drivers/block/drbd/drbd_receiver.c | 4 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 2 +- drivers/infiniband/core/cma.c | 2 +- drivers/infiniband/hw/cxgb4/id_table.c | 4 +- drivers/infiniband/hw/hns/hns_roce_ah.c | 5 +- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 3 +- drivers/md/bcache/request.c | 2 +- .../test-drivers/vivid/vivid-touch-cap.c | 2 +- drivers/mmc/core/core.c | 4 +- drivers/mmc/host/dw_mmc.c | 2 +- drivers/mtd/nand/raw/nandsim.c | 4 +- drivers/mtd/tests/mtd_nandecctest.c | 10 +-- drivers/mtd/tests/stresstest.c | 17 +--- drivers/mtd/ubi/debug.c | 2 +- drivers/mtd/ubi/debug.h | 6 +- drivers/net/ethernet/broadcom/cnic.c | 3 +- .../chelsio/inline_crypto/chtls/chtls_io.c | 4 +- drivers/net/hamradio/baycom_epp.c | 2 +- drivers/net/hamradio/hdlcdrv.c | 2 +- drivers/net/hamradio/yam.c | 2 +- drivers/net/phy/at803x.c | 2 +- .../broadcom/brcm80211/brcmfmac/p2p.c | 2 +- .../net/wireless/intel/iwlwifi/mvm/mac-ctxt.c | 2 +- drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- drivers/scsi/qedi/qedi_main.c | 2 +- fs/ceph/inode.c | 2 +- fs/ceph/mdsmap.c | 2 +- fs/ext4/super.c | 7 +- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 8 +- fs/ubifs/debug.c | 8 +- fs/ubifs/lpt_commit.c | 14 +-- fs/ubifs/tnc_commit.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_error.c | 2 +- include/linux/nodemask.h | 2 +- kernel/bpf/core.c | 4 +- kernel/locking/test-ww_mutex.c | 4 +- kernel/time/clocksource.c | 2 +- lib/fault-inject.c | 2 +- lib/find_bit_benchmark.c | 4 +- lib/kobject.c | 2 +- lib/reed_solomon/test_rslib.c | 6 +- lib/sbitmap.c | 2 +- lib/test-string_helpers.c | 2 +- lib/test_hexdump.c | 10 +-- lib/test_list_sort.c | 2 +- mm/kasan/kasan_test.c | 6 +- mm/slub.c | 2 +- net/802/garp.c | 2 +- net/802/mrp.c | 2 +- net/ceph/mon_client.c | 2 +- net/ceph/osd_client.c | 2 +- net/core/neighbour.c | 2 +- net/core/pktgen.c | 43 +++++----- net/core/stream.c | 2 +- net/ipv4/igmp.c | 6 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_hashtables.c | 2 +- net/ipv6/addrconf.c | 8 +- net/ipv6/mcast.c | 10 +-- net/netfilter/ipvs/ip_vs_twos.c | 4 +- net/packet/af_packet.c | 2 +- net/sched/act_gact.c | 2 +- net/sched/act_sample.c | 2 +- net/sched/sch_netem.c | 4 +- net/sctp/socket.c | 2 +- net/sunrpc/cache.c | 2 +- net/sunrpc/xprtsock.c | 2 +- net/tipc/socket.c | 2 +- net/xfrm/xfrm_state.c | 2 +- 89 files changed, 204 insertions(+), 218 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 96f3fbd517642..129279b33b1d4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -375,7 +375,7 @@ static unsigned long sigpage_addr(const struct mm_struct *mm, slots = ((last - first) >> PAGE_SHIFT) + 1; - offset = get_random_int() % slots; + offset = prandom_u32_max(slots); addr = first + (offset << PAGE_SHIFT); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 92bcc1768f0b9..87203429f8023 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -595,7 +595,7 @@ unsigned long __get_wchan(struct task_struct *p) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & ~0xf; } diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 660492f064e7e..1256e3582475f 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -293,7 +293,7 @@ unsigned long stack_top(void) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & STACK_ALIGN; } diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index f32c38abd7915..8c9826062652e 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -78,7 +78,7 @@ static unsigned long vdso_base(void) unsigned long base = STACK_TOP; if (current->flags & PF_RANDOMIZE) { - base += get_random_int() & (VDSO_RANDOMIZE_SIZE - 1); + base += prandom_u32_max(VDSO_RANDOMIZE_SIZE); base = PAGE_ALIGN(base); } diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 35b912bce4297..bbe9ce471791e 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -711,7 +711,7 @@ unsigned long mips_stack_top(void) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & ALMASK; } diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index b2cc2c2dd4bfc..5fd9bf1d596c6 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -79,7 +79,7 @@ static unsigned long vdso_base(void) } if (current->flags & PF_RANDOMIZE) { - base += get_random_int() & (VDSO_RANDOMIZE_SIZE - 1); + base += prandom_u32_max(VDSO_RANDOMIZE_SIZE); base = PAGE_ALIGN(base); } diff --git a/arch/parisc/kernel/vdso.c b/arch/parisc/kernel/vdso.c index 63dc44c4c246b..47e5960a2f961 100644 --- a/arch/parisc/kernel/vdso.c +++ b/arch/parisc/kernel/vdso.c @@ -75,7 +75,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, map_base = mm->mmap_base; if (current->flags & PF_RANDOMIZE) - map_base -= (get_random_int() & 0x1f) * PAGE_SIZE; + map_base -= prandom_u32_max(0x20) * PAGE_SIZE; vdso_text_start = get_unmapped_area(NULL, map_base, vdso_text_len, 0, 0); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 37df0428e4fbe..599391c235738 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2308,6 +2308,6 @@ void notrace __ppc64_runlatch_off(void) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & ~0xf; } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index d5119e039d855..5ec78555dd2e5 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -224,7 +224,7 @@ unsigned long __get_wchan(struct task_struct *p) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & ~0xf; } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 535099f2736da..3105ca5bd4701 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -227,7 +227,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned long len) end -= len; if (end > start) { - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1); addr = start + (offset << PAGE_SHIFT); } else { addr = start; diff --git a/arch/sparc/vdso/vma.c b/arch/sparc/vdso/vma.c index cc19e09b0fa1e..ae9a86cb6f3d9 100644 --- a/arch/sparc/vdso/vma.c +++ b/arch/sparc/vdso/vma.c @@ -354,7 +354,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned int len) unsigned int offset; /* This loses some more bits than a modulo, but is cheaper */ - offset = get_random_int() & (PTRS_PER_PTE - 1); + offset = prandom_u32_max(PTRS_PER_PTE); return start + (offset << PAGE_SHIFT); } diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 80b90b1276a19..010bc422a09dd 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -356,7 +356,7 @@ int singlestepping(void * t) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= prandom_u32_max(8192); return sp & ~0xf; } #endif diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 6292b960037b7..311eae30e0894 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -327,7 +327,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) end -= len; if (end > start) { - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1); addr = start + (offset << PAGE_SHIFT); } else { addr = start; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b1abf663417cd..c032edcd3d95e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void) */ if (module_load_offset == 0) module_load_offset = - (get_random_int() % 1024 + 1) * PAGE_SIZE; + (prandom_u32_max(1024) + 1) * PAGE_SIZE; mutex_unlock(&module_kaslr_mutex); } return module_load_offset; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58a6ea472db92..c21b7347a26dd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -965,7 +965,7 @@ early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= prandom_u32_max(8192); return sp & ~0xf; } diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index 0612a73638a81..423b21e80929a 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -136,10 +136,10 @@ static int pageattr_test(void) failed += print_split(&sa); for (i = 0; i < NTEST; i++) { - unsigned long pfn = prandom_u32() % max_pfn_mapped; + unsigned long pfn = prandom_u32_max(max_pfn_mapped); addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = prandom_u32() % NPAGES; + len[i] = prandom_u32_max(NPAGES); len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index e4bb03b8b9245..bff4833dbe7c8 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -855,9 +855,9 @@ static int prepare_keybuf(const u8 *key, unsigned int ksize, /* Generate a random length in range [0, max_len], but prefer smaller values */ static unsigned int generate_random_length(unsigned int max_len) { - unsigned int len = prandom_u32() % (max_len + 1); + unsigned int len = prandom_u32_max(max_len + 1); - switch (prandom_u32() % 4) { + switch (prandom_u32_max(4)) { case 0: return len % 64; case 1: @@ -874,14 +874,14 @@ static void flip_random_bit(u8 *buf, size_t size) { size_t bitpos; - bitpos = prandom_u32() % (size * 8); + bitpos = prandom_u32_max(size * 8); buf[bitpos / 8] ^= 1 << (bitpos % 8); } /* Flip a random byte in the given nonempty data buffer */ static void flip_random_byte(u8 *buf, size_t size) { - buf[prandom_u32() % size] ^= 0xff; + buf[prandom_u32_max(size)] ^= 0xff; } /* Sometimes make some random changes to the given nonempty data buffer */ @@ -891,15 +891,15 @@ static void mutate_buffer(u8 *buf, size_t size) size_t i; /* Sometimes flip some bits */ - if (prandom_u32() % 4 == 0) { - num_flips = min_t(size_t, 1 << (prandom_u32() % 8), size * 8); + if (prandom_u32_max(4) == 0) { + num_flips = min_t(size_t, 1 << prandom_u32_max(8), size * 8); for (i = 0; i < num_flips; i++) flip_random_bit(buf, size); } /* Sometimes flip some bytes */ - if (prandom_u32() % 4 == 0) { - num_flips = min_t(size_t, 1 << (prandom_u32() % 8), size); + if (prandom_u32_max(4) == 0) { + num_flips = min_t(size_t, 1 << prandom_u32_max(8), size); for (i = 0; i < num_flips; i++) flip_random_byte(buf, size); } @@ -915,11 +915,11 @@ static void generate_random_bytes(u8 *buf, size_t count) if (count == 0) return; - switch (prandom_u32() % 8) { /* Choose a generation strategy */ + switch (prandom_u32_max(8)) { /* Choose a generation strategy */ case 0: case 1: /* All the same byte, plus optional mutations */ - switch (prandom_u32() % 4) { + switch (prandom_u32_max(4)) { case 0: b = 0x00; break; @@ -959,24 +959,24 @@ static char *generate_random_sgl_divisions(struct test_sg_division *divs, unsigned int this_len; const char *flushtype_str; - if (div == &divs[max_divs - 1] || prandom_u32() % 2 == 0) + if (div == &divs[max_divs - 1] || prandom_u32_max(2) == 0) this_len = remaining; else - this_len = 1 + (prandom_u32() % remaining); + this_len = 1 + prandom_u32_max(remaining); div->proportion_of_total = this_len; - if (prandom_u32() % 4 == 0) - div->offset = (PAGE_SIZE - 128) + (prandom_u32() % 128); - else if (prandom_u32() % 2 == 0) - div->offset = prandom_u32() % 32; + if (prandom_u32_max(4) == 0) + div->offset = (PAGE_SIZE - 128) + prandom_u32_max(128); + else if (prandom_u32_max(2) == 0) + div->offset = prandom_u32_max(32); else - div->offset = prandom_u32() % PAGE_SIZE; - if (prandom_u32() % 8 == 0) + div->offset = prandom_u32_max(PAGE_SIZE); + if (prandom_u32_max(8) == 0) div->offset_relative_to_alignmask = true; div->flush_type = FLUSH_TYPE_NONE; if (gen_flushes) { - switch (prandom_u32() % 4) { + switch (prandom_u32_max(4)) { case 0: div->flush_type = FLUSH_TYPE_REIMPORT; break; @@ -988,7 +988,7 @@ static char *generate_random_sgl_divisions(struct test_sg_division *divs, if (div->flush_type != FLUSH_TYPE_NONE && !(req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && - prandom_u32() % 2 == 0) + prandom_u32_max(2) == 0) div->nosimd = true; switch (div->flush_type) { @@ -1035,7 +1035,7 @@ static void generate_random_testvec_config(struct testvec_config *cfg, p += scnprintf(p, end - p, "random:"); - switch (prandom_u32() % 4) { + switch (prandom_u32_max(4)) { case 0: case 1: cfg->inplace_mode = OUT_OF_PLACE; @@ -1050,12 +1050,12 @@ static void generate_random_testvec_config(struct testvec_config *cfg, break; } - if (prandom_u32() % 2 == 0) { + if (prandom_u32_max(2) == 0) { cfg->req_flags |= CRYPTO_TFM_REQ_MAY_SLEEP; p += scnprintf(p, end - p, " may_sleep"); } - switch (prandom_u32() % 4) { + switch (prandom_u32_max(4)) { case 0: cfg->finalization_type = FINALIZATION_TYPE_FINAL; p += scnprintf(p, end - p, " use_final"); @@ -1071,7 +1071,7 @@ static void generate_random_testvec_config(struct testvec_config *cfg, } if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && - prandom_u32() % 2 == 0) { + prandom_u32_max(2) == 0) { cfg->nosimd = true; p += scnprintf(p, end - p, " nosimd"); } @@ -1084,7 +1084,7 @@ static void generate_random_testvec_config(struct testvec_config *cfg, cfg->req_flags); p += scnprintf(p, end - p, "]"); - if (cfg->inplace_mode == OUT_OF_PLACE && prandom_u32() % 2 == 0) { + if (cfg->inplace_mode == OUT_OF_PLACE && prandom_u32_max(2) == 0) { p += scnprintf(p, end - p, " dst_divs=["); p = generate_random_sgl_divisions(cfg->dst_divs, ARRAY_SIZE(cfg->dst_divs), @@ -1093,13 +1093,13 @@ static void generate_random_testvec_config(struct testvec_config *cfg, p += scnprintf(p, end - p, "]"); } - if (prandom_u32() % 2 == 0) { - cfg->iv_offset = 1 + (prandom_u32() % MAX_ALGAPI_ALIGNMASK); + if (prandom_u32_max(2) == 0) { + cfg->iv_offset = 1 + prandom_u32_max(MAX_ALGAPI_ALIGNMASK); p += scnprintf(p, end - p, " iv_offset=%u", cfg->iv_offset); } - if (prandom_u32() % 2 == 0) { - cfg->key_offset = 1 + (prandom_u32() % MAX_ALGAPI_ALIGNMASK); + if (prandom_u32_max(2) == 0) { + cfg->key_offset = 1 + prandom_u32_max(MAX_ALGAPI_ALIGNMASK); p += scnprintf(p, end - p, " key_offset=%u", cfg->key_offset); } @@ -1652,8 +1652,8 @@ static void generate_random_hash_testvec(struct shash_desc *desc, vec->ksize = 0; if (maxkeysize) { vec->ksize = maxkeysize; - if (prandom_u32() % 4 == 0) - vec->ksize = 1 + (prandom_u32() % maxkeysize); + if (prandom_u32_max(4) == 0) + vec->ksize = 1 + prandom_u32_max(maxkeysize); generate_random_bytes((u8 *)vec->key, vec->ksize); vec->setkey_error = crypto_shash_setkey(desc->tfm, vec->key, @@ -2218,13 +2218,13 @@ static void mutate_aead_message(struct aead_testvec *vec, bool aad_iv, const unsigned int aad_tail_size = aad_iv ? ivsize : 0; const unsigned int authsize = vec->clen - vec->plen; - if (prandom_u32() % 2 == 0 && vec->alen > aad_tail_size) { + if (prandom_u32_max(2) == 0 && vec->alen > aad_tail_size) { /* Mutate the AAD */ flip_random_bit((u8 *)vec->assoc, vec->alen - aad_tail_size); - if (prandom_u32() % 2 == 0) + if (prandom_u32_max(2) == 0) return; } - if (prandom_u32() % 2 == 0) { + if (prandom_u32_max(2) == 0) { /* Mutate auth tag (assuming it's at the end of ciphertext) */ flip_random_bit((u8 *)vec->ctext + vec->plen, authsize); } else { @@ -2249,7 +2249,7 @@ static void generate_aead_message(struct aead_request *req, const unsigned int ivsize = crypto_aead_ivsize(tfm); const unsigned int authsize = vec->clen - vec->plen; const bool inauthentic = (authsize >= MIN_COLLISION_FREE_AUTHSIZE) && - (prefer_inauthentic || prandom_u32() % 4 == 0); + (prefer_inauthentic || prandom_u32_max(4) == 0); /* Generate the AAD. */ generate_random_bytes((u8 *)vec->assoc, vec->alen); @@ -2257,7 +2257,7 @@ static void generate_aead_message(struct aead_request *req, /* Avoid implementation-defined behavior. */ memcpy((u8 *)vec->assoc + vec->alen - ivsize, vec->iv, ivsize); - if (inauthentic && prandom_u32() % 2 == 0) { + if (inauthentic && prandom_u32_max(2) == 0) { /* Generate a random ciphertext. */ generate_random_bytes((u8 *)vec->ctext, vec->clen); } else { @@ -2321,8 +2321,8 @@ static void generate_random_aead_testvec(struct aead_request *req, /* Key: length in [0, maxkeysize], but usually choose maxkeysize */ vec->klen = maxkeysize; - if (prandom_u32() % 4 == 0) - vec->klen = prandom_u32() % (maxkeysize + 1); + if (prandom_u32_max(4) == 0) + vec->klen = prandom_u32_max(maxkeysize + 1); generate_random_bytes((u8 *)vec->key, vec->klen); vec->setkey_error = crypto_aead_setkey(tfm, vec->key, vec->klen); @@ -2331,8 +2331,8 @@ static void generate_random_aead_testvec(struct aead_request *req, /* Tag length: in [0, maxauthsize], but usually choose maxauthsize */ authsize = maxauthsize; - if (prandom_u32() % 4 == 0) - authsize = prandom_u32() % (maxauthsize + 1); + if (prandom_u32_max(4) == 0) + authsize = prandom_u32_max(maxauthsize + 1); if (prefer_inauthentic && authsize < MIN_COLLISION_FREE_AUTHSIZE) authsize = MIN_COLLISION_FREE_AUTHSIZE; if (WARN_ON(authsize > maxdatasize)) @@ -2342,7 +2342,7 @@ static void generate_random_aead_testvec(struct aead_request *req, /* AAD, plaintext, and ciphertext lengths */ total_len = generate_random_length(maxdatasize); - if (prandom_u32() % 4 == 0) + if (prandom_u32_max(4) == 0) vec->alen = 0; else vec->alen = generate_random_length(total_len); @@ -2958,8 +2958,8 @@ static void generate_random_cipher_testvec(struct skcipher_request *req, /* Key: length in [0, maxkeysize], but usually choose maxkeysize */ vec->klen = maxkeysize; - if (prandom_u32() % 4 == 0) - vec->klen = prandom_u32() % (maxkeysize + 1); + if (prandom_u32_max(4) == 0) + vec->klen = prandom_u32_max(maxkeysize + 1); generate_random_bytes((u8 *)vec->key, vec->klen); vec->setkey_error = crypto_skcipher_setkey(tfm, vec->key, vec->klen); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c897c45720365..ee69d50ba4fd3 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -781,7 +781,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, timeo = connect_int * HZ; /* 28.5% random jitter */ - timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; + timeo += prandom_u32_max(2) ? timeo / 7 : -timeo / 7; err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); if (err <= 0) @@ -1004,7 +1004,7 @@ retry: drbd_warn(connection, "Error receiving initial packet\n"); sock_release(s); randomize: - if (prandom_u32() & 1) + if (prandom_u32_max(2)) goto retry; } } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index cd75b0ca2555f..845023c14eb36 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -2424,7 +2424,7 @@ gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, /* Check whether the file_priv has already selected one ring. */ if ((int)file_priv->bsd_engine < 0) file_priv->bsd_engine = - get_random_int() % num_vcs_engines(dev_priv); + prandom_u32_max(num_vcs_engines(dev_priv)); return file_priv->bsd_engine; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 70da57ef2eeb6..cc2222b85c881 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3807,7 +3807,7 @@ static int cma_alloc_any_port(enum rdma_ucm_port_space ps, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32() % remaining + low; + rover = prandom_u32_max(remaining) + low; retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c index f64e7e02b129f..280d614668556 100644 --- a/drivers/infiniband/hw/cxgb4/id_table.c +++ b/drivers/infiniband/hw/cxgb4/id_table.c @@ -54,7 +54,7 @@ u32 c4iw_id_alloc(struct c4iw_id_table *alloc) if (obj < alloc->max) { if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) - alloc->last += prandom_u32() % RANDOM_SKIP; + alloc->last += prandom_u32_max(RANDOM_SKIP); else alloc->last = obj + 1; if (alloc->last >= alloc->max) @@ -85,7 +85,7 @@ int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, alloc->start = start; alloc->flags = flags; if (flags & C4IW_ID_TABLE_F_RANDOM) - alloc->last = prandom_u32() % RANDOM_SKIP; + alloc->last = prandom_u32_max(RANDOM_SKIP); else alloc->last = 0; alloc->max = num; diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 492b122d05219..480c062dd04f1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -41,9 +41,8 @@ static inline u16 get_ah_udp_sport(const struct rdma_ah_attr *ah_attr) u16 sport; if (!fl) - sport = get_random_u32() % - (IB_ROCE_UDP_ENCAP_VALID_PORT_MAX + 1 - - IB_ROCE_UDP_ENCAP_VALID_PORT_MIN) + + sport = prandom_u32_max(IB_ROCE_UDP_ENCAP_VALID_PORT_MAX + 1 - + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN) + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN; else sport = rdma_flow_label_to_udp_sport(fl); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 758e1d7ebc365..8546b8816524c 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1517,8 +1517,7 @@ static void rtrs_clt_err_recovery_work(struct work_struct *work) rtrs_clt_stop_and_destroy_conns(clt_path); queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, msecs_to_jiffies(delay_ms + - prandom_u32() % - RTRS_RECONNECT_SEED)); + prandom_u32_max(RTRS_RECONNECT_SEED))); } static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt, diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f2c5a7e06fa93..3427555b0ccae 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -401,7 +401,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) } if (bypass_torture_test(dc)) { - if ((get_random_int() & 3) == 3) + if (prandom_u32_max(4) == 3) goto skip; else goto rescale; diff --git a/drivers/media/test-drivers/vivid/vivid-touch-cap.c b/drivers/media/test-drivers/vivid/vivid-touch-cap.c index 64e3e4cb30c20..792660a85bc11 100644 --- a/drivers/media/test-drivers/vivid/vivid-touch-cap.c +++ b/drivers/media/test-drivers/vivid/vivid-touch-cap.c @@ -221,7 +221,7 @@ static void vivid_fill_buff_noise(__s16 *tch_buf, int size) static inline int get_random_pressure(void) { - return get_random_int() % VIVID_PRESSURE_LIMIT; + return prandom_u32_max(VIVID_PRESSURE_LIMIT); } static void vivid_tch_buf_set(struct v4l2_pix_format *f, diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index ef53a25788248..95fa8fb1d45f2 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -97,8 +97,8 @@ static void mmc_should_fail_request(struct mmc_host *host, !should_fail(&host->fail_mmc_request, data->blksz * data->blocks)) return; - data->error = data_errors[prandom_u32() % ARRAY_SIZE(data_errors)]; - data->bytes_xfered = (prandom_u32() % (data->bytes_xfered >> 9)) << 9; + data->error = data_errors[prandom_u32_max(ARRAY_SIZE(data_errors))]; + data->bytes_xfered = prandom_u32_max(data->bytes_xfered >> 9) << 9; } #else /* CONFIG_FAIL_MMC_REQUEST */ diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 581614196a841..c78bbc22e0d1e 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1858,7 +1858,7 @@ static void dw_mci_start_fault_timer(struct dw_mci *host) * Try to inject the error at random points during the data transfer. */ hrtimer_start(&host->fault_timer, - ms_to_ktime(prandom_u32() % 25), + ms_to_ktime(prandom_u32_max(25)), HRTIMER_MODE_REL); } diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c index 24beade95c7fb..50bcf745e8164 100644 --- a/drivers/mtd/nand/raw/nandsim.c +++ b/drivers/mtd/nand/raw/nandsim.c @@ -1405,9 +1405,9 @@ static void ns_do_bit_flips(struct nandsim *ns, int num) if (bitflips && prandom_u32() < (1 << 22)) { int flips = 1; if (bitflips > 1) - flips = (prandom_u32() % (int) bitflips) + 1; + flips = prandom_u32_max(bitflips) + 1; while (flips--) { - int pos = prandom_u32() % (num * 8); + int pos = prandom_u32_max(num * 8); ns->buf.byte[pos / 8] ^= (1 << (pos % 8)); NS_WARN("read_page: flipping bit %d in page %d " "reading from %d ecc: corrected=%u failed=%u\n", diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c index c4f271314f526..1c7201b0f372d 100644 --- a/drivers/mtd/tests/mtd_nandecctest.c +++ b/drivers/mtd/tests/mtd_nandecctest.c @@ -47,7 +47,7 @@ struct nand_ecc_test { static void single_bit_error_data(void *error_data, void *correct_data, size_t size) { - unsigned int offset = prandom_u32() % (size * BITS_PER_BYTE); + unsigned int offset = prandom_u32_max(size * BITS_PER_BYTE); memcpy(error_data, correct_data, size); __change_bit_le(offset, error_data); @@ -58,9 +58,9 @@ static void double_bit_error_data(void *error_data, void *correct_data, { unsigned int offset[2]; - offset[0] = prandom_u32() % (size * BITS_PER_BYTE); + offset[0] = prandom_u32_max(size * BITS_PER_BYTE); do { - offset[1] = prandom_u32() % (size * BITS_PER_BYTE); + offset[1] = prandom_u32_max(size * BITS_PER_BYTE); } while (offset[0] == offset[1]); memcpy(error_data, correct_data, size); @@ -71,7 +71,7 @@ static void double_bit_error_data(void *error_data, void *correct_data, static unsigned int random_ecc_bit(size_t size) { - unsigned int offset = prandom_u32() % (3 * BITS_PER_BYTE); + unsigned int offset = prandom_u32_max(3 * BITS_PER_BYTE); if (size == 256) { /* @@ -79,7 +79,7 @@ static unsigned int random_ecc_bit(size_t size) * and 17th bit) in ECC code for 256 byte data block */ while (offset == 16 || offset == 17) - offset = prandom_u32() % (3 * BITS_PER_BYTE); + offset = prandom_u32_max(3 * BITS_PER_BYTE); } return offset; diff --git a/drivers/mtd/tests/stresstest.c b/drivers/mtd/tests/stresstest.c index cb29c8c1b3703..d2faaca7f19d7 100644 --- a/drivers/mtd/tests/stresstest.c +++ b/drivers/mtd/tests/stresstest.c @@ -45,9 +45,8 @@ static int rand_eb(void) unsigned int eb; again: - eb = prandom_u32(); /* Read or write up 2 eraseblocks at a time - hence 'ebcnt - 1' */ - eb %= (ebcnt - 1); + eb = prandom_u32_max(ebcnt - 1); if (bbt[eb]) goto again; return eb; @@ -55,20 +54,12 @@ again: static int rand_offs(void) { - unsigned int offs; - - offs = prandom_u32(); - offs %= bufsize; - return offs; + return prandom_u32_max(bufsize); } static int rand_len(int offs) { - unsigned int len; - - len = prandom_u32(); - len %= (bufsize - offs); - return len; + return prandom_u32_max(bufsize - offs); } static int do_read(void) @@ -127,7 +118,7 @@ static int do_write(void) static int do_operation(void) { - if (prandom_u32() & 1) + if (prandom_u32_max(2)) return do_read(); else return do_write(); diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index 31d427ee191a3..908d0e0885574 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -590,7 +590,7 @@ int ubi_dbg_power_cut(struct ubi_device *ubi, int caller) if (ubi->dbg.power_cut_max > ubi->dbg.power_cut_min) { range = ubi->dbg.power_cut_max - ubi->dbg.power_cut_min; - ubi->dbg.power_cut_counter += prandom_u32() % range; + ubi->dbg.power_cut_counter += prandom_u32_max(range); } return 0; } diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h index 118248a5d7d48..dc8d8f83657a0 100644 --- a/drivers/mtd/ubi/debug.h +++ b/drivers/mtd/ubi/debug.h @@ -73,7 +73,7 @@ static inline int ubi_dbg_is_bgt_disabled(const struct ubi_device *ubi) static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi) { if (ubi->dbg.emulate_bitflips) - return !(prandom_u32() % 200); + return !prandom_u32_max(200); return 0; } @@ -87,7 +87,7 @@ static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi) static inline int ubi_dbg_is_write_failure(const struct ubi_device *ubi) { if (ubi->dbg.emulate_io_failures) - return !(prandom_u32() % 500); + return !prandom_u32_max(500); return 0; } @@ -101,7 +101,7 @@ static inline int ubi_dbg_is_write_failure(const struct ubi_device *ubi) static inline int ubi_dbg_is_erase_failure(const struct ubi_device *ubi) { if (ubi->dbg.emulate_io_failures) - return !(prandom_u32() % 400); + return !prandom_u32_max(400); return 0; } diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index e86503d97f32b..f597b313acaa3 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4105,8 +4105,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev) for (i = 0; i < MAX_CM_SK_TBL_SZ; i++) atomic_set(&cp->csk_tbl[i].ref_count, 0); - port_id = prandom_u32(); - port_id %= CNIC_LOCAL_PORT_RANGE; + port_id = prandom_u32_max(CNIC_LOCAL_PORT_RANGE); if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, CNIC_LOCAL_PORT_MIN, port_id)) { cnic_cm_free_mem(dev); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index 539992dad8ba3..a4256087ac828 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -919,8 +919,8 @@ static int csk_wait_memory(struct chtls_dev *cdev, current_timeo = *timeo_p; noblock = (*timeo_p ? false : true); if (csk_mem_free(cdev, sk)) { - current_timeo = (prandom_u32() % (HZ / 5)) + 2; - vm_wait = (prandom_u32() % (HZ / 5)) + 2; + current_timeo = prandom_u32_max(HZ / 5) + 2; + vm_wait = prandom_u32_max(HZ / 5) + 2; } add_wait_queue(sk_sleep(sk), &wait); diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 3e69079ed694b..7df78a721b04e 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -438,7 +438,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat) if ((--bc->hdlctx.slotcnt) > 0) return 0; bc->hdlctx.slotcnt = bc->ch_params.slottime; - if ((prandom_u32() % 256) > bc->ch_params.ppersist) + if (prandom_u32_max(256) > bc->ch_params.ppersist) return 0; } } diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index a6184d6c7b15f..bef904325a0fb 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -377,7 +377,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s) if ((--s->hdlctx.slotcnt) > 0) return; s->hdlctx.slotcnt = s->ch_params.slottime; - if ((prandom_u32() % 256) > s->ch_params.ppersist) + if (prandom_u32_max(256) > s->ch_params.ppersist) return; start_tx(dev, s); } diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 980f2be32f05a..97a6cc5c7ae89 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -626,7 +626,7 @@ static void yam_arbitrate(struct net_device *dev) yp->slotcnt = yp->slot / 10; /* is random > persist ? */ - if ((prandom_u32() % 256) > yp->pers) + if (prandom_u32_max(256) > yp->pers) return; yam_start_tx(dev, yp); diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 9e9adde335c83..349b7b1dbbf29 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -1758,7 +1758,7 @@ static int qca808x_phy_fast_retrain_config(struct phy_device *phydev) static int qca808x_phy_ms_random_seed_set(struct phy_device *phydev) { - u16 seed_value = (prandom_u32() % QCA808X_MASTER_SLAVE_SEED_RANGE); + u16 seed_value = prandom_u32_max(QCA808X_MASTER_SLAVE_SEED_RANGE); return at803x_debug_reg_mask(phydev, QCA808X_PHY_DEBUG_LOCAL_SEED, QCA808X_MASTER_SLAVE_SEED_CFG, diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index 479041f070f98..10d9d9c63b281 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@ -1128,7 +1128,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work) if (afx_hdl->is_listen && afx_hdl->my_listen_chan) /* 100ms ~ 300ms */ err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, - 100 * (1 + prandom_u32() % 3)); + 100 * (1 + prandom_u32_max(3))); else err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index ed586e6d7d64b..de0c545d50fd5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@ -1099,7 +1099,7 @@ static void iwl_mvm_mac_ctxt_cmd_fill_ap(struct iwl_mvm *mvm, iwl_mvm_mac_ap_iterator, &data); if (data.beacon_device_ts) { - u32 rand = (prandom_u32() % (64 - 36)) + 36; + u32 rand = prandom_u32_max(64 - 36) + 36; mvmvif->ap_beacon_time = data.beacon_device_ts + ieee80211_tu_to_usec(data.beacon_int * rand / 100); diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c index 39e16eab47aad..ddc048069af25 100644 --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -2233,7 +2233,7 @@ static void fcoe_ctlr_vn_restart(struct fcoe_ctlr *fip) if (fip->probe_tries < FIP_VN_RLIM_COUNT) { fip->probe_tries++; - wait = prandom_u32() % FIP_VN_PROBE_WAIT; + wait = prandom_u32_max(FIP_VN_PROBE_WAIT); } else wait = FIP_VN_RLIM_INT; mod_timer(&fip->timer, jiffies + msecs_to_jiffies(wait)); @@ -3125,7 +3125,7 @@ static void fcoe_ctlr_vn_timeout(struct fcoe_ctlr *fip) fcoe_all_vn2vn, 0); fip->port_ka_time = jiffies + msecs_to_jiffies(FIP_VN_BEACON_INT + - (prandom_u32() % FIP_VN_BEACON_FUZZ)); + prandom_u32_max(FIP_VN_BEACON_FUZZ)); } if (time_before(fip->port_ka_time, next_time)) next_time = fip->port_ka_time; diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index cecfb2cb4c7be..df2fe7bd26d1b 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -618,7 +618,7 @@ static int qedi_cm_alloc_mem(struct qedi_ctx *qedi) sizeof(struct qedi_endpoint *)), GFP_KERNEL); if (!qedi->ep_tbl) return -ENOMEM; - port_id = prandom_u32() % QEDI_LOCAL_PORT_RANGE; + port_id = prandom_u32_max(QEDI_LOCAL_PORT_RANGE); if (qedi_init_id_tbl(&qedi->lcl_port_tbl, QEDI_LOCAL_PORT_RANGE, QEDI_LOCAL_PORT_MIN, port_id)) { qedi_cm_free_mem(qedi); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 42351d7a0dd6b..f0c6e7e7b92b9 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32() % nsplits; + i = prandom_u32_max(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8d0a6d2c2da43..3fbabc98e1f70 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32() % n; + n = prandom_u32_max(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d733db8a0b026..989365b878a67 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3782,8 +3782,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - (prandom_u32() - % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3930,8 +3929,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + (prandom_u32() % - (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + elr->lr_next_sched = jiffies + prandom_u32_max( + EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d36bcb23ccfec..4546e01b2ee08 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -282,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 289bcb7ca3009..acf3d3fa43635 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2534,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + prandom_u32_max(sbi->max_fragment_chunk) + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + prandom_u32_max(sbi->max_fragment_chunk) + 1; seg->next_blkoff += - prandom_u32() % sbi->max_fragment_hole + 1; + prandom_u32_max(sbi->max_fragment_hole) + 1; } } } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index fc718f6178f25..f4d3b568aa64a 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2467,7 +2467,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!((prandom_u32() % out_of) + 1 <= n); + return !!(prandom_u32_max(out_of) + 1 <= n); } @@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail within 1 minute */ - delay = prandom_u32() % 60000; + delay = prandom_u32_max(60000); d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; - delay = prandom_u32() % 10000; + delay = prandom_u32_max(10000); /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn(c, "failing after %lu calls", delay); @@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32() % len; + from = prandom_u32_max(len); /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index d76a19e460cd4..cfbc31f709f4b 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (prandom_u32() & 3) + if (prandom_u32_max(4)) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 58c92c96ecef2..01362ad5f804a 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) + if (dbg_is_chk_index(c) && !prandom_u32_max(8)) return -ENOSPC; return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e2bdf089c0a31..6261599bb389a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1520,7 +1520,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32() & 1) + if (prandom_u32_max(2)) return 0; #endif diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 6cdfd64bc56bd..7838b31126e22 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32() & 1; + do_sparse = prandom_u32_max(2); #endif /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 296faa41d81d5..7db588ed0be59 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -274,7 +274,7 @@ xfs_errortag_test( ASSERT(error_tag < XFS_ERRTAG_MAX); randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32() % randfactor) + if (!randfactor || prandom_u32_max(randfactor)) return false; xfs_warn_ratelimited(mp, diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 378956c93c947..efef68c9352a0 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -516,7 +516,7 @@ static inline int node_random(const nodemask_t *maskp) bit = first_node(*maskp); break; default: - bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() % w); + bit = find_nth_bit(maskp->bits, MAX_NUMNODES, prandom_u32_max(w)); break; } return bit; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 59cf4dc728a55..53c6c98bda7b6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1032,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = prandom_u32_max(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -1094,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = prandom_u32_max(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 353004155d659..43efb2a041602 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -399,7 +399,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_int() % (n + 1); + r = prandom_u32_max(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks); int err; do { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cee5da1e54c41..8058bec87acee 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = prandom_u32() % nr_cpu_ids; + cpu = prandom_u32_max(nr_cpu_ids); cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 423784d9c058e..96e092de5b723 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -139,7 +139,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) return false; } - if (attr->probability <= prandom_u32() % 100) + if (attr->probability <= prandom_u32_max(100)) return false; if (!fail_stacktrace(attr)) diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 10754586403b1..7c3c011abd294 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -174,8 +174,8 @@ static int __init find_bit_test(void) bitmap_zero(bitmap2, BITMAP_LEN); while (nbits--) { - __set_bit(prandom_u32() % BITMAP_LEN, bitmap); - __set_bit(prandom_u32() % BITMAP_LEN, bitmap2); + __set_bit(prandom_u32_max(BITMAP_LEN), bitmap); + __set_bit(prandom_u32_max(BITMAP_LEN), bitmap2); } test_find_next_bit(bitmap, BITMAP_LEN); diff --git a/lib/kobject.c b/lib/kobject.c index 5f0e71ab292cb..a0b2dbfcfa233 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -694,7 +694,7 @@ static void kobject_release(struct kref *kref) { struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE - unsigned long delay = HZ + HZ * (get_random_int() & 0x3); + unsigned long delay = HZ + HZ * prandom_u32_max(4); pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n", kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); diff --git a/lib/reed_solomon/test_rslib.c b/lib/reed_solomon/test_rslib.c index d9d1c33aebdae..4d241bdc88aa8 100644 --- a/lib/reed_solomon/test_rslib.c +++ b/lib/reed_solomon/test_rslib.c @@ -183,7 +183,7 @@ static int get_rcw_we(struct rs_control *rs, struct wspace *ws, do { /* Must not choose the same location twice */ - errloc = prandom_u32() % len; + errloc = prandom_u32_max(len); } while (errlocs[errloc] != 0); errlocs[errloc] = 1; @@ -194,12 +194,12 @@ static int get_rcw_we(struct rs_control *rs, struct wspace *ws, for (i = 0; i < eras; i++) { do { /* Must not choose the same location twice */ - errloc = prandom_u32() % len; + errloc = prandom_u32_max(len); } while (errlocs[errloc] != 0); derrlocs[i] = errloc; - if (ewsc && (prandom_u32() & 1)) { + if (ewsc && prandom_u32_max(2)) { /* Erasure with the symbol intact */ errlocs[errloc] = 2; } else { diff --git a/lib/sbitmap.c b/lib/sbitmap.c index a8108a962dfd4..055dac069afb9 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -33,7 +33,7 @@ static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { - hint = depth ? prandom_u32() % depth : 0; + hint = depth ? prandom_u32_max(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); } diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index 437d8e6b7cb12..86fadd3ba08c5 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -587,7 +587,7 @@ static int __init test_string_helpers_init(void) for (i = 0; i < UNESCAPE_ALL_MASK + 1; i++) test_string_unescape("unescape", i, false); test_string_unescape("unescape inplace", - get_random_int() % (UNESCAPE_ANY + 1), true); + prandom_u32_max(UNESCAPE_ANY + 1), true); /* Without dictionary */ for (i = 0; i < ESCAPE_ALL_MASK + 1; i++) diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 5144899d3c6b8..0927f44cd4787 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -149,7 +149,7 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize, static void __init test_hexdump_set(int rowsize, bool ascii) { size_t d = min_t(size_t, sizeof(data_b), rowsize); - size_t len = get_random_int() % d + 1; + size_t len = prandom_u32_max(d) + 1; test_hexdump(len, rowsize, 4, ascii); test_hexdump(len, rowsize, 2, ascii); @@ -208,11 +208,11 @@ static void __init test_hexdump_overflow(size_t buflen, size_t len, static void __init test_hexdump_overflow_set(size_t buflen, bool ascii) { unsigned int i = 0; - int rs = (get_random_int() % 2 + 1) * 16; + int rs = (prandom_u32_max(2) + 1) * 16; do { int gs = 1 << i; - size_t len = get_random_int() % rs + gs; + size_t len = prandom_u32_max(rs) + gs; test_hexdump_overflow(buflen, rounddown(len, gs), rs, gs, ascii); } while (i++ < 3); @@ -223,11 +223,11 @@ static int __init test_hexdump_init(void) unsigned int i; int rowsize; - rowsize = (get_random_int() % 2 + 1) * 16; + rowsize = (prandom_u32_max(2) + 1) * 16; for (i = 0; i < 16; i++) test_hexdump_set(rowsize, false); - rowsize = (get_random_int() % 2 + 1) * 16; + rowsize = (prandom_u32_max(2) + 1) * 16; for (i = 0; i < 16; i++) test_hexdump_set(rowsize, true); diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c index ade7a1ea0c8e2..19ff229b9c3a7 100644 --- a/lib/test_list_sort.c +++ b/lib/test_list_sort.c @@ -71,7 +71,7 @@ static void list_sort_test(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, el); /* force some equivalencies */ - el->value = prandom_u32() % (TEST_LIST_LEN / 3); + el->value = prandom_u32_max(TEST_LIST_LEN / 3); el->serial = i; el->poison1 = TEST_POISON1; el->poison2 = TEST_POISON2; diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index f25692def7813..2503ae2ae65d3 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1292,7 +1292,7 @@ static void match_all_not_assigned(struct kunit *test) KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); for (i = 0; i < 256; i++) { - size = (get_random_int() % 1024) + 1; + size = prandom_u32_max(1024) + 1; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); @@ -1301,7 +1301,7 @@ static void match_all_not_assigned(struct kunit *test) } for (i = 0; i < 256; i++) { - order = (get_random_int() % 4) + 1; + order = prandom_u32_max(4) + 1; pages = alloc_pages(GFP_KERNEL, order); ptr = page_address(pages); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -1314,7 +1314,7 @@ static void match_all_not_assigned(struct kunit *test) return; for (i = 0; i < 256; i++) { - size = (get_random_int() % 1024) + 1; + size = prandom_u32_max(1024) + 1; ptr = vmalloc(size); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); diff --git a/mm/slub.c b/mm/slub.c index 96dd392d7f99f..157527d7101be 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1881,7 +1881,7 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) return false; freelist_count = oo_objects(s->oo); - pos = get_random_int() % freelist_count; + pos = prandom_u32_max(freelist_count); page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); diff --git a/net/802/garp.c b/net/802/garp.c index f6012f8e59f00..fc9eb02a912f8 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -407,7 +407,7 @@ static void garp_join_timer_arm(struct garp_applicant *app) { unsigned long delay; - delay = (u64)msecs_to_jiffies(garp_join_time) * prandom_u32() >> 32; + delay = prandom_u32_max(msecs_to_jiffies(garp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } diff --git a/net/802/mrp.c b/net/802/mrp.c index 35e04cc5390c4..155f74d8b14f4 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -592,7 +592,7 @@ static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; - delay = (u64)msecs_to_jiffies(mrp_join_time) * prandom_u32() >> 32; + delay = prandom_u32_max(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 6a6898ee40495..db60217f911b3 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -222,7 +222,7 @@ static void pick_new_mon(struct ceph_mon_client *monc) max--; } - n = prandom_u32() % max; + n = prandom_u32_max(max); if (o >= 0 && n >= o) n++; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 87b883c7bfd64..4e4f1e4bc265a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1479,7 +1479,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, static int pick_random_replica(const struct ceph_osds *acting) { - int i = prandom_u32() % acting->size; + int i = prandom_u32_max(acting->size); dout("%s picked osd%d, primary osd%d\n", __func__, acting->osds[i], acting->primary); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e93edb8101036..3c4786b999070 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (prandom_u32() % base) + (base >> 1) : 0; + return base ? prandom_u32_max(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 88906ba6d9a78..5ca4f953034ca 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2324,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = prandom_u32() % pkt_dev->cflows; + flow = prandom_u32_max(pkt_dev->cflows); pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2380,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = prandom_u32() % - (pkt_dev->queue_map_max - - pkt_dev->queue_map_min + 1) - + pkt_dev->queue_map_min; + t = prandom_u32_max(pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; } else { t = pkt_dev->cur_queue_map + 1; if (t > pkt_dev->queue_map_max) @@ -2412,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = prandom_u32() % pkt_dev->src_mac_count; + mc = prandom_u32_max(pkt_dev->src_mac_count); else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2438,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = prandom_u32() % pkt_dev->dst_mac_count; + mc = prandom_u32_max(pkt_dev->dst_mac_count); else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2470,18 +2469,18 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = prandom_u32() & (4096 - 1); + pkt_dev->vlan_id = prandom_u32_max(4096); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = prandom_u32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32_max(4096); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = prandom_u32() % - (pkt_dev->udp_src_max - pkt_dev->udp_src_min) - + pkt_dev->udp_src_min; + pkt_dev->cur_udp_src = prandom_u32_max( + pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; else { pkt_dev->cur_udp_src++; @@ -2492,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = prandom_u32() % - (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) - + pkt_dev->udp_dst_min; + pkt_dev->cur_udp_dst = prandom_u32_max( + pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2509,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = prandom_u32() % (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2531,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_IPDST_RND) { do { - t = prandom_u32() % - (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + + imn; s = htonl(t); } while (ipv4_is_loopback(s) || ipv4_is_multicast(s) || @@ -2579,9 +2578,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = prandom_u32() % - (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) - + pkt_dev->min_pkt_size; + t = prandom_u32_max(pkt_dev->max_pkt_size - + pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) @@ -2590,7 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } else if (pkt_dev->n_imix_entries > 0) { struct imix_pkt *entry; - __u32 t = prandom_u32() % IMIX_PRECISION; + __u32 t = prandom_u32_max(IMIX_PRECISION); __u8 entry_index = pkt_dev->imix_distribution[t]; entry = &pkt_dev->imix_entries[entry_index]; diff --git a/net/core/stream.c b/net/core/stream.c index 1105057ce00a5..75fded8495f5b 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; + current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index df0660d818ac5..81be3e0f0e704 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -213,7 +213,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = prandom_u32() % max_delay; + int tv = prandom_u32_max(max_delay); im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -222,7 +222,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = prandom_u32() % in_dev->mr_maxdelay; + int tv = prandom_u32_max(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -236,7 +236,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = prandom_u32() % delay; + int tv = prandom_u32_max(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ebca860e113f9..4e84ed21d16fe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -314,7 +314,7 @@ other_half_scan: if (likely(remaining > 1)) remaining &= ~1U; - offset = prandom_u32() % remaining; + offset = prandom_u32_max(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a0ad34e4f044b..d3dc281566229 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1037,7 +1037,7 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, (prandom_u32() & 7) * 2); + i = max_t(int, i, prandom_u32_max(8) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 10ce86bf228e1..417834b7169d7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -104,7 +104,7 @@ static inline u32 cstamp_delta(unsigned long cstamp) static inline s32 rfc3315_s14_backoff_init(s32 irt) { /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ - u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt; + u64 tmp = (900000 + prandom_u32_max(200001)) * (u64)irt; do_div(tmp, 1000000); return (s32)tmp; } @@ -112,11 +112,11 @@ static inline s32 rfc3315_s14_backoff_init(s32 irt) static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) { /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ - u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt; + u64 tmp = (1900000 + prandom_u32_max(200001)) * (u64)rt; do_div(tmp, 1000000); if ((s32)tmp > mrt) { /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ - tmp = (900000 + prandom_u32() % 200001) * (u64)mrt; + tmp = (900000 + prandom_u32_max(200001)) * (u64)mrt; do_div(tmp, 1000000); } return (s32)tmp; @@ -3967,7 +3967,7 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); + rand_num = prandom_u32_max(idev->cnf.rtr_solicit_delay ?: 1); nonce = 0; if (idev->cnf.enhanced_dad || diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 0566ab03ddbee..7860383295d84 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1050,7 +1050,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, /* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { - unsigned long tv = prandom_u32() % idev->mc_maxdelay; + unsigned long tv = prandom_u32_max(idev->mc_maxdelay); idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) @@ -1068,7 +1068,7 @@ static void mld_gq_stop_work(struct inet6_dev *idev) /* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = prandom_u32() % delay; + unsigned long tv = prandom_u32_max(delay); if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); @@ -1085,7 +1085,7 @@ static void mld_ifc_stop_work(struct inet6_dev *idev) /* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = prandom_u32() % delay; + unsigned long tv = prandom_u32_max(delay); if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); @@ -1130,7 +1130,7 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) } if (delay >= resptime) - delay = prandom_u32() % resptime; + delay = prandom_u32_max(resptime); if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); @@ -2574,7 +2574,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = prandom_u32() % unsolicited_report_interval(ma->idev); + delay = prandom_u32_max(unsolicited_report_interval(ma->idev)); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index acb55d8393ef6..f2579fc9c75bd 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -71,8 +71,8 @@ static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc, * from 0 to total_weight */ total_weight += 1; - rweight1 = prandom_u32() % total_weight; - rweight2 = prandom_u32() % total_weight; + rweight1 = prandom_u32_max(total_weight); + rweight2 = prandom_u32_max(total_weight); /* Pick two weighted servers */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d3f6db350de77..6ce8dd19f33c3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1350,7 +1350,7 @@ static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) if (READ_ONCE(history[i]) == rxhash) count++; - victim = prandom_u32() % ROLLOVER_HLEN; + victim = prandom_u32_max(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index abe1bcc5c7971..62d682b96b885 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -25,7 +25,7 @@ static struct tc_action_ops act_gact_ops; static int gact_net_rand(struct tcf_gact *gact) { smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ - if (prandom_u32() % gact->tcfg_pval) + if (prandom_u32_max(gact->tcfg_pval)) return gact->tcf_action; return gact->tcfg_paction; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5ba36f70e3a13..7a25477f5d996 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -168,7 +168,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ - if (psample_group && (prandom_u32() % s->rate == 0)) { + if (psample_group && (prandom_u32_max(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 18f4273a835b9..bab45b3b1fdb5 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -513,8 +513,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto finish_segs; } - skb->data[prandom_u32() % skb_headlen(skb)] ^= - 1<<(prandom_u32() % 8); + skb->data[prandom_u32_max(skb_headlen(skb))] ^= + 1<<prandom_u32_max(8); } if (unlikely(sch->q.qlen >= sch->limit)) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 171f1a35d2052..1e354ba449607 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8319,7 +8319,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32() % remaining + low; + rover = prandom_u32_max(remaining) + low; do { rover++; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index c3c693b51c945..f075a9fb5ccc6 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -677,7 +677,7 @@ static void cache_limit_defers(void) /* Consider removing either the first or the last */ if (cache_defer_cnt > DFR_MAX) { - if (prandom_u32() & 1) + if (prandom_u32_max(2)) discard = list_entry(cache_defer_list.next, struct cache_deferred_req, recent); else diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e976007f4fd00..f55ff5155b6e2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1619,7 +1619,7 @@ static int xs_get_random_port(void) if (max < min) return -EADDRINUSE; range = max - min + 1; - rand = (unsigned short) prandom_u32() % range; + rand = prandom_u32_max(range); return rand + min; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f1c3b8eb4b3d3..e902b01ea3cb1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3010,7 +3010,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk) struct net *net = sock_net(sk); struct tipc_net *tn = net_generic(net, tipc_net_id); u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1; - u32 portid = prandom_u32() % remaining + TIPC_MIN_PORT; + u32 portid = prandom_u32_max(remaining) + TIPC_MIN_PORT; while (remaining--) { portid++; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 81df34b3da6ed..3d2fe7712ac5b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2072,7 +2072,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { - spi = low + prandom_u32()%(high-low+1); + spi = low + prandom_u32_max(high - low + 1); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { newspi = htonl(spi); -- GitLab From 8b3ccbc1f1f91847160951aa15dd27c22dddcb49 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 16:43:38 +0200 Subject: [PATCH 1752/2223] treewide: use prandom_u32_max() when possible, part 2 Rather than incurring a division or requesting too many random bytes for the given range, use the prandom_u32_max() function, which only takes the minimum required bytes from the RNG and avoids divisions. This was done by hand, covering things that coccinelle could not do on its own. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: Jan Kara <jack@suse.cz> # for ext2, ext4, and sbitmap Acked-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- fs/ext2/ialloc.c | 3 +-- fs/ext4/ialloc.c | 5 ++--- lib/sbitmap.c | 2 +- lib/test_vmalloc.c | 17 ++++------------- 4 files changed, 8 insertions(+), 19 deletions(-) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 998dd2ac80089..f4944c4dee607 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -277,8 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - group = prandom_u32(); - parent_group = (unsigned)group % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext2_get_group_desc (sb, group, NULL); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 208b87ce88588..7575aa3596751 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); - grp = hinfo.hash; + parent_group = hinfo.hash % ngroups; } else - grp = prandom_u32(); - parent_group = (unsigned)grp % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 055dac069afb9..7280ae8ca88c7 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -21,7 +21,7 @@ static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) int i; for_each_possible_cpu(i) - *per_cpu_ptr(sb->alloc_hint, i) = prandom_u32() % depth; + *per_cpu_ptr(sb->alloc_hint, i) = prandom_u32_max(depth); } return 0; } diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 4f2f2d1bac562..a26bbbf20e62d 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -151,9 +151,7 @@ static int random_size_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - n = prandom_u32(); - n = (n % 100) + 1; - + n = prandom_u32_max(100) + 1; p = vmalloc(n * PAGE_SIZE); if (!p) @@ -293,16 +291,12 @@ pcpu_alloc_test(void) return -1; for (i = 0; i < 35000; i++) { - unsigned int r; - - r = prandom_u32(); - size = (r % (PAGE_SIZE / 4)) + 1; + size = prandom_u32_max(PAGE_SIZE / 4) + 1; /* * Maximum PAGE_SIZE */ - r = prandom_u32(); - align = 1 << ((r % 11) + 1); + align = 1 << (prandom_u32_max(11) + 1); pcpu[i] = __alloc_percpu(size, align); if (!pcpu[i]) @@ -393,14 +387,11 @@ static struct test_driver { static void shuffle_array(int *arr, int n) { - unsigned int rnd; int i, j; for (i = n - 1; i > 0; i--) { - rnd = prandom_u32(); - /* Cut the range. */ - j = rnd % i; + j = prandom_u32_max(i); /* Swap indexes. */ swap(arr[i], arr[j]); -- GitLab From 7e3cf0843fe505491baa05e355e83e6997e089dd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 17:23:53 +0200 Subject: [PATCH 1753/2223] treewide: use get_random_{u8,u16}() when possible, part 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than truncate a 32-bit value to a 16-bit value or an 8-bit value, simply use the get_random_{u8,u16}() functions, which are faster than wasting the additional bytes from a 32-bit value. This was done mechanically with this coccinelle script: @@ expression E; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u16; typedef __be16; typedef __le16; typedef u8; @@ ( - (get_random_u32() & 0xffff) + get_random_u16() | - (get_random_u32() & 0xff) + get_random_u8() | - (get_random_u32() % 65536) + get_random_u16() | - (get_random_u32() % 256) + get_random_u8() | - (get_random_u32() >> 16) + get_random_u16() | - (get_random_u32() >> 24) + get_random_u8() | - (u16)get_random_u32() + get_random_u16() | - (u8)get_random_u32() + get_random_u8() | - (__be16)get_random_u32() + (__be16)get_random_u16() | - (__le16)get_random_u32() + (__le16)get_random_u16() | - prandom_u32_max(65536) + get_random_u16() | - prandom_u32_max(256) + get_random_u8() | - E->inet_id = get_random_u32() + E->inet_id = get_random_u16() ) @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u16; identifier v; @@ - u16 v = get_random_u32(); + u16 v = get_random_u16(); @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u8; identifier v; @@ - u8 v = get_random_u32(); + u8 v = get_random_u8(); @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u16; u16 v; @@ - v = get_random_u32(); + v = get_random_u16(); @@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u8; u8 v; @@ - v = get_random_u32(); + v = get_random_u8(); // Find a potential literal @literal_mask@ expression LITERAL; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; position p; @@ ((T)get_random_u32()@p & (LITERAL)) // Examine limits @script:python add_one@ literal << literal_mask.LITERAL; RESULT; @@ value = None if literal.startswith('0x'): value = int(literal, 16) elif literal[0] in '123456789': value = int(literal, 10) if value is None: print("I don't know how to handle %s" % (literal)) cocci.include_match(False) elif value < 256: coccinelle.RESULT = cocci.make_ident("get_random_u8") elif value < 65536: coccinelle.RESULT = cocci.make_ident("get_random_u16") else: print("Skipping large mask of %s" % (literal)) cocci.include_match(False) // Replace the literal mask with the calculated result. @plus_one@ expression literal_mask.LITERAL; position literal_mask.p; identifier add_one.RESULT; identifier FUNC; @@ - (FUNC()@p & (LITERAL)) + (RESULT() & LITERAL) Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Toke Høiland-Jørgensen <toke@toke.dk> # for sch_cake Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- arch/arm/kernel/signal.c | 2 +- arch/arm64/kernel/syscall.c | 2 +- crypto/testmgr.c | 8 ++++---- drivers/media/common/v4l2-tpg/v4l2-tpg-core.c | 2 +- drivers/media/test-drivers/vivid/vivid-radio-rx.c | 4 ++-- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- drivers/net/hamradio/baycom_epp.c | 2 +- drivers/net/hamradio/hdlcdrv.c | 2 +- drivers/net/hamradio/yam.c | 2 +- drivers/net/wireguard/selftest/allowedips.c | 4 ++-- drivers/net/wireless/st/cw1200/wsm.c | 2 +- drivers/scsi/lpfc/lpfc_hbadisc.c | 6 +++--- lib/cmdline_kunit.c | 4 ++-- net/dccp/ipv4.c | 4 ++-- net/ipv4/datagram.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/mac80211/scan.c | 2 +- net/netfilter/nf_nat_core.c | 4 ++-- net/sched/sch_cake.c | 6 +++--- net/sctp/socket.c | 2 +- 21 files changed, 34 insertions(+), 34 deletions(-) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ea128e32e8ca8..e07f359254c3c 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -655,7 +655,7 @@ struct page *get_signal_page(void) PAGE_SIZE / sizeof(u32)); /* Give the signal return code some randomness */ - offset = 0x200 + (get_random_int() & 0x7fc); + offset = 0x200 + (get_random_u16() & 0x7fc); signal_return_offset = offset; /* Copy signal return handlers into the page */ diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 733451fe7e41f..d72e8f23422da 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -67,7 +67,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, * * The resulting 5 bits of entropy is seen in SP[8:4]. */ - choose_random_kstack_offset(get_random_int() & 0x1FF); + choose_random_kstack_offset(get_random_u16() & 0x1FF); } static inline bool has_syscall_work(unsigned long flags) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index bff4833dbe7c8..bcd059caa1c81 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -927,7 +927,7 @@ static void generate_random_bytes(u8 *buf, size_t count) b = 0xff; break; default: - b = (u8)prandom_u32(); + b = get_random_u8(); break; } memset(buf, b, count); @@ -935,8 +935,8 @@ static void generate_random_bytes(u8 *buf, size_t count) break; case 2: /* Ascending or descending bytes, plus optional mutations */ - increment = (u8)prandom_u32(); - b = (u8)prandom_u32(); + increment = get_random_u8(); + b = get_random_u8(); for (i = 0; i < count; i++, b += increment) buf[i] = b; mutate_buffer(buf, count); @@ -944,7 +944,7 @@ static void generate_random_bytes(u8 *buf, size_t count) default: /* Fully random bytes */ for (i = 0; i < count; i++) - buf[i] = (u8)prandom_u32(); + buf[i] = get_random_u8(); } } diff --git a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c index 9b7bcdce6e44e..303d02b1d71c9 100644 --- a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c +++ b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c @@ -870,7 +870,7 @@ static void precalculate_color(struct tpg_data *tpg, int k) g = tpg_colors[col].g; b = tpg_colors[col].b; } else if (tpg->pattern == TPG_PAT_NOISE) { - r = g = b = prandom_u32_max(256); + r = g = b = get_random_u8(); } else if (k == TPG_COLOR_RANDOM) { r = g = b = tpg->qual_offset + prandom_u32_max(196); } else if (k >= TPG_COLOR_RAMP) { diff --git a/drivers/media/test-drivers/vivid/vivid-radio-rx.c b/drivers/media/test-drivers/vivid/vivid-radio-rx.c index 232cab508f48b..8bd09589fb153 100644 --- a/drivers/media/test-drivers/vivid/vivid-radio-rx.c +++ b/drivers/media/test-drivers/vivid/vivid-radio-rx.c @@ -104,8 +104,8 @@ retry: break; case 2: rds.block |= V4L2_RDS_BLOCK_ERROR; - rds.lsb = prandom_u32_max(256); - rds.msb = prandom_u32_max(256); + rds.lsb = get_random_u8(); + rds.msb = get_random_u8(); break; case 3: /* Skip block altogether */ if (i) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index f90bfba4b3034..eda129d0143e7 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1466,7 +1466,7 @@ static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt) tp->write_seq = snd_isn; tp->snd_nxt = snd_isn; tp->snd_una = snd_isn; - inet_sk(sk)->inet_id = prandom_u32(); + inet_sk(sk)->inet_id = get_random_u16(); assign_rxopt(sk, opt); if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10)) diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 7df78a721b04e..791b4a53d69fd 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -438,7 +438,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat) if ((--bc->hdlctx.slotcnt) > 0) return 0; bc->hdlctx.slotcnt = bc->ch_params.slottime; - if (prandom_u32_max(256) > bc->ch_params.ppersist) + if (get_random_u8() > bc->ch_params.ppersist) return 0; } } diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index bef904325a0fb..2263029d1a20e 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -377,7 +377,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s) if ((--s->hdlctx.slotcnt) > 0) return; s->hdlctx.slotcnt = s->ch_params.slottime; - if (prandom_u32_max(256) > s->ch_params.ppersist) + if (get_random_u8() > s->ch_params.ppersist) return; start_tx(dev, s); } diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 97a6cc5c7ae89..2ed2f836f09af 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -626,7 +626,7 @@ static void yam_arbitrate(struct net_device *dev) yp->slotcnt = yp->slot / 10; /* is random > persist ? */ - if (prandom_u32_max(256) > yp->pers) + if (get_random_u8() > yp->pers) return; yam_start_tx(dev, yp); diff --git a/drivers/net/wireguard/selftest/allowedips.c b/drivers/net/wireguard/selftest/allowedips.c index 41db10f9be498..dd897c0740a28 100644 --- a/drivers/net/wireguard/selftest/allowedips.c +++ b/drivers/net/wireguard/selftest/allowedips.c @@ -310,7 +310,7 @@ static __init bool randomized_test(void) for (k = 0; k < 4; ++k) mutated[k] = (mutated[k] & mutate_mask[k]) | (~mutate_mask[k] & - prandom_u32_max(256)); + get_random_u8()); cidr = prandom_u32_max(32) + 1; peer = peers[prandom_u32_max(NUM_PEERS)]; if (wg_allowedips_insert_v4(&t, @@ -354,7 +354,7 @@ static __init bool randomized_test(void) for (k = 0; k < 4; ++k) mutated[k] = (mutated[k] & mutate_mask[k]) | (~mutate_mask[k] & - prandom_u32_max(256)); + get_random_u8()); cidr = prandom_u32_max(128) + 1; peer = peers[prandom_u32_max(NUM_PEERS)]; if (wg_allowedips_insert_v6(&t, diff --git a/drivers/net/wireless/st/cw1200/wsm.c b/drivers/net/wireless/st/cw1200/wsm.c index 5a3e7a626702d..4a9e4b5d3547a 100644 --- a/drivers/net/wireless/st/cw1200/wsm.c +++ b/drivers/net/wireless/st/cw1200/wsm.c @@ -1594,7 +1594,7 @@ static int cw1200_get_prio_queue(struct cw1200_common *priv, edca = &priv->edca.params[i]; score = ((edca->aifns + edca->cwmin) << 16) + ((edca->cwmax - edca->cwmin) * - (get_random_int() & 0xFFFF)); + get_random_u16()); if (score < best && (winner < 0 || i != 3)) { best = score; winner = i; diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index c7f834ba8edbb..d38ebd7281b9b 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -2156,8 +2156,8 @@ lpfc_check_pending_fcoe_event(struct lpfc_hba *phba, uint8_t unreg_fcf) * This function makes an running random selection decision on FCF record to * use through a sequence of @fcf_cnt eligible FCF records with equal * probability. To perform integer manunipulation of random numbers with - * size unit32_t, the lower 16 bits of the 32-bit random number returned - * from prandom_u32() are taken as the random random number generated. + * size unit32_t, a 16-bit random number returned from get_random_u16() is + * taken as the random random number generated. * * Returns true when outcome is for the newly read FCF record should be * chosen; otherwise, return false when outcome is for keeping the previously @@ -2169,7 +2169,7 @@ lpfc_sli4_new_fcf_random_select(struct lpfc_hba *phba, uint32_t fcf_cnt) uint32_t rand_num; /* Get 16-bit uniform random number */ - rand_num = 0xFFFF & prandom_u32(); + rand_num = get_random_u16(); /* Decision with probability 1/fcf_cnt */ if ((fcf_cnt * rand_num) < 0xFFFF) diff --git a/lib/cmdline_kunit.c b/lib/cmdline_kunit.c index a72a2c16066ef..d4572dbc91453 100644 --- a/lib/cmdline_kunit.c +++ b/lib/cmdline_kunit.c @@ -76,7 +76,7 @@ static void cmdline_test_lead_int(struct kunit *test) int rc = cmdline_test_values[i]; int offset; - sprintf(in, "%u%s", get_random_int() % 256, str); + sprintf(in, "%u%s", get_random_u8(), str); /* Only first '-' after the number will advance the pointer */ offset = strlen(in) - strlen(str) + !!(rc == 2); cmdline_do_one_test(test, in, rc, offset); @@ -94,7 +94,7 @@ static void cmdline_test_tail_int(struct kunit *test) int rc = strcmp(str, "") ? (strcmp(str, "-") ? 0 : 1) : 1; int offset; - sprintf(in, "%s%u", str, get_random_int() % 256); + sprintf(in, "%s%u", str, get_random_u8()); /* * Only first and leading '-' not followed by integer * will advance the pointer. diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6a6e121dc00c0..713b7b8dad7e5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -144,7 +144,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, inet->inet_dport); - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); err = dccp_connect(sk); rt = NULL; @@ -443,7 +443,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 405a8c2aea641..0ee7fd2597300 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1ae83ad629b25..922c87ef1ab58 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -172,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, * Avoid using the hashed IP ident generator. */ if (sk->sk_protocol == IPPROTO_TCP) - iph->id = (__force __be16)prandom_u32(); + iph->id = (__force __be16)get_random_u16(); else __ip_select_ident(net, iph, 1); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6376ad9157654..7a250ef9d1b7b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -323,7 +323,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr); } - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -1543,7 +1543,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 0e8c4f48c36d7..dc3cdee51e660 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -641,7 +641,7 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - u16 sn = get_random_u32(); + u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index d8e6380f63371..18319a6e68062 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -468,7 +468,7 @@ find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else - off = prandom_u32(); + off = get_random_u16(); attempts = range_size; if (attempts > max_attempts) @@ -490,7 +490,7 @@ another_round: if (attempts >= range_size || attempts < 16) return; attempts /= 2; - off = prandom_u32(); + off = get_random_u16(); goto another_round; } diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 55c6879d2c7e7..7193d25932ce2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2092,11 +2092,11 @@ retry: WARN_ON(host_load > CAKE_QUEUES); - /* The shifted prandom_u32() is a way to apply dithering to - * avoid accumulating roundoff errors + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors */ flow->deficit += (b->flow_quantum * quantum_div[host_load] + - (prandom_u32() >> 16)) >> 16; + get_random_u16()) >> 16; list_move_tail(&flow->flowchain, &b->old_flows); goto retry; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1e354ba449607..83628c347744b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9448,7 +9448,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); newinet->uc_ttl = inet->uc_ttl; newinet->mc_loop = 1; -- GitLab From f743f16c548b1a2633e8b6034058d6475d7f26a3 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 17:23:53 +0200 Subject: [PATCH 1754/2223] treewide: use get_random_{u8,u16}() when possible, part 2 Rather than truncate a 32-bit value to a 16-bit value or an 8-bit value, simply use the get_random_{u8,u16}() functions, which are faster than wasting the additional bytes from a 32-bit value. This was done by hand, identifying all of the places where one of the random integer functions was used in a non-32-bit context. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- arch/s390/kernel/process.c | 2 +- drivers/mtd/nand/raw/nandsim.c | 2 +- drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c | 2 +- lib/test_vmalloc.c | 2 +- net/rds/bind.c | 2 +- net/sched/sch_sfb.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 5ec78555dd2e5..42af4b3aa02b8 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -230,7 +230,7 @@ unsigned long arch_align_stack(unsigned long sp) static inline unsigned long brk_rnd(void) { - return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; + return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c index 50bcf745e8164..d211939c8bdd5 100644 --- a/drivers/mtd/nand/raw/nandsim.c +++ b/drivers/mtd/nand/raw/nandsim.c @@ -1402,7 +1402,7 @@ static int ns_do_read_error(struct nandsim *ns, int num) static void ns_do_bit_flips(struct nandsim *ns, int num) { - if (bitflips && prandom_u32() < (1 << 22)) { + if (bitflips && get_random_u16() < (1 << 6)) { int flips = 1; if (bitflips > 1) flips = prandom_u32_max(bitflips) + 1; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c index d0a7465be586d..170c61c8136cc 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c @@ -177,7 +177,7 @@ static int brcmf_pno_set_random(struct brcmf_if *ifp, struct brcmf_pno_info *pi) memcpy(pfn_mac.mac, mac_addr, ETH_ALEN); for (i = 0; i < ETH_ALEN; i++) { pfn_mac.mac[i] &= mac_mask[i]; - pfn_mac.mac[i] |= get_random_int() & ~(mac_mask[i]); + pfn_mac.mac[i] |= get_random_u8() & ~(mac_mask[i]); } /* Clear multi bit */ pfn_mac.mac[0] &= 0xFE; diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index a26bbbf20e62d..cf7780572f5b4 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -80,7 +80,7 @@ static int random_size_align_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - rnd = prandom_u32(); + rnd = get_random_u8(); /* * Maximum 1024 pages, if PAGE_SIZE is 4096. diff --git a/net/rds/bind.c b/net/rds/bind.c index 5b5fb4ca8d3e5..97a29172a8eec 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -104,7 +104,7 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, return -EINVAL; last = rover; } else { - rover = max_t(u16, prandom_u32(), 2); + rover = max_t(u16, get_random_u16(), 2); last = rover - 1; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index e2389fa3cff8a..0366a1a029a9e 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -379,7 +379,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto enqueue; } - r = prandom_u32() & SFB_MAX_PROB; + r = get_random_u16() & SFB_MAX_PROB; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { -- GitLab From a251c17aa558d8e3128a528af5cf8b9d7caae4fd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 17:43:22 +0200 Subject: [PATCH 1755/2223] treewide: use get_random_u32() when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prandom_u32() function has been a deprecated inline wrapper around get_random_u32() for several releases now, and compiles down to the exact same code. Replace the deprecated wrapper with a direct call to the real function. The same also applies to get_random_int(), which is just a wrapper around get_random_u32(). This was done as a basic find and replace. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: Jan Kara <jack@suse.cz> # for ext4 Acked-by: Toke Høiland-Jørgensen <toke@toke.dk> # for sch_cake Acked-by: Chuck Lever <chuck.lever@oracle.com> # for nfsd Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com> # for thunderbolt Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs Acked-by: Helge Deller <deller@gmx.de> # for parisc Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- Documentation/networking/filter.rst | 2 +- arch/parisc/kernel/process.c | 2 +- arch/parisc/kernel/sys_parisc.c | 4 ++-- arch/s390/mm/mmap.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- drivers/gpu/drm/i915/i915_gem_gtt.c | 6 +++--- drivers/gpu/drm/i915/selftests/i915_selftest.c | 2 +- drivers/gpu/drm/tests/drm_buddy_test.c | 2 +- drivers/gpu/drm/tests/drm_mm_test.c | 2 +- drivers/infiniband/hw/cxgb4/cm.c | 4 ++-- drivers/infiniband/hw/hfi1/tid_rdma.c | 2 +- drivers/infiniband/hw/mlx4/mad.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 +- drivers/md/raid5-cache.c | 2 +- .../media/test-drivers/vivid/vivid-touch-cap.c | 4 ++-- drivers/misc/habanalabs/gaudi2/gaudi2.c | 2 +- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/broadcom/cnic.c | 2 +- .../chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- drivers/net/ethernet/rocker/rocker_main.c | 6 +++--- .../net/wireless/marvell/mwifiex/cfg80211.c | 4 ++-- .../net/wireless/microchip/wilc1000/cfg80211.c | 2 +- .../net/wireless/quantenna/qtnfmac/cfg80211.c | 2 +- drivers/net/wireless/ti/wlcore/main.c | 2 +- drivers/nvme/common/auth.c | 2 +- drivers/scsi/cxgbi/cxgb4i/cxgb4i.c | 4 ++-- drivers/target/iscsi/cxgbit/cxgbit_cm.c | 2 +- drivers/thunderbolt/xdomain.c | 2 +- drivers/video/fbdev/uvesafb.c | 2 +- fs/exfat/inode.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/ioctl.c | 4 ++-- fs/ext4/mmp.c | 2 +- fs/f2fs/namei.c | 2 +- fs/fat/inode.c | 2 +- fs/nfsd/nfs4state.c | 4 ++-- fs/ntfs3/fslog.c | 6 +++--- fs/ubifs/journal.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_log.c | 2 +- include/net/netfilter/nf_queue.h | 2 +- include/net/red.h | 2 +- include/net/sock.h | 2 +- kernel/bpf/bloom_filter.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/kcsan/selftest.c | 2 +- lib/random32.c | 2 +- lib/reed_solomon/test_rslib.c | 6 +++--- lib/test_fprobe.c | 2 +- lib/test_kprobes.c | 2 +- lib/test_min_heap.c | 6 +++--- lib/test_rhashtable.c | 6 +++--- mm/shmem.c | 2 +- mm/slab.c | 2 +- net/core/pktgen.c | 4 ++-- net/ipv4/route.c | 2 +- net/ipv4/tcp_cdg.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/ip6_flowlabel.c | 2 +- net/ipv6/output_core.c | 2 +- net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/xt_statistic.c | 2 +- net/openvswitch/actions.c | 2 +- net/sched/sch_cake.c | 2 +- net/sched/sch_netem.c | 18 +++++++++--------- net/sunrpc/auth_gss/gss_krb5_wrap.c | 4 ++-- net/sunrpc/xprt.c | 2 +- net/unix/af_unix.c | 2 +- 71 files changed, 100 insertions(+), 100 deletions(-) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 43cdc4d34745c..f69da50748609 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -305,7 +305,7 @@ Possible BPF extensions are shown in the following table: vlan_tci skb_vlan_tag_get(skb) vlan_avail skb_vlan_tag_present(skb) vlan_tpid skb->vlan_proto - rand prandom_u32() + rand get_random_u32() =================================== ================================================= These extensions can also be prefixed with '#'. diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 7c37e09c92da6..18c4f0e3e906c 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -288,7 +288,7 @@ __get_wchan(struct task_struct *p) static inline unsigned long brk_rnd(void) { - return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 2b34294517a15..848b0702005d6 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -239,14 +239,14 @@ static unsigned long mmap_rnd(void) unsigned long rnd = 0; if (current->flags & PF_RANDOMIZE) - rnd = get_random_int() & MMAP_RND_MASK; + rnd = get_random_u32() & MMAP_RND_MASK; return rnd << PAGE_SHIFT; } unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_legacy_base(void) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 5980ce3488325..3327c47bc1814 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack) unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_base_legacy(unsigned long rnd) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 48276c0e479d8..860b60273df3f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -503,7 +503,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; /* A random value per boot for bit slice [12:upper_bit) */ - va_align.bits = get_random_int() & va_align.mask; + va_align.bits = get_random_u32() & va_align.mask; } if (cpu_has(c, X86_FEATURE_MWAITX)) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 329ff75b80b97..7bd1861ddbdfb 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -137,12 +137,12 @@ static u64 random_offset(u64 start, u64 end, u64 len, u64 align) range = round_down(end - len, align) - round_up(start, align); if (range) { if (sizeof(unsigned long) == sizeof(u64)) { - addr = get_random_long(); + addr = get_random_u64(); } else { - addr = get_random_int(); + addr = get_random_u32(); if (range > U32_MAX) { addr <<= 32; - addr |= get_random_int(); + addr |= get_random_u32(); } } div64_u64_rem(addr, range, &addr); diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c b/drivers/gpu/drm/i915/selftests/i915_selftest.c index c4e932368b37e..39da0fb0d6d26 100644 --- a/drivers/gpu/drm/i915/selftests/i915_selftest.c +++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c @@ -135,7 +135,7 @@ static int __run_selftests(const char *name, int err = 0; while (!i915_selftest.random_seed) - i915_selftest.random_seed = get_random_int(); + i915_selftest.random_seed = get_random_u32(); i915_selftest.timeout_jiffies = i915_selftest.timeout_ms ? diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index 7a2b2d6bc3fe9..62f69589a72d3 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -729,7 +729,7 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) static int drm_buddy_init_test(struct kunit *test) { while (!random_seed) - random_seed = get_random_int(); + random_seed = get_random_u32(); return 0; } diff --git a/drivers/gpu/drm/tests/drm_mm_test.c b/drivers/gpu/drm/tests/drm_mm_test.c index 659d1af4dca78..c4b66eeae2039 100644 --- a/drivers/gpu/drm/tests/drm_mm_test.c +++ b/drivers/gpu/drm/tests/drm_mm_test.c @@ -2212,7 +2212,7 @@ err_nodes: static int drm_mm_init_test(struct kunit *test) { while (!random_seed) - random_seed = get_random_int(); + random_seed = get_random_u32(); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 14392c942f492..499a425a33791 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -734,7 +734,7 @@ static int send_connect(struct c4iw_ep *ep) &ep->com.remote_addr; int ret; enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type; - u32 isn = (prandom_u32() & ~7UL) - 1; + u32 isn = (get_random_u32() & ~7UL) - 1; struct net_device *netdev; u64 params; @@ -2469,7 +2469,7 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, } if (!is_t4(adapter_type)) { - u32 isn = (prandom_u32() & ~7UL) - 1; + u32 isn = (get_random_u32() & ~7UL) - 1; skb = get_skb(skb, roundup(sizeof(*rpl5), 16), GFP_KERNEL); rpl5 = __skb_put_zero(skb, roundup(sizeof(*rpl5), 16)); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 2a7abf7a1f7fb..18b05ffb415a3 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -850,7 +850,7 @@ void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) int i; for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { - rcd->flows[i].generation = mask_generation(prandom_u32()); + rcd->flows[i].generation = mask_generation(get_random_u32()); kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); } } diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index d13ecbdd43917..a37cfac5e23f9 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -96,7 +96,7 @@ static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, __be64 mlx4_ib_gen_node_guid(void) { #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) - return cpu_to_be64(NODE_GUID_HI | prandom_u32()); + return cpu_to_be64(NODE_GUID_HI | get_random_u32()); } __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index ebb35b809f26e..b610d36295bb2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -465,7 +465,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, goto err_qp; } - psn = prandom_u32() & 0xffffff; + psn = get_random_u32() & 0xffffff; ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); if (ret) goto err_modify; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 79c73330020b1..832d8566e1656 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2994,7 +2994,7 @@ static int r5l_load_log(struct r5l_log *log) } create: if (create_super) { - log->last_cp_seq = prandom_u32(); + log->last_cp_seq = get_random_u32(); cp = 0; r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); /* diff --git a/drivers/media/test-drivers/vivid/vivid-touch-cap.c b/drivers/media/test-drivers/vivid/vivid-touch-cap.c index 792660a85bc11..6cc32eb54f9d0 100644 --- a/drivers/media/test-drivers/vivid/vivid-touch-cap.c +++ b/drivers/media/test-drivers/vivid/vivid-touch-cap.c @@ -210,7 +210,7 @@ static void vivid_fill_buff_noise(__s16 *tch_buf, int size) /* Fill 10% of the values within range -3 and 3, zero the others */ for (i = 0; i < size; i++) { - unsigned int rand = get_random_int(); + unsigned int rand = get_random_u32(); if (rand % 10) tch_buf[i] = 0; @@ -272,7 +272,7 @@ void vivid_fillbuff_tch(struct vivid_dev *dev, struct vivid_buffer *buf) return; if (test_pat_idx == 0) - dev->tch_pat_random = get_random_int(); + dev->tch_pat_random = get_random_u32(); rand = dev->tch_pat_random; switch (test_pattern) { diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 75c4bef7841cb..65e6cae6100a4 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -2948,7 +2948,7 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev) static inline int gaudi2_get_non_zero_random_int(void) { - int rand = get_random_int(); + int rand = get_random_u32(); return rand ? rand : 1; } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 24bb50dfd362a..e84c49bf4d0c3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4806,7 +4806,7 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond) switch (packets_per_slave) { case 0: - slave_id = prandom_u32(); + slave_id = get_random_u32(); break; case 1: slave_id = this_cpu_inc_return(*bond->rr_tx_counter); diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index f597b313acaa3..2198e35d9e181 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4164,7 +4164,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev) { u32 seed; - seed = prandom_u32(); + seed = get_random_u32(); cnic_ctx_wr(dev, 45, 0, seed); return 0; } diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index eda129d0143e7..c2e7037c7ba1c 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1063,7 +1063,7 @@ static void chtls_pass_accept_rpl(struct sk_buff *skb, opt2 |= WND_SCALE_EN_V(WSCALE_OK(tp)); rpl5->opt0 = cpu_to_be64(opt0); rpl5->opt2 = cpu_to_be32(opt2); - rpl5->iss = cpu_to_be32((prandom_u32() & ~7UL) - 1); + rpl5->iss = cpu_to_be32((get_random_u32() & ~7UL) - 1); set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id); t4_set_arp_err_handler(skb, sk, chtls_accept_rpl_arp_failure); cxgb4_l2t_send(csk->egress_dev, skb, csk->l2t_entry); diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 023682cd27687..5672d952452fd 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -129,7 +129,7 @@ static int rocker_reg_test(const struct rocker *rocker) u64 test_reg; u64 rnd; - rnd = prandom_u32(); + rnd = get_random_u32(); rnd >>= 1; rocker_write32(rocker, TEST_REG, rnd); test_reg = rocker_read32(rocker, TEST_REG); @@ -139,9 +139,9 @@ static int rocker_reg_test(const struct rocker *rocker) return -EIO; } - rnd = prandom_u32(); + rnd = get_random_u32(); rnd <<= 31; - rnd |= prandom_u32(); + rnd |= get_random_u32(); rocker_write64(rocker, TEST_REG64, rnd); test_reg = rocker_read64(rocker, TEST_REG64); if (test_reg != rnd * 2) { diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 535995e8279f4..bcd564dc3554a 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -239,7 +239,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, tx_info->pkt_len = pkt_len; mwifiex_form_mgmt_frame(skb, buf, len); - *cookie = prandom_u32() | 1; + *cookie = get_random_u32() | 1; if (ieee80211_is_action(mgmt->frame_control)) skb = mwifiex_clone_skb_for_tx_status(priv, @@ -303,7 +303,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy, duration); if (!ret) { - *cookie = prandom_u32() | 1; + *cookie = get_random_u32() | 1; priv->roc_cfg.cookie = *cookie; priv->roc_cfg.chan = *chan; diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index b89047965e78e..9bbfff8033578 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -1161,7 +1161,7 @@ static int mgmt_tx(struct wiphy *wiphy, const u8 *vendor_ie; int ret = 0; - *cookie = prandom_u32(); + *cookie = get_random_u32(); priv->tx_cookie = *cookie; mgmt = (const struct ieee80211_mgmt *)buf; diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index bfdf03bfa6c57..73e6f9408b515 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -449,7 +449,7 @@ qtnf_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, { struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); const struct ieee80211_mgmt *mgmt_frame = (void *)params->buf; - u32 short_cookie = prandom_u32(); + u32 short_cookie = get_random_u32(); u16 flags = 0; u16 freq; diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 3e3922d4c7880..28c0f06e311f7 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -6100,7 +6100,7 @@ static int wl1271_register_hw(struct wl1271 *wl) wl1271_warning("Fuse mac address is zero. using random mac"); /* Use TI oui and a random nic */ oui_addr = WLCORE_TI_OUI_ADDRESS; - nic_addr = get_random_int(); + nic_addr = get_random_u32(); } else { oui_addr = wl->fuse_oui_addr; /* fuse has the BD_ADDR, the WLAN addresses are the next two */ diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 04bd28f17dcce..d90e4f0c08b7b 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -23,7 +23,7 @@ u32 nvme_auth_get_seqnum(void) mutex_lock(&nvme_dhchap_mutex); if (!nvme_dhchap_seqnum) - nvme_dhchap_seqnum = prandom_u32(); + nvme_dhchap_seqnum = get_random_u32(); else { nvme_dhchap_seqnum++; if (!nvme_dhchap_seqnum) diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c index 53d91bf9c12a8..c07d2e3b4bcff 100644 --- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c @@ -254,7 +254,7 @@ static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb, } else if (is_t5(lldi->adapter_type)) { struct cpl_t5_act_open_req *req = (struct cpl_t5_act_open_req *)skb->head; - u32 isn = (prandom_u32() & ~7UL) - 1; + u32 isn = (get_random_u32() & ~7UL) - 1; INIT_TP_WR(req, 0); OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, @@ -282,7 +282,7 @@ static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb, } else { struct cpl_t6_act_open_req *req = (struct cpl_t6_act_open_req *)skb->head; - u32 isn = (prandom_u32() & ~7UL) - 1; + u32 isn = (get_random_u32() & ~7UL) - 1; INIT_TP_WR(req, 0); OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, diff --git a/drivers/target/iscsi/cxgbit/cxgbit_cm.c b/drivers/target/iscsi/cxgbit/cxgbit_cm.c index 3336d2b78bf77..d9204c590d9ab 100644 --- a/drivers/target/iscsi/cxgbit/cxgbit_cm.c +++ b/drivers/target/iscsi/cxgbit/cxgbit_cm.c @@ -1202,7 +1202,7 @@ cxgbit_pass_accept_rpl(struct cxgbit_sock *csk, struct cpl_pass_accept_req *req) opt2 |= CONG_CNTRL_V(CONG_ALG_NEWRENO); opt2 |= T5_ISS_F; - rpl5->iss = cpu_to_be32((prandom_u32() & ~7UL) - 1); + rpl5->iss = cpu_to_be32((get_random_u32() & ~7UL) - 1); opt2 |= T5_OPT_2_VALID_F; diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c index bbb248a2686fb..f00b2f62d8e3c 100644 --- a/drivers/thunderbolt/xdomain.c +++ b/drivers/thunderbolt/xdomain.c @@ -2437,7 +2437,7 @@ int tb_xdomain_init(void) tb_property_add_immediate(xdomain_property_dir, "deviceid", 0x1); tb_property_add_immediate(xdomain_property_dir, "devicerv", 0x80000100); - xdomain_property_block_gen = prandom_u32(); + xdomain_property_block_gen = get_random_u32(); return 0; } diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c index fd5d701106e1d..00d789b6c0faf 100644 --- a/drivers/video/fbdev/uvesafb.c +++ b/drivers/video/fbdev/uvesafb.c @@ -167,7 +167,7 @@ static int uvesafb_exec(struct uvesafb_ktask *task) memcpy(&m->id, &uvesafb_cn_id, sizeof(m->id)); m->seq = seq; m->len = len; - m->ack = prandom_u32(); + m->ack = get_random_u32(); /* uvesafb_task structure */ memcpy(m + 1, &task->t, sizeof(task->t)); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a795437b86d06..5590a1e83126c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -552,7 +552,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (info->attr & ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7575aa3596751..e9bc46684106b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1279,7 +1279,7 @@ got: EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4d49c5cfb690f..ded535535b27b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -454,8 +454,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ctime = inode_bl->i_ctime = current_time(inode); inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); - inode_bl->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9af68a7ecdcf3..588cb09c5291f 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void) u32 new_seq; do { - new_seq = prandom_u32(); + new_seq = get_random_u32(); } while (new_seq > EXT4_MMP_SEQ_MAX); return new_seq; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d5065a5af1f8a..a389772fd212a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,7 +50,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a38238d75c08e..1cbcc4608dc78 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -523,7 +523,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 198d7abf34e45..4e718500a00c4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4375,8 +4375,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->track_reclaim_completes = false; - nn->clverifier_counter = prandom_u32(); - nn->clientid_base = prandom_u32(); + nn->clverifier_counter = get_random_u32(); + nn->clientid_base = get_random_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index e7c494005122c..0d611a6c5511f 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3819,7 +3819,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_int(), false, false); + log_create(log, l_size, 0, get_random_u32(), false, false); log->ra = ra; @@ -3893,7 +3893,7 @@ check_restart_area: /* Do some checks based on whether we have a valid log page. */ if (!rst_info.valid_page) { - open_log_count = get_random_int(); + open_log_count = get_random_u32(); goto init_log_instance; } open_log_count = le32_to_cpu(ra2->open_log_count); @@ -4044,7 +4044,7 @@ find_oldest: memcpy(ra->clients, Add2Ptr(ra2, t16), le16_to_cpu(ra2->ra_len) - t16); - log->current_openlog_count = get_random_int(); + log->current_openlog_count = get_random_u32(); ra->open_log_count = cpu_to_le32(log->current_openlog_count); log->ra_size = offsetof(struct RESTART_AREA, clients) + sizeof(struct CLIENT_REC); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 75dab0ae3939d..4619652046cf8 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) { if (c->double_hash) - dent->cookie = (__force __le32) prandom_u32(); + dent->cookie = (__force __le32) get_random_u32(); else dent->cookie = 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 7838b31126e22..94db50eb706ac 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -805,7 +805,7 @@ sparse_alloc: * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, - args.agbno, args.len, prandom_u32()); + args.agbno, args.len, get_random_u32()); if (error) return error; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2bbe7916a998d..eae7427062cf9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -596,7 +596,7 @@ xfs_iget_cache_miss( */ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { - VFS_I(ip)->i_generation = prandom_u32(); + VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6e7e4fd72ae7..f02a0dd522b3d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3544,7 +3544,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = prandom_u32(); + tic->t_tid = get_random_u32(); if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 980daa6e1e3aa..c81021ab07aa1 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -43,7 +43,7 @@ void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) { while (*jhash_initval == 0) - *jhash_initval = prandom_u32(); + *jhash_initval = get_random_u32(); } static inline u32 hash_v4(const struct iphdr *iph, u32 initval) diff --git a/include/net/red.h b/include/net/red.h index 454ac2b65d8ca..425364de0df79 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -363,7 +363,7 @@ static inline unsigned long red_calc_qavg(const struct red_parms *p, static inline u32 red_random(const struct red_parms *p) { - return reciprocal_divide(prandom_u32(), p->max_P_reciprocal); + return reciprocal_divide(get_random_u32(), p->max_P_reciprocal); } static inline int red_mark_probability(const struct red_parms *p, diff --git a/include/net/sock.h b/include/net/sock.h index 08038a385ef21..9e464f6409a71 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2109,7 +2109,7 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) static inline u32 net_tx_rndhash(void) { - u32 v = prandom_u32(); + u32 v = get_random_u32(); return v ?: 1; } diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b9ea539a55614..48ee750849f25 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) attr->value_size / sizeof(u32); if (!(attr->map_flags & BPF_F_ZERO_SEED)) - bloom->hash_seed = get_random_int(); + bloom->hash_seed = get_random_u32(); return &bloom->map; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 53c6c98bda7b6..25a54e04560e5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1216,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, bool emit_zext) { struct bpf_insn *to = to_buff; - u32 imm_rnd = get_random_int(); + u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ed3f8a53603b9..f39ee3e055897 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -527,7 +527,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else - htab->hashrnd = get_random_int(); + htab->hashrnd = get_random_u32(); htab_init_buckets(htab); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f6d2d511c06f..014ee0953dbde 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13350,7 +13350,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, aux[adj_idx].ptr_type == PTR_TO_CTX) continue; - imm_rnd = get_random_int(); + imm_rnd = get_random_u32(); rnd_hi32_patch[0] = insn; rnd_hi32_patch[1].imm = imm_rnd; rnd_hi32_patch[3].dst_reg = load_reg; diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 75712959c84e0..58b94deae5c0b 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -26,7 +26,7 @@ static bool __init test_requires(void) { /* random should be initialized for the below tests */ - return prandom_u32() + prandom_u32() != 0; + return get_random_u32() + get_random_u32() != 0; } /* diff --git a/lib/random32.c b/lib/random32.c index d5d9029362cbb..d4f19e1a69d4e 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -47,7 +47,7 @@ * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. - * For more random results, use prandom_u32(). + * For more random results, use get_random_u32(). */ u32 prandom_u32_state(struct rnd_state *state) { diff --git a/lib/reed_solomon/test_rslib.c b/lib/reed_solomon/test_rslib.c index 4d241bdc88aa8..848e7eb5da921 100644 --- a/lib/reed_solomon/test_rslib.c +++ b/lib/reed_solomon/test_rslib.c @@ -164,7 +164,7 @@ static int get_rcw_we(struct rs_control *rs, struct wspace *ws, /* Load c with random data and encode */ for (i = 0; i < dlen; i++) - c[i] = prandom_u32() & nn; + c[i] = get_random_u32() & nn; memset(c + dlen, 0, nroots * sizeof(*c)); encode_rs16(rs, c, dlen, c + dlen, 0); @@ -178,7 +178,7 @@ static int get_rcw_we(struct rs_control *rs, struct wspace *ws, for (i = 0; i < errs; i++) { do { /* Error value must be nonzero */ - errval = prandom_u32() & nn; + errval = get_random_u32() & nn; } while (errval == 0); do { @@ -206,7 +206,7 @@ static int get_rcw_we(struct rs_control *rs, struct wspace *ws, /* Erasure with corrupted symbol */ do { /* Error value must be nonzero */ - errval = prandom_u32() & nn; + errval = get_random_u32() & nn; } while (errval == 0); errlocs[errloc] = 1; diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index ed70637a2ffa4..e0381b3ec410c 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -145,7 +145,7 @@ static unsigned long get_ftrace_location(void *func) static int fprobe_test_init(struct kunit *test) { do { - rand1 = prandom_u32(); + rand1 = get_random_u32(); } while (rand1 <= div_factor); target = fprobe_selftest_target; diff --git a/lib/test_kprobes.c b/lib/test_kprobes.c index a5edc2ebc947a..eeb1d728d9746 100644 --- a/lib/test_kprobes.c +++ b/lib/test_kprobes.c @@ -341,7 +341,7 @@ static int kprobes_test_init(struct kunit *test) stacktrace_driver = kprobe_stacktrace_driver; do { - rand1 = prandom_u32(); + rand1 = get_random_u32(); } while (rand1 <= div_factor); return 0; } diff --git a/lib/test_min_heap.c b/lib/test_min_heap.c index d19c8080fd4d1..7b01b4387cfbc 100644 --- a/lib/test_min_heap.c +++ b/lib/test_min_heap.c @@ -83,7 +83,7 @@ static __init int test_heapify_all(bool min_heap) /* Test with randomly generated values. */ heap.nr = ARRAY_SIZE(values); for (i = 0; i < heap.nr; i++) - values[i] = get_random_int(); + values[i] = get_random_u32(); min_heapify_all(&heap, &funcs); err += pop_verify_heap(min_heap, &heap, &funcs); @@ -116,7 +116,7 @@ static __init int test_heap_push(bool min_heap) /* Test with randomly generated values. */ while (heap.nr < heap.size) { - temp = get_random_int(); + temp = get_random_u32(); min_heap_push(&heap, &temp, &funcs); } err += pop_verify_heap(min_heap, &heap, &funcs); @@ -158,7 +158,7 @@ static __init int test_heap_pop_push(bool min_heap) /* Test with randomly generated values. */ for (i = 0; i < ARRAY_SIZE(data); i++) { - temp = get_random_int(); + temp = get_random_u32(); min_heap_pop_push(&heap, &temp, &funcs); } err += pop_verify_heap(min_heap, &heap, &funcs); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 5a1dd4736b56f..b358a74ed7ed8 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -291,7 +291,7 @@ static int __init test_rhltable(unsigned int entries) if (WARN_ON(err)) goto out_free; - k = prandom_u32(); + k = get_random_u32(); ret = 0; for (i = 0; i < entries; i++) { rhl_test_objects[i].value.id = k; @@ -369,12 +369,12 @@ static int __init test_rhltable(unsigned int entries) pr_info("test %d random rhlist add/delete operations\n", entries); for (j = 0; j < entries; j++) { u32 i = prandom_u32_max(entries); - u32 prand = prandom_u32(); + u32 prand = get_random_u32(); cond_resched(); if (prand == 0) - prand = prandom_u32(); + prand = get_random_u32(); if (prand & 1) { prand >>= 1; diff --git a/mm/shmem.c b/mm/shmem.c index 86214d48dd099..8280a5cb48dfc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2332,7 +2332,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); diff --git a/mm/slab.c b/mm/slab.c index a5486ff8362a1..60cd19b9ee047 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2380,7 +2380,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, unsigned int rand; /* Use best entropy available to define a random shift */ - rand = get_random_int(); + rand = get_random_u32(); /* Use a random state if the pre-computed list is not available */ if (!cachep->random_seq) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5ca4f953034ca..c3763056c554a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2464,7 +2464,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)prandom_u32() & + ((__force __be32)get_random_u32() & htonl(0x000fffff)); } @@ -2568,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)prandom_u32() | + (((__force __be32)get_random_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 795cbe1de9124..1a37a07c7163c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3664,7 +3664,7 @@ static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); - atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); + atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ddc7ba0554bdd..efcd145f06db1 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -243,7 +243,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + if (get_random_u32() <= nexp_u32(grad * backoff_factor)) return false; if (use_ineff) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d63118ce59006..9f2688246deeb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = prandom_u32(); + rand = get_random_u32(); first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index ceb85c67ce395..18481eb76a0a4 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -220,7 +220,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { - fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; + fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 2880dc7d9a491..2685c3f15e9d3 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -18,7 +18,7 @@ static u32 __ipv6_select_ident(struct net *net, u32 id; do { - id = prandom_u32(); + id = get_random_u32(); } while (!id); return id; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index fb67f1ca2495b..8c04bb57dd6fe 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1308,7 +1308,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; + unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->ipvs != ipvs) diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 203e24ae472c2..b26c1dcfc27b5 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: - if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability) + if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 868db4669a291..ca3ebfdb30231 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1033,7 +1033,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, actions = nla_next(sample_arg, &rem); if ((arg->probability != U32_MAX) && - (!arg->probability || prandom_u32() > arg->probability)) { + (!arg->probability || get_random_u32() > arg->probability)) { if (last) consume_skb(skb); return 0; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 7193d25932ce2..817cd0695b350 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -573,7 +573,7 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, /* Simple BLUE implementation. Lack of ECN is deliberate. */ if (vars->p_drop) - drop |= (prandom_u32() < vars->p_drop); + drop |= (get_random_u32() < vars->p_drop); /* Overload the drop_next field as an activity timeout */ if (!vars->count) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bab45b3b1fdb5..fb00ac40ecb72 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -171,7 +171,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; - state->last = prandom_u32(); + state->last = get_random_u32(); } /* get_crandom - correlated random number generator @@ -184,9 +184,9 @@ static u32 get_crandom(struct crndstate *state) unsigned long answer; if (!state || state->rho == 0) /* no correlation */ - return prandom_u32(); + return get_random_u32(); - value = prandom_u32(); + value = get_random_u32(); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; @@ -200,7 +200,7 @@ static u32 get_crandom(struct crndstate *state) static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; - u32 rnd = prandom_u32(); + u32 rnd = get_random_u32(); /* * Makes a comparison between rnd and the transition @@ -268,15 +268,15 @@ static bool loss_gilb_ell(struct netem_sched_data *q) switch (clg->state) { case GOOD_STATE: - if (prandom_u32() < clg->a1) + if (get_random_u32() < clg->a1) clg->state = BAD_STATE; - if (prandom_u32() < clg->a4) + if (get_random_u32() < clg->a4) return true; break; case BAD_STATE: - if (prandom_u32() < clg->a2) + if (get_random_u32() < clg->a2) clg->state = GOOD_STATE; - if (prandom_u32() > clg->a3) + if (get_random_u32() > clg->a3) return true; } @@ -632,7 +632,7 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) if (!q->slot_dist) next_delay = q->slot_config.min_delay + - (prandom_u32() * + (get_random_u32() * (q->slot_config.max_delay - q->slot_config.min_delay) >> 32); else diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 5f96e75f9eecf..48337687848c6 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -130,8 +130,8 @@ gss_krb5_make_confounder(char *p, u32 conflen) /* initialize to random value */ if (i == 0) { - i = prandom_u32(); - i = (i << 32) | prandom_u32(); + i = get_random_u32(); + i = (i << 32) | get_random_u32(); } switch (conflen) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f8fae78156494..9407007f47aee 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1868,7 +1868,7 @@ xprt_alloc_xid(struct rpc_xprt *xprt) static void xprt_init_xid(struct rpc_xprt *xprt) { - xprt->xid = prandom_u32(); + xprt->xid = get_random_u32(); } static void diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 15dbb392c875b..b3545fc680979 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1147,7 +1147,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); - ordernum = prandom_u32(); + ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; -- GitLab From 197173db990cad244221ba73c43b1df6170ae278 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 17:49:46 +0200 Subject: [PATCH 1756/2223] treewide: use get_random_bytes() when possible The prandom_bytes() function has been a deprecated inline wrapper around get_random_bytes() for several releases now, and compiles down to the exact same code. Replace the deprecated wrapper with a direct call to the real function. This was done as a basic find and replace. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> # powerpc Acked-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- arch/powerpc/crypto/crc-vpmsum_test.c | 2 +- block/blk-crypto-fallback.c | 2 +- crypto/async_tx/raid6test.c | 2 +- drivers/dma/dmatest.c | 2 +- drivers/mtd/nand/raw/nandsim.c | 2 +- drivers/mtd/tests/mtd_nandecctest.c | 2 +- drivers/mtd/tests/speedtest.c | 2 +- drivers/mtd/tests/stresstest.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/rocker/rocker_main.c | 2 +- drivers/net/wireguard/selftest/allowedips.c | 12 ++++++------ fs/ubifs/debug.c | 2 +- kernel/kcsan/selftest.c | 2 +- lib/random32.c | 2 +- lib/test_objagg.c | 2 +- lib/uuid.c | 2 +- net/ipv4/route.c | 2 +- net/mac80211/rc80211_minstrel_ht.c | 2 +- net/sched/sch_pie.c | 2 +- 19 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c index c1c1ef9457fb4..273c527868db2 100644 --- a/arch/powerpc/crypto/crc-vpmsum_test.c +++ b/arch/powerpc/crypto/crc-vpmsum_test.c @@ -82,7 +82,7 @@ static int __init crc_test_init(void) if (len <= offset) continue; - prandom_bytes(data, len); + get_random_bytes(data, len); len -= offset; crypto_shash_update(crct10dif_shash, data+offset, len); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 621abd1b0e4d3..ad9844c5b40cb 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -539,7 +539,7 @@ static int blk_crypto_fallback_init(void) if (blk_crypto_fallback_inited) return 0; - prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c index 9719c75206618..d3fbee1e03e55 100644 --- a/crypto/async_tx/raid6test.c +++ b/crypto/async_tx/raid6test.c @@ -37,7 +37,7 @@ static void makedata(int disks) int i; for (i = 0; i < disks; i++) { - prandom_bytes(page_address(data[i]), PAGE_SIZE); + get_random_bytes(page_address(data[i]), PAGE_SIZE); dataptrs[i] = data[i]; dataoffs[i] = 0; } diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 9fe2ae7943169..ffe621695e472 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -312,7 +312,7 @@ static unsigned long dmatest_random(void) { unsigned long buf; - prandom_bytes(&buf, sizeof(buf)); + get_random_bytes(&buf, sizeof(buf)); return buf; } diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c index d211939c8bdd5..672719023241b 100644 --- a/drivers/mtd/nand/raw/nandsim.c +++ b/drivers/mtd/nand/raw/nandsim.c @@ -1393,7 +1393,7 @@ static int ns_do_read_error(struct nandsim *ns, int num) unsigned int page_no = ns->regs.row; if (ns_read_error(page_no)) { - prandom_bytes(ns->buf.byte, num); + get_random_bytes(ns->buf.byte, num); NS_WARN("simulating read error in page %u\n", page_no); return 1; } diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c index 1c7201b0f372d..440988562cfdc 100644 --- a/drivers/mtd/tests/mtd_nandecctest.c +++ b/drivers/mtd/tests/mtd_nandecctest.c @@ -266,7 +266,7 @@ static int nand_ecc_test_run(const size_t size) goto error; } - prandom_bytes(correct_data, size); + get_random_bytes(correct_data, size); ecc_sw_hamming_calculate(correct_data, size, correct_ecc, sm_order); for (i = 0; i < ARRAY_SIZE(nand_ecc_test); i++) { nand_ecc_test[i].prepare(error_data, error_ecc, diff --git a/drivers/mtd/tests/speedtest.c b/drivers/mtd/tests/speedtest.c index c9ec7086bfa1d..075bce32caa51 100644 --- a/drivers/mtd/tests/speedtest.c +++ b/drivers/mtd/tests/speedtest.c @@ -223,7 +223,7 @@ static int __init mtd_speedtest_init(void) if (!iobuf) goto out; - prandom_bytes(iobuf, mtd->erasesize); + get_random_bytes(iobuf, mtd->erasesize); bbt = kzalloc(ebcnt, GFP_KERNEL); if (!bbt) diff --git a/drivers/mtd/tests/stresstest.c b/drivers/mtd/tests/stresstest.c index d2faaca7f19d7..75b6ddc5dc4da 100644 --- a/drivers/mtd/tests/stresstest.c +++ b/drivers/mtd/tests/stresstest.c @@ -183,7 +183,7 @@ static int __init mtd_stresstest_init(void) goto out; for (i = 0; i < ebcnt; i++) offsets[i] = mtd->erasesize; - prandom_bytes(writebuf, bufsize); + get_random_bytes(writebuf, bufsize); bbt = kzalloc(ebcnt, GFP_KERNEL); if (!bbt) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index eed98c10ca9d6..04cf7684f1b0c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3874,7 +3874,7 @@ static void bnxt_init_vnics(struct bnxt *bp) if (bp->vnic_info[i].rss_hash_key) { if (i == 0) - prandom_bytes(vnic->rss_hash_key, + get_random_bytes(vnic->rss_hash_key, HW_HASH_KEY_SIZE); else memcpy(vnic->rss_hash_key, diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 5672d952452fd..9e59669a93dd3 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -224,7 +224,7 @@ static int rocker_dma_test_offset(const struct rocker *rocker, if (err) goto unmap; - prandom_bytes(buf, ROCKER_TEST_DMA_BUF_SIZE); + get_random_bytes(buf, ROCKER_TEST_DMA_BUF_SIZE); for (i = 0; i < ROCKER_TEST_DMA_BUF_SIZE; i++) expect[i] = ~buf[i]; err = rocker_dma_test_one(rocker, wait, ROCKER_TEST_DMA_CTRL_INVERT, diff --git a/drivers/net/wireguard/selftest/allowedips.c b/drivers/net/wireguard/selftest/allowedips.c index dd897c0740a28..19eac00b23814 100644 --- a/drivers/net/wireguard/selftest/allowedips.c +++ b/drivers/net/wireguard/selftest/allowedips.c @@ -284,7 +284,7 @@ static __init bool randomized_test(void) mutex_lock(&mutex); for (i = 0; i < NUM_RAND_ROUTES; ++i) { - prandom_bytes(ip, 4); + get_random_bytes(ip, 4); cidr = prandom_u32_max(32) + 1; peer = peers[prandom_u32_max(NUM_PEERS)]; if (wg_allowedips_insert_v4(&t, (struct in_addr *)ip, cidr, @@ -299,7 +299,7 @@ static __init bool randomized_test(void) } for (j = 0; j < NUM_MUTATED_ROUTES; ++j) { memcpy(mutated, ip, 4); - prandom_bytes(mutate_mask, 4); + get_random_bytes(mutate_mask, 4); mutate_amount = prandom_u32_max(32); for (k = 0; k < mutate_amount / 8; ++k) mutate_mask[k] = 0xff; @@ -328,7 +328,7 @@ static __init bool randomized_test(void) } for (i = 0; i < NUM_RAND_ROUTES; ++i) { - prandom_bytes(ip, 16); + get_random_bytes(ip, 16); cidr = prandom_u32_max(128) + 1; peer = peers[prandom_u32_max(NUM_PEERS)]; if (wg_allowedips_insert_v6(&t, (struct in6_addr *)ip, cidr, @@ -343,7 +343,7 @@ static __init bool randomized_test(void) } for (j = 0; j < NUM_MUTATED_ROUTES; ++j) { memcpy(mutated, ip, 16); - prandom_bytes(mutate_mask, 16); + get_random_bytes(mutate_mask, 16); mutate_amount = prandom_u32_max(128); for (k = 0; k < mutate_amount / 8; ++k) mutate_mask[k] = 0xff; @@ -381,13 +381,13 @@ static __init bool randomized_test(void) for (j = 0;; ++j) { for (i = 0; i < NUM_QUERIES; ++i) { - prandom_bytes(ip, 4); + get_random_bytes(ip, 4); if (lookup(t.root4, 32, ip) != horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) { horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip); pr_err("allowedips random v4 self-test: FAIL\n"); goto free; } - prandom_bytes(ip, 16); + get_random_bytes(ip, 16); if (lookup(t.root6, 128, ip) != horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) { pr_err("allowedips random v6 self-test: FAIL\n"); goto free; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index f4d3b568aa64a..3f128b9fdfbb2 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2581,7 +2581,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, if (ffs) memset(p + from, 0xFF, to - from); else - prandom_bytes(p + from, to - from); + get_random_bytes(p + from, to - from); return to; } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 58b94deae5c0b..00cdf8fa56936 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -46,7 +46,7 @@ static bool __init test_encode_decode(void) unsigned long addr; size_t verif_size; - prandom_bytes(&addr, sizeof(addr)); + get_random_bytes(&addr, sizeof(addr)); if (addr < PAGE_SIZE) addr = PAGE_SIZE; diff --git a/lib/random32.c b/lib/random32.c index d4f19e1a69d4e..32060b8526681 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -69,7 +69,7 @@ EXPORT_SYMBOL(prandom_u32_state); * @bytes: the requested number of bytes * * This is used for pseudo-randomness with no outside seeding. - * For more random results, use prandom_bytes(). + * For more random results, use get_random_bytes(). */ void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { diff --git a/lib/test_objagg.c b/lib/test_objagg.c index da137939a4100..c0c957c506354 100644 --- a/lib/test_objagg.c +++ b/lib/test_objagg.c @@ -157,7 +157,7 @@ static int test_nodelta_obj_get(struct world *world, struct objagg *objagg, int err; if (should_create_root) - prandom_bytes(world->next_root_buf, + get_random_bytes(world->next_root_buf, sizeof(world->next_root_buf)); objagg_obj = world_obj_get(world, objagg, key_id); diff --git a/lib/uuid.c b/lib/uuid.c index 562d53977cabb..e309b4c5be3df 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -52,7 +52,7 @@ EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { - prandom_bytes(b, 16); + get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1a37a07c7163c..cd1fa9f70f1a1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3719,7 +3719,7 @@ int __init ip_rt_init(void) ip_idents = idents_hash; - prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); + get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 7f3f5f51081d5..3d91b98db0996 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -2036,7 +2036,7 @@ static void __init init_sample_table(void) memset(sample_table, 0xff, sizeof(sample_table)); for (col = 0; col < SAMPLE_COLUMNS; col++) { - prandom_bytes(rnd, sizeof(rnd)); + get_random_bytes(rnd, sizeof(rnd)); for (i = 0; i < MCS_GROUP_RATES; i++) { new_idx = (i + rnd[i]) % MCS_GROUP_RATES; while (sample_table[col][new_idx] != 0xff) diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 974038ba6c7b8..265c238047a42 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -72,7 +72,7 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, if (vars->accu_prob >= (MAX_PROB / 2) * 17) return true; - prandom_bytes(&rnd, 8); + get_random_bytes(&rnd, 8); if ((rnd >> BITS_PER_BYTE) < local_prob) { vars->accu_prob = 0; return true; -- GitLab From de492c83cae0af72de370b9404aacda93dafcad5 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Wed, 5 Oct 2022 17:50:20 +0200 Subject: [PATCH 1757/2223] prandom: remove unused functions With no callers left of prandom_u32() and prandom_bytes(), as well as get_random_int(), remove these deprecated wrappers, in favor of get_random_u32() and get_random_bytes(). Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Acked-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- drivers/char/random.c | 11 +++++------ include/linux/prandom.h | 12 ------------ include/linux/random.h | 5 ----- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 01acf235f2635..2fe28eeb2f387 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -97,7 +97,7 @@ MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression"); * Returns whether or not the input pool has been seeded and thus guaranteed * to supply cryptographically secure random numbers. This applies to: the * /dev/urandom device, the get_random_bytes function, and the get_random_{u8, - * u16,u32,u64,int,long} family of functions. + * u16,u32,u64,long} family of functions. * * Returns: true if the input pool has been seeded. * false if the input pool has not been seeded. @@ -161,15 +161,14 @@ EXPORT_SYMBOL(wait_for_random_bytes); * u16 get_random_u16() * u32 get_random_u32() * u64 get_random_u64() - * unsigned int get_random_int() * unsigned long get_random_long() * * These interfaces will return the requested number of random bytes * into the given buffer or as a return value. This is equivalent to - * a read from /dev/urandom. The u8, u16, u32, u64, int, and long - * family of functions may be higher performance for one-off random - * integers, because they do a bit of buffering and do not invoke - * reseeding until the buffer is emptied. + * a read from /dev/urandom. The u8, u16, u32, u64, long family of + * functions may be higher performance for one-off random integers, + * because they do a bit of buffering and do not invoke reseeding + * until the buffer is emptied. * *********************************************************************/ diff --git a/include/linux/prandom.h b/include/linux/prandom.h index 78db003bc290b..e0a0759dd09c0 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -12,18 +12,6 @@ #include <linux/percpu.h> #include <linux/random.h> -/* Deprecated: use get_random_u32 instead. */ -static inline u32 prandom_u32(void) -{ - return get_random_u32(); -} - -/* Deprecated: use get_random_bytes instead. */ -static inline void prandom_bytes(void *buf, size_t nbytes) -{ - return get_random_bytes(buf, nbytes); -} - struct rnd_state { __u32 s1, s2, s3, s4; }; diff --git a/include/linux/random.h b/include/linux/random.h index 08322f700cdcc..147a5e0d0b8ed 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -42,10 +42,6 @@ u8 get_random_u8(void); u16 get_random_u16(void); u32 get_random_u32(void); u64 get_random_u64(void); -static inline unsigned int get_random_int(void) -{ - return get_random_u32(); -} static inline unsigned long get_random_long(void) { #if BITS_PER_LONG == 64 @@ -100,7 +96,6 @@ declare_get_random_var_wait(u8, u8) declare_get_random_var_wait(u16, u16) declare_get_random_var_wait(u32, u32) declare_get_random_var_wait(u64, u32) -declare_get_random_var_wait(int, unsigned int) declare_get_random_var_wait(long, unsigned long) #undef declare_get_random_var -- GitLab From ef79361b265dd229725ad62bc850f6913ef2f94a Mon Sep 17 00:00:00 2001 From: Xu Panda <xu.panda@zte.com.cn> Date: Mon, 12 Sep 2022 07:15:57 +0000 Subject: [PATCH 1758/2223] fork: remove duplicate included header files linux/sched/mm.h is included more than once. Link: https://lkml.kernel.org/r/20220912071556.16811-1-xu.panda@zte.com.cn Signed-off-by: Xu Panda <xu.panda@zte.com.cn> Reported-by: Zeal Robot <zealci@zte.com.cn> Cc: Andy Lutomirski <luto@kernel.org> Cc: Christian Brauner (Microsoft) <brauner@kernel.org> Cc: "Eric W . Biederman" <ebiederm@xmission.com> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf698..3b8784e3ce97d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,7 +97,6 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> -#include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> -- GitLab From 723ac751208f6d6540191689cfbf6c77135a7a1b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@gmail.com> Date: Thu, 29 Sep 2022 21:33:30 +0900 Subject: [PATCH 1759/2223] nilfs2: replace WARN_ONs by nilfs_error for checkpoint acquisition failure If creation or finalization of a checkpoint fails due to anomalies in the checkpoint metadata on disk, a kernel warning is generated. This patch replaces the WARN_ONs by nilfs_error, so that a kernel, booted with panic_on_warn, does not panic. A nilfs_error is appropriate here to handle the abnormal filesystem condition. This also replaces the detected error codes with an I/O error so that neither of the internal error codes is returned to callers. Link: https://lkml.kernel.org/r/20220929123330.19658-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+fbb3e0b24e8dae5a16ee@syzkaller.appspotmail.com Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/segment.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9abae2c9120ed..6e4a7c4228c79 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -875,9 +875,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - } else - WARN_ON(err == -EINVAL || err == -ENOENT); - + } else if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint creation failed due to metadata corruption."); + err = -EIO; + } return err; } @@ -891,7 +893,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, &raw_cp, &bh_cp); if (unlikely(err)) { - WARN_ON(err == -EINVAL || err == -ENOENT); + if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint finalization failed due to metadata corruption."); + err = -EIO; + } goto failed_ibh; } raw_cp->cp_snapshot_list.ssl_next = 0; -- GitLab From 329028e04a0b4d6a2e37b75ea90335436b7c3c8c Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn <lukas.bulwahn@gmail.com> Date: Thu, 29 Sep 2022 12:14:41 +0200 Subject: [PATCH 1760/2223] ia64: update config files Clean up config files by: - removing configs that were deleted in the past - removing configs not in tree and without recently pending patches - adding new configs that are replacements for old configs in the file For some detailed information, see Link. Link: https://lore.kernel.org/kernel-janitors/20220929090645.1389-1-lukas.bulwahn@gmail.com/ Link: https://lkml.kernel.org/r/20220929101441.32009-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- arch/ia64/configs/bigsur_defconfig | 2 -- arch/ia64/configs/generic_defconfig | 2 -- arch/ia64/configs/gensparse_defconfig | 3 --- arch/ia64/configs/tiger_defconfig | 2 -- arch/ia64/configs/zx1_defconfig | 1 - 5 files changed, 10 deletions(-) diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig index a3724882295cd..3e1337aceb371 100644 --- a/arch/ia64/configs/bigsur_defconfig +++ b/arch/ia64/configs/bigsur_defconfig @@ -20,7 +20,6 @@ CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m CONFIG_ATA=m @@ -91,7 +90,6 @@ CONFIG_NFS_V4=m CONFIG_NFSD=m CONFIG_NFSD_V4=y CONFIG_CIFS=m -CONFIG_CIFS_STATS=y CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y CONFIG_NLS_CODEPAGE_437=y diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig index a3dff482a3d70..f8033bacea89e 100644 --- a/arch/ia64/configs/generic_defconfig +++ b/arch/ia64/configs/generic_defconfig @@ -39,7 +39,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_CONNECTOR=y # CONFIG_PNP_DEBUG_MESSAGES is not set CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_SGI_XP=m @@ -91,7 +90,6 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y # CONFIG_HW_RANDOM is not set CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y -CONFIG_RAW_DRIVER=m CONFIG_HPET=y CONFIG_AGP=m CONFIG_AGP_I460=m diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig index 4cd46105b0201..ffebe6c503f51 100644 --- a/arch/ia64/configs/gensparse_defconfig +++ b/arch/ia64/configs/gensparse_defconfig @@ -31,11 +31,9 @@ CONFIG_IP_MULTICAST=y CONFIG_SYN_COOKIES=y # CONFIG_IPV6 is not set CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_ATA=y -CONFIG_BLK_DEV_IDECD=y CONFIG_ATA_GENERIC=y CONFIG_PATA_CMD64X=y CONFIG_ATA_PIIX=y @@ -81,7 +79,6 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y # CONFIG_HW_RANDOM is not set CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y -CONFIG_RAW_DRIVER=m CONFIG_HPET=y CONFIG_AGP=m CONFIG_AGP_I460=m diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig index a2045d73adfad..45f5d6e2da0af 100644 --- a/arch/ia64/configs/tiger_defconfig +++ b/arch/ia64/configs/tiger_defconfig @@ -36,7 +36,6 @@ CONFIG_IP_MULTICAST=y CONFIG_SYN_COOKIES=y # CONFIG_IPV6 is not set CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_ATA=y @@ -85,7 +84,6 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y # CONFIG_HW_RANDOM is not set CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y -CONFIG_RAW_DRIVER=m CONFIG_HPET=y CONFIG_AGP=m CONFIG_AGP_I460=m diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig index 99f8b2a0332bc..ed104550d0d51 100644 --- a/arch/ia64/configs/zx1_defconfig +++ b/arch/ia64/configs/zx1_defconfig @@ -30,7 +30,6 @@ CONFIG_PATA_CMD64X=y CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_ST=y -CONFIG_CHR_DEV_OSST=y CONFIG_BLK_DEV_SR=y CONFIG_CHR_DEV_SG=y CONFIG_SCSI_CONSTANTS=y -- GitLab From 30341ec95af4f6e85b981c975c23929bbea8b58a Mon Sep 17 00:00:00 2001 From: Ren Zhijie <renzhijie2@huawei.com> Date: Thu, 29 Sep 2022 07:00:57 +0000 Subject: [PATCH 1761/2223] init/Kconfig: fix unmet direct dependencies Commit 3c07bfce92a5 ("proc: make config PROC_CHILDREN depend on PROC_FS") make config PROC_CHILDREN depend on PROC_FS. When CONFIG_PROC_FS is not set and CONFIG_CHECKPOINT_RESTORE=y, make menuconfig screams like this: WARNING: unmet direct dependencies detected for PROC_CHILDREN Depends on [n]: PROC_FS [=n] Selected by [y]: - CHECKPOINT_RESTORE [=y] CHECKPOINT_RESTORE would select PROC_CHILDREN which depends on PROC_FS, so add depends on PROC_FS to CHECKPOINT_RESTORE to fix this. Link: https://lkml.kernel.org/r/20220929070057.59044-1-renzhijie2@huawei.com Fixes: 3c07bfce92a5 ("proc: make config PROC_CHILDREN depend on PROC_FS") Signed-off-by: Ren Zhijie <renzhijie2@huawei.com> Reviewed-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index 532362fcfe31f..b5799347fb52e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1261,6 +1261,7 @@ endif # NAMESPACES config CHECKPOINT_RESTORE bool "Checkpoint/restore support" + depends on PROC_FS select PROC_CHILDREN select KCMP default n -- GitLab From 95e9a8552e85a7b7c885d3458c7c74c28dfe359b Mon Sep 17 00:00:00 2001 From: xu xin <xu.xin16@zte.com.cn> Date: Fri, 30 Sep 2022 06:19:50 +0000 Subject: [PATCH 1762/2223] ia64: mca: use strscpy() is more robust and safer The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL terminated strings. Link: https://lkml.kernel.org/r/20220930061950.288290-1-xu.xin16@zte.com.cn Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: Xu Panda <xu.panda@zte.com.cn> Signed-off-by: xu xin <xu.xin16@zte.com.cn> Cc: Haowen Bai <baihaowen@meizu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- arch/ia64/kernel/mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index c62a66710ad6d..92ede80d17fea 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1793,7 +1793,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, p->parent = p->real_parent = p->group_leader = p; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); - strncpy(p->comm, type, sizeof(p->comm)-1); + strscpy(p->comm, type, sizeof(p->comm)-1); } /* Caller prevents this from being called after init */ -- GitLab From 0f4107d1798f7ee603845b2a6699c9559a1fec9f Mon Sep 17 00:00:00 2001 From: Frank Rowand <frank.rowand@sony.com> Date: Fri, 30 Sep 2022 20:50:09 -0500 Subject: [PATCH 1763/2223] mailmap: update Frank Rowand email address Frank is no longer at Sony, add an entry for his latest Sony email Link: https://lkml.kernel.org/r/20221001015009.3994518-1-frowand.list@gmail.com Signed-off-by: Frank Rowand <frank.rowand@sony.com> Cc: Tim Bird <Tim.Bird@sony.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 8ded2e7c2906f..bd18574892cde 100644 --- a/.mailmap +++ b/.mailmap @@ -134,6 +134,7 @@ Filipe Lautert <filipe@icewall.org> Finn Thain <fthain@linux-m68k.org> <fthain@telegraphics.com.au> Franck Bui-Huu <vagabon.xyz@gmail.com> Frank Rowand <frowand.list@gmail.com> <frank.rowand@am.sony.com> +Frank Rowand <frowand.list@gmail.com> <frank.rowand@sony.com> Frank Rowand <frowand.list@gmail.com> <frank.rowand@sonymobile.com> Frank Rowand <frowand.list@gmail.com> <frowand@mvista.com> Frank Zago <fzago@systemfabricworks.com> -- GitLab From 5bc73bb3451b9e449828694733a4c6b413ceeb3b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Wed, 5 Oct 2022 23:14:00 +0300 Subject: [PATCH 1764/2223] proc: test how it holds up with mapping'less process Create process without mappings and check /proc/*/maps /proc/*/numa_maps /proc/*/smaps /proc/*/smaps_rollup They must be empty (excluding vsyscall page) or full of zeroes. Retroactively this test should've caught embarassing /proc/*/smaps_rollup oops: [17752.703567] BUG: kernel NULL pointer dereference, address: 0000000000000000 [17752.703580] #PF: supervisor read access in kernel mode [17752.703583] #PF: error_code(0x0000) - not-present page [17752.703587] PGD 0 P4D 0 [17752.703593] Oops: 0000 [#1] PREEMPT SMP PTI [17752.703598] CPU: 0 PID: 60649 Comm: cat Tainted: G W 5.19.9-100.fc35.x86_64 #1 [17752.703603] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X99 Extreme6/3.1, BIOS P3.30 08/05/2016 [17752.703607] RIP: 0010:show_smaps_rollup+0x159/0x2e0 Note 1: ProtectionKey field in /proc/*/smaps is optional, so check most of its contents, not everything. Note 2: due to the nature of this test, child process hardly can signal its readiness (after unmapping everything!) to parent. I feel like "sleep(1)" is justified. If you know how to do it without sleep please tell me. Note 3: /proc/*/statm is not tested but can be. Link: https://lkml.kernel.org/r/Yz3liL6Dn+n2SD8Q@localhost.localdomain Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/proc-empty-vm.c | 386 +++++++++++++++++++ 3 files changed, 388 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-empty-vm.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index c4e6a34f9657b..a156ac5dd2c6a 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -5,6 +5,7 @@ /proc-fsconfig-hidepid /proc-loadavg-001 /proc-multiple-procfs +/proc-empty-vm /proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 219fc61138473..cd95369254c08 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -8,6 +8,7 @@ TEST_GEN_PROGS += fd-001-lookup TEST_GEN_PROGS += fd-002-posix-eq TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-loadavg-001 +TEST_GEN_PROGS += proc-empty-vm TEST_GEN_PROGS += proc-pid-vm TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c new file mode 100644 index 0000000000000..d95b1cb43d9d0 --- /dev/null +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2022 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Create a process without mappings by unmapping everything at once and + * holding it with ptrace(2). See what happens to + * + * /proc/${pid}/maps + * /proc/${pid}/numa_maps + * /proc/${pid}/smaps + * /proc/${pid}/smaps_rollup + */ +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +/* + * 0: vsyscall VMA doesn't exist vsyscall=none + * 1: vsyscall VMA is --xp vsyscall=xonly + * 2: vsyscall VMA is r-xp vsyscall=emulate + */ +static int g_vsyscall; +static const char *g_proc_pid_maps_vsyscall; +static const char *g_proc_pid_smaps_vsyscall; + +static const char proc_pid_maps_vsyscall_0[] = ""; +static const char proc_pid_maps_vsyscall_1[] = +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; +static const char proc_pid_maps_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; + +static const char proc_pid_smaps_vsyscall_0[] = ""; + +static const char proc_pid_smaps_vsyscall_1[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" +"Size: 4 kB\n" +"KernelPageSize: 4 kB\n" +"MMUPageSize: 4 kB\n" +"Rss: 0 kB\n" +"Pss: 0 kB\n" +"Pss_Dirty: 0 kB\n" +"Shared_Clean: 0 kB\n" +"Shared_Dirty: 0 kB\n" +"Private_Clean: 0 kB\n" +"Private_Dirty: 0 kB\n" +"Referenced: 0 kB\n" +"Anonymous: 0 kB\n" +"LazyFree: 0 kB\n" +"AnonHugePages: 0 kB\n" +"ShmemPmdMapped: 0 kB\n" +"FilePmdMapped: 0 kB\n" +"Shared_Hugetlb: 0 kB\n" +"Private_Hugetlb: 0 kB\n" +"Swap: 0 kB\n" +"SwapPss: 0 kB\n" +"Locked: 0 kB\n" +"THPeligible: 0\n" +/* + * "ProtectionKey:" field is conditional. It is possible to check it as well, + * but I don't have such machine. + */ +; + +static const char proc_pid_smaps_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n" +"Size: 4 kB\n" +"KernelPageSize: 4 kB\n" +"MMUPageSize: 4 kB\n" +"Rss: 0 kB\n" +"Pss: 0 kB\n" +"Pss_Dirty: 0 kB\n" +"Shared_Clean: 0 kB\n" +"Shared_Dirty: 0 kB\n" +"Private_Clean: 0 kB\n" +"Private_Dirty: 0 kB\n" +"Referenced: 0 kB\n" +"Anonymous: 0 kB\n" +"LazyFree: 0 kB\n" +"AnonHugePages: 0 kB\n" +"ShmemPmdMapped: 0 kB\n" +"FilePmdMapped: 0 kB\n" +"Shared_Hugetlb: 0 kB\n" +"Private_Hugetlb: 0 kB\n" +"Swap: 0 kB\n" +"SwapPss: 0 kB\n" +"Locked: 0 kB\n" +"THPeligible: 0\n" +/* + * "ProtectionKey:" field is conditional. It is possible to check it as well, + * but I'm too tired. + */ +; + +static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) +{ + _exit(EXIT_FAILURE); +} + +static void sigaction_SIGSEGV_vsyscall(int _, siginfo_t *__, void *___) +{ + _exit(g_vsyscall); +} + +/* + * vsyscall page can't be unmapped, probe it directly. + */ +static void vsyscall(void) +{ + pid_t pid; + int wstatus; + + pid = fork(); + if (pid < 0) { + fprintf(stderr, "fork, errno %d\n", errno); + exit(1); + } + if (pid == 0) { + setrlimit(RLIMIT_CORE, &(struct rlimit){}); + + /* Hide "segfault at ffffffffff600000" messages. */ + struct sigaction act = {}; + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV_vsyscall; + sigaction(SIGSEGV, &act, NULL); + + g_vsyscall = 0; + /* gettimeofday(NULL, NULL); */ + asm volatile ( + "call %P0" + : + : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) + : "rax", "rcx", "r11" + ); + + g_vsyscall = 1; + *(volatile int *)0xffffffffff600000UL; + + g_vsyscall = 2; + exit(g_vsyscall); + } + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus)) { + g_vsyscall = WEXITSTATUS(wstatus); + } else { + fprintf(stderr, "error: vsyscall wstatus %08x\n", wstatus); + exit(1); + } +} + +static int test_proc_pid_maps(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/maps", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + perror("open /proc/${pid}/maps"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + if (g_vsyscall == 0) { + assert(rv == 0); + } else { + size_t len = strlen(g_proc_pid_maps_vsyscall); + assert(rv == len); + assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0); + } + return EXIT_SUCCESS; + } +} + +static int test_proc_pid_numa_maps(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/numa_maps", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) { + /* + * /proc/${pid}/numa_maps is under CONFIG_NUMA, + * it doesn't necessarily exist. + */ + return EXIT_SUCCESS; + } + perror("open /proc/${pid}/numa_maps"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + assert(rv == 0); + return EXIT_SUCCESS; + } +} + +static int test_proc_pid_smaps(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) { + /* + * /proc/${pid}/smaps is under CONFIG_PROC_PAGE_MONITOR, + * it doesn't necessarily exist. + */ + return EXIT_SUCCESS; + } + perror("open /proc/${pid}/smaps"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + if (g_vsyscall == 0) { + assert(rv == 0); + } else { + size_t len = strlen(g_proc_pid_maps_vsyscall); + /* TODO "ProtectionKey:" */ + assert(rv > len); + assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0); + } + return EXIT_SUCCESS; + } +} + +static const char g_smaps_rollup[] = +"00000000-00000000 ---p 00000000 00:00 0 [rollup]\n" +"Rss: 0 kB\n" +"Pss: 0 kB\n" +"Pss_Dirty: 0 kB\n" +"Pss_Anon: 0 kB\n" +"Pss_File: 0 kB\n" +"Pss_Shmem: 0 kB\n" +"Shared_Clean: 0 kB\n" +"Shared_Dirty: 0 kB\n" +"Private_Clean: 0 kB\n" +"Private_Dirty: 0 kB\n" +"Referenced: 0 kB\n" +"Anonymous: 0 kB\n" +"LazyFree: 0 kB\n" +"AnonHugePages: 0 kB\n" +"ShmemPmdMapped: 0 kB\n" +"FilePmdMapped: 0 kB\n" +"Shared_Hugetlb: 0 kB\n" +"Private_Hugetlb: 0 kB\n" +"Swap: 0 kB\n" +"SwapPss: 0 kB\n" +"Locked: 0 kB\n" +; + +static int test_proc_pid_smaps_rollup(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) { + /* + * /proc/${pid}/smaps_rollup is under CONFIG_PROC_PAGE_MONITOR, + * it doesn't necessarily exist. + */ + return EXIT_SUCCESS; + } + perror("open /proc/${pid}/smaps_rollup"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + assert(rv == sizeof(g_smaps_rollup) - 1); + assert(memcmp(buf, g_smaps_rollup, sizeof(g_smaps_rollup) - 1) == 0); + return EXIT_SUCCESS; + } +} + +int main(void) +{ + int rv = EXIT_SUCCESS; + + vsyscall(); + + switch (g_vsyscall) { + case 0: + g_proc_pid_maps_vsyscall = proc_pid_maps_vsyscall_0; + g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_0; + break; + case 1: + g_proc_pid_maps_vsyscall = proc_pid_maps_vsyscall_1; + g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_1; + break; + case 2: + g_proc_pid_maps_vsyscall = proc_pid_maps_vsyscall_2; + g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_2; + break; + default: + abort(); + } + + pid_t pid = fork(); + if (pid == -1) { + perror("fork"); + return EXIT_FAILURE; + } else if (pid == 0) { + rv = ptrace(PTRACE_TRACEME, 0, NULL, NULL); + if (rv != 0) { + if (errno == EPERM) { + fprintf(stderr, +"Did you know? ptrace(PTRACE_TRACEME) doesn't work under strace.\n" + ); + kill(getppid(), SIGTERM); + return EXIT_FAILURE; + } + perror("ptrace PTRACE_TRACEME"); + return EXIT_FAILURE; + } + + /* + * Hide "segfault at ..." messages. Signal handler won't run. + */ + struct sigaction act = {}; + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV; + sigaction(SIGSEGV, &act, NULL); + +#ifdef __amd64__ + munmap(NULL, ((size_t)1 << 47) - 4096); +#else +#error "implement 'unmap everything'" +#endif + return EXIT_FAILURE; + } else { + /* + * TODO find reliable way to signal parent that munmap(2) completed. + * Child can't do it directly because it effectively doesn't exist + * anymore. Looking at child's VM files isn't 100% reliable either: + * due to a bug they may not become empty or empty-like. + */ + sleep(1); + + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_maps(pid); + } + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_numa_maps(pid); + } + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_smaps(pid); + } + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_smaps_rollup(pid); + } + /* + * TODO test /proc/${pid}/statm, task_statm() + * ->start_code, ->end_code aren't updated by munmap(). + * Output can be "0 0 0 2 0 0 0\n" where "2" can be anything. + */ + + /* Cut the rope. */ + int wstatus; + waitpid(pid, &wstatus, 0); + assert(WIFSTOPPED(wstatus)); + assert(WSTOPSIG(wstatus) == SIGSEGV); + } + + return rv; +} -- GitLab From 6a961bffd1c3505c13b4d33bbb8385fe08239cb8 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Fri, 2 Sep 2022 11:41:46 +0800 Subject: [PATCH 1765/2223] include/linux/entry-common.h: remove has_signal comment of arch_do_signal_or_restart() prototype The argument has_signal of arch_do_signal_or_restart() has been removed in commit 8ba62d37949e ("task_work: Call tracehook_notify_signal from get_signal on all architectures"), let us remove the related comment. Link: https://lkml.kernel.org/r/1662090106-5545-1-git-send-email-yangtiezhu@loongson.cn Fixes: 8ba62d37949e ("task_work: Call tracehook_notify_signal from get_signal on all architectures") Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- include/linux/entry-common.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 84a466b176cf4..d95ab85f96ba5 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -253,7 +253,6 @@ static __always_inline void arch_exit_to_user_mode(void) { } /** * arch_do_signal_or_restart - Architecture specific signal delivery function * @regs: Pointer to currents pt_regs - * @has_signal: actual signal to handle * * Invoked from exit_to_user_mode_loop(). */ -- GitLab From fac35ba763ed07ba93154c95ffc0c4a55023707f Mon Sep 17 00:00:00 2001 From: Baolin Wang <baolin.wang@linux.alibaba.com> Date: Thu, 1 Sep 2022 18:41:31 +0800 Subject: [PATCH 1766/2223] mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified. So when looking up a CONT-PTE size hugetlb page by follow_page(), it will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size hugetlb in follow_page_pte(). However this pte entry lock is incorrect for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get the correct lock, which is mm->page_table_lock. That means the pte entry of the CONT-PTE size hugetlb under current pte lock is unstable in follow_page_pte(), we can continue to migrate or poison the pte entry of the CONT-PTE size hugetlb, which can cause some potential race issues, even though they are under the 'pte lock'. For example, suppose thread A is trying to look up a CONT-PTE size hugetlb page by move_pages() syscall under the lock, however antoher thread B can migrate the CONT-PTE hugetlb page at the same time, which will cause thread A to get an incorrect page, if thread A also wants to do page migration, then data inconsistency error occurs. Moreover we have the same issue for CONT-PMD size hugetlb in follow_huge_pmd(). To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte() to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to get the correct pte entry lock to make the pte entry stable. Mike said: Support for CONT_PMD/_PTE was added with bb9dd3df8ee9 ("arm64: hugetlb: refactor find_num_contig()"). Patch series "Support for contiguous pte hugepages", v4. However, I do not believe these code paths were executed until migration support was added with 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") I would go with 5480280d3f2d for the Fixes: targe. Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.1662017562.git.baolin.wang@linux.alibaba.com Fixes: 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: David Hildenbrand <david@redhat.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- include/linux/hugetlb.h | 8 ++++---- mm/gup.c | 14 +++++++++++++- mm/hugetlb.c | 27 +++++++++++++-------------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ec981a0d8b3a..67c88b82fc32d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -207,8 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pd(struct vm_area_struct *vma, unsigned long address, hugepd_t hpd, int flags, int pdshift); -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags); +struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, + int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, @@ -312,8 +312,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma, return NULL; } -static inline struct page *follow_huge_pmd(struct mm_struct *mm, - unsigned long address, pmd_t *pmd, int flags) +static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, + unsigned long address, int flags) { return NULL; } diff --git a/mm/gup.c b/mm/gup.c index 00926abb44263..251cb6a10bc0d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -530,6 +530,18 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); + + /* + * Considering PTE level hugetlb, like continuous-PTE hugetlb on + * ARM64 architecture. + */ + if (is_vm_hugetlb_page(vma)) { + page = follow_huge_pmd_pte(vma, address, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -662,7 +674,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (pmd_none(pmdval)) return no_page_table(vma, flags); if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd(mm, address, pmd, flags); + page = follow_huge_pmd_pte(vma, address, flags); if (page) return page; return no_page_table(vma, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0bdfc7e1c933f..9564bf817e6a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6946,12 +6946,13 @@ follow_huge_pd(struct vm_area_struct *vma, } struct page * __weak -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags) +follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) { + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; spinlock_t *ptl; - pte_t pte; + pte_t *ptep, pte; /* * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via @@ -6961,17 +6962,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; retry: - ptl = pmd_lockptr(mm, pmd); - spin_lock(ptl); - /* - * make sure that the address range covered by this pmd is not - * unmapped from other threads. - */ - if (!pmd_huge(*pmd)) - goto out; - pte = huge_ptep_get((pte_t *)pmd); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); + if (!ptep) + return NULL; + + ptl = huge_pte_lock(h, mm, ptep); + pte = huge_ptep_get(ptep); if (pte_present(pte)) { - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pte_page(pte) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); /* * try_grab_page() should always succeed here, because: a) we * hold the pmd (ptl) lock, and b) we've just checked that the @@ -6987,7 +6986,7 @@ retry: } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait_huge((pte_t *)pmd, ptl); + __migration_entry_wait_huge(ptep, ptl); goto retry; } /* -- GitLab From b1f44cdabad8c50cd72d6b6731e9fdf3730a8f4f Mon Sep 17 00:00:00 2001 From: SeongJae Park <sj@kernel.org> Date: Sun, 2 Oct 2022 19:31:30 +0000 Subject: [PATCH 1767/2223] mm/damon/core: initialize damon_target->list in damon_new_target() 'struct damon_target' creation function, 'damon_new_target()' is not initializing its '->list' field, unlike other DAMON structs creator functions such as 'damon_new_region()'. Normal users of 'damon_new_target()' initializes the field by adding the target to DAMON context's targets list, but some code could access the uninitialized field. This commit avoids the case by initializing the field in 'damon_new_target()'. Link: https://lkml.kernel.org/r/20221002193130.8227-1-sj@kernel.org Fixes: f23b8eee1871 ("mm/damon/core: implement region-based sampling") Signed-off-by: SeongJae Park <sj@kernel.org> Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/damon/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 7d25dc582fe34..4cbe7867b547c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -313,6 +313,7 @@ struct damon_target *damon_new_target(void) t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); + INIT_LIST_HEAD(&t->list); return t; } -- GitLab From d325dc6eb763c10f591c239550b8c7e5466a5d09 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@gmail.com> Date: Tue, 4 Oct 2022 00:05:19 +0900 Subject: [PATCH 1768/2223] nilfs2: fix use-after-free bug of struct nilfs_root If the beginning of the inode bitmap area is corrupted on disk, an inode with the same inode number as the root inode can be allocated and fail soon after. In this case, the subsequent call to nilfs_clear_inode() on that bogus root inode will wrongly decrement the reference counter of struct nilfs_root, and this will erroneously free struct nilfs_root, causing kernel oopses. This fixes the problem by changing nilfs_new_inode() to skip reserved inode numbers while repairing the inode bitmap. Link: https://lkml.kernel.org/r/20221003150519.39789-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+b8c672b0e22615c80fe0@syzkaller.appspotmail.com Reported-by: Khalid Masum <khalid.masum.92@gmail.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/inode.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 67f63cfeade5c..b074144f6f834 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -328,6 +328,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) struct inode *inode; struct nilfs_inode_info *ii; struct nilfs_root *root; + struct buffer_head *bh; int err = -ENOMEM; ino_t ino; @@ -343,11 +344,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root; - err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); if (unlikely(err)) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + if (unlikely(ino < NILFS_USER_INO)) { + nilfs_warn(sb, + "inode bitmap is inconsistent for reserved inodes"); + do { + brelse(bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + } while (ino < NILFS_USER_INO); + + nilfs_info(sb, "repaired inode bitmap for reserved inodes"); + } + ii->i_bh = bh; + atomic64_inc(&root->inodes_count); inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; -- GitLab From 21a87d88c2253350e115029f14fe2a10a7e6c856 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@gmail.com> Date: Sun, 2 Oct 2022 12:08:04 +0900 Subject: [PATCH 1769/2223] nilfs2: fix NULL pointer dereference at nilfs_bmap_lookup_at_level() If the i_mode field in inode of metadata files is corrupted on disk, it can cause the initialization of bmap structure, which should have been called from nilfs_read_inode_common(), not to be called. This causes a lockdep warning followed by a NULL pointer dereference at nilfs_bmap_lookup_at_level(). This patch fixes these issues by adding a missing sanitiy check for the i_mode field of metadata file's inode. Link: https://lkml.kernel.org/r/20221002030804.29978-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+2b32eb36c1a825b7a74c@syzkaller.appspotmail.com Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b074144f6f834..232dd7b6cca14 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -455,6 +455,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) + return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) return -ESTALE; /* this inode is deleted */ -- GitLab From d0d51a97063db4704a5ef6bc978dddab1636a306 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@gmail.com> Date: Fri, 7 Oct 2022 17:52:26 +0900 Subject: [PATCH 1770/2223] nilfs2: fix leak of nilfs_root in case of writer thread creation failure If nilfs_attach_log_writer() failed to create a log writer thread, it frees a data structure of the log writer without any cleanup. After commit e912a5b66837 ("nilfs2: use root object to get ifile"), this causes a leak of struct nilfs_root, which started to leak an ifile metadata inode and a kobject on that struct. In addition, if the kernel is booted with panic_on_warn, the above ifile metadata inode leak will cause the following panic when the nilfs2 kernel module is removed: kmem_cache_destroy nilfs2_inode_cache: Slab cache still has objects when called from nilfs_destroy_cachep+0x16/0x3a [nilfs2] WARNING: CPU: 8 PID: 1464 at mm/slab_common.c:494 kmem_cache_destroy+0x138/0x140 ... RIP: 0010:kmem_cache_destroy+0x138/0x140 Code: 00 20 00 00 e8 a9 55 d8 ff e9 76 ff ff ff 48 8b 53 60 48 c7 c6 20 70 65 86 48 c7 c7 d8 69 9c 86 48 8b 4c 24 28 e8 ef 71 c7 00 <0f> 0b e9 53 ff ff ff c3 48 81 ff ff 0f 00 00 77 03 31 c0 c3 53 48 ... Call Trace: <TASK> ? nilfs_palloc_freev.cold.24+0x58/0x58 [nilfs2] nilfs_destroy_cachep+0x16/0x3a [nilfs2] exit_nilfs_fs+0xa/0x1b [nilfs2] __x64_sys_delete_module+0x1d9/0x3a0 ? __sanitizer_cov_trace_pc+0x1a/0x50 ? syscall_trace_enter.isra.19+0x119/0x190 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ... </TASK> Kernel panic - not syncing: panic_on_warn set ... This patch fixes these issues by calling nilfs_detach_log_writer() cleanup function if spawning the log writer thread fails. Link: https://lkml.kernel.org/r/20221007085226.57667-1-konishi.ryusuke@gmail.com Fixes: e912a5b66837 ("nilfs2: use root object to get ifile") Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+7381dc4ad60658ca4c05@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c7547..5276ab525f010 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2786,10 +2786,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL); err = nilfs_segctor_start_thread(nilfs->ns_writer); - if (err) { - kfree(nilfs->ns_writer); - nilfs->ns_writer = NULL; - } + if (unlikely(err)) + nilfs_detach_log_writer(sb); + return err; } -- GitLab From 10f6913c548b32ecb73801a16b120e761c6957ea Mon Sep 17 00:00:00 2001 From: Wenting Zhang <zephray@outlook.com> Date: Fri, 8 Jul 2022 16:38:22 -0400 Subject: [PATCH 1771/2223] riscv: always honor the CONFIG_CMDLINE_FORCE when parsing dtb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_CMDLINE_FORCE is enabled, cmdline provided by CONFIG_CMDLINE are always used. This allows CONFIG_CMDLINE to be used regardless of the result of device tree scanning. This especially fixes the case where a device tree without the chosen node is supplied to the kernel. In such cases, early_init_dt_scan would return true. But inside early_init_dt_scan_chosen, the cmdline won't be updated as there is no chosen node in the device tree. As a result, CONFIG_CMDLINE is not copied into boot_command_line even if CONFIG_CMDLINE_FORCE is enabled. This commit allows properly update boot_command_line in this situation. Fixes: 8fd6e05c7463 ("arch: riscv: support kernel command line forcing when no DTB passed") Signed-off-by: Wenting Zhang <zephray@outlook.com> Reviewed-by: Björn Töpel <bjorn@kernel.org> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/PSBPR04MB399135DFC54928AB958D0638B1829@PSBPR04MB3991.apcprd04.prod.outlook.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 2dfc463b86bb3..ad76bb59b0590 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -252,10 +252,10 @@ static void __init parse_dtb(void) pr_info("Machine model: %s\n", name); dump_stack_set_arch_desc("%s (DT)", name); } - return; + } else { + pr_err("No DTB passed to the kernel\n"); } - pr_err("No DTB passed to the kernel\n"); #ifdef CONFIG_CMDLINE_FORCE strscpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); pr_info("Forcing kernel command line to: %s\n", boot_command_line); -- GitLab From 49b0dea1eb5e0fd5e498a2c2ce50d2e036494072 Mon Sep 17 00:00:00 2001 From: Stefan Binding <sbinding@opensource.cirrus.com> Date: Tue, 11 Oct 2022 15:35:48 +0100 Subject: [PATCH 1772/2223] ALSA: hda: hda_cs_dsp_ctl: Minor clean and redundant code removal The cs_dsp core will return an error if passed a NULL cs_dsp struct so there is no need for the hda_cs_dsp_write|read_ctl functions to manually check that. The cs_dsp core will also check the data is within bounds of the control so the additional bounds check is redundant too. Simplify things a bit by removing said code. Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com> Link: https://lore.kernel.org/r/20221011143552.621792-2-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/hda_cs_dsp_ctl.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/sound/pci/hda/hda_cs_dsp_ctl.c b/sound/pci/hda/hda_cs_dsp_ctl.c index 89ee549cb7d50..41d3e8fd289d7 100644 --- a/sound/pci/hda/hda_cs_dsp_ctl.c +++ b/sound/pci/hda/hda_cs_dsp_ctl.c @@ -199,16 +199,10 @@ EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_control_remove, SND_HDA_CS_DSP_CONTROLS); int hda_cs_dsp_write_ctl(struct cs_dsp *dsp, const char *name, int type, unsigned int alg, const void *buf, size_t len) { - struct cs_dsp_coeff_ctl *cs_ctl; + struct cs_dsp_coeff_ctl *cs_ctl = cs_dsp_get_ctl(dsp, name, type, alg); struct hda_cs_dsp_coeff_ctl *ctl; int ret; - cs_ctl = cs_dsp_get_ctl(dsp, name, type, alg); - if (!cs_ctl) - return -EINVAL; - - ctl = cs_ctl->priv; - ret = cs_dsp_coeff_write_ctrl(cs_ctl, 0, buf, len); if (ret) return ret; @@ -216,6 +210,8 @@ int hda_cs_dsp_write_ctl(struct cs_dsp *dsp, const char *name, int type, if (cs_ctl->flags & WMFW_CTL_FLAG_SYS) return 0; + ctl = cs_ctl->priv; + snd_ctl_notify(ctl->card, SNDRV_CTL_EVENT_MASK_VALUE, &ctl->kctl->id); return 0; @@ -225,13 +221,8 @@ EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_write_ctl, SND_HDA_CS_DSP_CONTROLS); int hda_cs_dsp_read_ctl(struct cs_dsp *dsp, const char *name, int type, unsigned int alg, void *buf, size_t len) { - struct cs_dsp_coeff_ctl *cs_ctl; - - cs_ctl = cs_dsp_get_ctl(dsp, name, type, alg); - if (!cs_ctl) - return -EINVAL; + return cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(dsp, name, type, alg), 0, buf, len); - return cs_dsp_coeff_read_ctrl(cs_ctl, 0, buf, len); } EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_read_ctl, SND_HDA_CS_DSP_CONTROLS); -- GitLab From 06f3a0a758c4246dc644e22fb33f85c6e5f92af6 Mon Sep 17 00:00:00 2001 From: Stefan Binding <sbinding@opensource.cirrus.com> Date: Tue, 11 Oct 2022 15:35:49 +0100 Subject: [PATCH 1773/2223] ALSA: hda: hda_cs_dsp_ctl: Ensure pwr_lock is held before reading/writing controls These apis require the pwr_lock to be held. Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com> Link: https://lore.kernel.org/r/20221011143552.621792-3-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/hda_cs_dsp_ctl.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_cs_dsp_ctl.c b/sound/pci/hda/hda_cs_dsp_ctl.c index 41d3e8fd289d7..75fb691858172 100644 --- a/sound/pci/hda/hda_cs_dsp_ctl.c +++ b/sound/pci/hda/hda_cs_dsp_ctl.c @@ -199,11 +199,14 @@ EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_control_remove, SND_HDA_CS_DSP_CONTROLS); int hda_cs_dsp_write_ctl(struct cs_dsp *dsp, const char *name, int type, unsigned int alg, const void *buf, size_t len) { - struct cs_dsp_coeff_ctl *cs_ctl = cs_dsp_get_ctl(dsp, name, type, alg); + struct cs_dsp_coeff_ctl *cs_ctl; struct hda_cs_dsp_coeff_ctl *ctl; int ret; + mutex_lock(&dsp->pwr_lock); + cs_ctl = cs_dsp_get_ctl(dsp, name, type, alg); ret = cs_dsp_coeff_write_ctrl(cs_ctl, 0, buf, len); + mutex_unlock(&dsp->pwr_lock); if (ret) return ret; @@ -221,7 +224,13 @@ EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_write_ctl, SND_HDA_CS_DSP_CONTROLS); int hda_cs_dsp_read_ctl(struct cs_dsp *dsp, const char *name, int type, unsigned int alg, void *buf, size_t len) { - return cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(dsp, name, type, alg), 0, buf, len); + int ret; + + mutex_lock(&dsp->pwr_lock); + ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(dsp, name, type, alg), 0, buf, len); + mutex_unlock(&dsp->pwr_lock); + + return ret; } EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_read_ctl, SND_HDA_CS_DSP_CONTROLS); -- GitLab From 2176c6b599dba55a640cffec0182c0b6bab680d1 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald <rf@opensource.cirrus.com> Date: Tue, 11 Oct 2022 15:35:50 +0100 Subject: [PATCH 1774/2223] ALSA: hda/cs_dsp_ctl: Fix mutex inversion when creating controls Redesign the creation of ALSA controls so that the cs_dsp pwr_lock is not held when calling snd_ctl_add(). Instead of creating the ALSA control from the cs_dsp control_add callback, do it after cs_dsp_power_up() has completed. The existing functions are changed to return void instead of passing errors back - this duplicates the original behaviour, as cs_dsp does not abort firmware load if creation of a control fails. It is safe to walk the control list without taking any mutex provided that the caller is not trying to load a new firmware or remove the driver in parallel. There is no other situation that the list can change. So the caller can trigger creation of ALSA controls after cs_dsp_power_up() has returned. A cs_dsp control will have a non-NULL priv pointer if we have created an ALSA control. With the previous code the ALSA controls were created from the cs_dsp control_add callback. But this is called with pwr_lock held (as it is part of the DSP power-up sequence). The kernel lock checking will show a mutex inversion between this and the control creation path: control_add pwr_lock held, takes controls_rwsem (in snd_ctl_add) get/put controls_rwsem held, takes pwr_lock to call cs_dsp. This is not completely theoretical. Although the time window is very small, it is possible for these to run in parallel and deadlock the old implementation. Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com> Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com> Link: https://lore.kernel.org/r/20221011143552.621792-4-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/cs35l41_hda.c | 8 ++--- sound/pci/hda/hda_cs_dsp_ctl.c | 59 ++++++++++++++++++++-------------- sound/pci/hda/hda_cs_dsp_ctl.h | 2 +- 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 3952f28537034..102ac4a94a9d6 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -91,20 +91,18 @@ static const struct reg_sequence cs35l41_hda_mute[] = { { CS35L41_AMP_DIG_VOL_CTRL, 0x0000A678 }, // AMP_VOL_PCM Mute }; -static int cs35l41_control_add(struct cs_dsp_coeff_ctl *cs_ctl) +static void cs35l41_add_controls(struct cs35l41_hda *cs35l41) { - struct cs35l41_hda *cs35l41 = container_of(cs_ctl->dsp, struct cs35l41_hda, cs_dsp); struct hda_cs_dsp_ctl_info info; info.device_name = cs35l41->amp_name; info.fw_type = cs35l41->firmware_type; info.card = cs35l41->codec->card; - return hda_cs_dsp_control_add(cs_ctl, &info); + hda_cs_dsp_add_controls(&cs35l41->cs_dsp, &info); } static const struct cs_dsp_client_ops client_ops = { - .control_add = cs35l41_control_add, .control_remove = hda_cs_dsp_control_remove, }; @@ -435,6 +433,8 @@ static int cs35l41_init_dsp(struct cs35l41_hda *cs35l41) if (ret) goto err_release; + cs35l41_add_controls(cs35l41); + ret = cs35l41_save_calibration(cs35l41); err_release: diff --git a/sound/pci/hda/hda_cs_dsp_ctl.c b/sound/pci/hda/hda_cs_dsp_ctl.c index 75fb691858172..1622a22f96f6a 100644 --- a/sound/pci/hda/hda_cs_dsp_ctl.c +++ b/sound/pci/hda/hda_cs_dsp_ctl.c @@ -97,7 +97,7 @@ static unsigned int wmfw_convert_flags(unsigned int in) return out; } -static int hda_cs_dsp_add_kcontrol(struct hda_cs_dsp_coeff_ctl *ctl, const char *name) +static void hda_cs_dsp_add_kcontrol(struct hda_cs_dsp_coeff_ctl *ctl, const char *name) { struct cs_dsp_coeff_ctl *cs_ctl = ctl->cs_ctl; struct snd_kcontrol_new kcontrol = {0}; @@ -107,7 +107,7 @@ static int hda_cs_dsp_add_kcontrol(struct hda_cs_dsp_coeff_ctl *ctl, const char if (cs_ctl->len > ADSP_MAX_STD_CTRL_SIZE) { dev_err(cs_ctl->dsp->dev, "KControl %s: length %zu exceeds maximum %d\n", name, cs_ctl->len, ADSP_MAX_STD_CTRL_SIZE); - return -EINVAL; + return; } kcontrol.name = name; @@ -120,24 +120,21 @@ static int hda_cs_dsp_add_kcontrol(struct hda_cs_dsp_coeff_ctl *ctl, const char /* Save ctl inside private_data, ctl is owned by cs_dsp, * and will be freed when cs_dsp removes the control */ kctl = snd_ctl_new1(&kcontrol, (void *)ctl); - if (!kctl) { - ret = -ENOMEM; - return ret; - } + if (!kctl) + return; ret = snd_ctl_add(ctl->card, kctl); if (ret) { dev_err(cs_ctl->dsp->dev, "Failed to add KControl %s = %d\n", kcontrol.name, ret); - return ret; + return; } dev_dbg(cs_ctl->dsp->dev, "Added KControl: %s\n", kcontrol.name); ctl->kctl = kctl; - - return 0; } -int hda_cs_dsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl, struct hda_cs_dsp_ctl_info *info) +static void hda_cs_dsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl, + const struct hda_cs_dsp_ctl_info *info) { struct cs_dsp *cs_dsp = cs_ctl->dsp; char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; @@ -145,13 +142,10 @@ int hda_cs_dsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl, struct hda_cs_dsp_ct const char *region_name; int ret; - if (cs_ctl->flags & WMFW_CTL_FLAG_SYS) - return 0; - region_name = cs_dsp_mem_region_name(cs_ctl->alg_region.type); if (!region_name) { - dev_err(cs_dsp->dev, "Unknown region type: %d\n", cs_ctl->alg_region.type); - return -EINVAL; + dev_warn(cs_dsp->dev, "Unknown region type: %d\n", cs_ctl->alg_region.type); + return; } ret = scnprintf(name, SNDRV_CTL_ELEM_ID_NAME_MAXLEN, "%s %s %.12s %x", info->device_name, @@ -171,22 +165,39 @@ int hda_cs_dsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl, struct hda_cs_dsp_ct ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); if (!ctl) - return -ENOMEM; + return; ctl->cs_ctl = cs_ctl; ctl->card = info->card; cs_ctl->priv = ctl; - ret = hda_cs_dsp_add_kcontrol(ctl, name); - if (ret) { - dev_err(cs_dsp->dev, "Error (%d) adding control %s\n", ret, name); - kfree(ctl); - return ret; - } + hda_cs_dsp_add_kcontrol(ctl, name); +} - return 0; +void hda_cs_dsp_add_controls(struct cs_dsp *dsp, const struct hda_cs_dsp_ctl_info *info) +{ + struct cs_dsp_coeff_ctl *cs_ctl; + + /* + * pwr_lock would cause mutex inversion with ALSA control lock compared + * to the get/put functions. + * It is safe to walk the list without holding a mutex because entries + * are persistent and only cs_dsp_power_up() or cs_dsp_remove() can + * change the list. + */ + lockdep_assert_not_held(&dsp->pwr_lock); + + list_for_each_entry(cs_ctl, &dsp->ctl_list, list) { + if (cs_ctl->flags & WMFW_CTL_FLAG_SYS) + continue; + + if (cs_ctl->priv) + continue; + + hda_cs_dsp_control_add(cs_ctl, info); + } } -EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_control_add, SND_HDA_CS_DSP_CONTROLS); +EXPORT_SYMBOL_NS_GPL(hda_cs_dsp_add_controls, SND_HDA_CS_DSP_CONTROLS); void hda_cs_dsp_control_remove(struct cs_dsp_coeff_ctl *cs_ctl) { diff --git a/sound/pci/hda/hda_cs_dsp_ctl.h b/sound/pci/hda/hda_cs_dsp_ctl.h index 4babc69cf2f0c..2cf93359c4f23 100644 --- a/sound/pci/hda/hda_cs_dsp_ctl.h +++ b/sound/pci/hda/hda_cs_dsp_ctl.h @@ -29,7 +29,7 @@ struct hda_cs_dsp_ctl_info { extern const char * const hda_cs_dsp_fw_ids[HDA_CS_DSP_NUM_FW]; -int hda_cs_dsp_control_add(struct cs_dsp_coeff_ctl *cs_ctl, struct hda_cs_dsp_ctl_info *info); +void hda_cs_dsp_add_controls(struct cs_dsp *dsp, const struct hda_cs_dsp_ctl_info *info); void hda_cs_dsp_control_remove(struct cs_dsp_coeff_ctl *cs_ctl); int hda_cs_dsp_write_ctl(struct cs_dsp *dsp, const char *name, int type, unsigned int alg, const void *buf, size_t len); -- GitLab From 23904f7b2518e9b6bbfe2ac7bbe9e284bcdda18e Mon Sep 17 00:00:00 2001 From: Stefan Binding <sbinding@opensource.cirrus.com> Date: Tue, 11 Oct 2022 15:35:51 +0100 Subject: [PATCH 1775/2223] ALSA: hda: cs35l41: Remove suspend/resume hda hooks The current code uses calls from the HDA Codec driver to determine when to suspend/resume by calling hooks via the hda_component binding. However, this means the cs35l41 driver relies on the HDA Codec driver to tell it when to suspend or resume, creating an additional external dependency, and potentially creating race conditions in the future. It is better for the cs35l41 hda driver to decide for itself when the part should be suspended or resumed. This makes supporting system suspend easier. Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com> Link: https://lore.kernel.org/r/20221011143552.621792-5-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/cs35l41_hda.c | 31 ++++++++++++------------------- sound/pci/hda/hda_component.h | 2 -- sound/pci/hda/patch_realtek.c | 19 +------------------ 3 files changed, 13 insertions(+), 39 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 102ac4a94a9d6..89f6b4a28d3d7 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -487,10 +487,10 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action) struct regmap *reg = cs35l41->regmap; int ret = 0; - mutex_lock(&cs35l41->fw_mutex); - switch (action) { case HDA_GEN_PCM_ACT_OPEN: + pm_runtime_get_sync(dev); + mutex_lock(&cs35l41->fw_mutex); cs35l41->playback_started = true; if (cs35l41->firmware_running) { regmap_multi_reg_write(reg, cs35l41_hda_config_dsp, @@ -508,15 +508,21 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action) CS35L41_AMP_EN_MASK, 1 << CS35L41_AMP_EN_SHIFT); if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST) regmap_write(reg, CS35L41_GPIO1_CTRL1, 0x00008001); + mutex_unlock(&cs35l41->fw_mutex); break; case HDA_GEN_PCM_ACT_PREPARE: + mutex_lock(&cs35l41->fw_mutex); ret = cs35l41_global_enable(reg, cs35l41->hw_cfg.bst_type, 1); + mutex_unlock(&cs35l41->fw_mutex); break; case HDA_GEN_PCM_ACT_CLEANUP: + mutex_lock(&cs35l41->fw_mutex); regmap_multi_reg_write(reg, cs35l41_hda_mute, ARRAY_SIZE(cs35l41_hda_mute)); ret = cs35l41_global_enable(reg, cs35l41->hw_cfg.bst_type, 0); + mutex_unlock(&cs35l41->fw_mutex); break; case HDA_GEN_PCM_ACT_CLOSE: + mutex_lock(&cs35l41->fw_mutex); ret = regmap_update_bits(reg, CS35L41_PWR_CTRL2, CS35L41_AMP_EN_MASK, 0 << CS35L41_AMP_EN_SHIFT); if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST) @@ -530,14 +536,16 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action) } cs35l41_irq_release(cs35l41); cs35l41->playback_started = false; + mutex_unlock(&cs35l41->fw_mutex); + + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); break; default: dev_warn(cs35l41->dev, "Playback action not supported: %d\n", action); break; } - mutex_unlock(&cs35l41->fw_mutex); - if (ret) dev_err(cs35l41->dev, "Regmap access fail: %d\n", ret); } @@ -618,19 +626,6 @@ static int cs35l41_runtime_resume(struct device *dev) return 0; } -static int cs35l41_hda_suspend_hook(struct device *dev) -{ - dev_dbg(dev, "Request Suspend\n"); - pm_runtime_mark_last_busy(dev); - return pm_runtime_put_autosuspend(dev); -} - -static int cs35l41_hda_resume_hook(struct device *dev) -{ - dev_dbg(dev, "Request Resume\n"); - return pm_runtime_get_sync(dev); -} - static int cs35l41_smart_amp(struct cs35l41_hda *cs35l41) { int halo_sts; @@ -863,8 +858,6 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas ret = cs35l41_create_controls(cs35l41); comps->playback_hook = cs35l41_hda_playback_hook; - comps->suspend_hook = cs35l41_hda_suspend_hook; - comps->resume_hook = cs35l41_hda_resume_hook; pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); diff --git a/sound/pci/hda/hda_component.h b/sound/pci/hda/hda_component.h index 1223621bd62ca..534e845b9cd1d 100644 --- a/sound/pci/hda/hda_component.h +++ b/sound/pci/hda/hda_component.h @@ -16,6 +16,4 @@ struct hda_component { char name[HDA_MAX_NAME_SIZE]; struct hda_codec *codec; void (*playback_hook)(struct device *dev, int action); - int (*suspend_hook)(struct device *dev); - int (*resume_hook)(struct device *dev); }; diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4b076912bbf4b..e6c4bb5fa041a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4022,22 +4022,16 @@ static void alc5505_dsp_init(struct hda_codec *codec) static int alc269_suspend(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; - int i; if (spec->has_alc5505_dsp) alc5505_dsp_suspend(codec); - for (i = 0; i < HDA_MAX_COMPONENTS; i++) - if (spec->comps[i].suspend_hook) - spec->comps[i].suspend_hook(spec->comps[i].dev); - return alc_suspend(codec); } static int alc269_resume(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; - int i; if (spec->codec_variant == ALC269_TYPE_ALC269VB) alc269vb_toggle_power_output(codec, 0); @@ -4068,10 +4062,6 @@ static int alc269_resume(struct hda_codec *codec) if (spec->has_alc5505_dsp) alc5505_dsp_resume(codec); - for (i = 0; i < HDA_MAX_COMPONENTS; i++) - if (spec->comps[i].resume_hook) - spec->comps[i].resume_hook(spec->comps[i].dev); - return 0; } #endif /* CONFIG_PM */ @@ -6664,19 +6654,12 @@ static int comp_bind(struct device *dev) { struct hda_codec *cdc = dev_to_hda_codec(dev); struct alc_spec *spec = cdc->spec; - int ret, i; + int ret; ret = component_bind_all(dev, spec->comps); if (ret) return ret; - if (snd_hdac_is_power_on(&cdc->core)) { - codec_dbg(cdc, "Resuming after bind.\n"); - for (i = 0; i < HDA_MAX_COMPONENTS; i++) - if (spec->comps[i].resume_hook) - spec->comps[i].resume_hook(spec->comps[i].dev); - } - return 0; } -- GitLab From 88672826e2a465d2f4c0a50fb5ced2956f4ffcbc Mon Sep 17 00:00:00 2001 From: Stefan Binding <sbinding@opensource.cirrus.com> Date: Tue, 11 Oct 2022 15:35:52 +0100 Subject: [PATCH 1776/2223] ALSA: hda: cs35l41: Support System Suspend Add support for system suspend into the CS35L41 HDA Driver. Since S4 suspend may power off the system, it is required that the driver ensure the part is safe to be shutdown before system suspend, as well as ensuring that the firmware is unloaded before shutdown. The part must then be restored on system resume, including re-downloading the firmware. Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com> Link: https://lore.kernel.org/r/20221011143552.621792-6-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai <tiwai@suse.de> --- sound/pci/hda/cs35l41_hda.c | 160 ++++++++++++++++++++++++++++++------ 1 file changed, 136 insertions(+), 24 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 89f6b4a28d3d7..e5f0549bf06d0 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -461,9 +461,12 @@ static void cs35l41_remove_dsp(struct cs35l41_hda *cs35l41) struct cs_dsp *dsp = &cs35l41->cs_dsp; cancel_work_sync(&cs35l41->fw_load_work); + + mutex_lock(&cs35l41->fw_mutex); cs35l41_shutdown_dsp(cs35l41); cs_dsp_remove(dsp); cs35l41->halo_initialized = false; + mutex_unlock(&cs35l41->fw_mutex); } /* Protection release cycle to get the speaker out of Safe-Mode */ @@ -570,45 +573,148 @@ static int cs35l41_hda_channel_map(struct device *dev, unsigned int tx_num, unsi rx_slot); } +static void cs35l41_ready_for_reset(struct cs35l41_hda *cs35l41) +{ + mutex_lock(&cs35l41->fw_mutex); + if (cs35l41->firmware_running) { + + regcache_cache_only(cs35l41->regmap, false); + + cs35l41_exit_hibernate(cs35l41->dev, cs35l41->regmap); + cs35l41_shutdown_dsp(cs35l41); + cs35l41_safe_reset(cs35l41->regmap, cs35l41->hw_cfg.bst_type); + + regcache_cache_only(cs35l41->regmap, true); + regcache_mark_dirty(cs35l41->regmap); + } + mutex_unlock(&cs35l41->fw_mutex); +} + +static int cs35l41_system_suspend(struct device *dev) +{ + struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); + int ret; + + dev_dbg(cs35l41->dev, "System Suspend\n"); + + if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST_NO_VSPK_SWITCH) { + dev_err(cs35l41->dev, "System Suspend not supported\n"); + return -EINVAL; + } + + ret = pm_runtime_force_suspend(dev); + if (ret) + return ret; + + /* Shutdown DSP before system suspend */ + cs35l41_ready_for_reset(cs35l41); + + /* + * Reset GPIO may be shared, so cannot reset here. + * However beyond this point, amps may be powered down. + */ + return 0; +} + +static int cs35l41_system_resume(struct device *dev) +{ + struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); + int ret; + + dev_dbg(cs35l41->dev, "System Resume\n"); + + if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST_NO_VSPK_SWITCH) { + dev_err(cs35l41->dev, "System Resume not supported\n"); + return -EINVAL; + } + + if (cs35l41->reset_gpio) { + usleep_range(2000, 2100); + gpiod_set_value_cansleep(cs35l41->reset_gpio, 1); + } + + usleep_range(2000, 2100); + + ret = pm_runtime_force_resume(dev); + + mutex_lock(&cs35l41->fw_mutex); + if (!ret && cs35l41->request_fw_load && !cs35l41->fw_request_ongoing) { + cs35l41->fw_request_ongoing = true; + schedule_work(&cs35l41->fw_load_work); + } + mutex_unlock(&cs35l41->fw_mutex); + + return ret; +} + static int cs35l41_runtime_suspend(struct device *dev) { struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); + int ret = 0; - dev_dbg(cs35l41->dev, "Suspend\n"); + dev_dbg(cs35l41->dev, "Runtime Suspend\n"); - if (!cs35l41->firmware_running) + if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST_NO_VSPK_SWITCH) { + dev_dbg(cs35l41->dev, "Runtime Suspend not supported\n"); return 0; + } - if (cs35l41_enter_hibernate(cs35l41->dev, cs35l41->regmap, cs35l41->hw_cfg.bst_type) < 0) - return 0; + mutex_lock(&cs35l41->fw_mutex); + + if (cs35l41->playback_started) { + regmap_multi_reg_write(cs35l41->regmap, cs35l41_hda_mute, + ARRAY_SIZE(cs35l41_hda_mute)); + cs35l41_global_enable(cs35l41->regmap, cs35l41->hw_cfg.bst_type, 0); + regmap_update_bits(cs35l41->regmap, CS35L41_PWR_CTRL2, + CS35L41_AMP_EN_MASK, 0 << CS35L41_AMP_EN_SHIFT); + if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST) + regmap_write(cs35l41->regmap, CS35L41_GPIO1_CTRL1, 0x00000001); + regmap_update_bits(cs35l41->regmap, CS35L41_PWR_CTRL2, + CS35L41_VMON_EN_MASK | CS35L41_IMON_EN_MASK, + 0 << CS35L41_VMON_EN_SHIFT | 0 << CS35L41_IMON_EN_SHIFT); + cs35l41->playback_started = false; + } + + if (cs35l41->firmware_running) { + ret = cs35l41_enter_hibernate(cs35l41->dev, cs35l41->regmap, + cs35l41->hw_cfg.bst_type); + if (ret) + goto err; + } else { + cs35l41_safe_reset(cs35l41->regmap, cs35l41->hw_cfg.bst_type); + } regcache_cache_only(cs35l41->regmap, true); regcache_mark_dirty(cs35l41->regmap); - return 0; +err: + mutex_unlock(&cs35l41->fw_mutex); + + return ret; } static int cs35l41_runtime_resume(struct device *dev) { struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); - int ret; + int ret = 0; - dev_dbg(cs35l41->dev, "Resume.\n"); + dev_dbg(cs35l41->dev, "Runtime Resume\n"); if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST_NO_VSPK_SWITCH) { - dev_dbg(cs35l41->dev, "System does not support Resume\n"); + dev_dbg(cs35l41->dev, "Runtime Resume not supported\n"); return 0; } - if (!cs35l41->firmware_running) - return 0; + mutex_lock(&cs35l41->fw_mutex); regcache_cache_only(cs35l41->regmap, false); - ret = cs35l41_exit_hibernate(cs35l41->dev, cs35l41->regmap); - if (ret) { - regcache_cache_only(cs35l41->regmap, true); - return ret; + if (cs35l41->firmware_running) { + ret = cs35l41_exit_hibernate(cs35l41->dev, cs35l41->regmap); + if (ret) { + dev_warn(cs35l41->dev, "Unable to exit Hibernate."); + goto err; + } } /* Test key needs to be unlocked to allow the OTP settings to re-apply */ @@ -617,13 +723,16 @@ static int cs35l41_runtime_resume(struct device *dev) cs35l41_test_key_lock(cs35l41->dev, cs35l41->regmap); if (ret) { dev_err(cs35l41->dev, "Failed to restore register cache: %d\n", ret); - return ret; + goto err; } if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST) cs35l41_init_boost(cs35l41->dev, cs35l41->regmap, &cs35l41->hw_cfg); - return 0; +err: + mutex_unlock(&cs35l41->fw_mutex); + + return ret; } static int cs35l41_smart_amp(struct cs35l41_hda *cs35l41) @@ -673,8 +782,6 @@ clean_dsp: static void cs35l41_load_firmware(struct cs35l41_hda *cs35l41, bool load) { - pm_runtime_get_sync(cs35l41->dev); - if (cs35l41->firmware_running && !load) { dev_dbg(cs35l41->dev, "Unloading Firmware\n"); cs35l41_shutdown_dsp(cs35l41); @@ -684,9 +791,6 @@ static void cs35l41_load_firmware(struct cs35l41_hda *cs35l41, bool load) } else { dev_dbg(cs35l41->dev, "Unable to Load firmware.\n"); } - - pm_runtime_mark_last_busy(cs35l41->dev); - pm_runtime_put_autosuspend(cs35l41->dev); } static int cs35l41_fw_load_ctl_get(struct snd_kcontrol *kcontrol, @@ -702,16 +806,21 @@ static void cs35l41_fw_load_work(struct work_struct *work) { struct cs35l41_hda *cs35l41 = container_of(work, struct cs35l41_hda, fw_load_work); + pm_runtime_get_sync(cs35l41->dev); + mutex_lock(&cs35l41->fw_mutex); /* Recheck if playback is ongoing, mutex will block playback during firmware loading */ if (cs35l41->playback_started) - dev_err(cs35l41->dev, "Cannot Load/Unload firmware during Playback\n"); + dev_err(cs35l41->dev, "Cannot Load/Unload firmware during Playback. Retrying...\n"); else cs35l41_load_firmware(cs35l41, cs35l41->request_fw_load); cs35l41->fw_request_ongoing = false; mutex_unlock(&cs35l41->fw_mutex); + + pm_runtime_mark_last_busy(cs35l41->dev); + pm_runtime_put_autosuspend(cs35l41->dev); } static int cs35l41_fw_load_ctl_put(struct snd_kcontrol *kcontrol, @@ -835,6 +944,8 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas pm_runtime_get_sync(dev); + mutex_lock(&cs35l41->fw_mutex); + comps->dev = dev; if (!cs35l41->acpi_subsystem_id) cs35l41->acpi_subsystem_id = kasprintf(GFP_KERNEL, "%.8x", @@ -847,10 +958,8 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas if (firmware_autostart) { dev_dbg(cs35l41->dev, "Firmware Autostart.\n"); cs35l41->request_fw_load = true; - mutex_lock(&cs35l41->fw_mutex); if (cs35l41_smart_amp(cs35l41) < 0) dev_warn(cs35l41->dev, "Cannot Run Firmware, reverting to dsp bypass...\n"); - mutex_unlock(&cs35l41->fw_mutex); } else { dev_dbg(cs35l41->dev, "Firmware Autostart is disabled.\n"); } @@ -859,6 +968,8 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas comps->playback_hook = cs35l41_hda_playback_hook; + mutex_unlock(&cs35l41->fw_mutex); + pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); @@ -1426,6 +1537,7 @@ EXPORT_SYMBOL_NS_GPL(cs35l41_hda_remove, SND_HDA_SCODEC_CS35L41); const struct dev_pm_ops cs35l41_hda_pm_ops = { RUNTIME_PM_OPS(cs35l41_runtime_suspend, cs35l41_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(cs35l41_system_suspend, cs35l41_system_resume) }; EXPORT_SYMBOL_NS_GPL(cs35l41_hda_pm_ops, SND_HDA_SCODEC_CS35L41); -- GitLab From 7880672bdc975daa586e8256714d9906d30c615e Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Fri, 7 Oct 2022 21:35:00 +0100 Subject: [PATCH 1777/2223] xen: Kconfig: Fix spelling mistake "Maxmium" -> "Maximum" There is a spelling mistake in a Kconfig description. Fix it. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Acked-by: Stefano Stabellini <sstabellini@kernel.org> Link: https://lore.kernel.org/r/20221007203500.2756787-1-colin.i.king@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a65bd92121a5d..d5d7c402b6511 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -56,7 +56,7 @@ config XEN_MEMORY_HOTPLUG_LIMIT depends on XEN_HAVE_PVMMU depends on MEMORY_HOTPLUG help - Maxmium amount of memory (in GiB) that a PV guest can be + Maximum amount of memory (in GiB) that a PV guest can be expanded to when using memory hotplug. A PV guest can have more memory than this limit if is -- GitLab From 6c9f7434159b96231f5b27ab938f4766e3586b48 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven <geert+renesas@glider.be> Date: Tue, 4 Oct 2022 18:22:30 +0200 Subject: [PATCH 1778/2223] irqchip: IMX_MU_MSI should depend on ARCH_MXC The Freescale/NXP i.MX Messaging Unit is only present on Freescale/NXP i.MX SoCs. Hence add a dependency on ARCH_MXC, to prevent asking the user about this driver when configuring a kernel without Freescale/NXP i.MX SoC family support. While at it, expand "MU" to "Messaging Unit" in the help text. Fixes: 70afdab904d2d1e6 ("irqchip: Add IMX MU MSI controller driver") Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/7f3bd932614ddbff46a1b750ef45b231130364ad.1664900434.git.geert+renesas@glider.be --- drivers/irqchip/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 4d85a1870c43a..93d990133f9ad 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -484,14 +484,15 @@ config IMX_INTMUX config IMX_MU_MSI tristate "i.MX MU used as MSI controller" depends on OF && HAS_IOMEM + depends on ARCH_MXC || COMPILE_TEST default m if ARCH_MXC select IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY select GENERIC_MSI_IRQ_DOMAIN help - Provide a driver for the MU block used as a CPU-to-CPU MSI - controller. This requires a specially crafted DT to make use - of this driver. + Provide a driver for the i.MX Messaging Unit block used as a + CPU-to-CPU MSI controller. This requires a specially crafted DT + to make use of this driver. If unsure, say N -- GitLab From e25b091bed4946078c0998e4be77bc56824a9adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Sat, 8 Oct 2022 19:46:02 +0200 Subject: [PATCH 1779/2223] watchdog: Add tracing events for the most usual watchdog events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To simplify debugging which process touches a watchdog and when, add tracing events for .start(), .set_timeout(), .ping() and .stop(). Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org> Reviewed-by: Guenter Roeck <linux@roeck-us.net> Link: https://lore.kernel.org/r/20221008174602.3972859-1-u.kleine-koenig@pengutronix.de Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org> --- MAINTAINERS | 1 + drivers/watchdog/watchdog_core.c | 4 ++ drivers/watchdog/watchdog_dev.c | 12 +++++- include/trace/events/watchdog.h | 66 ++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/watchdog.h diff --git a/MAINTAINERS b/MAINTAINERS index 589517372408c..9751746559dc9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21864,6 +21864,7 @@ F: Documentation/watchdog/ F: drivers/watchdog/ F: include/linux/watchdog.h F: include/uapi/linux/watchdog.h +F: include/trace/events/watchdog.h WHISKEYCOVE PMIC GPIO DRIVER M: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c index 3fe8a7edc252f..c777a612d932d 100644 --- a/drivers/watchdog/watchdog_core.c +++ b/drivers/watchdog/watchdog_core.c @@ -38,6 +38,9 @@ #include "watchdog_core.h" /* For watchdog_dev_register/... */ +#define CREATE_TRACE_POINTS +#include <trace/events/watchdog.h> + static DEFINE_IDA(watchdog_ida); static int stop_on_reboot = -1; @@ -163,6 +166,7 @@ static int watchdog_reboot_notifier(struct notifier_block *nb, int ret; ret = wdd->ops->stop(wdd); + trace_watchdog_stop(wdd, ret); if (ret) return NOTIFY_BAD; } diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 744b2ab75288d..55574ed425042 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -47,6 +47,8 @@ #include "watchdog_core.h" #include "watchdog_pretimeout.h" +#include <trace/events/watchdog.h> + /* the dev_t structure to store the dynamically allocated watchdog devices */ static dev_t watchdog_devt; /* Reference to watchdog device behind /dev/watchdog */ @@ -157,10 +159,13 @@ static int __watchdog_ping(struct watchdog_device *wdd) wd_data->last_hw_keepalive = now; - if (wdd->ops->ping) + if (wdd->ops->ping) { err = wdd->ops->ping(wdd); /* ping the watchdog */ - else + trace_watchdog_ping(wdd, err); + } else { err = wdd->ops->start(wdd); /* restart watchdog */ + trace_watchdog_start(wdd, err); + } if (err == 0) watchdog_hrtimer_pretimeout_start(wdd); @@ -259,6 +264,7 @@ static int watchdog_start(struct watchdog_device *wdd) } } else { err = wdd->ops->start(wdd); + trace_watchdog_start(wdd, err); if (err == 0) { set_bit(WDOG_ACTIVE, &wdd->status); wd_data->last_keepalive = started_at; @@ -297,6 +303,7 @@ static int watchdog_stop(struct watchdog_device *wdd) if (wdd->ops->stop) { clear_bit(WDOG_HW_RUNNING, &wdd->status); err = wdd->ops->stop(wdd); + trace_watchdog_stop(wdd, err); } else { set_bit(WDOG_HW_RUNNING, &wdd->status); } @@ -369,6 +376,7 @@ static int watchdog_set_timeout(struct watchdog_device *wdd, if (wdd->ops->set_timeout) { err = wdd->ops->set_timeout(wdd, timeout); + trace_watchdog_set_timeout(wdd, timeout, err); } else { wdd->timeout = timeout; /* Disable pretimeout if it doesn't fit the new timeout */ diff --git a/include/trace/events/watchdog.h b/include/trace/events/watchdog.h new file mode 100644 index 0000000000000..beb9bb3424c8e --- /dev/null +++ b/include/trace/events/watchdog.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM watchdog + +#if !defined(_TRACE_WATCHDOG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WATCHDOG_H + +#include <linux/watchdog.h> +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(watchdog_template, + + TP_PROTO(struct watchdog_device *wdd, int err), + + TP_ARGS(wdd, err), + + TP_STRUCT__entry( + __field(int, id) + __field(int, err) + ), + + TP_fast_assign( + __entry->id = wdd->id; + __entry->err = err; + ), + + TP_printk("watchdog%d err=%d", __entry->id, __entry->err) +); + +DEFINE_EVENT(watchdog_template, watchdog_start, + TP_PROTO(struct watchdog_device *wdd, int err), + TP_ARGS(wdd, err)); + +DEFINE_EVENT(watchdog_template, watchdog_ping, + TP_PROTO(struct watchdog_device *wdd, int err), + TP_ARGS(wdd, err)); + +DEFINE_EVENT(watchdog_template, watchdog_stop, + TP_PROTO(struct watchdog_device *wdd, int err), + TP_ARGS(wdd, err)); + +TRACE_EVENT(watchdog_set_timeout, + + TP_PROTO(struct watchdog_device *wdd, unsigned int timeout, int err), + + TP_ARGS(wdd, timeout, err), + + TP_STRUCT__entry( + __field(int, id) + __field(unsigned int, timeout) + __field(int, err) + ), + + TP_fast_assign( + __entry->id = wdd->id; + __entry->timeout = timeout; + __entry->err = err; + ), + + TP_printk("watchdog%d timeout=%u err=%d", __entry->id, __entry->timeout, __entry->err) +); + +#endif /* !defined(_TRACE_WATCHDOG_H) || defined(TRACE_HEADER_MULTI_READ) */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> -- GitLab From b675d4bdfefac2fd46838383ecb3c06ad0f4c94d Mon Sep 17 00:00:00 2001 From: Yosry Ahmed <yosryahmed@google.com> Date: Tue, 11 Oct 2022 22:51:55 +0000 Subject: [PATCH 1780/2223] mm: cgroup: fix comments for get from fd/file helpers Fix the documentation comments for cgroup_[v1v2_]get_from_[fd/file](). Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6349a9fe9ec15..d922773fa90bc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6246,6 +6246,7 @@ static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) /** * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports * cgroup2. + * @f: file corresponding to cgroup2_dir */ static struct cgroup *cgroup_get_from_file(struct file *f) { @@ -6753,7 +6754,7 @@ out: EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** - * cgroup_get_from_fd - get a cgroup pointer from a fd + * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained @@ -6778,6 +6779,7 @@ struct cgroup *cgroup_v1v2_get_from_fd(int fd) /** * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports * cgroup2. + * @fd: fd obtained by open(cgroup2_dir) */ struct cgroup *cgroup_get_from_fd(int fd) { -- GitLab From 7e777b1b012e977cfd04347fb347f3f5d097f99e Mon Sep 17 00:00:00 2001 From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com> Date: Tue, 11 Oct 2022 09:50:02 +0200 Subject: [PATCH 1781/2223] net: ethernet: ti: am65-cpsw: set correct devlink flavour for unused ports am65_cpsw_nuss_register_ndevs() skips calling devlink_port_type_eth_set() for ports without assigned netdev, triggering the following warning when DEVLINK_PORT_TYPE_WARN_TIMEOUT elapses after 3600s: Type was not set for devlink port. WARNING: CPU: 0 PID: 129 at net/core/devlink.c:8095 devlink_port_type_warn+0x18/0x30 Fixes: 0680e20af5fb ("net: ethernet: ti: am65-cpsw: Fix devlink port register sequence") Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com> Reviewed-by: Andrew Lunn <andrew@lunn.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 3cbe4ec462344..7f86068f3ff63 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2476,7 +2476,10 @@ static int am65_cpsw_nuss_register_devlink(struct am65_cpsw_common *common) port = am65_common_get_port(common, i); dl_port = &port->devlink_port; - attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + if (port->ndev) + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + else + attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED; attrs.phys.port_number = port->port_id; attrs.switch_id.id_len = sizeof(resource_size_t); memcpy(attrs.switch_id.id, common->switch_id, attrs.switch_id.id_len); -- GitLab From 87445f369cca2965620e79f87145d3d7fa35befd Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 11 Oct 2022 14:27:28 -0700 Subject: [PATCH 1782/2223] ipv6: ping: fix wrong checksum for large frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For a given ping datagram, ping_getfrag() is called once per skb fragment. A large datagram requiring more than one page fragment is currently getting the checksum of the last fragment, instead of the cumulative one. After this patch, "ping -s 35000 ::1" is working correctly. Fixes: 6d0bfe226116 ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Lorenzo Colitti <lorenzo@google.com> Cc: Maciej Żenczykowski <maze@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/ping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 517042caf6dc1..705672f319e16 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -639,7 +639,7 @@ int ping_getfrag(void *from, char *to, * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { - skb->csum = pfh->wcheck; + skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } -- GitLab From 0d24148bd276ead5708ef56a4725580555bb48a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 11 Oct 2022 14:27:29 -0700 Subject: [PATCH 1783/2223] inet: ping: fix recent breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blamed commit broke the assumption used by ping sendmsg() that allocated skb would have MAX_HEADER bytes in skb->head. This patch changes the way ping works, by making sure the skb head contains space for the icmp header, and adjusting ping_getfrag() which was desperate about going past the icmp header :/ This is adopting what UDP does, mostly. syzbot is able to crash a host using both kfence and following repro in a loop. fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6) connect(fd, {sa_family=AF_INET6, sin6_port=htons(0), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_scope_id=0}, 28 sendmsg(fd, {msg_name=NULL, msg_namelen=0, msg_iov=[ {iov_base="\200\0\0\0\23\0\0\0\0\0\0\0\0\0"..., iov_len=65496}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0 When kfence triggers, skb->head only has 64 bytes, immediately followed by struct skb_shared_info (no extra headroom based on ksize(ptr)) Then icmpv6_push_pending_frames() is overwriting first bytes of skb_shinfo(skb), making nr_frags bigger than MAX_SKB_FRAGS, and/or setting shinfo->gso_size to a non zero value. If nr_frags is mangled, a crash happens in skb_release_data() If gso_size is mangled, we have the following report: lo: caps=(0x00000516401d7c69, 0x00000516401d7c69) WARNING: CPU: 0 PID: 7548 at net/core/dev.c:3239 skb_warn_bad_offload+0x119/0x230 net/core/dev.c:3239 Modules linked in: CPU: 0 PID: 7548 Comm: syz-executor268 Not tainted 6.0.0-syzkaller-02754-g557f050166e5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 RIP: 0010:skb_warn_bad_offload+0x119/0x230 net/core/dev.c:3239 Code: 70 03 00 00 e8 58 c3 24 fa 4c 8d a5 e8 00 00 00 e8 4c c3 24 fa 4c 89 e9 4c 89 e2 4c 89 f6 48 c7 c7 00 53 f5 8a e8 13 ac e7 01 <0f> 0b 5b 5d 41 5c 41 5d 41 5e e9 28 c3 24 fa e8 23 c3 24 fa 48 89 RSP: 0018:ffffc9000366f3e8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88807a9d9d00 RCX: 0000000000000000 RDX: ffff8880780c0000 RSI: ffffffff8160f6f8 RDI: fffff520006cde6f RBP: ffff888079952000 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8880799520e8 R13: ffff88807a9da070 R14: ffff888079952000 R15: 0000000000000000 FS: 0000555556be6300(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020010000 CR3: 000000006eb7b000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> gso_features_check net/core/dev.c:3521 [inline] netif_skb_features+0x83e/0xb90 net/core/dev.c:3554 validate_xmit_skb+0x2b/0xf10 net/core/dev.c:3659 __dev_queue_xmit+0x998/0x3ad0 net/core/dev.c:4248 dev_queue_xmit include/linux/netdevice.h:3008 [inline] neigh_hh_output include/net/neighbour.h:530 [inline] neigh_output include/net/neighbour.h:544 [inline] ip6_finish_output2+0xf97/0x1520 net/ipv6/ip6_output.c:134 __ip6_finish_output net/ipv6/ip6_output.c:195 [inline] ip6_finish_output+0x690/0x1160 net/ipv6/ip6_output.c:206 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip6_output+0x1ed/0x540 net/ipv6/ip6_output.c:227 dst_output include/net/dst.h:445 [inline] ip6_local_out+0xaf/0x1a0 net/ipv6/output_core.c:161 ip6_send_skb+0xb7/0x340 net/ipv6/ip6_output.c:1966 ip6_push_pending_frames+0xdd/0x100 net/ipv6/ip6_output.c:1986 icmpv6_push_pending_frames+0x2af/0x490 net/ipv6/icmp.c:303 ping_v6_sendmsg+0xc44/0x1190 net/ipv6/ping.c:190 inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x712/0x8c0 net/socket.c:2482 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2536 __sys_sendmsg+0xf3/0x1c0 net/socket.c:2565 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f21aab42b89 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff1729d038 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f21aab42b89 RDX: 0000000000000000 RSI: 0000000020000180 RDI: 0000000000000003 RBP: 0000000000000000 R08: 000000000000000d R09: 000000000000000d R10: 000000000000000d R11: 0000000000000246 R12: 00007fff1729d050 R13: 00000000000f4240 R14: 0000000000021dd1 R15: 00007fff1729d044 </TASK> Fixes: 47cf88993c91 ("net: unify alloclen calculation for paged requests") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Pavel Begunkov <asml.silence@gmail.com> Cc: Lorenzo Colitti <lorenzo@google.com> Cc: Willem de Bruijn <willemb@google.com> Cc: Maciej Żenczykowski <maze@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/ping.c | 21 +++++---------------- net/ipv6/ping.c | 2 +- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 705672f319e16..bde333b24837a 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -617,21 +617,9 @@ int ping_getfrag(void *from, char *to, { struct pingfakehdr *pfh = from; - if (offset == 0) { - fraglen -= sizeof(struct icmphdr); - if (fraglen < 0) - BUG(); - if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr), - fraglen, &pfh->wcheck, - &pfh->msg->msg_iter)) - return -EFAULT; - } else if (offset < sizeof(struct icmphdr)) { - BUG(); - } else { - if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, - &pfh->msg->msg_iter)) - return -EFAULT; - } + if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, + &pfh->msg->msg_iter)) + return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by @@ -842,7 +830,8 @@ back_from_confirm: pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, - 0, &ipc, &rt, msg->msg_flags); + sizeof(struct icmphdr), &ipc, &rt, + msg->msg_flags); if (err) ip_flush_pending_frames(sk); else diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 5f2ef84937142..86c26e48d065a 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -179,7 +179,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, - 0, &ipc6, &fl6, rt, + sizeof(struct icmp6hdr), &ipc6, &fl6, rt, MSG_DONTWAIT); if (err) { -- GitLab From 72e560cb8c6f80fc2b4afc5d3634a32465e13a51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 11 Oct 2022 15:07:48 -0700 Subject: [PATCH 1784/2223] tcp: cdg: allow tcp_cdg_release() to be called multiple times Apparently, mptcp is able to call tcp_disconnect() on an already disconnected flow. This is generally fine, unless current congestion control is CDG, because it might trigger a double-free [1] Instead of fixing MPTCP, and future bugs, we can make tcp_disconnect() more resilient. [1] BUG: KASAN: double-free in slab_free mm/slub.c:3539 [inline] BUG: KASAN: double-free in kfree+0xe2/0x580 mm/slub.c:4567 CPU: 0 PID: 3645 Comm: kworker/0:7 Not tainted 6.0.0-syzkaller-02734-g0326074ff465 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 Workqueue: events mptcp_worker Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report_invalid_free+0x81/0x190 mm/kasan/report.c:462 ____kasan_slab_free+0x18b/0x1c0 mm/kasan/common.c:356 kasan_slab_free include/linux/kasan.h:200 [inline] slab_free_hook mm/slub.c:1759 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1785 slab_free mm/slub.c:3539 [inline] kfree+0xe2/0x580 mm/slub.c:4567 tcp_disconnect+0x980/0x1e20 net/ipv4/tcp.c:3145 __mptcp_close_ssk+0x5ca/0x7e0 net/mptcp/protocol.c:2327 mptcp_do_fastclose net/mptcp/protocol.c:2592 [inline] mptcp_worker+0x78c/0xff0 net/mptcp/protocol.c:2627 process_one_work+0x991/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 </TASK> Allocated by task 3671: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:437 [inline] ____kasan_kmalloc mm/kasan/common.c:516 [inline] ____kasan_kmalloc mm/kasan/common.c:475 [inline] __kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:525 kmalloc_array include/linux/slab.h:640 [inline] kcalloc include/linux/slab.h:671 [inline] tcp_cdg_init+0x10d/0x170 net/ipv4/tcp_cdg.c:380 tcp_init_congestion_control+0xab/0x550 net/ipv4/tcp_cong.c:193 tcp_reinit_congestion_control net/ipv4/tcp_cong.c:217 [inline] tcp_set_congestion_control+0x96c/0xaa0 net/ipv4/tcp_cong.c:391 do_tcp_setsockopt+0x505/0x2320 net/ipv4/tcp.c:3513 tcp_setsockopt+0xd4/0x100 net/ipv4/tcp.c:3801 mptcp_setsockopt+0x35f/0x2570 net/mptcp/sockopt.c:844 __sys_setsockopt+0x2d6/0x690 net/socket.c:2252 __do_sys_setsockopt net/socket.c:2263 [inline] __se_sys_setsockopt net/socket.c:2260 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 16: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:367 [inline] ____kasan_slab_free+0x166/0x1c0 mm/kasan/common.c:329 kasan_slab_free include/linux/kasan.h:200 [inline] slab_free_hook mm/slub.c:1759 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1785 slab_free mm/slub.c:3539 [inline] kfree+0xe2/0x580 mm/slub.c:4567 tcp_cleanup_congestion_control+0x70/0x120 net/ipv4/tcp_cong.c:226 tcp_v4_destroy_sock+0xdd/0x750 net/ipv4/tcp_ipv4.c:2254 tcp_v6_destroy_sock+0x11/0x20 net/ipv6/tcp_ipv6.c:1969 inet_csk_destroy_sock+0x196/0x440 net/ipv4/inet_connection_sock.c:1157 tcp_done+0x23b/0x340 net/ipv4/tcp.c:4649 tcp_rcv_state_process+0x40e7/0x4990 net/ipv4/tcp_input.c:6624 tcp_v6_do_rcv+0x3fc/0x13c0 net/ipv6/tcp_ipv6.c:1525 tcp_v6_rcv+0x2e8e/0x3830 net/ipv6/tcp_ipv6.c:1759 ip6_protocol_deliver_rcu+0x2db/0x1950 net/ipv6/ip6_input.c:439 ip6_input_finish+0x14c/0x2c0 net/ipv6/ip6_input.c:484 NF_HOOK include/linux/netfilter.h:302 [inline] NF_HOOK include/linux/netfilter.h:296 [inline] ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:455 [inline] ip6_rcv_finish+0x193/0x2c0 net/ipv6/ip6_input.c:79 ip_sabotage_in net/bridge/br_netfilter_hooks.c:874 [inline] ip_sabotage_in+0x1fa/0x260 net/bridge/br_netfilter_hooks.c:865 nf_hook_entry_hookfn include/linux/netfilter.h:142 [inline] nf_hook_slow+0xc5/0x1f0 net/netfilter/core.c:614 nf_hook.constprop.0+0x3ac/0x650 include/linux/netfilter.h:257 NF_HOOK include/linux/netfilter.h:300 [inline] ipv6_rcv+0x9e/0x380 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5485 __netif_receive_skb+0x1f/0x1c0 net/core/dev.c:5599 netif_receive_skb_internal net/core/dev.c:5685 [inline] netif_receive_skb+0x12f/0x8d0 net/core/dev.c:5744 NF_HOOK include/linux/netfilter.h:302 [inline] NF_HOOK include/linux/netfilter.h:296 [inline] br_pass_frame_up+0x303/0x410 net/bridge/br_input.c:68 br_handle_frame_finish+0x909/0x1aa0 net/bridge/br_input.c:199 br_nf_hook_thresh+0x2f8/0x3d0 net/bridge/br_netfilter_hooks.c:1041 br_nf_pre_routing_finish_ipv6+0x695/0xef0 net/bridge/br_netfilter_ipv6.c:207 NF_HOOK include/linux/netfilter.h:302 [inline] br_nf_pre_routing_ipv6+0x417/0x7c0 net/bridge/br_netfilter_ipv6.c:237 br_nf_pre_routing+0x1496/0x1fe0 net/bridge/br_netfilter_hooks.c:507 nf_hook_entry_hookfn include/linux/netfilter.h:142 [inline] nf_hook_bridge_pre net/bridge/br_input.c:255 [inline] br_handle_frame+0x9c9/0x12d0 net/bridge/br_input.c:399 __netif_receive_skb_core+0x9fe/0x38f0 net/core/dev.c:5379 __netif_receive_skb_one_core+0xae/0x180 net/core/dev.c:5483 __netif_receive_skb+0x1f/0x1c0 net/core/dev.c:5599 process_backlog+0x3a0/0x7c0 net/core/dev.c:5927 __napi_poll+0xb3/0x6d0 net/core/dev.c:6494 napi_poll net/core/dev.c:6561 [inline] net_rx_action+0x9c1/0xd90 net/core/dev.c:6672 __do_softirq+0x1d0/0x9c8 kernel/softirq.c:571 Fixes: 2b0a8c9eee81 ("tcp: add CDG congestion control") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_cdg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ddc7ba0554bdd..112f28f936934 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -375,6 +375,7 @@ static void tcp_cdg_init(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), @@ -388,6 +389,7 @@ static void tcp_cdg_release(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); kfree(ca->gradients); + ca->gradients = NULL; } static struct tcp_congestion_ops tcp_cdg __read_mostly = { -- GitLab From 739cfa34518ef3a6789f5f77239073972a387359 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky <leonro@nvidia.com> Date: Tue, 11 Oct 2022 16:14:55 +0300 Subject: [PATCH 1785/2223] net/mlx5: Make ASO poll CQ usable in atomic context Poll CQ functions shouldn't sleep as they are called in atomic context. The following splat appears once the mlx5_aso_poll_cq() is used in such flow. BUG: scheduling while atomic: swapper/17/0/0x00000100 Modules linked in: sch_ingress openvswitch nsh mlx5_vdpa vringh vhost_iotlb vdpa mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core fuse [last unloaded: mlx5_core] CPU: 17 PID: 0 Comm: swapper/17 Tainted: G W 6.0.0-rc2+ #13 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: <IRQ> dump_stack_lvl+0x34/0x44 __schedule_bug.cold+0x47/0x53 __schedule+0x4b6/0x670 ? hrtimer_start_range_ns+0x28d/0x360 schedule+0x50/0x90 schedule_hrtimeout_range_clock+0x98/0x120 ? __hrtimer_init+0xb0/0xb0 usleep_range_state+0x60/0x90 mlx5_aso_poll_cq+0xad/0x190 [mlx5_core] mlx5e_ipsec_aso_update_curlft+0x81/0xb0 [mlx5_core] xfrm_timer_handler+0x6b/0x360 ? xfrm_find_acq_byseq+0x50/0x50 __hrtimer_run_queues+0x139/0x290 hrtimer_run_softirq+0x7d/0xe0 __do_softirq+0xc7/0x272 irq_exit_rcu+0x87/0xb0 sysvec_apic_timer_interrupt+0x72/0x90 </IRQ> <TASK> asm_sysvec_apic_timer_interrupt+0x16/0x20 RIP: 0010:default_idle+0x18/0x20 Code: ae 7d ff ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 8b 05 b5 30 0d 01 85 c0 7e 07 0f 00 2d 0a e3 53 00 fb f4 <c3> 0f 1f 80 00 00 00 00 0f 1f 44 00 00 65 48 8b 04 25 80 ad 01 00 RSP: 0018:ffff888100883ee0 EFLAGS: 00000242 RAX: 0000000000000001 RBX: ffff888100849580 RCX: 4000000000000000 RDX: 0000000000000001 RSI: 0000000000000083 RDI: 000000000008863c RBP: 0000000000000011 R08: 00000064e6977fa9 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 default_idle_call+0x37/0xb0 do_idle+0x1cd/0x1e0 cpu_startup_entry+0x19/0x20 start_secondary+0xfe/0x120 secondary_startup_64_no_verify+0xcd/0xdb </TASK> softirq: huh, entered softirq 8 HRTIMER 00000000a97c08cb with preempt_count 00000100, exited with 00000000? Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c | 8 +++++++- .../net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 10 +--------- drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h | 2 +- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c index a53e205f4a895..be74e14033283 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c @@ -115,6 +115,7 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev, struct mlx5e_flow_meters *flow_meters; u8 cir_man, cir_exp, cbs_man, cbs_exp; struct mlx5_aso_wqe *aso_wqe; + unsigned long expires; struct mlx5_aso *aso; u64 rate, burst; u8 ds_cnt; @@ -187,7 +188,12 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev, mlx5_aso_post_wqe(aso, true, &aso_wqe->ctrl); /* With newer FW, the wait for the first ASO WQE is more than 2us, put the wait 10ms. */ - err = mlx5_aso_poll_cq(aso, true, 10); + expires = jiffies + msecs_to_jiffies(10); + do { + err = mlx5_aso_poll_cq(aso, true); + if (err) + usleep_range(2, 10); + } while (err && time_is_after_jiffies(expires)); mutex_unlock(&flow_meters->aso_lock); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 5da746da898d4..41970067917bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1405,7 +1405,7 @@ static int macsec_aso_set_arm_event(struct mlx5_core_dev *mdev, struct mlx5e_mac MLX5_ACCESS_ASO_OPC_MOD_MACSEC); macsec_aso_build_ctrl(aso, &aso_wqe->aso_ctrl, in); mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl); - err = mlx5_aso_poll_cq(maso, false, 10); + err = mlx5_aso_poll_cq(maso, false); mutex_unlock(&aso->aso_lock); return err; @@ -1430,7 +1430,7 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac macsec_aso_build_wqe_ctrl_seg(aso, &aso_wqe->aso_ctrl, NULL); mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl); - err = mlx5_aso_poll_cq(maso, false, 10); + err = mlx5_aso_poll_cq(maso, false); if (err) goto err_out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c index 21e14507ff5c0..baa8092f335e3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c @@ -381,20 +381,12 @@ void mlx5_aso_post_wqe(struct mlx5_aso *aso, bool with_data, WRITE_ONCE(doorbell_cseg, NULL); } -int mlx5_aso_poll_cq(struct mlx5_aso *aso, bool with_data, u32 interval_ms) +int mlx5_aso_poll_cq(struct mlx5_aso *aso, bool with_data) { struct mlx5_aso_cq *cq = &aso->cq; struct mlx5_cqe64 *cqe; - unsigned long expires; cqe = mlx5_cqwq_get_cqe(&cq->wq); - - expires = jiffies + msecs_to_jiffies(interval_ms); - while (!cqe && time_is_after_jiffies(expires)) { - usleep_range(2, 10); - cqe = mlx5_cqwq_get_cqe(&cq->wq); - } - if (!cqe) return -ETIMEDOUT; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h index d854e01d7fc57..2d40dcf9d42ed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h @@ -83,7 +83,7 @@ void mlx5_aso_build_wqe(struct mlx5_aso *aso, u8 ds_cnt, u32 obj_id, u32 opc_mode); void mlx5_aso_post_wqe(struct mlx5_aso *aso, bool with_data, struct mlx5_wqe_ctrl_seg *doorbell_cseg); -int mlx5_aso_poll_cq(struct mlx5_aso *aso, bool with_data, u32 interval_ms); +int mlx5_aso_poll_cq(struct mlx5_aso *aso, bool with_data); struct mlx5_aso *mlx5_aso_create(struct mlx5_core_dev *mdev, u32 pdn); void mlx5_aso_destroy(struct mlx5_aso *aso); -- GitLab From 4b2edd38282a42742d8f2039767fa4f1919330f0 Mon Sep 17 00:00:00 2001 From: Jianmin Lv <lvjianmin@loongson.cn> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1786/2223] LoongArch: Fix cpu name after CPU-hotplug Don't overwrite the SMBIOS-provided CPU name on coming back from CPU- hotplug (including S3/S4) if it is already initialized. Reviewed-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/kernel/cpu-probe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index 529ab8f44ec6d..255a09876ef28 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -187,7 +187,9 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int uint64_t *vendor = (void *)(&cpu_full_name[VENDOR_OFFSET]); uint64_t *cpuname = (void *)(&cpu_full_name[CPUNAME_OFFSET]); - __cpu_full_name[cpu] = cpu_full_name; + if (!__cpu_full_name[cpu]) + __cpu_full_name[cpu] = cpu_full_name; + *vendor = iocsr_read64(LOONGARCH_IOCSR_VENDOR); *cpuname = iocsr_read64(LOONGARCH_IOCSR_CPUNAME); -- GitLab From a522b7ad8e66fd9021d354844c1c6bd7893bde6f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1787/2223] LoongArch: Do not create sysfs control file for io master CPUs Now io master CPUs are not hotpluggable on LoongArch, but in the current code only /sys/devices/system/cpu/cpu0/online is not created. Let us set the hotpluggable field of all the io master CPUs as 0, then prevent to create sysfs control file for all the io master CPUs which confuses some user space tools. This is similar with commit 9cce844abf07 ("MIPS: CPU#0 is not hotpluggable"). Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/bootinfo.h | 5 +++++ arch/loongarch/kernel/smp.c | 5 ----- arch/loongarch/kernel/topology.c | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/include/asm/bootinfo.h b/arch/loongarch/include/asm/bootinfo.h index 8e5881bc5ad19..ed0910e8b856b 100644 --- a/arch/loongarch/include/asm/bootinfo.h +++ b/arch/loongarch/include/asm/bootinfo.h @@ -40,4 +40,9 @@ extern unsigned long fw_arg0, fw_arg1, fw_arg2; extern struct loongson_board_info b_info; extern struct loongson_system_configuration loongson_sysconf; +static inline bool io_master(int cpu) +{ + return test_bit(cpu, &loongson_sysconf.cores_io_master); +} + #endif /* _ASM_BOOTINFO_H */ diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index b5fab308dcf25..781a4d4bdddc9 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -240,11 +240,6 @@ void loongson3_smp_finish(void) #ifdef CONFIG_HOTPLUG_CPU -static bool io_master(int cpu) -{ - return test_bit(cpu, &loongson_sysconf.cores_io_master); -} - int loongson3_cpu_disable(void) { unsigned long flags; diff --git a/arch/loongarch/kernel/topology.c b/arch/loongarch/kernel/topology.c index ab1a75c0b5a64..caa7cd8590788 100644 --- a/arch/loongarch/kernel/topology.c +++ b/arch/loongarch/kernel/topology.c @@ -5,6 +5,7 @@ #include <linux/node.h> #include <linux/nodemask.h> #include <linux/percpu.h> +#include <asm/bootinfo.h> static DEFINE_PER_CPU(struct cpu, cpu_devices); @@ -40,7 +41,7 @@ static int __init topology_init(void) for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); - c->hotpluggable = !!i; + c->hotpluggable = !io_master(i); ret = register_cpu(c, i); if (ret < 0) pr_warn("topology_init: register_cpu %d failed (%d)\n", i, ret); -- GitLab From 1299a129a9f927433ba792b242c1f287a96059e7 Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1788/2223] LoongArch: Flush TLB earlier at initialization Move local_flush_tlb_all() earlier (just after setup_ptwalker() and before page allocation). This can avoid stale TLB entries misguiding the later page allocation. Without this patch the second kernel of kexec/kdump fails to boot SMP. BTW, move output_pgtable_bits_defines() into tlb_init() since it has nothing to do with tlb handler setup. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/mm/tlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c index 9818ce11546bc..da3681f131c8d 100644 --- a/arch/loongarch/mm/tlb.c +++ b/arch/loongarch/mm/tlb.c @@ -258,7 +258,7 @@ extern long exception_handlers[VECSIZE * 128 / sizeof(long)]; void setup_tlb_handler(int cpu) { setup_ptwalker(); - output_pgtable_bits_defines(); + local_flush_tlb_all(); /* The tlb handlers are generated only once */ if (cpu == 0) { @@ -301,6 +301,7 @@ void tlb_init(int cpu) write_csr_pagesize(PS_DEFAULT_SIZE); write_csr_stlbpgsize(PS_DEFAULT_SIZE); write_csr_tlbrefill_pagesize(PS_DEFAULT_SIZE); + setup_tlb_handler(cpu); - local_flush_tlb_all(); + output_pgtable_bits_defines(); } -- GitLab From ddf502717da029c9f065ade7e9bce90a1890e7df Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1789/2223] LoongArch: Mark __xchg() and __cmpxchg() as __always_inline Commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") allows compiler to uninline functions marked as 'inline'. In case of __xchg()/__cmpxchg() this would cause to reference BUILD_BUG(), which is an error case for catching bugs and will not happen for correct code, if __xchg()/__cmpxchg() is inlined. This bug can be produced with CONFIG_DEBUG_SECTION_MISMATCH enabled, and the solution is similar to below commits: 46f1619500d0225 ("MIPS: include: Mark __xchg as __always_inline"), 88356d09904bc60 ("MIPS: include: Mark __cmpxchg as __always_inline"). Acked-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/cmpxchg.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h index ae19e33c77548..ecfa6cf79806e 100644 --- a/arch/loongarch/include/asm/cmpxchg.h +++ b/arch/loongarch/include/asm/cmpxchg.h @@ -61,8 +61,8 @@ static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val, return (old32 & mask) >> shift; } -static inline unsigned long __xchg(volatile void *ptr, unsigned long x, - int size) +static __always_inline unsigned long +__xchg(volatile void *ptr, unsigned long x, int size) { switch (size) { case 1: @@ -159,8 +159,8 @@ static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old, return (old32 & mask) >> shift; } -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, unsigned int size) +static __always_inline unsigned long +__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { case 1: -- GitLab From 9550dfde5eb83558c9c21f664d9b622a26bacf7d Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1790/2223] LoongArch: Kconfig: Fix spelling mistake "delibrately" -> "deliberately" There is a spelling mistake in a commented section. Fix it. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 80e13869e5b87..f65c39eb37253 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -158,7 +158,7 @@ config STACKTRACE_SUPPORT bool default y -# MACH_LOONGSON32 and MACH_LOONGSON64 are delibrately carried over from the +# MACH_LOONGSON32 and MACH_LOONGSON64 are deliberately carried over from the # MIPS Loongson code, to preserve Loongson-specific code paths in drivers that # are shared between architectures, and specifically expecting the symbols. config MACH_LOONGSON32 -- GitLab From 0d8dad7048611e5ba02ae8519539ce4b8b1482d3 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao <xry111@xry111.site> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1791/2223] LoongArch: Add Kconfig option AS_HAS_EXPLICIT_RELOCS GNU as >= 2.40 and GCC >= 13 will support using explicit relocation hints in the assembly code, instead of la.* macros. The usage of explicit relocation hints can improve code generation so it's enabled by default by GCC >= 13. Introduce a Kconfig option AS_HAS_EXPLICIT_RELOCS as the switch for "use explicit relocation hints or not". Tested-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Xi Ruoyao <xry111@xry111.site> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f65c39eb37253..9aeecc83b4807 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -195,6 +195,9 @@ config SCHED_OMIT_FRAME_POINTER bool default y +config AS_HAS_EXPLICIT_RELOCS + def_bool $(as-instr,x:pcalau12i \$t0$(comma)%pc_hi20(x)) + menu "Kernel type and options" source "kernel/Kconfig.hz" -- GitLab From 11cd8a648301af0ad6937ed9493519d1e93fd4c8 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao <xry111@xry111.site> Date: Wed, 12 Oct 2022 16:36:08 +0800 Subject: [PATCH 1792/2223] LoongArch: Adjust symbol addressing for AS_HAS_EXPLICIT_RELOCS If explicit relocation hints are used by the toolchain, -Wa,-mla-* options will be useless for the C code. So only use them for the !CONFIG_AS_HAS_EXPLICIT_RELOCS case. Replace "la" with "la.pcrel" in head.S to keep the semantic consistent with new and old toolchains for the low level startup code. For per-CPU variables, the "address" of the symbol is actually an offset from $r21. The value is near the loading address of main kernel image, but far from the loading address of modules. So we use model("extreme") attibute to tell the compiler that a PC-relative addressing with 32-bit offset is not sufficient for local per-CPU variables. The behavior with different assemblers and compilers are summarized in the following table: AS has CC has explicit relocs explicit relocs * Behavior ============================================================== No No Use la.* macros. No change from Linux 6.0. -------------------------------------------------------------- No Yes Disable explicit relocs. No change from Linux 6.0. -------------------------------------------------------------- Yes No Not supported. -------------------------------------------------------------- Yes Yes Enable explicit relocs. No -Wa,-mla* options used. ============================================================== *: We assume CC must have model attribute if it has explicit relocs. Both features are added in GCC 13 development cycle, so any GCC release >= 13 should be OK. Using early GCC 13 development snapshots may produce modules with unsupported relocations. Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f09482a Link: https://gcc.gnu.org/r13-1834 Link: https://gcc.gnu.org/r13-2199 Tested-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Xi Ruoyao <xry111@xry111.site> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Makefile | 18 ++++++++++++++++++ arch/loongarch/include/asm/percpu.h | 9 +++++++++ arch/loongarch/kernel/head.S | 12 ++++++------ arch/loongarch/kernel/vmlinux.lds.S | 4 ++++ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 84689c3ee3af4..42352f9058582 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -43,10 +43,28 @@ endif cflags-y += -G0 -pipe -msoft-float LDFLAGS_vmlinux += -G0 -static -n -nostdlib + +# When the assembler supports explicit relocation hint, we must use it. +# GCC may have -mexplicit-relocs off by default if it was built with an old +# assembler, so we force it via an option. +# +# When the assembler does not supports explicit relocation hint, we can't use +# it. Disable it if the compiler supports it. +# +# If you've seen "unknown reloc hint" message building the kernel and you are +# now wondering why "-mexplicit-relocs" is not wrapped with cc-option: the +# combination of a "new" assembler and "old" compiler is not supported. Either +# upgrade the compiler or downgrade the assembler. +ifdef CONFIG_AS_HAS_EXPLICIT_RELOCS +cflags-y += -mexplicit-relocs +KBUILD_CFLAGS_KERNEL += -mdirect-extern-access +else +cflags-y += $(call cc-option,-mno-explicit-relocs) KBUILD_AFLAGS_KERNEL += -Wa,-mla-global-with-pcrel KBUILD_CFLAGS_KERNEL += -Wa,-mla-global-with-pcrel KBUILD_AFLAGS_MODULE += -Wa,-mla-global-with-abs KBUILD_CFLAGS_MODULE += -fplt -Wa,-mla-global-with-abs,-mla-local-with-abs +endif cflags-y += -ffreestanding cflags-y += $(call cc-option, -mno-check-zero-division) diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h index 0bd6b0110198f..ad8d88494554a 100644 --- a/arch/loongarch/include/asm/percpu.h +++ b/arch/loongarch/include/asm/percpu.h @@ -8,6 +8,15 @@ #include <asm/cmpxchg.h> #include <asm/loongarch.h> +/* + * The "address" (in fact, offset from $r21) of a per-CPU variable is close to + * the loading address of main kernel image, but far from where the modules are + * loaded. Tell the compiler this fact when using explicit relocs. + */ +#if defined(MODULE) && defined(CONFIG_AS_HAS_EXPLICIT_RELOCS) +#define PER_CPU_ATTRIBUTES __attribute__((model("extreme"))) +#endif + /* Use r21 for fast access */ register unsigned long __my_cpu_offset __asm__("$r21"); diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 7e57ae8741b1a..0c67c24ce0878 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -57,19 +57,19 @@ SYM_CODE_START(kernel_entry) # kernel entry point li.w t0, 0x00 # FPE=0, SXE=0, ASXE=0, BTE=0 csrwr t0, LOONGARCH_CSR_EUEN - la t0, __bss_start # clear .bss + la.pcrel t0, __bss_start # clear .bss st.d zero, t0, 0 - la t1, __bss_stop - LONGSIZE + la.pcrel t1, __bss_stop - LONGSIZE 1: addi.d t0, t0, LONGSIZE st.d zero, t0, 0 bne t0, t1, 1b - la t0, fw_arg0 + la.pcrel t0, fw_arg0 st.d a0, t0, 0 # firmware arguments - la t0, fw_arg1 + la.pcrel t0, fw_arg1 st.d a1, t0, 0 - la t0, fw_arg2 + la.pcrel t0, fw_arg2 st.d a2, t0, 0 /* KSave3 used for percpu base, initialized as 0 */ @@ -77,7 +77,7 @@ SYM_CODE_START(kernel_entry) # kernel entry point /* GPR21 used for percpu base (runtime), initialized as 0 */ move u0, zero - la tp, init_thread_union + la.pcrel tp, init_thread_union /* Set the SP after an empty pt_regs. */ PTR_LI sp, (_THREAD_SIZE - 32 - PT_SIZE) PTR_ADD sp, sp, tp diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S index e5890bec2bf6b..b3309a5e695b2 100644 --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -55,6 +55,10 @@ SECTIONS EXCEPTION_TABLE(16) + .got : ALIGN(16) { *(.got) } + .plt : ALIGN(16) { *(.plt) } + .got.plt : ALIGN(16) { *(.got.plt) } + . = ALIGN(PECOFF_SEGMENT_ALIGN); __init_begin = .; __inittext_begin = .; -- GitLab From 0a75e5d1a1845db94b9c462e7b0ee755642febfe Mon Sep 17 00:00:00 2001 From: Xi Ruoyao <xry111@xry111.site> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1793/2223] LoongArch: Define ELF relocation types added in ABIv2.0 These relocation types are used by GNU binutils >= 2.40 and GCC >= 13. Add their definitions so we will be able to use them in later patches. Link: https://github.com/loongson/LoongArch-Documentation/pull/57 Tested-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Xi Ruoyao <xry111@xry111.site> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/elf.h | 37 ++++++++++++++++++++++++++++++++ arch/loongarch/kernel/module.c | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/elf.h b/arch/loongarch/include/asm/elf.h index 5f3ff4781fda8..7af0cebf28d73 100644 --- a/arch/loongarch/include/asm/elf.h +++ b/arch/loongarch/include/asm/elf.h @@ -74,6 +74,43 @@ #define R_LARCH_SUB64 56 #define R_LARCH_GNU_VTINHERIT 57 #define R_LARCH_GNU_VTENTRY 58 +#define R_LARCH_B16 64 +#define R_LARCH_B21 65 +#define R_LARCH_B26 66 +#define R_LARCH_ABS_HI20 67 +#define R_LARCH_ABS_LO12 68 +#define R_LARCH_ABS64_LO20 69 +#define R_LARCH_ABS64_HI12 70 +#define R_LARCH_PCALA_HI20 71 +#define R_LARCH_PCALA_LO12 72 +#define R_LARCH_PCALA64_LO20 73 +#define R_LARCH_PCALA64_HI12 74 +#define R_LARCH_GOT_PC_HI20 75 +#define R_LARCH_GOT_PC_LO12 76 +#define R_LARCH_GOT64_PC_LO20 77 +#define R_LARCH_GOT64_PC_HI12 78 +#define R_LARCH_GOT_HI20 79 +#define R_LARCH_GOT_LO12 80 +#define R_LARCH_GOT64_LO20 81 +#define R_LARCH_GOT64_HI12 82 +#define R_LARCH_TLS_LE_HI20 83 +#define R_LARCH_TLS_LE_LO12 84 +#define R_LARCH_TLS_LE64_LO20 85 +#define R_LARCH_TLS_LE64_HI12 86 +#define R_LARCH_TLS_IE_PC_HI20 87 +#define R_LARCH_TLS_IE_PC_LO12 88 +#define R_LARCH_TLS_IE64_PC_LO20 89 +#define R_LARCH_TLS_IE64_PC_HI12 90 +#define R_LARCH_TLS_IE_HI20 91 +#define R_LARCH_TLS_IE_LO12 92 +#define R_LARCH_TLS_IE64_LO20 93 +#define R_LARCH_TLS_IE64_HI12 94 +#define R_LARCH_TLS_LD_PC_HI20 95 +#define R_LARCH_TLS_LD_HI20 96 +#define R_LARCH_TLS_GD_PC_HI20 97 +#define R_LARCH_TLS_GD_HI20 98 +#define R_LARCH_32_PCREL 99 +#define R_LARCH_RELAX 100 #ifndef ELF_ARCH diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index 638427ff0d515..755d91ef8d856 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -296,7 +296,7 @@ typedef int (*reloc_rela_handler)(struct module *mod, u32 *location, Elf_Addr v, /* The handlers for known reloc types */ static reloc_rela_handler reloc_rela_handlers[] = { - [R_LARCH_NONE ... R_LARCH_SUB64] = apply_r_larch_error, + [R_LARCH_NONE ... R_LARCH_RELAX] = apply_r_larch_error, [R_LARCH_NONE] = apply_r_larch_none, [R_LARCH_32] = apply_r_larch_32, -- GitLab From 9bd1e38032fb72982d9efe11948037cfa01eaa50 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao <xry111@xry111.site> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1794/2223] LoongArch: Support PC-relative relocations in modules Binutils >= 2.40 uses R_LARCH_B26 instead of R_LARCH_SOP_PUSH_PLT_PCREL, and R_LARCH_PCALA* instead of R_LARCH_SOP_PUSH_PCREL. Handle R_LARCH_B26 and R_LARCH_PCALA* in the module loader. For R_LARCH_ B26, also create a PLT entry as needed. Tested-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Xi Ruoyao <xry111@xry111.site> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/kernel/module-sections.c | 7 ++- arch/loongarch/kernel/module.c | 69 +++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c index 6d498288977d2..e25c2ccf26657 100644 --- a/arch/loongarch/kernel/module-sections.c +++ b/arch/loongarch/kernel/module-sections.c @@ -56,9 +56,14 @@ static void count_max_entries(Elf_Rela *relas, int num, unsigned int *plts) for (i = 0; i < num; i++) { type = ELF_R_TYPE(relas[i].r_info); - if (type == R_LARCH_SOP_PUSH_PLT_PCREL) { + switch (type) { + case R_LARCH_SOP_PUSH_PLT_PCREL: + case R_LARCH_B26: if (!duplicate_rela(relas, i)) (*plts)++; + break; + default: + break; /* Do nothing. */ } } } diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index 755d91ef8d856..543ab2dc7ba08 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -281,6 +281,73 @@ static int apply_r_larch_add_sub(struct module *mod, u32 *location, Elf_Addr v, } } +static int apply_r_larch_b26(struct module *mod, u32 *location, Elf_Addr v, + s64 *rela_stack, size_t *rela_stack_top, unsigned int type) +{ + ptrdiff_t offset = (void *)v - (void *)location; + union loongarch_instruction *insn = (union loongarch_instruction *)location; + + if (offset >= SZ_128M) + v = module_emit_plt_entry(mod, v); + + if (offset < -SZ_128M) + v = module_emit_plt_entry(mod, v); + + offset = (void *)v - (void *)location; + + if (offset & 3) { + pr_err("module %s: jump offset = 0x%llx unaligned! dangerous R_LARCH_B26 (%u) relocation\n", + mod->name, (long long)offset, type); + return -ENOEXEC; + } + + if (!signed_imm_check(offset, 28)) { + pr_err("module %s: jump offset = 0x%llx overflow! dangerous R_LARCH_B26 (%u) relocation\n", + mod->name, (long long)offset, type); + return -ENOEXEC; + } + + offset >>= 2; + insn->reg0i26_format.immediate_l = offset & 0xffff; + insn->reg0i26_format.immediate_h = (offset >> 16) & 0x3ff; + + return 0; +} + +static int apply_r_larch_pcala(struct module *mod, u32 *location, Elf_Addr v, + s64 *rela_stack, size_t *rela_stack_top, unsigned int type) +{ + union loongarch_instruction *insn = (union loongarch_instruction *)location; + /* Use s32 for a sign-extension deliberately. */ + s32 offset_hi20 = (void *)((v + 0x800) & ~0xfff) - + (void *)((Elf_Addr)location & ~0xfff); + Elf_Addr anchor = (((Elf_Addr)location) & ~0xfff) + offset_hi20; + ptrdiff_t offset_rem = (void *)v - (void *)anchor; + + switch (type) { + case R_LARCH_PCALA_LO12: + insn->reg2i12_format.immediate = v & 0xfff; + break; + case R_LARCH_PCALA_HI20: + v = offset_hi20 >> 12; + insn->reg1i20_format.immediate = v & 0xfffff; + break; + case R_LARCH_PCALA64_LO20: + v = offset_rem >> 32; + insn->reg1i20_format.immediate = v & 0xfffff; + break; + case R_LARCH_PCALA64_HI12: + v = offset_rem >> 52; + insn->reg2i12_format.immediate = v & 0xfff; + break; + default: + pr_err("%s: Unsupport relocation type %u\n", mod->name, type); + return -EINVAL; + } + + return 0; +} + /* * reloc_handlers_rela() - Apply a particular relocation to a module * @mod: the module to apply the reloc to @@ -310,6 +377,8 @@ static reloc_rela_handler reloc_rela_handlers[] = { [R_LARCH_SOP_SUB ... R_LARCH_SOP_IF_ELSE] = apply_r_larch_sop, [R_LARCH_SOP_POP_32_S_10_5 ... R_LARCH_SOP_POP_32_U] = apply_r_larch_sop_imm_field, [R_LARCH_ADD32 ... R_LARCH_SUB64] = apply_r_larch_add_sub, + [R_LARCH_B26] = apply_r_larch_b26, + [R_LARCH_PCALA_HI20...R_LARCH_PCALA64_HI12] = apply_r_larch_pcala, }; int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, -- GitLab From 59b3d4a9b0cc065a6a88446f8dd9b6d4659cc3df Mon Sep 17 00:00:00 2001 From: Xi Ruoyao <xry111@xry111.site> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1795/2223] LoongArch: Support R_LARCH_GOT_PC_{LO12,HI20} in modules GCC >= 13 and GNU assembler >= 2.40 use these relocations to address external symbols, so we need to add them. Let the module loader emit GOT entries for data symbols so we would be able to handle GOT relocations. The GOT entry is just the data's symbol address. In module.lds, emit a stub .got section for a section header entry. The actual content of the section entry will be filled at runtime by module_ frob_arch_sections(). Tested-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Xi Ruoyao <xry111@xry111.site> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/module.h | 27 ++++++++++++- arch/loongarch/include/asm/module.lds.h | 1 + arch/loongarch/kernel/module-sections.c | 54 ++++++++++++++++++++++--- arch/loongarch/kernel/module.c | 24 +++++++++++ 4 files changed, 99 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/include/asm/module.h b/arch/loongarch/include/asm/module.h index 9f6718df18547..b29b19a46f427 100644 --- a/arch/loongarch/include/asm/module.h +++ b/arch/loongarch/include/asm/module.h @@ -17,10 +17,15 @@ struct mod_section { }; struct mod_arch_specific { + struct mod_section got; struct mod_section plt; struct mod_section plt_idx; }; +struct got_entry { + Elf_Addr symbol_addr; +}; + struct plt_entry { u32 inst_lu12iw; u32 inst_lu32id; @@ -29,10 +34,16 @@ struct plt_entry { }; struct plt_idx_entry { - unsigned long symbol_addr; + Elf_Addr symbol_addr; }; -Elf_Addr module_emit_plt_entry(struct module *mod, unsigned long val); +Elf_Addr module_emit_got_entry(struct module *mod, Elf_Addr val); +Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Addr val); + +static inline struct got_entry emit_got_entry(Elf_Addr val) +{ + return (struct got_entry) { val }; +} static inline struct plt_entry emit_plt_entry(unsigned long val) { @@ -77,4 +88,16 @@ static inline struct plt_entry *get_plt_entry(unsigned long val, return plt + plt_idx; } +static inline struct got_entry *get_got_entry(Elf_Addr val, + const struct mod_section *sec) +{ + struct got_entry *got = (struct got_entry *)sec->shdr->sh_addr; + int i; + + for (i = 0; i < sec->num_entries; i++) + if (got[i].symbol_addr == val) + return &got[i]; + return NULL; +} + #endif /* _ASM_MODULE_H */ diff --git a/arch/loongarch/include/asm/module.lds.h b/arch/loongarch/include/asm/module.lds.h index 31c1c0db11a3a..a3d1bc0fcc72e 100644 --- a/arch/loongarch/include/asm/module.lds.h +++ b/arch/loongarch/include/asm/module.lds.h @@ -2,6 +2,7 @@ /* Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ SECTIONS { . = ALIGN(4); + .got : { BYTE(0) } .plt : { BYTE(0) } .plt.idx : { BYTE(0) } } diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c index e25c2ccf26657..d296a70b758fd 100644 --- a/arch/loongarch/kernel/module-sections.c +++ b/arch/loongarch/kernel/module-sections.c @@ -7,7 +7,33 @@ #include <linux/kernel.h> #include <linux/module.h> -Elf_Addr module_emit_plt_entry(struct module *mod, unsigned long val) +Elf_Addr module_emit_got_entry(struct module *mod, Elf_Addr val) +{ + struct mod_section *got_sec = &mod->arch.got; + int i = got_sec->num_entries; + struct got_entry *got = get_got_entry(val, got_sec); + + if (got) + return (Elf_Addr)got; + + /* There is no GOT entry for val yet, create a new one. */ + got = (struct got_entry *)got_sec->shdr->sh_addr; + got[i] = emit_got_entry(val); + + got_sec->num_entries++; + if (got_sec->num_entries > got_sec->max_entries) { + /* + * This may happen when the module contains a GOT_HI20 without + * a paired GOT_LO12. Such a module is broken, reject it. + */ + pr_err("%s: module contains bad GOT relocation\n", mod->name); + return 0; + } + + return (Elf_Addr)&got[i]; +} + +Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Addr val) { int nr; struct mod_section *plt_sec = &mod->arch.plt; @@ -50,7 +76,8 @@ static bool duplicate_rela(const Elf_Rela *rela, int idx) return false; } -static void count_max_entries(Elf_Rela *relas, int num, unsigned int *plts) +static void count_max_entries(Elf_Rela *relas, int num, + unsigned int *plts, unsigned int *gots) { unsigned int i, type; @@ -62,6 +89,10 @@ static void count_max_entries(Elf_Rela *relas, int num, unsigned int *plts) if (!duplicate_rela(relas, i)) (*plts)++; break; + case R_LARCH_GOT_PC_HI20: + if (!duplicate_rela(relas, i)) + (*gots)++; + break; default: break; /* Do nothing. */ } @@ -71,18 +102,24 @@ static void count_max_entries(Elf_Rela *relas, int num, unsigned int *plts) int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { - unsigned int i, num_plts = 0; + unsigned int i, num_plts = 0, num_gots = 0; /* * Find the empty .plt sections. */ for (i = 0; i < ehdr->e_shnum; i++) { - if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt")) + if (!strcmp(secstrings + sechdrs[i].sh_name, ".got")) + mod->arch.got.shdr = sechdrs + i; + else if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt")) mod->arch.plt.shdr = sechdrs + i; else if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt.idx")) mod->arch.plt_idx.shdr = sechdrs + i; } + if (!mod->arch.got.shdr) { + pr_err("%s: module GOT section(s) missing\n", mod->name); + return -ENOEXEC; + } if (!mod->arch.plt.shdr) { pr_err("%s: module PLT section(s) missing\n", mod->name); return -ENOEXEC; @@ -105,9 +142,16 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts); + count_max_entries(relas, num_rela, &num_plts, &num_gots); } + mod->arch.got.shdr->sh_type = SHT_NOBITS; + mod->arch.got.shdr->sh_flags = SHF_ALLOC; + mod->arch.got.shdr->sh_addralign = L1_CACHE_BYTES; + mod->arch.got.shdr->sh_size = (num_gots + 1) * sizeof(struct got_entry); + mod->arch.got.num_entries = 0; + mod->arch.got.max_entries = num_gots; + mod->arch.plt.shdr->sh_type = SHT_NOBITS; mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC; mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES; diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index 543ab2dc7ba08..bee7457db8043 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -348,6 +348,29 @@ static int apply_r_larch_pcala(struct module *mod, u32 *location, Elf_Addr v, return 0; } +static int apply_r_larch_got_pc(struct module *mod, u32 *location, Elf_Addr v, + s64 *rela_stack, size_t *rela_stack_top, unsigned int type) +{ + Elf_Addr got = module_emit_got_entry(mod, v); + + if (!got) + return -EINVAL; + + switch (type) { + case R_LARCH_GOT_PC_LO12: + type = R_LARCH_PCALA_LO12; + break; + case R_LARCH_GOT_PC_HI20: + type = R_LARCH_PCALA_HI20; + break; + default: + pr_err("%s: Unsupport relocation type %u\n", mod->name, type); + return -EINVAL; + } + + return apply_r_larch_pcala(mod, location, got, rela_stack, rela_stack_top, type); +} + /* * reloc_handlers_rela() - Apply a particular relocation to a module * @mod: the module to apply the reloc to @@ -379,6 +402,7 @@ static reloc_rela_handler reloc_rela_handlers[] = { [R_LARCH_ADD32 ... R_LARCH_SUB64] = apply_r_larch_add_sub, [R_LARCH_B26] = apply_r_larch_b26, [R_LARCH_PCALA_HI20...R_LARCH_PCALA64_HI12] = apply_r_larch_pcala, + [R_LARCH_GOT_PC_HI20...R_LARCH_GOT_PC_LO12] = apply_r_larch_got_pc, }; int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, -- GitLab From a2a84e36331af3b000ad12b552c5485b8282b366 Mon Sep 17 00:00:00 2001 From: Rui Wang <wangrui@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1796/2223] LoongArch: mm: Refactor TLB exception handlers This patch simplifies TLB load, store and modify exception handlers: 1. Reduce instructions, such as alu/csr and memory access; 2. Execute tlb search instruction only in the fast path; 3. Return directly from the fast path for both normal and huge pages; 4. Re-tab the assembly for better vertical alignment. And fixes the concurrent modification issue of fast path for huge pages. This issue will occur in the following steps: CPU-1 (In TLB exception) CPU-2 (In THP splitting) 1: Load PMD entry (HUGE=1) 2: Goto huge path 3: Store PMD entry (HUGE=0) 4: Reload PMD entry (HUGE=0) 5: Fill TLB entry (PA is incorrect) This patch also slightly improves the TLB processing performance: * Normal pages: 2.15%, Huge pages: 1.70%. #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> int main(int argc, char *argv[]) { size_t page_size; size_t mem_size; size_t off; void *base; int flags; int i; if (argc < 2) { fprintf(stderr, "%s MEM_SIZE [HUGE]\n", argv[0]); return -1; } page_size = sysconf(_SC_PAGESIZE); flags = MAP_PRIVATE | MAP_ANONYMOUS; mem_size = strtoul(argv[1], NULL, 10); if (argc > 2) flags |= MAP_HUGETLB; for (i = 0; i < 10; i++) { base = mmap(NULL, mem_size, PROT_READ, flags, -1, 0); if (base == MAP_FAILED) { fprintf(stderr, "Map memory failed!\n"); return -1; } for (off = 0; off < mem_size; off += page_size) *(volatile int *)(base + off); munmap(base, mem_size); } return 0; } Signed-off-by: Rui Wang <wangrui@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/mm/tlbex.S | 537 ++++++++++++++++++-------------------- 1 file changed, 247 insertions(+), 290 deletions(-) diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S index 39743337999e9..d8ee8fbc8c673 100644 --- a/arch/loongarch/mm/tlbex.S +++ b/arch/loongarch/mm/tlbex.S @@ -10,15 +10,20 @@ #include <asm/regdef.h> #include <asm/stackframe.h> +#define PTRS_PER_PGD_BITS (PAGE_SHIFT - 3) +#define PTRS_PER_PUD_BITS (PAGE_SHIFT - 3) +#define PTRS_PER_PMD_BITS (PAGE_SHIFT - 3) +#define PTRS_PER_PTE_BITS (PAGE_SHIFT - 3) + .macro tlb_do_page_fault, write SYM_FUNC_START(tlb_do_page_fault_\write) SAVE_ALL - csrrd a2, LOONGARCH_CSR_BADV - move a0, sp - REG_S a2, sp, PT_BVADDR - li.w a1, \write - la.abs t0, do_page_fault - jirl ra, t0, 0 + csrrd a2, LOONGARCH_CSR_BADV + move a0, sp + REG_S a2, sp, PT_BVADDR + li.w a1, \write + la.abs t0, do_page_fault + jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(tlb_do_page_fault_\write) .endm @@ -29,133 +34,115 @@ SYM_FUNC_START(handle_tlb_protect) BACKUP_T0T1 SAVE_ALL - move a0, sp - move a1, zero - csrrd a2, LOONGARCH_CSR_BADV - REG_S a2, sp, PT_BVADDR - la.abs t0, do_page_fault - jirl ra, t0, 0 + move a0, sp + move a1, zero + csrrd a2, LOONGARCH_CSR_BADV + REG_S a2, sp, PT_BVADDR + la.abs t0, do_page_fault + jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(handle_tlb_protect) SYM_FUNC_START(handle_tlb_load) - csrwr t0, EXCEPTION_KS0 - csrwr t1, EXCEPTION_KS1 - csrwr ra, EXCEPTION_KS2 + csrwr t0, EXCEPTION_KS0 + csrwr t1, EXCEPTION_KS1 + csrwr ra, EXCEPTION_KS2 /* * The vmalloc handling is not in the hotpath. */ - csrrd t0, LOONGARCH_CSR_BADV - bltz t0, vmalloc_load - csrrd t1, LOONGARCH_CSR_PGDL + csrrd t0, LOONGARCH_CSR_BADV + bltz t0, vmalloc_load + csrrd t1, LOONGARCH_CSR_PGDL vmalloc_done_load: /* Get PGD offset in bytes */ - srli.d t0, t0, PGDIR_SHIFT - andi t0, t0, (PTRS_PER_PGD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + bstrpick.d ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT + alsl.d t1, ra, t1, 3 #if CONFIG_PGTABLE_LEVELS > 3 - csrrd t0, LOONGARCH_CSR_BADV - ld.d t1, t1, 0 - srli.d t0, t0, PUD_SHIFT - andi t0, t0, (PTRS_PER_PUD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + ld.d t1, t1, 0 + bstrpick.d ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT + alsl.d t1, ra, t1, 3 #endif #if CONFIG_PGTABLE_LEVELS > 2 - csrrd t0, LOONGARCH_CSR_BADV - ld.d t1, t1, 0 - srli.d t0, t0, PMD_SHIFT - andi t0, t0, (PTRS_PER_PMD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + ld.d t1, t1, 0 + bstrpick.d ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT + alsl.d t1, ra, t1, 3 #endif - ld.d ra, t1, 0 + ld.d ra, t1, 0 /* * For huge tlb entries, pmde doesn't contain an address but * instead contains the tlb pte. Check the PAGE_HUGE bit and * see if we need to jump to huge tlb processing. */ - andi t0, ra, _PAGE_HUGE - bnez t0, tlb_huge_update_load + rotri.d ra, ra, _PAGE_HUGE_SHIFT + 1 + bltz ra, tlb_huge_update_load - csrrd t0, LOONGARCH_CSR_BADV - srli.d t0, t0, PAGE_SHIFT - andi t0, t0, (PTRS_PER_PTE - 1) - slli.d t0, t0, _PTE_T_LOG2 - add.d t1, ra, t0 + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1) + bstrpick.d t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT + alsl.d t1, t0, ra, _PTE_T_LOG2 #ifdef CONFIG_SMP smp_pgtable_change_load: -#endif -#ifdef CONFIG_SMP - ll.d t0, t1, 0 + ll.d t0, t1, 0 #else - ld.d t0, t1, 0 + ld.d t0, t1, 0 #endif - tlbsrch - - srli.d ra, t0, _PAGE_PRESENT_SHIFT - andi ra, ra, 1 - beqz ra, nopage_tlb_load + andi ra, t0, _PAGE_PRESENT + beqz ra, nopage_tlb_load - ori t0, t0, _PAGE_VALID + ori t0, t0, _PAGE_VALID #ifdef CONFIG_SMP - sc.d t0, t1, 0 - beqz t0, smp_pgtable_change_load + sc.d t0, t1, 0 + beqz t0, smp_pgtable_change_load #else - st.d t0, t1, 0 + st.d t0, t1, 0 #endif - ori t1, t1, 8 - xori t1, t1, 8 - ld.d t0, t1, 0 - ld.d t1, t1, 8 - csrwr t0, LOONGARCH_CSR_TLBELO0 - csrwr t1, LOONGARCH_CSR_TLBELO1 + tlbsrch + bstrins.d t1, zero, 3, 3 + ld.d t0, t1, 0 + ld.d t1, t1, 8 + csrwr t0, LOONGARCH_CSR_TLBELO0 + csrwr t1, LOONGARCH_CSR_TLBELO1 tlbwr -leave_load: - csrrd t0, EXCEPTION_KS0 - csrrd t1, EXCEPTION_KS1 - csrrd ra, EXCEPTION_KS2 + + csrrd t0, EXCEPTION_KS0 + csrrd t1, EXCEPTION_KS1 + csrrd ra, EXCEPTION_KS2 ertn + #ifdef CONFIG_64BIT vmalloc_load: - la.abs t1, swapper_pg_dir - b vmalloc_done_load + la.abs t1, swapper_pg_dir + b vmalloc_done_load #endif - /* - * This is the entry point when build_tlbchange_handler_head - * spots a huge page. - */ + /* This is the entry point of a huge page. */ tlb_huge_update_load: #ifdef CONFIG_SMP - ll.d t0, t1, 0 -#else - ld.d t0, t1, 0 + ll.d ra, t1, 0 #endif - srli.d ra, t0, _PAGE_PRESENT_SHIFT - andi ra, ra, 1 - beqz ra, nopage_tlb_load - tlbsrch + andi t0, ra, _PAGE_PRESENT + beqz t0, nopage_tlb_load - ori t0, t0, _PAGE_VALID #ifdef CONFIG_SMP - sc.d t0, t1, 0 - beqz t0, tlb_huge_update_load - ld.d t0, t1, 0 + ori t0, ra, _PAGE_VALID + sc.d t0, t1, 0 + beqz t0, tlb_huge_update_load + ori t0, ra, _PAGE_VALID #else - st.d t0, t1, 0 + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1) + ori t0, ra, _PAGE_VALID + st.d t0, t1, 0 #endif + tlbsrch addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16) addi.d ra, t1, 0 csrxchg ra, t1, LOONGARCH_CSR_TLBIDX tlbwr - csrxchg zero, t1, LOONGARCH_CSR_TLBIDX + csrxchg zero, t1, LOONGARCH_CSR_TLBIDX /* * A huge PTE describes an area the size of the @@ -167,21 +154,20 @@ tlb_huge_update_load: * address space. */ /* Huge page: Move Global bit */ - xori t0, t0, _PAGE_HUGE - lu12i.w t1, _PAGE_HGLOBAL >> 12 - and t1, t0, t1 - srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT) - or t0, t0, t1 + xori t0, t0, _PAGE_HUGE + lu12i.w t1, _PAGE_HGLOBAL >> 12 + and t1, t0, t1 + srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT) + or t0, t0, t1 - addi.d ra, t0, 0 - csrwr t0, LOONGARCH_CSR_TLBELO0 - addi.d t0, ra, 0 + move ra, t0 + csrwr ra, LOONGARCH_CSR_TLBELO0 /* Convert to entrylo1 */ - addi.d t1, zero, 1 - slli.d t1, t1, (HPAGE_SHIFT - 1) - add.d t0, t0, t1 - csrwr t0, LOONGARCH_CSR_TLBELO1 + addi.d t1, zero, 1 + slli.d t1, t1, (HPAGE_SHIFT - 1) + add.d t0, t0, t1 + csrwr t0, LOONGARCH_CSR_TLBELO1 /* Set huge page tlb entry size */ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) @@ -194,136 +180,120 @@ tlb_huge_update_load: addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX + csrrd t0, EXCEPTION_KS0 + csrrd t1, EXCEPTION_KS1 + csrrd ra, EXCEPTION_KS2 + ertn + nopage_tlb_load: - dbar 0 - csrrd ra, EXCEPTION_KS2 - la.abs t0, tlb_do_page_fault_0 - jr t0 + dbar 0 + csrrd ra, EXCEPTION_KS2 + la.abs t0, tlb_do_page_fault_0 + jr t0 SYM_FUNC_END(handle_tlb_load) SYM_FUNC_START(handle_tlb_store) - csrwr t0, EXCEPTION_KS0 - csrwr t1, EXCEPTION_KS1 - csrwr ra, EXCEPTION_KS2 + csrwr t0, EXCEPTION_KS0 + csrwr t1, EXCEPTION_KS1 + csrwr ra, EXCEPTION_KS2 /* * The vmalloc handling is not in the hotpath. */ - csrrd t0, LOONGARCH_CSR_BADV - bltz t0, vmalloc_store - csrrd t1, LOONGARCH_CSR_PGDL + csrrd t0, LOONGARCH_CSR_BADV + bltz t0, vmalloc_store + csrrd t1, LOONGARCH_CSR_PGDL vmalloc_done_store: /* Get PGD offset in bytes */ - srli.d t0, t0, PGDIR_SHIFT - andi t0, t0, (PTRS_PER_PGD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 - + bstrpick.d ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT + alsl.d t1, ra, t1, 3 #if CONFIG_PGTABLE_LEVELS > 3 - csrrd t0, LOONGARCH_CSR_BADV - ld.d t1, t1, 0 - srli.d t0, t0, PUD_SHIFT - andi t0, t0, (PTRS_PER_PUD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + ld.d t1, t1, 0 + bstrpick.d ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT + alsl.d t1, ra, t1, 3 #endif #if CONFIG_PGTABLE_LEVELS > 2 - csrrd t0, LOONGARCH_CSR_BADV - ld.d t1, t1, 0 - srli.d t0, t0, PMD_SHIFT - andi t0, t0, (PTRS_PER_PMD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + ld.d t1, t1, 0 + bstrpick.d ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT + alsl.d t1, ra, t1, 3 #endif - ld.d ra, t1, 0 + ld.d ra, t1, 0 /* * For huge tlb entries, pmde doesn't contain an address but * instead contains the tlb pte. Check the PAGE_HUGE bit and * see if we need to jump to huge tlb processing. */ - andi t0, ra, _PAGE_HUGE - bnez t0, tlb_huge_update_store + rotri.d ra, ra, _PAGE_HUGE_SHIFT + 1 + bltz ra, tlb_huge_update_store - csrrd t0, LOONGARCH_CSR_BADV - srli.d t0, t0, PAGE_SHIFT - andi t0, t0, (PTRS_PER_PTE - 1) - slli.d t0, t0, _PTE_T_LOG2 - add.d t1, ra, t0 + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1) + bstrpick.d t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT + alsl.d t1, t0, ra, _PTE_T_LOG2 #ifdef CONFIG_SMP smp_pgtable_change_store: -#endif -#ifdef CONFIG_SMP - ll.d t0, t1, 0 + ll.d t0, t1, 0 #else - ld.d t0, t1, 0 + ld.d t0, t1, 0 #endif - tlbsrch - - srli.d ra, t0, _PAGE_PRESENT_SHIFT - andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) - xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) - bnez ra, nopage_tlb_store + andi ra, t0, _PAGE_PRESENT | _PAGE_WRITE + xori ra, ra, _PAGE_PRESENT | _PAGE_WRITE + bnez ra, nopage_tlb_store - ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #ifdef CONFIG_SMP - sc.d t0, t1, 0 - beqz t0, smp_pgtable_change_store + sc.d t0, t1, 0 + beqz t0, smp_pgtable_change_store #else - st.d t0, t1, 0 + st.d t0, t1, 0 #endif - - ori t1, t1, 8 - xori t1, t1, 8 - ld.d t0, t1, 0 - ld.d t1, t1, 8 - csrwr t0, LOONGARCH_CSR_TLBELO0 - csrwr t1, LOONGARCH_CSR_TLBELO1 + tlbsrch + bstrins.d t1, zero, 3, 3 + ld.d t0, t1, 0 + ld.d t1, t1, 8 + csrwr t0, LOONGARCH_CSR_TLBELO0 + csrwr t1, LOONGARCH_CSR_TLBELO1 tlbwr -leave_store: - csrrd t0, EXCEPTION_KS0 - csrrd t1, EXCEPTION_KS1 - csrrd ra, EXCEPTION_KS2 + + csrrd t0, EXCEPTION_KS0 + csrrd t1, EXCEPTION_KS1 + csrrd ra, EXCEPTION_KS2 ertn + #ifdef CONFIG_64BIT vmalloc_store: - la.abs t1, swapper_pg_dir - b vmalloc_done_store + la.abs t1, swapper_pg_dir + b vmalloc_done_store #endif - /* - * This is the entry point when build_tlbchange_handler_head - * spots a huge page. - */ + /* This is the entry point of a huge page. */ tlb_huge_update_store: #ifdef CONFIG_SMP - ll.d t0, t1, 0 -#else - ld.d t0, t1, 0 + ll.d ra, t1, 0 #endif - srli.d ra, t0, _PAGE_PRESENT_SHIFT - andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) - xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT) - bnez ra, nopage_tlb_store - - tlbsrch - ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + andi t0, ra, _PAGE_PRESENT | _PAGE_WRITE + xori t0, t0, _PAGE_PRESENT | _PAGE_WRITE + bnez t0, nopage_tlb_store #ifdef CONFIG_SMP - sc.d t0, t1, 0 - beqz t0, tlb_huge_update_store - ld.d t0, t1, 0 + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + sc.d t0, t1, 0 + beqz t0, tlb_huge_update_store + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #else - st.d t0, t1, 0 + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1) + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + st.d t0, t1, 0 #endif + tlbsrch addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16) addi.d ra, t1, 0 csrxchg ra, t1, LOONGARCH_CSR_TLBIDX tlbwr - csrxchg zero, t1, LOONGARCH_CSR_TLBIDX + csrxchg zero, t1, LOONGARCH_CSR_TLBIDX /* * A huge PTE describes an area the size of the * configured huge page size. This is twice the @@ -334,21 +304,20 @@ tlb_huge_update_store: * address space. */ /* Huge page: Move Global bit */ - xori t0, t0, _PAGE_HUGE - lu12i.w t1, _PAGE_HGLOBAL >> 12 - and t1, t0, t1 - srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT) - or t0, t0, t1 + xori t0, t0, _PAGE_HUGE + lu12i.w t1, _PAGE_HGLOBAL >> 12 + and t1, t0, t1 + srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT) + or t0, t0, t1 - addi.d ra, t0, 0 - csrwr t0, LOONGARCH_CSR_TLBELO0 - addi.d t0, ra, 0 + move ra, t0 + csrwr ra, LOONGARCH_CSR_TLBELO0 /* Convert to entrylo1 */ - addi.d t1, zero, 1 - slli.d t1, t1, (HPAGE_SHIFT - 1) - add.d t0, t0, t1 - csrwr t0, LOONGARCH_CSR_TLBELO1 + addi.d t1, zero, 1 + slli.d t1, t1, (HPAGE_SHIFT - 1) + add.d t0, t0, t1 + csrwr t0, LOONGARCH_CSR_TLBELO1 /* Set huge page tlb entry size */ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) @@ -362,126 +331,110 @@ tlb_huge_update_store: addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX + csrrd t0, EXCEPTION_KS0 + csrrd t1, EXCEPTION_KS1 + csrrd ra, EXCEPTION_KS2 + ertn + nopage_tlb_store: - dbar 0 - csrrd ra, EXCEPTION_KS2 - la.abs t0, tlb_do_page_fault_1 - jr t0 + dbar 0 + csrrd ra, EXCEPTION_KS2 + la.abs t0, tlb_do_page_fault_1 + jr t0 SYM_FUNC_END(handle_tlb_store) SYM_FUNC_START(handle_tlb_modify) - csrwr t0, EXCEPTION_KS0 - csrwr t1, EXCEPTION_KS1 - csrwr ra, EXCEPTION_KS2 + csrwr t0, EXCEPTION_KS0 + csrwr t1, EXCEPTION_KS1 + csrwr ra, EXCEPTION_KS2 /* * The vmalloc handling is not in the hotpath. */ - csrrd t0, LOONGARCH_CSR_BADV - bltz t0, vmalloc_modify - csrrd t1, LOONGARCH_CSR_PGDL + csrrd t0, LOONGARCH_CSR_BADV + bltz t0, vmalloc_modify + csrrd t1, LOONGARCH_CSR_PGDL vmalloc_done_modify: /* Get PGD offset in bytes */ - srli.d t0, t0, PGDIR_SHIFT - andi t0, t0, (PTRS_PER_PGD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + bstrpick.d ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT + alsl.d t1, ra, t1, 3 #if CONFIG_PGTABLE_LEVELS > 3 - csrrd t0, LOONGARCH_CSR_BADV - ld.d t1, t1, 0 - srli.d t0, t0, PUD_SHIFT - andi t0, t0, (PTRS_PER_PUD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + ld.d t1, t1, 0 + bstrpick.d ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT + alsl.d t1, ra, t1, 3 #endif #if CONFIG_PGTABLE_LEVELS > 2 - csrrd t0, LOONGARCH_CSR_BADV - ld.d t1, t1, 0 - srli.d t0, t0, PMD_SHIFT - andi t0, t0, (PTRS_PER_PMD - 1) - slli.d t0, t0, 3 - add.d t1, t1, t0 + ld.d t1, t1, 0 + bstrpick.d ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT + alsl.d t1, ra, t1, 3 #endif - ld.d ra, t1, 0 + ld.d ra, t1, 0 /* * For huge tlb entries, pmde doesn't contain an address but * instead contains the tlb pte. Check the PAGE_HUGE bit and * see if we need to jump to huge tlb processing. */ - andi t0, ra, _PAGE_HUGE - bnez t0, tlb_huge_update_modify + rotri.d ra, ra, _PAGE_HUGE_SHIFT + 1 + bltz ra, tlb_huge_update_modify - csrrd t0, LOONGARCH_CSR_BADV - srli.d t0, t0, PAGE_SHIFT - andi t0, t0, (PTRS_PER_PTE - 1) - slli.d t0, t0, _PTE_T_LOG2 - add.d t1, ra, t0 + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1) + bstrpick.d t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT + alsl.d t1, t0, ra, _PTE_T_LOG2 #ifdef CONFIG_SMP smp_pgtable_change_modify: -#endif -#ifdef CONFIG_SMP - ll.d t0, t1, 0 + ll.d t0, t1, 0 #else - ld.d t0, t1, 0 + ld.d t0, t1, 0 #endif - tlbsrch - - srli.d ra, t0, _PAGE_WRITE_SHIFT - andi ra, ra, 1 - beqz ra, nopage_tlb_modify + andi ra, t0, _PAGE_WRITE + beqz ra, nopage_tlb_modify - ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #ifdef CONFIG_SMP - sc.d t0, t1, 0 - beqz t0, smp_pgtable_change_modify + sc.d t0, t1, 0 + beqz t0, smp_pgtable_change_modify #else - st.d t0, t1, 0 + st.d t0, t1, 0 #endif - ori t1, t1, 8 - xori t1, t1, 8 - ld.d t0, t1, 0 - ld.d t1, t1, 8 - csrwr t0, LOONGARCH_CSR_TLBELO0 - csrwr t1, LOONGARCH_CSR_TLBELO1 + tlbsrch + bstrins.d t1, zero, 3, 3 + ld.d t0, t1, 0 + ld.d t1, t1, 8 + csrwr t0, LOONGARCH_CSR_TLBELO0 + csrwr t1, LOONGARCH_CSR_TLBELO1 tlbwr -leave_modify: - csrrd t0, EXCEPTION_KS0 - csrrd t1, EXCEPTION_KS1 - csrrd ra, EXCEPTION_KS2 + + csrrd t0, EXCEPTION_KS0 + csrrd t1, EXCEPTION_KS1 + csrrd ra, EXCEPTION_KS2 ertn + #ifdef CONFIG_64BIT vmalloc_modify: - la.abs t1, swapper_pg_dir - b vmalloc_done_modify + la.abs t1, swapper_pg_dir + b vmalloc_done_modify #endif - /* - * This is the entry point when - * build_tlbchange_handler_head spots a huge page. - */ + /* This is the entry point of a huge page. */ tlb_huge_update_modify: #ifdef CONFIG_SMP - ll.d t0, t1, 0 -#else - ld.d t0, t1, 0 + ll.d ra, t1, 0 #endif - - srli.d ra, t0, _PAGE_WRITE_SHIFT - andi ra, ra, 1 - beqz ra, nopage_tlb_modify - - tlbsrch - ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + andi t0, ra, _PAGE_WRITE + beqz t0, nopage_tlb_modify #ifdef CONFIG_SMP - sc.d t0, t1, 0 - beqz t0, tlb_huge_update_modify - ld.d t0, t1, 0 + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + sc.d t0, t1, 0 + beqz t0, tlb_huge_update_modify + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) #else - st.d t0, t1, 0 + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1) + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED) + st.d t0, t1, 0 #endif /* * A huge PTE describes an area the size of the @@ -493,21 +446,20 @@ tlb_huge_update_modify: * address space. */ /* Huge page: Move Global bit */ - xori t0, t0, _PAGE_HUGE - lu12i.w t1, _PAGE_HGLOBAL >> 12 - and t1, t0, t1 - srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT) - or t0, t0, t1 + xori t0, t0, _PAGE_HUGE + lu12i.w t1, _PAGE_HGLOBAL >> 12 + and t1, t0, t1 + srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT) + or t0, t0, t1 - addi.d ra, t0, 0 - csrwr t0, LOONGARCH_CSR_TLBELO0 - addi.d t0, ra, 0 + move ra, t0 + csrwr ra, LOONGARCH_CSR_TLBELO0 /* Convert to entrylo1 */ - addi.d t1, zero, 1 - slli.d t1, t1, (HPAGE_SHIFT - 1) - add.d t0, t0, t1 - csrwr t0, LOONGARCH_CSR_TLBELO1 + addi.d t1, zero, 1 + slli.d t1, t1, (HPAGE_SHIFT - 1) + add.d t0, t0, t1 + csrwr t0, LOONGARCH_CSR_TLBELO1 /* Set huge page tlb entry size */ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16) @@ -521,26 +473,31 @@ tlb_huge_update_modify: addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16)) csrxchg t1, t0, LOONGARCH_CSR_TLBIDX + csrrd t0, EXCEPTION_KS0 + csrrd t1, EXCEPTION_KS1 + csrrd ra, EXCEPTION_KS2 + ertn + nopage_tlb_modify: - dbar 0 - csrrd ra, EXCEPTION_KS2 - la.abs t0, tlb_do_page_fault_1 - jr t0 + dbar 0 + csrrd ra, EXCEPTION_KS2 + la.abs t0, tlb_do_page_fault_1 + jr t0 SYM_FUNC_END(handle_tlb_modify) SYM_FUNC_START(handle_tlb_refill) - csrwr t0, LOONGARCH_CSR_TLBRSAVE - csrrd t0, LOONGARCH_CSR_PGD - lddir t0, t0, 3 + csrwr t0, LOONGARCH_CSR_TLBRSAVE + csrrd t0, LOONGARCH_CSR_PGD + lddir t0, t0, 3 #if CONFIG_PGTABLE_LEVELS > 3 - lddir t0, t0, 2 + lddir t0, t0, 2 #endif #if CONFIG_PGTABLE_LEVELS > 2 - lddir t0, t0, 1 + lddir t0, t0, 1 #endif - ldpte t0, 0 - ldpte t0, 1 + ldpte t0, 0 + ldpte t0, 1 tlbfill - csrrd t0, LOONGARCH_CSR_TLBRSAVE + csrrd t0, LOONGARCH_CSR_TLBRSAVE ertn SYM_FUNC_END(handle_tlb_refill) -- GitLab From b61a40afca164a9bd066f749beff3bf209c5e209 Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1797/2223] LoongArch: Refactor cache probe and flush methods Current cache probe and flush methods have some drawbacks: 1, Assume there are 3 cache levels and only 3 levels; 2, Assume L1 = I + D, L2 = V, L3 = S, V is exclusive, S is inclusive. However, the fact is I + D, I + D + V, I + D + S and I + D + V + S are all valid. So, refactor the cache probe and flush methods to adapt more types of cache hierarchy. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/cacheflush.h | 87 +++++---- arch/loongarch/include/asm/cacheops.h | 36 ++-- arch/loongarch/include/asm/cpu-features.h | 5 - arch/loongarch/include/asm/cpu-info.h | 21 ++- arch/loongarch/include/asm/loongarch.h | 33 +--- arch/loongarch/include/asm/setup.h | 2 + arch/loongarch/kernel/cacheinfo.c | 98 +++------- arch/loongarch/kernel/traps.c | 3 - arch/loongarch/mm/cache.c | 211 ++++++++++++---------- arch/loongarch/pci/pci.c | 7 +- 10 files changed, 236 insertions(+), 267 deletions(-) diff --git a/arch/loongarch/include/asm/cacheflush.h b/arch/loongarch/include/asm/cacheflush.h index 670900141b7c8..0681788eb474a 100644 --- a/arch/loongarch/include/asm/cacheflush.h +++ b/arch/loongarch/include/asm/cacheflush.h @@ -6,10 +6,33 @@ #define _ASM_CACHEFLUSH_H #include <linux/mm.h> -#include <asm/cpu-features.h> +#include <asm/cpu-info.h> #include <asm/cacheops.h> -extern void local_flush_icache_range(unsigned long start, unsigned long end); +static inline bool cache_present(struct cache_desc *cdesc) +{ + return cdesc->flags & CACHE_PRESENT; +} + +static inline bool cache_private(struct cache_desc *cdesc) +{ + return cdesc->flags & CACHE_PRIVATE; +} + +static inline bool cache_inclusive(struct cache_desc *cdesc) +{ + return cdesc->flags & CACHE_INCLUSIVE; +} + +static inline unsigned int cpu_last_level_cache_line_size(void) +{ + int cache_present = boot_cpu_data.cache_leaves_present; + + return boot_cpu_data.cache_leaves[cache_present - 1].linesz; +} + +asmlinkage void __flush_cache_all(void); +void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_range local_flush_icache_range #define flush_icache_user_range local_flush_icache_range @@ -35,44 +58,30 @@ extern void local_flush_icache_range(unsigned long start, unsigned long end); : \ : "i" (op), "ZC" (*(unsigned char *)(addr))) -static inline void flush_icache_line_indexed(unsigned long addr) -{ - cache_op(Index_Invalidate_I, addr); -} - -static inline void flush_dcache_line_indexed(unsigned long addr) -{ - cache_op(Index_Writeback_Inv_D, addr); -} - -static inline void flush_vcache_line_indexed(unsigned long addr) -{ - cache_op(Index_Writeback_Inv_V, addr); -} - -static inline void flush_scache_line_indexed(unsigned long addr) -{ - cache_op(Index_Writeback_Inv_S, addr); -} - -static inline void flush_icache_line(unsigned long addr) -{ - cache_op(Hit_Invalidate_I, addr); -} - -static inline void flush_dcache_line(unsigned long addr) -{ - cache_op(Hit_Writeback_Inv_D, addr); -} - -static inline void flush_vcache_line(unsigned long addr) -{ - cache_op(Hit_Writeback_Inv_V, addr); -} - -static inline void flush_scache_line(unsigned long addr) +static inline void flush_cache_line(int leaf, unsigned long addr) { - cache_op(Hit_Writeback_Inv_S, addr); + switch (leaf) { + case Cache_LEAF0: + cache_op(Index_Writeback_Inv_LEAF0, addr); + break; + case Cache_LEAF1: + cache_op(Index_Writeback_Inv_LEAF1, addr); + break; + case Cache_LEAF2: + cache_op(Index_Writeback_Inv_LEAF2, addr); + break; + case Cache_LEAF3: + cache_op(Index_Writeback_Inv_LEAF3, addr); + break; + case Cache_LEAF4: + cache_op(Index_Writeback_Inv_LEAF4, addr); + break; + case Cache_LEAF5: + cache_op(Index_Writeback_Inv_LEAF5, addr); + break; + default: + break; + } } #include <asm-generic/cacheflush.h> diff --git a/arch/loongarch/include/asm/cacheops.h b/arch/loongarch/include/asm/cacheops.h index dc280efecebd8..0f4a86f8e2bea 100644 --- a/arch/loongarch/include/asm/cacheops.h +++ b/arch/loongarch/include/asm/cacheops.h @@ -8,16 +8,18 @@ #define __ASM_CACHEOPS_H /* - * Most cache ops are split into a 2 bit field identifying the cache, and a 3 + * Most cache ops are split into a 3 bit field identifying the cache, and a 2 * bit field identifying the cache operation. */ -#define CacheOp_Cache 0x03 -#define CacheOp_Op 0x1c +#define CacheOp_Cache 0x07 +#define CacheOp_Op 0x18 -#define Cache_I 0x00 -#define Cache_D 0x01 -#define Cache_V 0x02 -#define Cache_S 0x03 +#define Cache_LEAF0 0x00 +#define Cache_LEAF1 0x01 +#define Cache_LEAF2 0x02 +#define Cache_LEAF3 0x03 +#define Cache_LEAF4 0x04 +#define Cache_LEAF5 0x05 #define Index_Invalidate 0x08 #define Index_Writeback_Inv 0x08 @@ -25,13 +27,17 @@ #define Hit_Writeback_Inv 0x10 #define CacheOp_User_Defined 0x18 -#define Index_Invalidate_I (Cache_I | Index_Invalidate) -#define Index_Writeback_Inv_D (Cache_D | Index_Writeback_Inv) -#define Index_Writeback_Inv_V (Cache_V | Index_Writeback_Inv) -#define Index_Writeback_Inv_S (Cache_S | Index_Writeback_Inv) -#define Hit_Invalidate_I (Cache_I | Hit_Invalidate) -#define Hit_Writeback_Inv_D (Cache_D | Hit_Writeback_Inv) -#define Hit_Writeback_Inv_V (Cache_V | Hit_Writeback_Inv) -#define Hit_Writeback_Inv_S (Cache_S | Hit_Writeback_Inv) +#define Index_Writeback_Inv_LEAF0 (Cache_LEAF0 | Index_Writeback_Inv) +#define Index_Writeback_Inv_LEAF1 (Cache_LEAF1 | Index_Writeback_Inv) +#define Index_Writeback_Inv_LEAF2 (Cache_LEAF2 | Index_Writeback_Inv) +#define Index_Writeback_Inv_LEAF3 (Cache_LEAF3 | Index_Writeback_Inv) +#define Index_Writeback_Inv_LEAF4 (Cache_LEAF4 | Index_Writeback_Inv) +#define Index_Writeback_Inv_LEAF5 (Cache_LEAF5 | Index_Writeback_Inv) +#define Hit_Writeback_Inv_LEAF0 (Cache_LEAF0 | Hit_Writeback_Inv) +#define Hit_Writeback_Inv_LEAF1 (Cache_LEAF1 | Hit_Writeback_Inv) +#define Hit_Writeback_Inv_LEAF2 (Cache_LEAF2 | Hit_Writeback_Inv) +#define Hit_Writeback_Inv_LEAF3 (Cache_LEAF3 | Hit_Writeback_Inv) +#define Hit_Writeback_Inv_LEAF4 (Cache_LEAF4 | Hit_Writeback_Inv) +#define Hit_Writeback_Inv_LEAF5 (Cache_LEAF5 | Hit_Writeback_Inv) #endif /* __ASM_CACHEOPS_H */ diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h index a8d87c40a0eb0..b07974218393d 100644 --- a/arch/loongarch/include/asm/cpu-features.h +++ b/arch/loongarch/include/asm/cpu-features.h @@ -19,11 +19,6 @@ #define cpu_has_loongarch32 (cpu_data[0].isa_level & LOONGARCH_CPU_ISA_32BIT) #define cpu_has_loongarch64 (cpu_data[0].isa_level & LOONGARCH_CPU_ISA_64BIT) -#define cpu_icache_line_size() cpu_data[0].icache.linesz -#define cpu_dcache_line_size() cpu_data[0].dcache.linesz -#define cpu_vcache_line_size() cpu_data[0].vcache.linesz -#define cpu_scache_line_size() cpu_data[0].scache.linesz - #ifdef CONFIG_32BIT # define cpu_has_64bits (cpu_data[0].isa_level & LOONGARCH_CPU_ISA_64BIT) # define cpu_vabits 31 diff --git a/arch/loongarch/include/asm/cpu-info.h b/arch/loongarch/include/asm/cpu-info.h index b6c4f96079dfe..cd73a6f57fe37 100644 --- a/arch/loongarch/include/asm/cpu-info.h +++ b/arch/loongarch/include/asm/cpu-info.h @@ -10,18 +10,28 @@ #include <asm/loongarch.h> +/* cache_desc->flags */ +enum { + CACHE_PRESENT = (1 << 0), + CACHE_PRIVATE = (1 << 1), /* core private cache */ + CACHE_INCLUSIVE = (1 << 2), /* include the inner level caches */ +}; + /* * Descriptor for a cache */ struct cache_desc { - unsigned int waysize; /* Bytes per way */ + unsigned char type; + unsigned char level; unsigned short sets; /* Number of lines per set */ unsigned char ways; /* Number of ways */ unsigned char linesz; /* Size of line in bytes */ - unsigned char waybit; /* Bits to select in a cache set */ unsigned char flags; /* Flags describing cache properties */ }; +#define CACHE_LEVEL_MAX 3 +#define CACHE_LEAVES_MAX 6 + struct cpuinfo_loongarch { u64 asid_cache; unsigned long asid_mask; @@ -40,11 +50,8 @@ struct cpuinfo_loongarch { int tlbsizemtlb; int tlbsizestlbsets; int tlbsizestlbways; - struct cache_desc icache; /* Primary I-cache */ - struct cache_desc dcache; /* Primary D or combined I/D cache */ - struct cache_desc vcache; /* Victim cache, between pcache and scache */ - struct cache_desc scache; /* Secondary cache */ - struct cache_desc tcache; /* Tertiary/split secondary cache */ + int cache_leaves_present; /* number of cache_leaves[] elements */ + struct cache_desc cache_leaves[CACHE_LEAVES_MAX]; int core; /* physical core number in package */ int package;/* physical package number */ int vabits; /* Virtual Address size in bits */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 3ba4f7e87cd25..7f8d57a61c8bd 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -187,36 +187,15 @@ static inline u32 read_cpucfg(u32 reg) #define CPUCFG16_L3_DINCL BIT(16) #define LOONGARCH_CPUCFG17 0x11 -#define CPUCFG17_L1I_WAYS_M GENMASK(15, 0) -#define CPUCFG17_L1I_SETS_M GENMASK(23, 16) -#define CPUCFG17_L1I_SIZE_M GENMASK(30, 24) -#define CPUCFG17_L1I_WAYS 0 -#define CPUCFG17_L1I_SETS 16 -#define CPUCFG17_L1I_SIZE 24 - #define LOONGARCH_CPUCFG18 0x12 -#define CPUCFG18_L1D_WAYS_M GENMASK(15, 0) -#define CPUCFG18_L1D_SETS_M GENMASK(23, 16) -#define CPUCFG18_L1D_SIZE_M GENMASK(30, 24) -#define CPUCFG18_L1D_WAYS 0 -#define CPUCFG18_L1D_SETS 16 -#define CPUCFG18_L1D_SIZE 24 - #define LOONGARCH_CPUCFG19 0x13 -#define CPUCFG19_L2_WAYS_M GENMASK(15, 0) -#define CPUCFG19_L2_SETS_M GENMASK(23, 16) -#define CPUCFG19_L2_SIZE_M GENMASK(30, 24) -#define CPUCFG19_L2_WAYS 0 -#define CPUCFG19_L2_SETS 16 -#define CPUCFG19_L2_SIZE 24 - #define LOONGARCH_CPUCFG20 0x14 -#define CPUCFG20_L3_WAYS_M GENMASK(15, 0) -#define CPUCFG20_L3_SETS_M GENMASK(23, 16) -#define CPUCFG20_L3_SIZE_M GENMASK(30, 24) -#define CPUCFG20_L3_WAYS 0 -#define CPUCFG20_L3_SETS 16 -#define CPUCFG20_L3_SIZE 24 +#define CPUCFG_CACHE_WAYS_M GENMASK(15, 0) +#define CPUCFG_CACHE_SETS_M GENMASK(23, 16) +#define CPUCFG_CACHE_LSIZE_M GENMASK(30, 24) +#define CPUCFG_CACHE_WAYS 0 +#define CPUCFG_CACHE_SETS 16 +#define CPUCFG_CACHE_LSIZE 24 #define LOONGARCH_CPUCFG48 0x30 #define CPUCFG48_MCSR_LCK BIT(0) diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h index 6d7d2a3e23dd6..ca373f8e3c4db 100644 --- a/arch/loongarch/include/asm/setup.h +++ b/arch/loongarch/include/asm/setup.h @@ -13,7 +13,9 @@ extern unsigned long eentry; extern unsigned long tlbrentry; +extern void tlb_init(int cpu); extern void cpu_cache_init(void); +extern void cache_error_setup(void); extern void per_cpu_trap_init(int cpu); extern void set_handler(unsigned long offset, void *addr, unsigned long len); extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len); diff --git a/arch/loongarch/kernel/cacheinfo.c b/arch/loongarch/kernel/cacheinfo.c index 4662b06269f42..c7988f757281c 100644 --- a/arch/loongarch/kernel/cacheinfo.c +++ b/arch/loongarch/kernel/cacheinfo.c @@ -5,73 +5,34 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include <linux/cacheinfo.h> +#include <linux/topology.h> #include <asm/bootinfo.h> #include <asm/cpu-info.h> -/* Populates leaf and increments to next leaf */ -#define populate_cache(cache, leaf, c_level, c_type) \ -do { \ - leaf->type = c_type; \ - leaf->level = c_level; \ - leaf->coherency_line_size = c->cache.linesz; \ - leaf->number_of_sets = c->cache.sets; \ - leaf->ways_of_associativity = c->cache.ways; \ - leaf->size = c->cache.linesz * c->cache.sets * \ - c->cache.ways; \ - if (leaf->level > 2) \ - leaf->size *= nodes_per_package; \ - leaf++; \ -} while (0) - int init_cache_level(unsigned int cpu) { - struct cpuinfo_loongarch *c = ¤t_cpu_data; + int cache_present = current_cpu_data.cache_leaves_present; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - int levels = 0, leaves = 0; - - /* - * If Dcache is not set, we assume the cache structures - * are not properly initialized. - */ - if (c->dcache.waysize) - levels += 1; - else - return -ENOENT; - - - leaves += (c->icache.waysize) ? 2 : 1; - - if (c->vcache.waysize) { - levels++; - leaves++; - } - if (c->scache.waysize) { - levels++; - leaves++; - } + this_cpu_ci->num_levels = + current_cpu_data.cache_leaves[cache_present - 1].level; + this_cpu_ci->num_leaves = cache_present; - if (c->tcache.waysize) { - levels++; - leaves++; - } - - this_cpu_ci->num_levels = levels; - this_cpu_ci->num_leaves = leaves; return 0; } static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf, struct cacheinfo *sib_leaf) { - return !((this_leaf->level == 1) || (this_leaf->level == 2)); + return (!(*(unsigned char *)(this_leaf->priv) & CACHE_PRIVATE) + && !(*(unsigned char *)(sib_leaf->priv) & CACHE_PRIVATE)); } static void cache_cpumap_setup(unsigned int cpu) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sib_leaf; unsigned int index; + struct cacheinfo *this_leaf, *sib_leaf; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); for (index = 0; index < this_cpu_ci->num_leaves; index++) { unsigned int i; @@ -85,8 +46,10 @@ static void cache_cpumap_setup(unsigned int cpu) for_each_online_cpu(i) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); - if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ + if (i == cpu || !sib_cpu_ci->info_list || + (cpu_to_node(i) != cpu_to_node(cpu))) + continue; + sib_leaf = sib_cpu_ci->info_list + index; if (cache_leaves_are_shared(this_leaf, sib_leaf)) { cpumask_set_cpu(cpu, &sib_leaf->shared_cpu_map); @@ -98,31 +61,24 @@ static void cache_cpumap_setup(unsigned int cpu) int populate_cache_leaves(unsigned int cpu) { - int level = 1, nodes_per_package = 1; - struct cpuinfo_loongarch *c = ¤t_cpu_data; + int i, cache_present = current_cpu_data.cache_leaves_present; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf = this_cpu_ci->info_list; - - if (loongson_sysconf.nr_nodes > 1) - nodes_per_package = loongson_sysconf.cores_per_package - / loongson_sysconf.cores_per_node; - - if (c->icache.waysize) { - populate_cache(dcache, this_leaf, level, CACHE_TYPE_DATA); - populate_cache(icache, this_leaf, level++, CACHE_TYPE_INST); - } else { - populate_cache(dcache, this_leaf, level++, CACHE_TYPE_UNIFIED); + struct cache_desc *cd, *cdesc = current_cpu_data.cache_leaves; + + for (i = 0; i < cache_present; i++) { + cd = cdesc + i; + + this_leaf->type = cd->type; + this_leaf->level = cd->level; + this_leaf->coherency_line_size = cd->linesz; + this_leaf->number_of_sets = cd->sets; + this_leaf->ways_of_associativity = cd->ways; + this_leaf->size = cd->linesz * cd->sets * cd->ways; + this_leaf->priv = &cd->flags; + this_leaf++; } - if (c->vcache.waysize) - populate_cache(vcache, this_leaf, level++, CACHE_TYPE_UNIFIED); - - if (c->scache.waysize) - populate_cache(scache, this_leaf, level++, CACHE_TYPE_UNIFIED); - - if (c->tcache.waysize) - populate_cache(tcache, this_leaf, level++, CACHE_TYPE_UNIFIED); - cache_cpumap_setup(cpu); this_cpu_ci->cpu_map_populated = true; diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 5010e95cef847..a5e8bd5d79484 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -620,9 +620,6 @@ asmlinkage void noinstr do_vint(struct pt_regs *regs, unsigned long sp) irqentry_exit(regs, state); } -extern void tlb_init(int cpu); -extern void cache_error_setup(void); - unsigned long eentry; unsigned long tlbrentry; diff --git a/arch/loongarch/mm/cache.c b/arch/loongarch/mm/cache.c index e8c68dcf6ab20..72685a48eaf08 100644 --- a/arch/loongarch/mm/cache.c +++ b/arch/loongarch/mm/cache.c @@ -6,8 +6,8 @@ * Copyright (C) 1994 - 2003, 06, 07 by Ralf Baechle (ralf@linux-mips.org) * Copyright (C) 2007 MIPS Technologies, Inc. */ +#include <linux/cacheinfo.h> #include <linux/export.h> -#include <linux/fcntl.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/kernel.h> @@ -16,14 +16,21 @@ #include <linux/sched.h> #include <linux/syscalls.h> +#include <asm/bootinfo.h> #include <asm/cacheflush.h> #include <asm/cpu.h> #include <asm/cpu-features.h> -#include <asm/dma.h> #include <asm/loongarch.h> +#include <asm/numa.h> #include <asm/processor.h> #include <asm/setup.h> +void cache_error_setup(void) +{ + extern char __weak except_vec_cex; + set_merr_handler(0x0, &except_vec_cex, 0x80); +} + /* * LoongArch maintains ICache/DCache coherency by hardware, * we just need "ibar" to avoid instruction hazard here. @@ -34,109 +41,121 @@ void local_flush_icache_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(local_flush_icache_range); -void cache_error_setup(void) -{ - extern char __weak except_vec_cex; - set_merr_handler(0x0, &except_vec_cex, 0x80); -} - -static unsigned long icache_size __read_mostly; -static unsigned long dcache_size __read_mostly; -static unsigned long vcache_size __read_mostly; -static unsigned long scache_size __read_mostly; - -static char *way_string[] = { NULL, "direct mapped", "2-way", - "3-way", "4-way", "5-way", "6-way", "7-way", "8-way", - "9-way", "10-way", "11-way", "12-way", - "13-way", "14-way", "15-way", "16-way", -}; - -static void probe_pcache(void) +static void flush_cache_leaf(unsigned int leaf) { - struct cpuinfo_loongarch *c = ¤t_cpu_data; - unsigned int lsize, sets, ways; - unsigned int config; - - config = read_cpucfg(LOONGARCH_CPUCFG17); - lsize = 1 << ((config & CPUCFG17_L1I_SIZE_M) >> CPUCFG17_L1I_SIZE); - sets = 1 << ((config & CPUCFG17_L1I_SETS_M) >> CPUCFG17_L1I_SETS); - ways = ((config & CPUCFG17_L1I_WAYS_M) >> CPUCFG17_L1I_WAYS) + 1; - - c->icache.linesz = lsize; - c->icache.sets = sets; - c->icache.ways = ways; - icache_size = sets * ways * lsize; - c->icache.waysize = icache_size / c->icache.ways; - - config = read_cpucfg(LOONGARCH_CPUCFG18); - lsize = 1 << ((config & CPUCFG18_L1D_SIZE_M) >> CPUCFG18_L1D_SIZE); - sets = 1 << ((config & CPUCFG18_L1D_SETS_M) >> CPUCFG18_L1D_SETS); - ways = ((config & CPUCFG18_L1D_WAYS_M) >> CPUCFG18_L1D_WAYS) + 1; - - c->dcache.linesz = lsize; - c->dcache.sets = sets; - c->dcache.ways = ways; - dcache_size = sets * ways * lsize; - c->dcache.waysize = dcache_size / c->dcache.ways; - - c->options |= LOONGARCH_CPU_PREFETCH; - - pr_info("Primary instruction cache %ldkB, %s, %s, linesize %d bytes.\n", - icache_size >> 10, way_string[c->icache.ways], "VIPT", c->icache.linesz); - - pr_info("Primary data cache %ldkB, %s, %s, %s, linesize %d bytes\n", - dcache_size >> 10, way_string[c->dcache.ways], "VIPT", "no aliases", c->dcache.linesz); + int i, j, nr_nodes; + uint64_t addr = CSR_DMW0_BASE; + struct cache_desc *cdesc = current_cpu_data.cache_leaves + leaf; + + nr_nodes = cache_private(cdesc) ? 1 : loongson_sysconf.nr_nodes; + + do { + for (i = 0; i < cdesc->sets; i++) { + for (j = 0; j < cdesc->ways; j++) { + flush_cache_line(leaf, addr); + addr++; + } + + addr -= cdesc->ways; + addr += cdesc->linesz; + } + addr += (1ULL << NODE_ADDRSPACE_SHIFT); + } while (--nr_nodes > 0); } -static void probe_vcache(void) +asmlinkage __visible void __flush_cache_all(void) { - struct cpuinfo_loongarch *c = ¤t_cpu_data; - unsigned int lsize, sets, ways; - unsigned int config; - - config = read_cpucfg(LOONGARCH_CPUCFG19); - lsize = 1 << ((config & CPUCFG19_L2_SIZE_M) >> CPUCFG19_L2_SIZE); - sets = 1 << ((config & CPUCFG19_L2_SETS_M) >> CPUCFG19_L2_SETS); - ways = ((config & CPUCFG19_L2_WAYS_M) >> CPUCFG19_L2_WAYS) + 1; - - c->vcache.linesz = lsize; - c->vcache.sets = sets; - c->vcache.ways = ways; - vcache_size = lsize * sets * ways; - c->vcache.waysize = vcache_size / c->vcache.ways; - - pr_info("Unified victim cache %ldkB %s, linesize %d bytes.\n", - vcache_size >> 10, way_string[c->vcache.ways], c->vcache.linesz); + int leaf; + struct cache_desc *cdesc = current_cpu_data.cache_leaves; + unsigned int cache_present = current_cpu_data.cache_leaves_present; + + leaf = cache_present - 1; + if (cache_inclusive(cdesc + leaf)) { + flush_cache_leaf(leaf); + return; + } + + for (leaf = 0; leaf < cache_present; leaf++) + flush_cache_leaf(leaf); } -static void probe_scache(void) -{ - struct cpuinfo_loongarch *c = ¤t_cpu_data; - unsigned int lsize, sets, ways; - unsigned int config; - - config = read_cpucfg(LOONGARCH_CPUCFG20); - lsize = 1 << ((config & CPUCFG20_L3_SIZE_M) >> CPUCFG20_L3_SIZE); - sets = 1 << ((config & CPUCFG20_L3_SETS_M) >> CPUCFG20_L3_SETS); - ways = ((config & CPUCFG20_L3_WAYS_M) >> CPUCFG20_L3_WAYS) + 1; - - c->scache.linesz = lsize; - c->scache.sets = sets; - c->scache.ways = ways; - /* 4 cores. scaches are shared */ - scache_size = lsize * sets * ways; - c->scache.waysize = scache_size / c->scache.ways; - - pr_info("Unified secondary cache %ldkB %s, linesize %d bytes.\n", - scache_size >> 10, way_string[c->scache.ways], c->scache.linesz); -} +#define L1IUPRE (1 << 0) +#define L1IUUNIFY (1 << 1) +#define L1DPRE (1 << 2) + +#define LXIUPRE (1 << 0) +#define LXIUUNIFY (1 << 1) +#define LXIUPRIV (1 << 2) +#define LXIUINCL (1 << 3) +#define LXDPRE (1 << 4) +#define LXDPRIV (1 << 5) +#define LXDINCL (1 << 6) + +#define populate_cache_properties(cfg0, cdesc, level, leaf) \ +do { \ + unsigned int cfg1; \ + \ + cfg1 = read_cpucfg(LOONGARCH_CPUCFG17 + leaf); \ + if (level == 1) { \ + cdesc->flags |= CACHE_PRIVATE; \ + } else { \ + if (cfg0 & LXIUPRIV) \ + cdesc->flags |= CACHE_PRIVATE; \ + if (cfg0 & LXIUINCL) \ + cdesc->flags |= CACHE_INCLUSIVE; \ + } \ + cdesc->level = level; \ + cdesc->flags |= CACHE_PRESENT; \ + cdesc->ways = ((cfg1 & CPUCFG_CACHE_WAYS_M) >> CPUCFG_CACHE_WAYS) + 1; \ + cdesc->sets = 1 << ((cfg1 & CPUCFG_CACHE_SETS_M) >> CPUCFG_CACHE_SETS); \ + cdesc->linesz = 1 << ((cfg1 & CPUCFG_CACHE_LSIZE_M) >> CPUCFG_CACHE_LSIZE); \ + cdesc++; leaf++; \ +} while (0) void cpu_cache_init(void) { - probe_pcache(); - probe_vcache(); - probe_scache(); - + unsigned int leaf = 0, level = 1; + unsigned int config = read_cpucfg(LOONGARCH_CPUCFG16); + struct cache_desc *cdesc = current_cpu_data.cache_leaves; + + if (config & L1IUPRE) { + if (config & L1IUUNIFY) + cdesc->type = CACHE_TYPE_UNIFIED; + else + cdesc->type = CACHE_TYPE_INST; + populate_cache_properties(config, cdesc, level, leaf); + } + + if (config & L1DPRE) { + cdesc->type = CACHE_TYPE_DATA; + populate_cache_properties(config, cdesc, level, leaf); + } + + config = config >> 3; + for (level = 2; level <= CACHE_LEVEL_MAX; level++) { + if (!config) + break; + + if (config & LXIUPRE) { + if (config & LXIUUNIFY) + cdesc->type = CACHE_TYPE_UNIFIED; + else + cdesc->type = CACHE_TYPE_INST; + populate_cache_properties(config, cdesc, level, leaf); + } + + if (config & LXDPRE) { + cdesc->type = CACHE_TYPE_DATA; + populate_cache_properties(config, cdesc, level, leaf); + } + + config = config >> 7; + } + + BUG_ON(leaf > CACHE_LEAVES_MAX); + + current_cpu_data.cache_leaves_present = leaf; + current_cpu_data.options |= LOONGARCH_CPU_PREFETCH; shm_align_mask = PAGE_SIZE - 1; } diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c index e9b7c34d9b6d8..2726639150bc7 100644 --- a/arch/loongarch/pci/pci.c +++ b/arch/loongarch/pci/pci.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/cacheflush.h> #include <asm/loongson.h> #define PCI_DEVICE_ID_LOONGSON_HOST 0x7a00 @@ -45,12 +46,10 @@ static int __init pcibios_init(void) unsigned int lsize; /* - * Set PCI cacheline size to that of the highest level in the + * Set PCI cacheline size to that of the last level in the * cache hierarchy. */ - lsize = cpu_dcache_line_size(); - lsize = cpu_vcache_line_size() ? : lsize; - lsize = cpu_scache_line_size() ? : lsize; + lsize = cpu_last_level_cache_line_size(); BUG_ON(!lsize); -- GitLab From 235d074fdc9a69e3720b8bb6efeb7c6d30c12d8e Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1798/2223] LoongArch: Support access filter to /dev/mem interface Accidental access to /dev/mem is obviously disastrous, but specific access can be used by people debugging the kernel. So select GENERIC_ LIB_DEVMEM_IS_ALLOWED, as well as define ARCH_HAS_VALID_PHYS_ADDR_RANGE and related helpers, to support access filter to /dev/mem interface. Signed-off-by: Weihao Li <liweihao@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/io.h | 4 ++++ arch/loongarch/mm/mmap.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 9aeecc83b4807..cf9deec016394 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -70,6 +70,7 @@ config LOONGARCH select GENERIC_LIB_CMPDI2 select GENERIC_LIB_LSHRDI3 select GENERIC_LIB_UCMPDI2 + select GENERIC_LIB_DEVMEM_IS_ALLOWED select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 999944ea1cea4..398d1a7b3dd64 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -107,4 +107,8 @@ extern void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t #include <asm-generic/io.h> +#define ARCH_HAS_VALID_PHYS_ADDR_RANGE +extern int valid_phys_addr_range(phys_addr_t addr, size_t size); +extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); + #endif /* _ASM_IO_H */ diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index 381a569635a9d..fbe1a4856fc42 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -3,6 +3,8 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include <linux/export.h> +#include <linux/io.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/mman.h> @@ -116,3 +118,30 @@ int __virt_addr_valid(volatile void *kaddr) return pfn_valid(PFN_DOWN(PHYSADDR(kaddr))); } EXPORT_SYMBOL_GPL(__virt_addr_valid); + +/* + * You really shouldn't be using read() or write() on /dev/mem. This might go + * away in the future. + */ +int valid_phys_addr_range(phys_addr_t addr, size_t size) +{ + /* + * Check whether addr is covered by a memory region without the + * MEMBLOCK_NOMAP attribute, and whether that region covers the + * entire range. In theory, this could lead to false negatives + * if the range is covered by distinct but adjacent memory regions + * that only differ in other attributes. However, few of such + * attributes have been defined, and it is debatable whether it + * follows that /dev/mem read() calls should be able traverse + * such boundaries. + */ + return memblock_is_region_memory(addr, size) && memblock_is_map_memory(addr); +} + +/* + * Do not allow /dev/mem mappings beyond the supported physical range. + */ +int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) +{ + return !(((pfn << PAGE_SHIFT) + size) & ~(GENMASK_ULL(cpu_pabits, 0))); +} -- GitLab From d279134168c78ac2caa1f7cd2a846579da1c93ac Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1799/2223] LoongArch: Use TLB for ioremap() We can support more cache attributes (e.g., CC, SUC and WUC) and page protection when we use TLB for ioremap(). The implementation is based on GENERIC_IOREMAP. The existing simple ioremap() implementation has better performance so we keep it and introduce ARCH_IOREMAP to control the selection. We move pagetable_init() earlier to make early ioremap() works, and we modify the PCI ecam mapping because the TLB-based version of ioremap() will actually take the size into account. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 11 ++++ arch/loongarch/include/asm/fixmap.h | 15 +++++ arch/loongarch/include/asm/io.h | 69 ++++++-------------- arch/loongarch/include/asm/pgtable-bits.h | 3 + arch/loongarch/kernel/setup.c | 2 +- arch/loongarch/mm/init.c | 64 +++++++++++++++++++ arch/loongarch/pci/acpi.c | 76 +++++++++++++++++++++-- 7 files changed, 184 insertions(+), 56 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index cf9deec016394..d126c50b2310c 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -62,6 +62,7 @@ config LOONGARCH select GENERIC_CPU_AUTOPROBE select GENERIC_ENTRY select GENERIC_GETTIMEOFDAY + select GENERIC_IOREMAP if !ARCH_IOREMAP select GENERIC_IRQ_MULTI_HANDLER select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW @@ -168,6 +169,9 @@ config MACH_LOONGSON32 config MACH_LOONGSON64 def_bool 64BIT +config FIX_EARLYCON_MEM + def_bool y + config PAGE_SIZE_4KB bool @@ -404,6 +408,13 @@ config FORCE_MAX_ZONEORDER The page size is not necessarily 4KB. Keep this in mind when choosing a value for this option. +config ARCH_IOREMAP + bool "Enable LoongArch DMW-based ioremap()" + help + We use generic TLB-based ioremap() by default since it has page + protection support. However, you can enable LoongArch DMW-based + ioremap() for better performance. + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS diff --git a/arch/loongarch/include/asm/fixmap.h b/arch/loongarch/include/asm/fixmap.h index b3541dfa20138..d2e55ae55bb9c 100644 --- a/arch/loongarch/include/asm/fixmap.h +++ b/arch/loongarch/include/asm/fixmap.h @@ -10,4 +10,19 @@ #define NR_FIX_BTMAPS 64 +enum fixed_addresses { + FIX_HOLE, + FIX_EARLYCON_MEM_BASE, + __end_of_fixed_addresses +}; + +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXMAP_PAGE_IO PAGE_KERNEL_SUC + +extern void __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + +#include <asm-generic/fixmap.h> + #endif diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 398d1a7b3dd64..402a7d9e3a53e 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -27,71 +27,38 @@ extern void __init early_iounmap(void __iomem *addr, unsigned long size); #define early_memremap early_ioremap #define early_memunmap early_iounmap +#ifdef CONFIG_ARCH_IOREMAP + static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, unsigned long prot_val) { - if (prot_val == _CACHE_CC) + if (prot_val & _CACHE_CC) return (void __iomem *)(unsigned long)(CACHE_BASE + offset); else return (void __iomem *)(unsigned long)(UNCACHE_BASE + offset); } -/* - * ioremap - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - */ -#define ioremap(offset, size) \ - ioremap_prot((offset), (size), _CACHE_SUC) +#define ioremap(offset, size) \ + ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL_SUC)) -/* - * ioremap_wc - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_wc performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * but accelerated by means of write-combining feature. It is specifically - * useful for PCIe prefetchable windows, which may vastly improve a - * communications performance. If it was determined on boot stage, what - * CPU CCA doesn't support WUC, the method shall fall-back to the - * _CACHE_SUC option (see cpu_probe() method). - */ -#define ioremap_wc(offset, size) \ - ioremap_prot((offset), (size), _CACHE_WUC) +#define iounmap(addr) ((void)(addr)) + +#endif /* - * ioremap_cache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_cache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. + * On LoongArch, ioremap() has two variants, ioremap_wc() and ioremap_cache(). + * They map bus memory into CPU space, the mapped memory is marked uncachable + * (_CACHE_SUC), uncachable but accelerated by write-combine (_CACHE_WUC) and + * cachable (_CACHE_CC) respectively for CPU access. * - * This version of ioremap ensures that the memory is marked cachable by - * the CPU. Also enables full write-combining. Useful for some - * memory-like regions on I/O busses. + * @offset: bus address of the memory + * @size: size of the resource to map */ -#define ioremap_cache(offset, size) \ - ioremap_prot((offset), (size), _CACHE_CC) +#define ioremap_wc(offset, size) \ + ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL_WUC)) -static inline void iounmap(const volatile void __iomem *addr) -{ -} +#define ioremap_cache(offset, size) \ + ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL)) #define mmiowb() asm volatile ("dbar 0" ::: "memory") diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 9ca147a29bab8..3d1e0a69975a5 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -83,8 +83,11 @@ _PAGE_GLOBAL | _PAGE_KERN | _CACHE_SUC) #define PAGE_KERNEL_WUC __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ _PAGE_GLOBAL | _PAGE_KERN | _CACHE_WUC) + #ifndef __ASSEMBLY__ +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL_SUC) + #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t _prot) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 7fabf2306e801..05af1102fee75 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -348,10 +348,10 @@ void __init setup_arch(char **cmdline_p) init_environ(); efi_init(); memblock_init(); + pagetable_init(); parse_early_param(); platform_init(); - pagetable_init(); arch_mem_init(cmdline_p); resource_init(); diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 0532ed5ba43de..080061793c859 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -152,6 +152,70 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #endif +static pte_t *fixmap_pte(unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + + if (pgd_none(*pgd)) { + pud_t *new __maybe_unused; + + new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + pgd_populate(&init_mm, pgd, new); +#ifndef __PAGETABLE_PUD_FOLDED + pud_init((unsigned long)new, (unsigned long)invalid_pmd_table); +#endif + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new __maybe_unused; + + new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + pud_populate(&init_mm, pud, new); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_init((unsigned long)new, (unsigned long)invalid_pte_table); +#endif + } + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t *new __maybe_unused; + + new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + pmd_populate_kernel(&init_mm, pmd, new); + } + + return pte_offset_kernel(pmd, addr); +} + +void __init __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *ptep; + + BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); + + ptep = fixmap_pte(addr); + if (!pte_none(*ptep)) { + pte_ERROR(*ptep); + return; + } + + if (pgprot_val(flags)) + set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); + else { + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } +} + /* * Align swapper_pg_dir in to 64K, allows its address to be loaded * with a single LUI instruction in the TLB handlers. If we used diff --git a/arch/loongarch/pci/acpi.c b/arch/loongarch/pci/acpi.c index bf921487333c6..8235ec92b41fe 100644 --- a/arch/loongarch/pci/acpi.c +++ b/arch/loongarch/pci/acpi.c @@ -82,6 +82,69 @@ static int acpi_prepare_root_resources(struct acpi_pci_root_info *ci) return 0; } +/* + * Create a PCI config space window + * - reserve mem region + * - alloc struct pci_config_window with space for all mappings + * - ioremap the config space + */ +static struct pci_config_window *arch_pci_ecam_create(struct device *dev, + struct resource *cfgres, struct resource *busr, const struct pci_ecam_ops *ops) +{ + int bsz, bus_range, err; + struct resource *conflict; + struct pci_config_window *cfg; + + if (busr->start > busr->end) + return ERR_PTR(-EINVAL); + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return ERR_PTR(-ENOMEM); + + cfg->parent = dev; + cfg->ops = ops; + cfg->busr.start = busr->start; + cfg->busr.end = busr->end; + cfg->busr.flags = IORESOURCE_BUS; + bus_range = resource_size(cfgres) >> ops->bus_shift; + + bsz = 1 << ops->bus_shift; + + cfg->res.start = cfgres->start; + cfg->res.end = cfgres->end; + cfg->res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + cfg->res.name = "PCI ECAM"; + + conflict = request_resource_conflict(&iomem_resource, &cfg->res); + if (conflict) { + err = -EBUSY; + dev_err(dev, "can't claim ECAM area %pR: address conflict with %s %pR\n", + &cfg->res, conflict->name, conflict); + goto err_exit; + } + + cfg->win = pci_remap_cfgspace(cfgres->start, bus_range * bsz); + if (!cfg->win) + goto err_exit_iomap; + + if (ops->init) { + err = ops->init(cfg); + if (err) + goto err_exit; + } + dev_info(dev, "ECAM at %pR for %pR\n", &cfg->res, &cfg->busr); + + return cfg; + +err_exit_iomap: + err = -ENOMEM; + dev_err(dev, "ECAM ioremap failed\n"); +err_exit: + pci_ecam_free(cfg); + return ERR_PTR(err); +} + /* * Lookup the bus range for the domain in MCFG, and set up config space * mapping. @@ -106,11 +169,16 @@ pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) bus_shift = ecam_ops->bus_shift ? : 20; - cfgres.start = root->mcfg_addr + (bus_res->start << bus_shift); - cfgres.end = cfgres.start + (resource_size(bus_res) << bus_shift) - 1; - cfgres.flags = IORESOURCE_MEM; + if (bus_shift == 20) + cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); + else { + cfgres.start = root->mcfg_addr + (bus_res->start << bus_shift); + cfgres.end = cfgres.start + (resource_size(bus_res) << bus_shift) - 1; + cfgres.end |= BIT(28) + (((PCI_CFG_SPACE_EXP_SIZE - 1) & 0xf00) << 16); + cfgres.flags = IORESOURCE_MEM; + cfg = arch_pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); + } - cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); if (IS_ERR(cfg)) { dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, PTR_ERR(cfg)); return NULL; -- GitLab From 5f1e001be579c2b7f37e7d5ff87c208c33e90fca Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1800/2223] LoongArch: Add qspinlock support On NUMA system, the performance of qspinlock is better than generic spinlock. Below is the UnixBench test results on a 8 nodes (4 cores per node, 32 cores in total) machine. A. With generic spinlock: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 449574022.5 38523.9 Double-Precision Whetstone 55.0 85190.4 15489.2 Execl Throughput 43.0 14696.2 3417.7 File Copy 1024 bufsize 2000 maxblocks 3960.0 143157.8 361.5 File Copy 256 bufsize 500 maxblocks 1655.0 37631.8 227.4 File Copy 4096 bufsize 8000 maxblocks 5800.0 444814.2 766.9 Pipe Throughput 12440.0 5047490.7 4057.5 Pipe-based Context Switching 4000.0 2021545.7 5053.9 Process Creation 126.0 23829.8 1891.3 Shell Scripts (1 concurrent) 42.4 33756.7 7961.5 Shell Scripts (8 concurrent) 6.0 4062.9 6771.5 System Call Overhead 15000.0 2479748.6 1653.2 ======== System Benchmarks Index Score 2955.6 B. With qspinlock: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 449467876.9 38514.8 Double-Precision Whetstone 55.0 85174.6 15486.3 Execl Throughput 43.0 14769.1 3434.7 File Copy 1024 bufsize 2000 maxblocks 3960.0 146150.5 369.1 File Copy 256 bufsize 500 maxblocks 1655.0 37496.8 226.6 File Copy 4096 bufsize 8000 maxblocks 5800.0 447527.0 771.6 Pipe Throughput 12440.0 5175989.2 4160.8 Pipe-based Context Switching 4000.0 2207747.8 5519.4 Process Creation 126.0 25125.5 1994.1 Shell Scripts (1 concurrent) 42.4 33461.2 7891.8 Shell Scripts (8 concurrent) 6.0 4024.7 6707.8 System Call Overhead 15000.0 2917278.6 1944.9 ======== System Benchmarks Index Score 3040.1 Signed-off-by: Rui Wang <wangrui@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/Kbuild | 5 ++--- arch/loongarch/include/asm/spinlock.h | 12 ++++++++++++ arch/loongarch/include/asm/spinlock_types.h | 11 +++++++++++ 4 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 arch/loongarch/include/asm/spinlock.h create mode 100644 arch/loongarch/include/asm/spinlock_types.h diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index d126c50b2310c..b36156a1896f7 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -51,6 +51,7 @@ config LOONGARCH select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_NO_INSTR diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 83bc0681e72b4..a0eed6076c79a 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -1,12 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += dma-contiguous.h generic-y += export.h +generic-y += mcs_spinlock.h generic-y += parport.h generic-y += early_ioremap.h generic-y += qrwlock.h -generic-y += qrwlock_types.h -generic-y += spinlock.h -generic-y += spinlock_types.h +generic-y += qspinlock.h generic-y += rwsem.h generic-y += segment.h generic-y += user.h diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h new file mode 100644 index 0000000000000..7cb3476999bec --- /dev/null +++ b/arch/loongarch/include/asm/spinlock.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ +#ifndef _ASM_SPINLOCK_H +#define _ASM_SPINLOCK_H + +#include <asm/processor.h> +#include <asm/qspinlock.h> +#include <asm/qrwlock.h> + +#endif /* _ASM_SPINLOCK_H */ diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h new file mode 100644 index 0000000000000..7458d036c161d --- /dev/null +++ b/arch/loongarch/include/asm/spinlock_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ +#ifndef _ASM_SPINLOCK_TYPES_H +#define _ASM_SPINLOCK_TYPES_H + +#include <asm-generic/qspinlock_types.h> +#include <asm-generic/qrwlock_types.h> + +#endif -- GitLab From b37042b2bb7cd751f03b73afb90364a418d870f4 Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1801/2223] LoongArch: Add perf events support The perf events infrastructure of LoongArch is very similar to old MIPS- based Loongson, so most of the codes are derived from MIPS. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 2 + arch/loongarch/include/asm/perf_event.h | 4 +- arch/loongarch/include/uapi/asm/perf_regs.h | 40 + arch/loongarch/kernel/Makefile | 2 + arch/loongarch/kernel/perf_event.c | 887 ++++++++++++++++++++ arch/loongarch/kernel/perf_regs.c | 53 ++ 6 files changed, 987 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/include/uapi/asm/perf_regs.h create mode 100644 arch/loongarch/kernel/perf_event.c create mode 100644 arch/loongarch/kernel/perf_regs.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index b36156a1896f7..223edbb8fe847 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -97,6 +97,8 @@ config LOONGARCH select HAVE_NMI select HAVE_PCI select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ select HAVE_SETUP_PER_CPU_AREA if NUMA diff --git a/arch/loongarch/include/asm/perf_event.h b/arch/loongarch/include/asm/perf_event.h index dcb3b17053a83..2a35a0bc2aaab 100644 --- a/arch/loongarch/include/asm/perf_event.h +++ b/arch/loongarch/include/asm/perf_event.h @@ -6,5 +6,7 @@ #ifndef __LOONGARCH_PERF_EVENT_H__ #define __LOONGARCH_PERF_EVENT_H__ -/* Nothing to show here; the file is required by linux/perf_event.h. */ + +#define perf_arch_bpf_user_pt_regs(regs) (struct user_pt_regs *)regs + #endif /* __LOONGARCH_PERF_EVENT_H__ */ diff --git a/arch/loongarch/include/uapi/asm/perf_regs.h b/arch/loongarch/include/uapi/asm/perf_regs.h new file mode 100644 index 0000000000000..29d69c00fc7a6 --- /dev/null +++ b/arch/loongarch/include/uapi/asm/perf_regs.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_LOONGARCH_PERF_REGS_H +#define _ASM_LOONGARCH_PERF_REGS_H + +enum perf_event_loongarch_regs { + PERF_REG_LOONGARCH_PC, + PERF_REG_LOONGARCH_R1, + PERF_REG_LOONGARCH_R2, + PERF_REG_LOONGARCH_R3, + PERF_REG_LOONGARCH_R4, + PERF_REG_LOONGARCH_R5, + PERF_REG_LOONGARCH_R6, + PERF_REG_LOONGARCH_R7, + PERF_REG_LOONGARCH_R8, + PERF_REG_LOONGARCH_R9, + PERF_REG_LOONGARCH_R10, + PERF_REG_LOONGARCH_R11, + PERF_REG_LOONGARCH_R12, + PERF_REG_LOONGARCH_R13, + PERF_REG_LOONGARCH_R14, + PERF_REG_LOONGARCH_R15, + PERF_REG_LOONGARCH_R16, + PERF_REG_LOONGARCH_R17, + PERF_REG_LOONGARCH_R18, + PERF_REG_LOONGARCH_R19, + PERF_REG_LOONGARCH_R20, + PERF_REG_LOONGARCH_R21, + PERF_REG_LOONGARCH_R22, + PERF_REG_LOONGARCH_R23, + PERF_REG_LOONGARCH_R24, + PERF_REG_LOONGARCH_R25, + PERF_REG_LOONGARCH_R26, + PERF_REG_LOONGARCH_R27, + PERF_REG_LOONGARCH_R28, + PERF_REG_LOONGARCH_R29, + PERF_REG_LOONGARCH_R30, + PERF_REG_LOONGARCH_R31, + PERF_REG_LOONGARCH_MAX, +}; +#endif /* _ASM_LOONGARCH_PERF_REGS_H */ diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index e5be17009fe8a..a213e994db68c 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -26,4 +26,6 @@ obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_regs.o + CPPFLAGS_vmlinux.lds := $(KBUILD_CFLAGS) diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c new file mode 100644 index 0000000000000..707bd32e5c4ff --- /dev/null +++ b/arch/loongarch/kernel/perf_event.c @@ -0,0 +1,887 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux performance counter support for LoongArch. + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + * + * Derived from MIPS: + * Copyright (C) 2010 MIPS Technologies, Inc. + * Copyright (C) 2011 Cavium Networks, Inc. + * Author: Deng-Cheng Zhu + */ + +#include <linux/cpumask.h> +#include <linux/interrupt.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/perf_event.h> +#include <linux/uaccess.h> +#include <linux/sched/task_stack.h> + +#include <asm/irq.h> +#include <asm/irq_regs.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static unsigned long +user_backtrace(struct perf_callchain_entry_ctx *entry, unsigned long fp) +{ + unsigned long err; + unsigned long __user *user_frame_tail; + struct stack_frame buftail; + + user_frame_tail = (unsigned long __user *)(fp - sizeof(struct stack_frame)); + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(user_frame_tail, sizeof(buftail))) + return 0; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, user_frame_tail, sizeof(buftail)); + pagefault_enable(); + + if (err || (unsigned long)user_frame_tail >= buftail.fp) + return 0; + + perf_callchain_store(entry, buftail.ra); + + return buftail.fp; +} + +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + unsigned long fp; + + if (perf_guest_state()) { + /* We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->csr_era); + + fp = regs->regs[22]; + + while (entry->nr < entry->max_stack && fp && !((unsigned long)fp & 0xf)) + fp = user_backtrace(entry, fp); +} + +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + struct unwind_state state; + unsigned long addr; + + for (unwind_start(&state, current, regs); + !unwind_done(&state); unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || perf_callchain_store(entry, addr)) + return; + } +} + +#define LOONGARCH_MAX_HWEVENTS 32 + +struct cpu_hw_events { + /* Array of events on this cpu. */ + struct perf_event *events[LOONGARCH_MAX_HWEVENTS]; + + /* + * Set the bit (indexed by the counter number) when the counter + * is used for an event. + */ + unsigned long used_mask[BITS_TO_LONGS(LOONGARCH_MAX_HWEVENTS)]; + + /* + * Software copy of the control register for each performance counter. + */ + unsigned int saved_ctrl[LOONGARCH_MAX_HWEVENTS]; +}; +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .saved_ctrl = {0}, +}; + +/* The description of LoongArch performance events. */ +struct loongarch_perf_event { + unsigned int event_id; +}; + +static struct loongarch_perf_event raw_event; +static DEFINE_MUTEX(raw_event_mutex); + +#define C(x) PERF_COUNT_HW_CACHE_##x +#define HW_OP_UNSUPPORTED 0xffffffff +#define CACHE_OP_UNSUPPORTED 0xffffffff + +#define PERF_MAP_ALL_UNSUPPORTED \ + [0 ... PERF_COUNT_HW_MAX - 1] = {HW_OP_UNSUPPORTED} + +#define PERF_CACHE_MAP_ALL_UNSUPPORTED \ +[0 ... C(MAX) - 1] = { \ + [0 ... C(OP_MAX) - 1] = { \ + [0 ... C(RESULT_MAX) - 1] = {CACHE_OP_UNSUPPORTED}, \ + }, \ +} + +struct loongarch_pmu { + u64 max_period; + u64 valid_count; + u64 overflow; + const char *name; + unsigned int num_counters; + u64 (*read_counter)(unsigned int idx); + void (*write_counter)(unsigned int idx, u64 val); + const struct loongarch_perf_event *(*map_raw_event)(u64 config); + const struct loongarch_perf_event (*general_event_map)[PERF_COUNT_HW_MAX]; + const struct loongarch_perf_event (*cache_event_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +static struct loongarch_pmu loongarch_pmu; + +#define M_PERFCTL_EVENT(event) (event & CSR_PERFCTRL_EVENT) + +#define M_PERFCTL_COUNT_EVENT_WHENEVER (CSR_PERFCTRL_PLV0 | \ + CSR_PERFCTRL_PLV1 | \ + CSR_PERFCTRL_PLV2 | \ + CSR_PERFCTRL_PLV3 | \ + CSR_PERFCTRL_IE) + +#define M_PERFCTL_CONFIG_MASK 0x1f0000 + +static void pause_local_counters(void); +static void resume_local_counters(void); + +static u64 loongarch_pmu_read_counter(unsigned int idx) +{ + u64 val = -1; + + switch (idx) { + case 0: + val = read_csr_perfcntr0(); + break; + case 1: + val = read_csr_perfcntr1(); + break; + case 2: + val = read_csr_perfcntr2(); + break; + case 3: + val = read_csr_perfcntr3(); + break; + default: + WARN_ONCE(1, "Invalid performance counter number (%d)\n", idx); + return 0; + } + + return val; +} + +static void loongarch_pmu_write_counter(unsigned int idx, u64 val) +{ + switch (idx) { + case 0: + write_csr_perfcntr0(val); + return; + case 1: + write_csr_perfcntr1(val); + return; + case 2: + write_csr_perfcntr2(val); + return; + case 3: + write_csr_perfcntr3(val); + return; + default: + WARN_ONCE(1, "Invalid performance counter number (%d)\n", idx); + return; + } +} + +static unsigned int loongarch_pmu_read_control(unsigned int idx) +{ + unsigned int val = -1; + + switch (idx) { + case 0: + val = read_csr_perfctrl0(); + break; + case 1: + val = read_csr_perfctrl1(); + break; + case 2: + val = read_csr_perfctrl2(); + break; + case 3: + val = read_csr_perfctrl3(); + break; + default: + WARN_ONCE(1, "Invalid performance counter number (%d)\n", idx); + return 0; + } + + return val; +} + +static void loongarch_pmu_write_control(unsigned int idx, unsigned int val) +{ + switch (idx) { + case 0: + write_csr_perfctrl0(val); + return; + case 1: + write_csr_perfctrl1(val); + return; + case 2: + write_csr_perfctrl2(val); + return; + case 3: + write_csr_perfctrl3(val); + return; + default: + WARN_ONCE(1, "Invalid performance counter number (%d)\n", idx); + return; + } +} + +static int loongarch_pmu_alloc_counter(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + int i; + + for (i = 0; i < loongarch_pmu.num_counters; i++) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + + return -EAGAIN; +} + +static void loongarch_pmu_enable_event(struct hw_perf_event *evt, int idx) +{ + unsigned int cpu; + struct perf_event *event = container_of(evt, struct perf_event, hw); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON(idx < 0 || idx >= loongarch_pmu.num_counters); + + /* Make sure interrupt enabled. */ + cpuc->saved_ctrl[idx] = M_PERFCTL_EVENT(evt->event_base & 0xff) | + (evt->config_base & M_PERFCTL_CONFIG_MASK) | CSR_PERFCTRL_IE; + + cpu = (event->cpu >= 0) ? event->cpu : smp_processor_id(); + + /* + * We do not actually let the counter run. Leave it until start(). + */ + pr_debug("Enabling perf counter for CPU%d\n", cpu); +} + +static void loongarch_pmu_disable_event(int idx) +{ + unsigned long flags; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON(idx < 0 || idx >= loongarch_pmu.num_counters); + + local_irq_save(flags); + cpuc->saved_ctrl[idx] = loongarch_pmu_read_control(idx) & + ~M_PERFCTL_COUNT_EVENT_WHENEVER; + loongarch_pmu_write_control(idx, cpuc->saved_ctrl[idx]); + local_irq_restore(flags); +} + +static int loongarch_pmu_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, + int idx) +{ + int ret = 0; + u64 left = local64_read(&hwc->period_left); + u64 period = hwc->sample_period; + + if (unlikely((left + period) & (1ULL << 63))) { + /* left underflowed by more than period. */ + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } else if (unlikely((left + period) <= period)) { + /* left underflowed by less than period. */ + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (left > loongarch_pmu.max_period) { + left = loongarch_pmu.max_period; + local64_set(&hwc->period_left, left); + } + + local64_set(&hwc->prev_count, loongarch_pmu.overflow - left); + + loongarch_pmu.write_counter(idx, loongarch_pmu.overflow - left); + + perf_event_update_userpage(event); + + return ret; +} + +static void loongarch_pmu_event_update(struct perf_event *event, + struct hw_perf_event *hwc, + int idx) +{ + u64 delta; + u64 prev_raw_count, new_raw_count; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = loongarch_pmu.read_counter(idx); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + delta = new_raw_count - prev_raw_count; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); +} + +static void loongarch_pmu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* Set the period for the event. */ + loongarch_pmu_event_set_period(event, hwc, hwc->idx); + + /* Enable the event. */ + loongarch_pmu_enable_event(hwc, hwc->idx); +} + +static void loongarch_pmu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + /* We are working on a local event. */ + loongarch_pmu_disable_event(hwc->idx); + barrier(); + loongarch_pmu_event_update(event, hwc, hwc->idx); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static int loongarch_pmu_add(struct perf_event *event, int flags) +{ + int idx, err = 0; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + perf_pmu_disable(event->pmu); + + /* To look for a free counter for this event. */ + idx = loongarch_pmu_alloc_counter(cpuc, hwc); + if (idx < 0) { + err = idx; + goto out; + } + + /* + * If there is an event in the counter we are going to use then + * make sure it is disabled. + */ + event->hw.idx = idx; + loongarch_pmu_disable_event(idx); + cpuc->events[idx] = event; + + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + if (flags & PERF_EF_START) + loongarch_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate our changes to the userspace mapping. */ + perf_event_update_userpage(event); + +out: + perf_pmu_enable(event->pmu); + return err; +} + +static void loongarch_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + WARN_ON(idx < 0 || idx >= loongarch_pmu.num_counters); + + loongarch_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +static void loongarch_pmu_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* Don't read disabled counters! */ + if (hwc->idx < 0) + return; + + loongarch_pmu_event_update(event, hwc, hwc->idx); +} + +static void loongarch_pmu_enable(struct pmu *pmu) +{ + resume_local_counters(); +} + +static void loongarch_pmu_disable(struct pmu *pmu) +{ + pause_local_counters(); +} + +static DEFINE_MUTEX(pmu_reserve_mutex); +static atomic_t active_events = ATOMIC_INIT(0); + +static int get_pmc_irq(void) +{ + struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY); + + if (d) + return irq_create_mapping(d, EXCCODE_PMC - EXCCODE_INT_START); + + return -EINVAL; +} + +static void reset_counters(void *arg); +static int __hw_perf_event_init(struct perf_event *event); + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmu_reserve_mutex)) { + on_each_cpu(reset_counters, NULL, 1); + free_irq(get_pmc_irq(), &loongarch_pmu); + mutex_unlock(&pmu_reserve_mutex); + } +} + +static void handle_associated_event(struct cpu_hw_events *cpuc, int idx, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct perf_event *event = cpuc->events[idx]; + struct hw_perf_event *hwc = &event->hw; + + loongarch_pmu_event_update(event, hwc, idx); + data->period = event->hw.last_period; + if (!loongarch_pmu_event_set_period(event, hwc, idx)) + return; + + if (perf_event_overflow(event, data, regs)) + loongarch_pmu_disable_event(idx); +} + +static irqreturn_t pmu_handle_irq(int irq, void *dev) +{ + int n; + int handled = IRQ_NONE; + uint64_t counter; + struct pt_regs *regs; + struct perf_sample_data data; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * First we pause the local counters, so that when we are locked + * here, the counters are all paused. When it gets locked due to + * perf_disable(), the timer interrupt handler will be delayed. + * + * See also loongarch_pmu_start(). + */ + pause_local_counters(); + + regs = get_irq_regs(); + + perf_sample_data_init(&data, 0, 0); + + for (n = 0; n < loongarch_pmu.num_counters; n++) { + if (test_bit(n, cpuc->used_mask)) { + counter = loongarch_pmu.read_counter(n); + if (counter & loongarch_pmu.overflow) { + handle_associated_event(cpuc, n, &data, regs); + handled = IRQ_HANDLED; + } + } + } + + resume_local_counters(); + + /* + * Do all the work for the pending perf events. We can do this + * in here because the performance counter interrupt is a regular + * interrupt, not NMI. + */ + if (handled == IRQ_HANDLED) + irq_work_run(); + + return handled; +} + +static int loongarch_pmu_event_init(struct perf_event *event) +{ + int r, irq; + unsigned long flags; + + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + /* Init it to avoid false validate_group */ + event->hw.event_base = 0xffffffff; + return -ENOENT; + } + + if (event->cpu >= 0 && !cpu_online(event->cpu)) + return -ENODEV; + + irq = get_pmc_irq(); + flags = IRQF_PERCPU | IRQF_NOBALANCING | IRQF_NO_THREAD | IRQF_NO_SUSPEND | IRQF_SHARED; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmu_reserve_mutex); + if (atomic_read(&active_events) == 0) { + r = request_irq(irq, pmu_handle_irq, flags, "Perf_PMU", &loongarch_pmu); + if (r < 0) { + mutex_unlock(&pmu_reserve_mutex); + pr_warn("PMU IRQ request failed\n"); + return -ENODEV; + } + } + atomic_inc(&active_events); + mutex_unlock(&pmu_reserve_mutex); + } + + return __hw_perf_event_init(event); +} + +static struct pmu pmu = { + .pmu_enable = loongarch_pmu_enable, + .pmu_disable = loongarch_pmu_disable, + .event_init = loongarch_pmu_event_init, + .add = loongarch_pmu_add, + .del = loongarch_pmu_del, + .start = loongarch_pmu_start, + .stop = loongarch_pmu_stop, + .read = loongarch_pmu_read, +}; + +static unsigned int loongarch_pmu_perf_event_encode(const struct loongarch_perf_event *pev) +{ + return (pev->event_id & 0xff); +} + +static const struct loongarch_perf_event *loongarch_pmu_map_general_event(int idx) +{ + const struct loongarch_perf_event *pev; + + pev = &(*loongarch_pmu.general_event_map)[idx]; + + if (pev->event_id == HW_OP_UNSUPPORTED) + return ERR_PTR(-ENOENT); + + return pev; +} + +static const struct loongarch_perf_event *loongarch_pmu_map_cache_event(u64 config) +{ + unsigned int cache_type, cache_op, cache_result; + const struct loongarch_perf_event *pev; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return ERR_PTR(-EINVAL); + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return ERR_PTR(-EINVAL); + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return ERR_PTR(-EINVAL); + + pev = &((*loongarch_pmu.cache_event_map) + [cache_type] + [cache_op] + [cache_result]); + + if (pev->event_id == CACHE_OP_UNSUPPORTED) + return ERR_PTR(-ENOENT); + + return pev; +} + +static int validate_group(struct perf_event *event) +{ + struct cpu_hw_events fake_cpuc; + struct perf_event *sibling, *leader = event->group_leader; + + memset(&fake_cpuc, 0, sizeof(fake_cpuc)); + + if (loongarch_pmu_alloc_counter(&fake_cpuc, &leader->hw) < 0) + return -EINVAL; + + for_each_sibling_event(sibling, leader) { + if (loongarch_pmu_alloc_counter(&fake_cpuc, &sibling->hw) < 0) + return -EINVAL; + } + + if (loongarch_pmu_alloc_counter(&fake_cpuc, &event->hw) < 0) + return -EINVAL; + + return 0; +} + +static void reset_counters(void *arg) +{ + int n; + int counters = loongarch_pmu.num_counters; + + for (n = 0; n < counters; n++) { + loongarch_pmu_write_control(n, 0); + loongarch_pmu.write_counter(n, 0); + } +} + +static const struct loongarch_perf_event loongson_event_map[PERF_COUNT_HW_MAX] = { + PERF_MAP_ALL_UNSUPPORTED, + [PERF_COUNT_HW_CPU_CYCLES] = { 0x00 }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x01 }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x08 }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x09 }, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x02 }, + [PERF_COUNT_HW_BRANCH_MISSES] = { 0x03 }, +}; + +static const struct loongarch_perf_event loongson_cache_map + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +PERF_CACHE_MAP_ALL_UNSUPPORTED, +[C(L1D)] = { + /* + * Like some other architectures (e.g. ARM), the performance + * counters don't differentiate between read and write + * accesses/misses, so this isn't strictly correct, but it's the + * best we can do. Writes and reads get combined. + */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { 0x8 }, + [C(RESULT_MISS)] = { 0x9 }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { 0x8 }, + [C(RESULT_MISS)] = { 0x9 }, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = { 0xaa }, + [C(RESULT_MISS)] = { 0xa9 }, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { 0x6 }, + [C(RESULT_MISS)] = { 0x7 }, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { 0xc }, + [C(RESULT_MISS)] = { 0xd }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { 0xc }, + [C(RESULT_MISS)] = { 0xd }, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_MISS)] = { 0x3b }, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { 0x4 }, + [C(RESULT_MISS)] = { 0x3c }, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = { 0x4 }, + [C(RESULT_MISS)] = { 0x3c }, + }, +}, +[C(BPU)] = { + /* Using the same code for *HW_BRANCH* */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = { 0x02 }, + [C(RESULT_MISS)] = { 0x03 }, + }, +}, +}; + +static int __hw_perf_event_init(struct perf_event *event) +{ + int err; + struct hw_perf_event *hwc = &event->hw; + struct perf_event_attr *attr = &event->attr; + const struct loongarch_perf_event *pev; + + /* Returning LoongArch event descriptor for generic perf event. */ + if (PERF_TYPE_HARDWARE == event->attr.type) { + if (event->attr.config >= PERF_COUNT_HW_MAX) + return -EINVAL; + pev = loongarch_pmu_map_general_event(event->attr.config); + } else if (PERF_TYPE_HW_CACHE == event->attr.type) { + pev = loongarch_pmu_map_cache_event(event->attr.config); + } else if (PERF_TYPE_RAW == event->attr.type) { + /* We are working on the global raw event. */ + mutex_lock(&raw_event_mutex); + pev = loongarch_pmu.map_raw_event(event->attr.config); + } else { + /* The event type is not (yet) supported. */ + return -EOPNOTSUPP; + } + + if (IS_ERR(pev)) { + if (PERF_TYPE_RAW == event->attr.type) + mutex_unlock(&raw_event_mutex); + return PTR_ERR(pev); + } + + /* + * We allow max flexibility on how each individual counter shared + * by the single CPU operates (the mode exclusion and the range). + */ + hwc->config_base = CSR_PERFCTRL_IE; + + hwc->event_base = loongarch_pmu_perf_event_encode(pev); + if (PERF_TYPE_RAW == event->attr.type) + mutex_unlock(&raw_event_mutex); + + if (!attr->exclude_user) { + hwc->config_base |= CSR_PERFCTRL_PLV3; + hwc->config_base |= CSR_PERFCTRL_PLV2; + } + if (!attr->exclude_kernel) { + hwc->config_base |= CSR_PERFCTRL_PLV0; + } + if (!attr->exclude_hv) { + hwc->config_base |= CSR_PERFCTRL_PLV1; + } + + hwc->config_base &= M_PERFCTL_CONFIG_MASK; + /* + * The event can belong to another cpu. We do not assign a local + * counter for it for now. + */ + hwc->idx = -1; + hwc->config = 0; + + if (!hwc->sample_period) { + hwc->sample_period = loongarch_pmu.max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } + + err = 0; + if (event->group_leader != event) + err = validate_group(event); + + event->destroy = hw_perf_event_destroy; + + if (err) + event->destroy(event); + + return err; +} + +static void pause_local_counters(void) +{ + unsigned long flags; + int ctr = loongarch_pmu.num_counters; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + local_irq_save(flags); + do { + ctr--; + cpuc->saved_ctrl[ctr] = loongarch_pmu_read_control(ctr); + loongarch_pmu_write_control(ctr, cpuc->saved_ctrl[ctr] & + ~M_PERFCTL_COUNT_EVENT_WHENEVER); + } while (ctr > 0); + local_irq_restore(flags); +} + +static void resume_local_counters(void) +{ + int ctr = loongarch_pmu.num_counters; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + do { + ctr--; + loongarch_pmu_write_control(ctr, cpuc->saved_ctrl[ctr]); + } while (ctr > 0); +} + +static const struct loongarch_perf_event *loongarch_pmu_map_raw_event(u64 config) +{ + raw_event.event_id = config & 0xff; + + return &raw_event; +} + +static int __init init_hw_perf_events(void) +{ + int counters; + + if (!cpu_has_pmp) + return -ENODEV; + + pr_info("Performance counters: "); + counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> 4) + 1; + + loongarch_pmu.num_counters = counters; + loongarch_pmu.max_period = (1ULL << 63) - 1; + loongarch_pmu.valid_count = (1ULL << 63) - 1; + loongarch_pmu.overflow = 1ULL << 63; + loongarch_pmu.name = "loongarch/loongson64"; + loongarch_pmu.read_counter = loongarch_pmu_read_counter; + loongarch_pmu.write_counter = loongarch_pmu_write_counter; + loongarch_pmu.map_raw_event = loongarch_pmu_map_raw_event; + loongarch_pmu.general_event_map = &loongson_event_map; + loongarch_pmu.cache_event_map = &loongson_cache_map; + + on_each_cpu(reset_counters, NULL, 1); + + pr_cont("%s PMU enabled, %d %d-bit counters available to each CPU.\n", + loongarch_pmu.name, counters, 64); + + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + + return 0; +} +early_initcall(init_hw_perf_events); diff --git a/arch/loongarch/kernel/perf_regs.c b/arch/loongarch/kernel/perf_regs.c new file mode 100644 index 0000000000000..263ac4ab5af68 --- /dev/null +++ b/arch/loongarch/kernel/perf_regs.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Loongson Technology Corporation Limited + * + * Derived from MIPS: + * Copyright (C) 2013 Cavium, Inc. + */ + +#include <linux/perf_event.h> + +#include <asm/ptrace.h> + +#ifdef CONFIG_32BIT +u64 perf_reg_abi(struct task_struct *tsk) +{ + return PERF_SAMPLE_REGS_ABI_32; +} +#else /* Must be CONFIG_64BIT */ +u64 perf_reg_abi(struct task_struct *tsk) +{ + if (test_tsk_thread_flag(tsk, TIF_32BIT_REGS)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} +#endif /* CONFIG_32BIT */ + +int perf_reg_validate(u64 mask) +{ + if (!mask) + return -EINVAL; + if (mask & ~((1ull << PERF_REG_LOONGARCH_MAX) - 1)) + return -EINVAL; + return 0; +} + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE((u32)idx >= PERF_REG_LOONGARCH_MAX)) + return 0; + + if ((u32)idx == PERF_REG_LOONGARCH_PC) + return regs->csr_era; + + return regs->regs[idx]; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} -- GitLab From dea2df3cc72555633cc7858ce1daa4b757f843ad Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:14 +0800 Subject: [PATCH 1802/2223] LoongArch: Add SysRq-x (TLB Dump) support Add SysRq-x (TLB Dump) support for LoongArch, which is useful for debugging. Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/kernel/Makefile | 2 ++ arch/loongarch/kernel/sysrq.c | 65 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 arch/loongarch/kernel/sysrq.c diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index a213e994db68c..7225916dd3781 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -23,6 +23,8 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_MAGIC_SYSRQ) += sysrq.o + obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o diff --git a/arch/loongarch/kernel/sysrq.c b/arch/loongarch/kernel/sysrq.c new file mode 100644 index 0000000000000..366baef72d297 --- /dev/null +++ b/arch/loongarch/kernel/sysrq.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * LoongArch specific sysrq operations. + * + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/sysrq.h> +#include <linux/workqueue.h> + +#include <asm/cpu-features.h> +#include <asm/tlb.h> + +/* + * Dump TLB entries on all CPUs. + */ + +static DEFINE_SPINLOCK(show_lock); + +static void sysrq_tlbdump_single(void *dummy) +{ + unsigned long flags; + + spin_lock_irqsave(&show_lock, flags); + + pr_info("CPU%d:\n", smp_processor_id()); + dump_tlb_regs(); + pr_info("\n"); + dump_tlb_all(); + pr_info("\n"); + + spin_unlock_irqrestore(&show_lock, flags); +} + +#ifdef CONFIG_SMP +static void sysrq_tlbdump_othercpus(struct work_struct *dummy) +{ + smp_call_function(sysrq_tlbdump_single, NULL, 0); +} + +static DECLARE_WORK(sysrq_tlbdump, sysrq_tlbdump_othercpus); +#endif + +static void sysrq_handle_tlbdump(int key) +{ + sysrq_tlbdump_single(NULL); +#ifdef CONFIG_SMP + schedule_work(&sysrq_tlbdump); +#endif +} + +static struct sysrq_key_op sysrq_tlbdump_op = { + .handler = sysrq_handle_tlbdump, + .help_msg = "show-tlbs(x)", + .action_msg = "Show TLB entries", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init loongarch_sysrq_init(void) +{ + return register_sysrq_key('x', &sysrq_tlbdump_op); +} +arch_initcall(loongarch_sysrq_init); -- GitLab From 2d2c395217d2233e752dbddcae3c5d94050b48c1 Mon Sep 17 00:00:00 2001 From: Youling Tang <tangyouling@loongson.cn> Date: Wed, 12 Oct 2022 16:36:19 +0800 Subject: [PATCH 1803/2223] LoongArch: Use generic BUG() handler Inspired by commit 9fb7410f955("arm64/BUG: Use BRK instruction for generic BUG traps"), do similar for LoongArch to use generic BUG() handler. This patch uses the BREAK software breakpoint instruction to generate a trap instead, similarly to most other arches, with the generic BUG code generating the dmesg boilerplate. This allows bug metadata to be moved to a separate table and reduces the amount of inline code at BUG() and WARN() sites. This also avoids clobbering any registers before they can be dumped. To mitigate the size of the bug table further, this patch makes use of the existing infrastructure for encoding addresses within the bug table as 32-bit relative pointers instead of absolute pointers. (Note: this limits the max kernel size to 2GB.) Before patch: [ 3018.338013] lkdtm: Performing direct entry BUG [ 3018.342445] Kernel bug detected[#5]: [ 3018.345992] CPU: 2 PID: 865 Comm: cat Tainted: G D 6.0.0-rc6+ #35 After patch: [ 125.585985] lkdtm: Performing direct entry BUG [ 125.590433] ------------[ cut here ]------------ [ 125.595020] kernel BUG at drivers/misc/lkdtm/bugs.c:78! [ 125.600211] Oops - BUG[#1]: [ 125.602980] CPU: 3 PID: 410 Comm: cat Not tainted 6.0.0-rc6+ #36 Out-of-line file/line data information obtained compared to before. Signed-off-by: Youling Tang <tangyouling@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 8 +++++ arch/loongarch/include/asm/bug.h | 58 ++++++++++++++++++++++++++------ arch/loongarch/kernel/head.S | 4 +++ arch/loongarch/kernel/traps.c | 26 ++++++++++++-- 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 223edbb8fe847..cc3ba53242b63 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -142,6 +142,14 @@ config CPU_HAS_PREFETCH bool default y +config GENERIC_BUG + def_bool y + depends on BUG + +config GENERIC_BUG_RELATIVE_POINTERS + def_bool y + depends on GENERIC_BUG + config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index bda49108a76d0..d4ca3ba254188 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -2,21 +2,59 @@ #ifndef __ASM_BUG_H #define __ASM_BUG_H -#include <linux/compiler.h> +#include <asm/break.h> +#include <linux/stringify.h> + +#ifndef CONFIG_DEBUG_BUGVERBOSE +#define _BUGVERBOSE_LOCATION(file, line) +#else +#define __BUGVERBOSE_LOCATION(file, line) \ + .pushsection .rodata.str, "aMS", @progbits, 1; \ + 10002: .string file; \ + .popsection; \ + \ + .long 10002b - .; \ + .short line; +#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) +#endif -#ifdef CONFIG_BUG +#ifndef CONFIG_GENERIC_BUG +#define __BUG_ENTRY(flags) +#else +#define __BUG_ENTRY(flags) \ + .pushsection __bug_table, "aw"; \ + .align 2; \ + 10000: .long 10001f - .; \ + _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ + .short flags; \ + .popsection; \ + 10001: +#endif -#include <asm/break.h> +#define ASM_BUG_FLAGS(flags) \ + __BUG_ENTRY(flags) \ + break BRK_BUG -static inline void __noreturn BUG(void) -{ - __asm__ __volatile__("break %0" : : "i" (BRK_BUG)); - unreachable(); -} +#define ASM_BUG() ASM_BUG_FLAGS(0) -#define HAVE_ARCH_BUG +#define __BUG_FLAGS(flags) \ + asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags))); -#endif +#define __WARN_FLAGS(flags) \ +do { \ + instrumentation_begin(); \ + __BUG_FLAGS(BUGFLAG_WARNING|(flags)); \ + instrumentation_end(); \ +} while (0) + +#define BUG() \ +do { \ + instrumentation_begin(); \ + __BUG_FLAGS(0); \ + unreachable(); \ +} while (0) + +#define HAVE_ARCH_BUG #include <asm-generic/bug.h> diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 0c67c24ce0878..d32128f1d3c49 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -8,6 +8,7 @@ #include <asm/addrspace.h> #include <asm/asm.h> #include <asm/asmmacro.h> +#include <asm/bug.h> #include <asm/regdef.h> #include <asm/loongarch.h> #include <asm/stackframe.h> @@ -85,6 +86,7 @@ SYM_CODE_START(kernel_entry) # kernel entry point PTR_ADDI sp, sp, -4 * SZREG # init stack pointer bl start_kernel + ASM_BUG() SYM_CODE_END(kernel_entry) @@ -116,6 +118,8 @@ SYM_CODE_START(smpboot_entry) ld.d tp, t0, CPU_BOOT_TINFO bl start_secondary + ASM_BUG() + SYM_CODE_END(smpboot_entry) #endif /* CONFIG_SMP */ diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index a5e8bd5d79484..66c2849b26e5c 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -374,6 +374,29 @@ asmlinkage void noinstr do_ale(struct pt_regs *regs) irqentry_exit(regs, state); } +#ifdef CONFIG_GENERIC_BUG +int is_valid_bugaddr(unsigned long addr) +{ + return 1; +} +#endif /* CONFIG_GENERIC_BUG */ + +static void bug_handler(struct pt_regs *regs) +{ + switch (report_bug(regs->csr_era, regs)) { + case BUG_TRAP_TYPE_BUG: + case BUG_TRAP_TYPE_NONE: + die_if_kernel("Oops - BUG", regs); + force_sig(SIGTRAP); + break; + + case BUG_TRAP_TYPE_WARN: + /* Skip the BUG instruction and continue */ + regs->csr_era += LOONGARCH_INSN_SIZE; + break; + } +} + asmlinkage void noinstr do_bp(struct pt_regs *regs) { bool user = user_mode(regs); @@ -427,8 +450,7 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) switch (bcode) { case BRK_BUG: - die_if_kernel("Kernel bug detected", regs); - force_sig(SIGTRAP); + bug_handler(regs); break; case BRK_DIVZERO: die_if_kernel("Break instruction in kernel code", regs); -- GitLab From 4a03b2ac06a5bcae29371866d9d11f5bfd4c9188 Mon Sep 17 00:00:00 2001 From: Youling Tang <tangyouling@loongson.cn> Date: Wed, 12 Oct 2022 16:36:19 +0800 Subject: [PATCH 1804/2223] LoongArch: Add kexec support Add three new files, kexec.h, machine_kexec.c and relocate_kernel.S to the LoongArch architecture, so as to add support for the kexec re-boot mechanism (CONFIG_KEXEC) on LoongArch platforms. Kexec supports loading vmlinux.elf in ELF format and vmlinux.efi in PE format. I tested kexec on LoongArch machines (Loongson-3A5000) and it works as expected: $ sudo kexec -l /boot/vmlinux.efi --reuse-cmdline $ sudo kexec -e Signed-off-by: Youling Tang <tangyouling@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 11 ++ arch/loongarch/include/asm/kexec.h | 60 +++++++ arch/loongarch/kernel/Makefile | 2 + arch/loongarch/kernel/head.S | 6 +- arch/loongarch/kernel/machine_kexec.c | 216 ++++++++++++++++++++++++ arch/loongarch/kernel/relocate_kernel.S | 106 ++++++++++++ 6 files changed, 400 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/include/asm/kexec.h create mode 100644 arch/loongarch/kernel/machine_kexec.c create mode 100644 arch/loongarch/kernel/relocate_kernel.S diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index cc3ba53242b63..cbbb82b0c4fe9 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -426,6 +426,17 @@ config ARCH_IOREMAP protection support. However, you can enable LoongArch DMW-based ioremap() for better performance. +config KEXEC + bool "Kexec system call" + select KEXEC_CORE + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is independent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similarity to the exec system call. + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS diff --git a/arch/loongarch/include/asm/kexec.h b/arch/loongarch/include/asm/kexec.h new file mode 100644 index 0000000000000..cf95cd3eb2dea --- /dev/null +++ b/arch/loongarch/include/asm/kexec.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * kexec.h for kexec + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + */ + +#ifndef _ASM_KEXEC_H +#define _ASM_KEXEC_H + +#include <asm/stacktrace.h> +#include <asm/page.h> + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) + /* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL) + +/* Reserve a page for the control code buffer */ +#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE + +/* The native architecture */ +#define KEXEC_ARCH KEXEC_ARCH_LOONGARCH + +static inline void crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + if (oldregs) + memcpy(newregs, oldregs, sizeof(*newregs)); + else + prepare_frametrace(newregs); +} + +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + unsigned long efi_boot; + unsigned long cmdline_ptr; + unsigned long systable_ptr; +}; + +typedef void (*do_kexec_t)(unsigned long efi_boot, + unsigned long cmdline_ptr, + unsigned long systable_ptr, + unsigned long start_addr, + unsigned long first_ind_entry); + +struct kimage; +extern const unsigned char relocate_new_kernel[]; +extern const size_t relocate_new_kernel_size; +extern void kexec_reboot(void); + +#ifdef CONFIG_SMP +extern atomic_t kexec_ready_to_reboot; +extern const unsigned char kexec_smp_wait[]; +#endif + +#endif /* !_ASM_KEXEC_H */ diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 7225916dd3781..0bad6272a55e6 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -25,6 +25,8 @@ obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_MAGIC_SYSRQ) += sysrq.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o + obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index d32128f1d3c49..97425779ce9f3 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -21,7 +21,11 @@ _head: .word MZ_MAGIC /* "MZ", MS-DOS header */ - .org 0x3c /* 0x04 ~ 0x3b reserved */ + .org 0x8 + .dword kernel_entry /* Kernel entry point */ + .dword _end - _text /* Kernel image effective size */ + .quad 0 /* Kernel image load offset from start of RAM */ + .org 0x3c /* 0x20 ~ 0x3b reserved */ .long pe_header - _head /* Offset to the PE header */ pe_header: diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c new file mode 100644 index 0000000000000..d5037573ed660 --- /dev/null +++ b/arch/loongarch/kernel/machine_kexec.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * machine_kexec.c for kexec + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + */ +#include <linux/compiler.h> +#include <linux/cpu.h> +#include <linux/kexec.h> +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/libfdt.h> +#include <linux/of_fdt.h> + +#include <asm/bootinfo.h> +#include <asm/cacheflush.h> +#include <asm/page.h> + +/* 0x100000 ~ 0x200000 is safe */ +#define KEXEC_CONTROL_CODE TO_CACHE(0x100000UL) +#define KEXEC_CMDLINE_ADDR TO_CACHE(0x108000UL) + +static unsigned long reboot_code_buffer; + +#ifdef CONFIG_SMP +static void (*relocated_kexec_smp_wait)(void *); +atomic_t kexec_ready_to_reboot = ATOMIC_INIT(0); +#endif + +static unsigned long efi_boot; +static unsigned long cmdline_ptr; +static unsigned long systable_ptr; +static unsigned long start_addr; +static unsigned long first_ind_entry; + +static void kexec_image_info(const struct kimage *kimage) +{ + unsigned long i; + + pr_debug("kexec kimage info:\n"); + pr_debug("\ttype: %d\n", kimage->type); + pr_debug("\tstart: %lx\n", kimage->start); + pr_debug("\thead: %lx\n", kimage->head); + pr_debug("\tnr_segments: %lu\n", kimage->nr_segments); + + for (i = 0; i < kimage->nr_segments; i++) { + pr_debug("\t segment[%lu]: %016lx - %016lx", i, + kimage->segment[i].mem, + kimage->segment[i].mem + kimage->segment[i].memsz); + pr_debug("\t\t0x%lx bytes, %lu pages\n", + (unsigned long)kimage->segment[i].memsz, + (unsigned long)kimage->segment[i].memsz / PAGE_SIZE); + } +} + +int machine_kexec_prepare(struct kimage *kimage) +{ + int i; + char *bootloader = "kexec"; + void *cmdline_ptr = (void *)KEXEC_CMDLINE_ADDR; + + kexec_image_info(kimage); + + kimage->arch.efi_boot = fw_arg0; + kimage->arch.systable_ptr = fw_arg2; + + /* Find the command line */ + for (i = 0; i < kimage->nr_segments; i++) { + if (!strncmp(bootloader, (char __user *)kimage->segment[i].buf, strlen(bootloader))) { + if (!copy_from_user(cmdline_ptr, kimage->segment[i].buf, COMMAND_LINE_SIZE)) + kimage->arch.cmdline_ptr = (unsigned long)cmdline_ptr; + break; + } + } + + if (!kimage->arch.cmdline_ptr) { + pr_err("Command line not included in the provided image\n"); + return -EINVAL; + } + + /* kexec need a safe page to save reboot_code_buffer */ + kimage->control_code_page = virt_to_page((void *)KEXEC_CONTROL_CODE); + + reboot_code_buffer = (unsigned long)page_address(kimage->control_code_page); + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); + +#ifdef CONFIG_SMP + /* All secondary cpus now may jump to kexec_smp_wait cycle */ + relocated_kexec_smp_wait = reboot_code_buffer + (void *)(kexec_smp_wait - relocate_new_kernel); +#endif + + return 0; +} + +void machine_kexec_cleanup(struct kimage *kimage) +{ +} + +void kexec_reboot(void) +{ + do_kexec_t do_kexec = NULL; + + /* + * We know we were online, and there will be no incoming IPIs at + * this point. + */ + set_cpu_online(smp_processor_id(), true); + + /* Ensure remote CPUs observe that we're online before rebooting. */ + smp_mb__after_atomic(); + + /* + * Make sure we get correct instructions written by the + * machine_kexec_prepare() CPU. + */ + __asm__ __volatile__ ("\tibar 0\n"::); + +#ifdef CONFIG_SMP + /* All secondary cpus go to kexec_smp_wait */ + if (smp_processor_id() > 0) { + relocated_kexec_smp_wait(NULL); + unreachable(); + } +#endif + + do_kexec = (void *)reboot_code_buffer; + do_kexec(efi_boot, cmdline_ptr, systable_ptr, start_addr, first_ind_entry); + + unreachable(); +} + + +#ifdef CONFIG_SMP +static void kexec_shutdown_secondary(void *regs) +{ + int cpu = smp_processor_id(); + + if (!cpu_online(cpu)) + return; + + /* We won't be sent IPIs any more. */ + set_cpu_online(cpu, false); + + local_irq_disable(); + while (!atomic_read(&kexec_ready_to_reboot)) + cpu_relax(); + + kexec_reboot(); +} +#endif + +void machine_shutdown(void) +{ + int cpu; + + /* All CPUs go to reboot_code_buffer */ + for_each_possible_cpu(cpu) + if (!cpu_online(cpu)) + cpu_device_up(get_cpu_device(cpu)); + +#ifdef CONFIG_SMP + smp_call_function(kexec_shutdown_secondary, NULL, 0); +#endif +} + +void machine_crash_shutdown(struct pt_regs *regs) +{ +} + +void machine_kexec(struct kimage *image) +{ + unsigned long entry, *ptr; + struct kimage_arch *internal = &image->arch; + + efi_boot = internal->efi_boot; + cmdline_ptr = internal->cmdline_ptr; + systable_ptr = internal->systable_ptr; + + start_addr = (unsigned long)phys_to_virt(image->start); + + first_ind_entry = (unsigned long)phys_to_virt(image->head & PAGE_MASK); + + /* + * The generic kexec code builds a page list with physical + * addresses. they are directly accessible through XKPRANGE + * hence the phys_to_virt() call. + */ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); + ptr = (entry & IND_INDIRECTION) ? + phys_to_virt(entry & PAGE_MASK) : ptr + 1) { + if (*ptr & IND_SOURCE || *ptr & IND_INDIRECTION || + *ptr & IND_DESTINATION) + *ptr = (unsigned long) phys_to_virt(*ptr); + } + + /* Mark offline before disabling local irq. */ + set_cpu_online(smp_processor_id(), false); + + /* We do not want to be bothered. */ + local_irq_disable(); + + pr_notice("EFI boot flag 0x%lx\n", efi_boot); + pr_notice("Command line at 0x%lx\n", cmdline_ptr); + pr_notice("System table at 0x%lx\n", systable_ptr); + pr_notice("We will call new kernel at 0x%lx\n", start_addr); + pr_notice("Bye ...\n"); + + /* Make reboot code buffer available to the boot CPU. */ + flush_cache_all(); + +#ifdef CONFIG_SMP + atomic_set(&kexec_ready_to_reboot, 1); +#endif + + kexec_reboot(); +} diff --git a/arch/loongarch/kernel/relocate_kernel.S b/arch/loongarch/kernel/relocate_kernel.S new file mode 100644 index 0000000000000..48362f38c6bf6 --- /dev/null +++ b/arch/loongarch/kernel/relocate_kernel.S @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * relocate_kernel.S for kexec + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + */ + +#include <linux/kexec.h> + +#include <asm/asm.h> +#include <asm/asmmacro.h> +#include <asm/regdef.h> +#include <asm/loongarch.h> +#include <asm/stackframe.h> +#include <asm/addrspace.h> + +SYM_CODE_START(relocate_new_kernel) + /* + * a0: EFI boot flag for the new kernel + * a1: Command line pointer for the new kernel + * a2: System table pointer for the new kernel + * a3: Start address to jump to after relocation + * a4: Pointer to the current indirection page entry + */ + move s0, a4 + +process_entry: + PTR_L s1, s0, 0 + PTR_ADDI s0, s0, SZREG + + /* destination page */ + andi s2, s1, IND_DESTINATION + beqz s2, 1f + li.w t0, ~0x1 + and s3, s1, t0 /* store destination addr in s3 */ + b process_entry + +1: + /* indirection page, update s0 */ + andi s2, s1, IND_INDIRECTION + beqz s2, 1f + li.w t0, ~0x2 + and s0, s1, t0 + b process_entry + +1: + /* done page */ + andi s2, s1, IND_DONE + beqz s2, 1f + b done + +1: + /* source page */ + andi s2, s1, IND_SOURCE + beqz s2, process_entry + li.w t0, ~0x8 + and s1, s1, t0 + li.w s5, (1 << _PAGE_SHIFT) / SZREG + +copy_word: + /* copy page word by word */ + REG_L s4, s1, 0 + REG_S s4, s3, 0 + PTR_ADDI s3, s3, SZREG + PTR_ADDI s1, s1, SZREG + LONG_ADDI s5, s5, -1 + beqz s5, process_entry + b copy_word + b process_entry + +done: + ibar 0 + dbar 0 + + /* + * Jump to the new kernel, + * make sure the values of a0, a1, a2 and a3 are not changed. + */ + jr a3 +SYM_CODE_END(relocate_new_kernel) + +#ifdef CONFIG_SMP +/* + * Other CPUs should wait until code is relocated and + * then start at the entry point from LOONGARCH_IOCSR_MBUF0. + */ +SYM_CODE_START(kexec_smp_wait) +1: li.w t0, 0x100 /* wait for init loop */ +2: addi.w t0, t0, -1 /* limit mailbox access */ + bnez t0, 2b + li.w t1, LOONGARCH_IOCSR_MBUF0 + iocsrrd.w s0, t1 /* check PC as an indicator */ + beqz s0, 1b + iocsrrd.d s0, t1 /* get PC via mailbox */ + + li.d t0, CACHE_BASE + or s0, s0, t0 /* s0 = TO_CACHE(s0) */ + jr s0 /* jump to initial PC */ +SYM_CODE_END(kexec_smp_wait) +#endif + +relocate_new_kernel_end: + +SYM_DATA_START(relocate_new_kernel_size) + PTR relocate_new_kernel_end - relocate_new_kernel +SYM_DATA_END(relocate_new_kernel_size) -- GitLab From 4e62d1d86585e1b62b4f96ee586881dd45a443dc Mon Sep 17 00:00:00 2001 From: Youling Tang <tangyouling@loongson.cn> Date: Wed, 12 Oct 2022 16:36:19 +0800 Subject: [PATCH 1805/2223] LoongArch: Add kdump support This patch adds support for kdump. In kdump case the normal kernel will reserve a region for the crash kernel and jump there on panic. Arch-specific functions are added to allow for implementing a crash dump file interface, /proc/vmcore, which can be viewed as a ELF file. A user-space tool, such as kexec-tools, is responsible for allocating a separate region for the core's ELF header within the crash kdump kernel memory and filling it in when executing kexec_load(). Then, its location will be advertised to the crash dump kernel via a command line argument "elfcorehdr=", and the crash dump kernel will preserve this region for later use with arch_reserve_vmcore() at boot time. At the same time, the crash kdump kernel is also limited within the "crashkernel" area via a command line argument "mem=", so as not to destroy the original kernel dump data. In the crash dump kernel environment, /proc/vmcore is used to access the primary kernel's memory with copy_oldmem_page(). I tested kdump on LoongArch machines (Loongson-3A5000) and it works as expected (suggested crashkernel parameter is "crashkernel=512M@2560M"), you may test it by triggering a crash through /proc/sysrq-trigger: $ sudo kexec -p /boot/vmlinux-kdump --reuse-cmdline --append="nr_cpus=1" # echo c > /proc/sysrq-trigger Signed-off-by: Youling Tang <tangyouling@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kconfig | 22 ++++++ arch/loongarch/Makefile | 4 + arch/loongarch/kernel/Makefile | 1 + arch/loongarch/kernel/crash_dump.c | 23 ++++++ arch/loongarch/kernel/machine_kexec.c | 98 +++++++++++++++++++++++-- arch/loongarch/kernel/mem.c | 3 - arch/loongarch/kernel/relocate_kernel.S | 6 ++ arch/loongarch/kernel/setup.c | 74 +++++++++++++++++++ arch/loongarch/kernel/traps.c | 4 + 9 files changed, 227 insertions(+), 8 deletions(-) create mode 100644 arch/loongarch/kernel/crash_dump.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index cbbb82b0c4fe9..2837ac5413c08 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -437,6 +437,28 @@ config KEXEC The name comes from the similarity to the exec system call. +config CRASH_DUMP + bool "Build kdump crash kernel" + help + Generate crash dump after being started by kexec. This should + be normally only set in special crash dump kernels which are + loaded in the main kernel with kexec-tools into a specially + reserved region and then later executed after a crash by + kdump/kexec. + + For more details see Documentation/admin-guide/kdump/kdump.rst + +config PHYSICAL_START + hex "Physical address where the kernel is loaded" + default "0x90000000a0000000" + depends on CRASH_DUMP + help + This gives the XKPRANGE address where the kernel is loaded. + If you plan to use kernel for capturing the crash dump change + this value to start of the reserved region (the "X" value as + specified in the "crashkernel=YM@XM" command line boot parameter + passed to the panic-ed kernel). + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 42352f9058582..ea17e692684ef 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -69,7 +69,11 @@ endif cflags-y += -ffreestanding cflags-y += $(call cc-option, -mno-check-zero-division) +ifndef CONFIG_PHYSICAL_START load-y = 0x9000000000200000 +else +load-y = $(CONFIG_PHYSICAL_START) +endif bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y) drivers-$(CONFIG_PCI) += arch/loongarch/pci/ diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 0bad6272a55e6..b8cca9c22a9ff 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_MAGIC_SYSRQ) += sysrq.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o diff --git a/arch/loongarch/kernel/crash_dump.c b/arch/loongarch/kernel/crash_dump.c new file mode 100644 index 0000000000000..e559307c10929 --- /dev/null +++ b/arch/loongarch/kernel/crash_dump.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/crash_dump.h> +#include <linux/io.h> +#include <linux/uio.h> + +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB); + if (!vaddr) + return -ENOMEM; + + csize = copy_to_iter(vaddr + offset, csize, iter); + + memunmap(vaddr); + + return csize; +} diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c index d5037573ed660..2dcb9e003657c 100644 --- a/arch/loongarch/kernel/machine_kexec.c +++ b/arch/loongarch/kernel/machine_kexec.c @@ -7,10 +7,15 @@ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/kexec.h> -#include <linux/mm.h> +#include <linux/crash_dump.h> #include <linux/delay.h> +#include <linux/irq.h> #include <linux/libfdt.h> +#include <linux/mm.h> #include <linux/of_fdt.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <asm/bootinfo.h> #include <asm/cacheflush.h> @@ -21,6 +26,7 @@ #define KEXEC_CMDLINE_ADDR TO_CACHE(0x108000UL) static unsigned long reboot_code_buffer; +static cpumask_t cpus_in_crash = CPU_MASK_NONE; #ifdef CONFIG_SMP static void (*relocated_kexec_smp_wait)(void *); @@ -78,7 +84,7 @@ int machine_kexec_prepare(struct kimage *kimage) return -EINVAL; } - /* kexec need a safe page to save reboot_code_buffer */ + /* kexec/kdump need a safe page to save reboot_code_buffer */ kimage->control_code_page = virt_to_page((void *)KEXEC_CONTROL_CODE); reboot_code_buffer = (unsigned long)page_address(kimage->control_code_page); @@ -102,7 +108,8 @@ void kexec_reboot(void) /* * We know we were online, and there will be no incoming IPIs at - * this point. + * this point. Mark online again before rebooting so that the crash + * analysis tool will see us correctly. */ set_cpu_online(smp_processor_id(), true); @@ -147,7 +154,74 @@ static void kexec_shutdown_secondary(void *regs) kexec_reboot(); } -#endif + +static void crash_shutdown_secondary(void *passed_regs) +{ + int cpu = smp_processor_id(); + struct pt_regs *regs = passed_regs; + + /* + * If we are passed registers, use those. Otherwise get the + * regs from the last interrupt, which should be correct, as + * we are in an interrupt. But if the regs are not there, + * pull them from the top of the stack. They are probably + * wrong, but we need something to keep from crashing again. + */ + if (!regs) + regs = get_irq_regs(); + if (!regs) + regs = task_pt_regs(current); + + if (!cpu_online(cpu)) + return; + + /* We won't be sent IPIs any more. */ + set_cpu_online(cpu, false); + + local_irq_disable(); + if (!cpumask_test_cpu(cpu, &cpus_in_crash)) + crash_save_cpu(regs, cpu); + cpumask_set_cpu(cpu, &cpus_in_crash); + + while (!atomic_read(&kexec_ready_to_reboot)) + cpu_relax(); + + kexec_reboot(); +} + +void crash_smp_send_stop(void) +{ + unsigned int ncpus; + unsigned long timeout; + static int cpus_stopped; + + /* + * This function can be called twice in panic path, but obviously + * we should execute this only once. + */ + if (cpus_stopped) + return; + + cpus_stopped = 1; + + /* Excluding the panic cpu */ + ncpus = num_online_cpus() - 1; + + smp_call_function(crash_shutdown_secondary, NULL, 0); + smp_wmb(); + + /* + * The crash CPU sends an IPI and wait for other CPUs to + * respond. Delay of at least 10 seconds. + */ + timeout = MSEC_PER_SEC * 10; + pr_emerg("Sending IPI to other cpus...\n"); + while ((cpumask_weight(&cpus_in_crash) < ncpus) && timeout--) { + mdelay(1); + cpu_relax(); + } +} +#endif /* defined(CONFIG_SMP) */ void machine_shutdown(void) { @@ -165,6 +239,19 @@ void machine_shutdown(void) void machine_crash_shutdown(struct pt_regs *regs) { + int crashing_cpu; + + local_irq_disable(); + + crashing_cpu = smp_processor_id(); + crash_save_cpu(regs, crashing_cpu); + +#ifdef CONFIG_SMP + crash_smp_send_stop(); +#endif + cpumask_set_cpu(crashing_cpu, &cpus_in_crash); + + pr_info("Starting crashdump kernel...\n"); } void machine_kexec(struct kimage *image) @@ -178,7 +265,8 @@ void machine_kexec(struct kimage *image) start_addr = (unsigned long)phys_to_virt(image->start); - first_ind_entry = (unsigned long)phys_to_virt(image->head & PAGE_MASK); + first_ind_entry = (image->type == KEXEC_TYPE_DEFAULT) ? + (unsigned long)phys_to_virt(image->head & PAGE_MASK) : 0; /* * The generic kexec code builds a page list with physical diff --git a/arch/loongarch/kernel/mem.c b/arch/loongarch/kernel/mem.c index 7423361b0ebc9..4a4107a6a9651 100644 --- a/arch/loongarch/kernel/mem.c +++ b/arch/loongarch/kernel/mem.c @@ -58,7 +58,4 @@ void __init memblock_init(void) /* Reserve the kernel text/data/bss */ memblock_reserve(__pa_symbol(&_text), __pa_symbol(&_end) - __pa_symbol(&_text)); - - /* Reserve the initrd */ - reserve_initrd_mem(); } diff --git a/arch/loongarch/kernel/relocate_kernel.S b/arch/loongarch/kernel/relocate_kernel.S index 48362f38c6bf6..d13252553a7c7 100644 --- a/arch/loongarch/kernel/relocate_kernel.S +++ b/arch/loongarch/kernel/relocate_kernel.S @@ -24,6 +24,12 @@ SYM_CODE_START(relocate_new_kernel) */ move s0, a4 + /* + * In case of a kdump/crash kernel, the indirection page is not + * populated as the kernel is directly copied to a reserved location + */ + beqz s0, done + process_entry: PTR_L s1, s0, 0 PTR_ADDI s0, s0, SZREG diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 05af1102fee75..837111292ec62 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -19,6 +19,8 @@ #include <linux/memblock.h> #include <linux/initrd.h> #include <linux/ioport.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> #include <linux/root_dev.h> #include <linux/console.h> #include <linux/pfn.h> @@ -185,8 +187,70 @@ static int __init early_parse_mem(char *p) } early_param("mem", early_parse_mem); +static void __init arch_reserve_vmcore(void) +{ +#ifdef CONFIG_PROC_VMCORE + u64 i; + phys_addr_t start, end; + + if (!is_kdump_kernel()) + return; + + if (!elfcorehdr_size) { + for_each_mem_range(i, &start, &end) { + if (elfcorehdr_addr >= start && elfcorehdr_addr < end) { + /* + * Reserve from the elf core header to the end of + * the memory segment, that should all be kdump + * reserved memory. + */ + elfcorehdr_size = end - elfcorehdr_addr; + break; + } + } + } + + if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) { + pr_warn("elfcorehdr is overlapped\n"); + return; + } + + memblock_reserve(elfcorehdr_addr, elfcorehdr_size); + + pr_info("Reserving %llu KiB of memory at 0x%llx for elfcorehdr\n", + elfcorehdr_size >> 10, elfcorehdr_addr); +#endif +} + +static void __init arch_parse_crashkernel(void) +{ +#ifdef CONFIG_KEXEC + int ret; + unsigned long long start; + unsigned long long total_mem; + unsigned long long crash_base, crash_size; + + total_mem = memblock_phys_mem_size(); + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); + if (ret < 0 || crash_size <= 0) + return; + + start = memblock_phys_alloc_range(crash_size, 1, crash_base, crash_base + crash_size); + if (start != crash_base) { + pr_warn("Invalid memory region reserved for crash kernel\n"); + return; + } + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +#endif +} + void __init platform_init(void) { + arch_reserve_vmcore(); + arch_parse_crashkernel(); + #ifdef CONFIG_ACPI_TABLE_UPGRADE acpi_table_upgrade(); #endif @@ -289,6 +353,15 @@ static void __init resource_init(void) request_resource(res, &data_resource); request_resource(res, &bss_resource); } + +#ifdef CONFIG_KEXEC + if (crashk_res.start < crashk_res.end) { + insert_resource(&iomem_resource, &crashk_res); + pr_info("Reserving %ldMB of memory at %ldMB for crashkernel\n", + (unsigned long)((crashk_res.end - crashk_res.start + 1) >> 20), + (unsigned long)(crashk_res.start >> 20)); + } +#endif } static int __init reserve_memblock_reserved_regions(void) @@ -350,6 +423,7 @@ void __init setup_arch(char **cmdline_p) memblock_init(); pagetable_init(); parse_early_param(); + reserve_initrd_mem(); platform_init(); arch_mem_init(cmdline_p); diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 66c2849b26e5c..1a4dce84ebc60 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -10,6 +10,7 @@ #include <linux/entry-common.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/module.h> #include <linux/extable.h> #include <linux/mm.h> @@ -246,6 +247,9 @@ void __noreturn die(const char *str, struct pt_regs *regs) oops_exit(); + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + if (in_interrupt()) panic("Fatal exception in interrupt"); -- GitLab From 8a34228eb30308f6e223c6f2b87e2381d45056e2 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Wed, 12 Oct 2022 16:36:19 +0800 Subject: [PATCH 1806/2223] LoongArch: Move {signed,unsigned}_imm_check() to inst.h {signed,unsigned}_imm_check() will also be used in the bpf jit, so move them from module.c to inst.h, this is preparation for later patches. Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/inst.h | 10 ++++++++++ arch/loongarch/kernel/module.c | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index 7b07cbb3188c0..7b3750907ad10 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -166,4 +166,14 @@ u32 larch_insn_gen_lu32id(enum loongarch_gpr rd, int imm); u32 larch_insn_gen_lu52id(enum loongarch_gpr rd, enum loongarch_gpr rj, int imm); u32 larch_insn_gen_jirl(enum loongarch_gpr rd, enum loongarch_gpr rj, unsigned long pc, unsigned long dest); +static inline bool signed_imm_check(long val, unsigned int bit) +{ + return -(1L << (bit - 1)) <= val && val < (1L << (bit - 1)); +} + +static inline bool unsigned_imm_check(unsigned long val, unsigned int bit) +{ + return val < (1UL << bit); +} + #endif /* _ASM_INST_H */ diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index bee7457db8043..097595b2fc14b 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -18,16 +18,6 @@ #include <linux/string.h> #include <linux/kernel.h> -static inline bool signed_imm_check(long val, unsigned int bit) -{ - return -(1L << (bit - 1)) <= val && val < (1L << (bit - 1)); -} - -static inline bool unsigned_imm_check(unsigned long val, unsigned int bit) -{ - return val < (1UL << bit); -} - static int rela_stack_push(s64 stack_value, s64 *rela_stack, size_t *rela_stack_top) { if (*rela_stack_top >= RELA_STACK_DEPTH) -- GitLab From 4e59e5a46936dd649208f348ead678c35197203d Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Wed, 12 Oct 2022 16:36:19 +0800 Subject: [PATCH 1807/2223] LoongArch: Add some instruction opcodes and formats According to the "Table of Instruction Encoding" in LoongArch Reference Manual [1], add some instruction opcodes and formats which are used in the BPF JIT for LoongArch. [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#table-of-instruction-encoding Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/include/asm/inst.h | 179 +++++++++++++++++++++++++++++- 1 file changed, 174 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index 7b3750907ad10..63df7efff2766 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -8,6 +8,8 @@ #include <linux/types.h> #include <asm/asm.h> +#define INSN_BREAK 0x002a0000 + #define ADDR_IMMMASK_LU52ID 0xFFF0000000000000 #define ADDR_IMMMASK_LU32ID 0x000FFFFF00000000 #define ADDR_IMMMASK_ADDU16ID 0x00000000FFFF0000 @@ -18,9 +20,16 @@ #define ADDR_IMM(addr, INSN) ((addr & ADDR_IMMMASK_##INSN) >> ADDR_IMMSHIFT_##INSN) +enum reg0i26_op { + b_op = 0x14, + bl_op = 0x15, +}; + enum reg1i20_op { lu12iw_op = 0x0a, lu32id_op = 0x0b, + pcaddu12i_op = 0x0e, + pcaddu18i_op = 0x0f, }; enum reg1i21_op { @@ -28,10 +37,34 @@ enum reg1i21_op { bnez_op = 0x11, }; +enum reg2_op { + revb2h_op = 0x0c, + revb4h_op = 0x0d, + revb2w_op = 0x0e, + revbd_op = 0x0f, + revh2w_op = 0x10, + revhd_op = 0x11, +}; + +enum reg2i5_op { + slliw_op = 0x81, + srliw_op = 0x89, + sraiw_op = 0x91, +}; + +enum reg2i6_op { + sllid_op = 0x41, + srlid_op = 0x45, + sraid_op = 0x49, +}; + enum reg2i12_op { addiw_op = 0x0a, addid_op = 0x0b, lu52id_op = 0x0c, + andi_op = 0x0d, + ori_op = 0x0e, + xori_op = 0x0f, ldb_op = 0xa0, ldh_op = 0xa1, ldw_op = 0xa2, @@ -40,6 +73,20 @@ enum reg2i12_op { sth_op = 0xa5, stw_op = 0xa6, std_op = 0xa7, + ldbu_op = 0xa8, + ldhu_op = 0xa9, + ldwu_op = 0xaa, +}; + +enum reg2i14_op { + llw_op = 0x20, + scw_op = 0x21, + lld_op = 0x22, + scd_op = 0x23, + ldptrw_op = 0x24, + stptrw_op = 0x25, + ldptrd_op = 0x26, + stptrd_op = 0x27, }; enum reg2i16_op { @@ -52,6 +99,71 @@ enum reg2i16_op { bgeu_op = 0x1b, }; +enum reg2bstrd_op { + bstrinsd_op = 0x2, + bstrpickd_op = 0x3, +}; + +enum reg3_op { + addw_op = 0x20, + addd_op = 0x21, + subw_op = 0x22, + subd_op = 0x23, + nor_op = 0x28, + and_op = 0x29, + or_op = 0x2a, + xor_op = 0x2b, + orn_op = 0x2c, + andn_op = 0x2d, + sllw_op = 0x2e, + srlw_op = 0x2f, + sraw_op = 0x30, + slld_op = 0x31, + srld_op = 0x32, + srad_op = 0x33, + mulw_op = 0x38, + mulhw_op = 0x39, + mulhwu_op = 0x3a, + muld_op = 0x3b, + mulhd_op = 0x3c, + mulhdu_op = 0x3d, + divw_op = 0x40, + modw_op = 0x41, + divwu_op = 0x42, + modwu_op = 0x43, + divd_op = 0x44, + modd_op = 0x45, + divdu_op = 0x46, + moddu_op = 0x47, + ldxb_op = 0x7000, + ldxh_op = 0x7008, + ldxw_op = 0x7010, + ldxd_op = 0x7018, + stxb_op = 0x7020, + stxh_op = 0x7028, + stxw_op = 0x7030, + stxd_op = 0x7038, + ldxbu_op = 0x7040, + ldxhu_op = 0x7048, + ldxwu_op = 0x7050, + amswapw_op = 0x70c0, + amswapd_op = 0x70c1, + amaddw_op = 0x70c2, + amaddd_op = 0x70c3, + amandw_op = 0x70c4, + amandd_op = 0x70c5, + amorw_op = 0x70c6, + amord_op = 0x70c7, + amxorw_op = 0x70c8, + amxord_op = 0x70c9, +}; + +enum reg3sa2_op { + alslw_op = 0x02, + alslwu_op = 0x03, + alsld_op = 0x16, +}; + struct reg0i26_format { unsigned int immediate_h : 10; unsigned int immediate_l : 16; @@ -71,6 +183,26 @@ struct reg1i21_format { unsigned int opcode : 6; }; +struct reg2_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int opcode : 22; +}; + +struct reg2i5_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int immediate : 5; + unsigned int opcode : 17; +}; + +struct reg2i6_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int immediate : 6; + unsigned int opcode : 16; +}; + struct reg2i12_format { unsigned int rd : 5; unsigned int rj : 5; @@ -78,6 +210,13 @@ struct reg2i12_format { unsigned int opcode : 10; }; +struct reg2i14_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int immediate : 14; + unsigned int opcode : 8; +}; + struct reg2i16_format { unsigned int rd : 5; unsigned int rj : 5; @@ -85,13 +224,43 @@ struct reg2i16_format { unsigned int opcode : 6; }; +struct reg2bstrd_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int lsbd : 6; + unsigned int msbd : 6; + unsigned int opcode : 10; +}; + +struct reg3_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int opcode : 17; +}; + +struct reg3sa2_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int immediate : 2; + unsigned int opcode : 15; +}; + union loongarch_instruction { unsigned int word; - struct reg0i26_format reg0i26_format; - struct reg1i20_format reg1i20_format; - struct reg1i21_format reg1i21_format; - struct reg2i12_format reg2i12_format; - struct reg2i16_format reg2i16_format; + struct reg0i26_format reg0i26_format; + struct reg1i20_format reg1i20_format; + struct reg1i21_format reg1i21_format; + struct reg2_format reg2_format; + struct reg2i5_format reg2i5_format; + struct reg2i6_format reg2i6_format; + struct reg2i12_format reg2i12_format; + struct reg2i14_format reg2i14_format; + struct reg2i16_format reg2i16_format; + struct reg2bstrd_format reg2bstrd_format; + struct reg3_format reg3_format; + struct reg3sa2_format reg3sa2_format; }; #define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) -- GitLab From 5dc615520c4dfb358245680f1904bad61116648e Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Wed, 12 Oct 2022 16:36:20 +0800 Subject: [PATCH 1808/2223] LoongArch: Add BPF JIT support BPF programs are normally handled by a BPF interpreter, add BPF JIT support for LoongArch to allow the kernel to generate native code when a program is loaded into the kernel. This will significantly speed-up processing of BPF programs. Co-developed-by: Youling Tang <tangyouling@loongson.cn> Signed-off-by: Youling Tang <tangyouling@loongson.cn> Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/Kbuild | 1 + arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/inst.h | 221 +++ .../include/uapi/asm/bpf_perf_event.h | 9 + arch/loongarch/net/Makefile | 7 + arch/loongarch/net/bpf_jit.c | 1179 +++++++++++++++++ arch/loongarch/net/bpf_jit.h | 282 ++++ 7 files changed, 1700 insertions(+) create mode 100644 arch/loongarch/include/uapi/asm/bpf_perf_event.h create mode 100644 arch/loongarch/net/Makefile create mode 100644 arch/loongarch/net/bpf_jit.c create mode 100644 arch/loongarch/net/bpf_jit.h diff --git a/arch/loongarch/Kbuild b/arch/loongarch/Kbuild index ab5373d0a24ff..b01f5cdb27e03 100644 --- a/arch/loongarch/Kbuild +++ b/arch/loongarch/Kbuild @@ -1,5 +1,6 @@ obj-y += kernel/ obj-y += mm/ +obj-y += net/ obj-y += vdso/ # for cleaning diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 2837ac5413c08..5c7c2a7762b79 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -87,6 +87,7 @@ config LOONGARCH select HAVE_CONTEXT_TRACKING_USER select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_CONTIGUOUS + select HAVE_EBPF_JIT select HAVE_EXIT_THREAD select HAVE_FAST_GUP select HAVE_GENERIC_VDSO diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index 63df7efff2766..fce1843ceebb3 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -345,4 +345,225 @@ static inline bool unsigned_imm_check(unsigned long val, unsigned int bit) return val < (1UL << bit); } +#define DEF_EMIT_REG0I26_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + int offset) \ +{ \ + unsigned int immediate_l, immediate_h; \ + \ + immediate_l = offset & 0xffff; \ + offset >>= 16; \ + immediate_h = offset & 0x3ff; \ + \ + insn->reg0i26_format.opcode = OP; \ + insn->reg0i26_format.immediate_l = immediate_l; \ + insn->reg0i26_format.immediate_h = immediate_h; \ +} + +DEF_EMIT_REG0I26_FORMAT(b, b_op) + +#define DEF_EMIT_REG1I20_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, int imm) \ +{ \ + insn->reg1i20_format.opcode = OP; \ + insn->reg1i20_format.immediate = imm; \ + insn->reg1i20_format.rd = rd; \ +} + +DEF_EMIT_REG1I20_FORMAT(lu12iw, lu12iw_op) +DEF_EMIT_REG1I20_FORMAT(lu32id, lu32id_op) +DEF_EMIT_REG1I20_FORMAT(pcaddu18i, pcaddu18i_op) + +#define DEF_EMIT_REG2_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj) \ +{ \ + insn->reg2_format.opcode = OP; \ + insn->reg2_format.rd = rd; \ + insn->reg2_format.rj = rj; \ +} + +DEF_EMIT_REG2_FORMAT(revb2h, revb2h_op) +DEF_EMIT_REG2_FORMAT(revb2w, revb2w_op) +DEF_EMIT_REG2_FORMAT(revbd, revbd_op) + +#define DEF_EMIT_REG2I5_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + int imm) \ +{ \ + insn->reg2i5_format.opcode = OP; \ + insn->reg2i5_format.immediate = imm; \ + insn->reg2i5_format.rd = rd; \ + insn->reg2i5_format.rj = rj; \ +} + +DEF_EMIT_REG2I5_FORMAT(slliw, slliw_op) +DEF_EMIT_REG2I5_FORMAT(srliw, srliw_op) +DEF_EMIT_REG2I5_FORMAT(sraiw, sraiw_op) + +#define DEF_EMIT_REG2I6_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + int imm) \ +{ \ + insn->reg2i6_format.opcode = OP; \ + insn->reg2i6_format.immediate = imm; \ + insn->reg2i6_format.rd = rd; \ + insn->reg2i6_format.rj = rj; \ +} + +DEF_EMIT_REG2I6_FORMAT(sllid, sllid_op) +DEF_EMIT_REG2I6_FORMAT(srlid, srlid_op) +DEF_EMIT_REG2I6_FORMAT(sraid, sraid_op) + +#define DEF_EMIT_REG2I12_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + int imm) \ +{ \ + insn->reg2i12_format.opcode = OP; \ + insn->reg2i12_format.immediate = imm; \ + insn->reg2i12_format.rd = rd; \ + insn->reg2i12_format.rj = rj; \ +} + +DEF_EMIT_REG2I12_FORMAT(addiw, addiw_op) +DEF_EMIT_REG2I12_FORMAT(addid, addid_op) +DEF_EMIT_REG2I12_FORMAT(lu52id, lu52id_op) +DEF_EMIT_REG2I12_FORMAT(andi, andi_op) +DEF_EMIT_REG2I12_FORMAT(ori, ori_op) +DEF_EMIT_REG2I12_FORMAT(xori, xori_op) +DEF_EMIT_REG2I12_FORMAT(ldbu, ldbu_op) +DEF_EMIT_REG2I12_FORMAT(ldhu, ldhu_op) +DEF_EMIT_REG2I12_FORMAT(ldwu, ldwu_op) +DEF_EMIT_REG2I12_FORMAT(ldd, ldd_op) +DEF_EMIT_REG2I12_FORMAT(stb, stb_op) +DEF_EMIT_REG2I12_FORMAT(sth, sth_op) +DEF_EMIT_REG2I12_FORMAT(stw, stw_op) +DEF_EMIT_REG2I12_FORMAT(std, std_op) + +#define DEF_EMIT_REG2I14_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + int imm) \ +{ \ + insn->reg2i14_format.opcode = OP; \ + insn->reg2i14_format.immediate = imm; \ + insn->reg2i14_format.rd = rd; \ + insn->reg2i14_format.rj = rj; \ +} + +DEF_EMIT_REG2I14_FORMAT(llw, llw_op) +DEF_EMIT_REG2I14_FORMAT(scw, scw_op) +DEF_EMIT_REG2I14_FORMAT(lld, lld_op) +DEF_EMIT_REG2I14_FORMAT(scd, scd_op) +DEF_EMIT_REG2I14_FORMAT(ldptrw, ldptrw_op) +DEF_EMIT_REG2I14_FORMAT(stptrw, stptrw_op) +DEF_EMIT_REG2I14_FORMAT(ldptrd, ldptrd_op) +DEF_EMIT_REG2I14_FORMAT(stptrd, stptrd_op) + +#define DEF_EMIT_REG2I16_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rj, \ + enum loongarch_gpr rd, \ + int offset) \ +{ \ + insn->reg2i16_format.opcode = OP; \ + insn->reg2i16_format.immediate = offset; \ + insn->reg2i16_format.rj = rj; \ + insn->reg2i16_format.rd = rd; \ +} + +DEF_EMIT_REG2I16_FORMAT(beq, beq_op) +DEF_EMIT_REG2I16_FORMAT(bne, bne_op) +DEF_EMIT_REG2I16_FORMAT(blt, blt_op) +DEF_EMIT_REG2I16_FORMAT(bge, bge_op) +DEF_EMIT_REG2I16_FORMAT(bltu, bltu_op) +DEF_EMIT_REG2I16_FORMAT(bgeu, bgeu_op) +DEF_EMIT_REG2I16_FORMAT(jirl, jirl_op) + +#define DEF_EMIT_REG2BSTRD_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + int msbd, \ + int lsbd) \ +{ \ + insn->reg2bstrd_format.opcode = OP; \ + insn->reg2bstrd_format.msbd = msbd; \ + insn->reg2bstrd_format.lsbd = lsbd; \ + insn->reg2bstrd_format.rj = rj; \ + insn->reg2bstrd_format.rd = rd; \ +} + +DEF_EMIT_REG2BSTRD_FORMAT(bstrpickd, bstrpickd_op) + +#define DEF_EMIT_REG3_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + enum loongarch_gpr rk) \ +{ \ + insn->reg3_format.opcode = OP; \ + insn->reg3_format.rd = rd; \ + insn->reg3_format.rj = rj; \ + insn->reg3_format.rk = rk; \ +} + +DEF_EMIT_REG3_FORMAT(addd, addd_op) +DEF_EMIT_REG3_FORMAT(subd, subd_op) +DEF_EMIT_REG3_FORMAT(muld, muld_op) +DEF_EMIT_REG3_FORMAT(divdu, divdu_op) +DEF_EMIT_REG3_FORMAT(moddu, moddu_op) +DEF_EMIT_REG3_FORMAT(and, and_op) +DEF_EMIT_REG3_FORMAT(or, or_op) +DEF_EMIT_REG3_FORMAT(xor, xor_op) +DEF_EMIT_REG3_FORMAT(sllw, sllw_op) +DEF_EMIT_REG3_FORMAT(slld, slld_op) +DEF_EMIT_REG3_FORMAT(srlw, srlw_op) +DEF_EMIT_REG3_FORMAT(srld, srld_op) +DEF_EMIT_REG3_FORMAT(sraw, sraw_op) +DEF_EMIT_REG3_FORMAT(srad, srad_op) +DEF_EMIT_REG3_FORMAT(ldxbu, ldxbu_op) +DEF_EMIT_REG3_FORMAT(ldxhu, ldxhu_op) +DEF_EMIT_REG3_FORMAT(ldxwu, ldxwu_op) +DEF_EMIT_REG3_FORMAT(ldxd, ldxd_op) +DEF_EMIT_REG3_FORMAT(stxb, stxb_op) +DEF_EMIT_REG3_FORMAT(stxh, stxh_op) +DEF_EMIT_REG3_FORMAT(stxw, stxw_op) +DEF_EMIT_REG3_FORMAT(stxd, stxd_op) +DEF_EMIT_REG3_FORMAT(amaddw, amaddw_op) +DEF_EMIT_REG3_FORMAT(amaddd, amaddd_op) +DEF_EMIT_REG3_FORMAT(amandw, amandw_op) +DEF_EMIT_REG3_FORMAT(amandd, amandd_op) +DEF_EMIT_REG3_FORMAT(amorw, amorw_op) +DEF_EMIT_REG3_FORMAT(amord, amord_op) +DEF_EMIT_REG3_FORMAT(amxorw, amxorw_op) +DEF_EMIT_REG3_FORMAT(amxord, amxord_op) +DEF_EMIT_REG3_FORMAT(amswapw, amswapw_op) +DEF_EMIT_REG3_FORMAT(amswapd, amswapd_op) + +#define DEF_EMIT_REG3SA2_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rd, \ + enum loongarch_gpr rj, \ + enum loongarch_gpr rk, \ + int imm) \ +{ \ + insn->reg3sa2_format.opcode = OP; \ + insn->reg3sa2_format.immediate = imm; \ + insn->reg3sa2_format.rd = rd; \ + insn->reg3sa2_format.rj = rj; \ + insn->reg3sa2_format.rk = rk; \ +} + +DEF_EMIT_REG3SA2_FORMAT(alsld, alsld_op) + #endif /* _ASM_INST_H */ diff --git a/arch/loongarch/include/uapi/asm/bpf_perf_event.h b/arch/loongarch/include/uapi/asm/bpf_perf_event.h new file mode 100644 index 0000000000000..eb6e2fd2a1f01 --- /dev/null +++ b/arch/loongarch/include/uapi/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ +#define _UAPI__ASM_BPF_PERF_EVENT_H__ + +#include <linux/ptrace.h> + +typedef struct user_pt_regs bpf_user_pt_regs_t; + +#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ diff --git a/arch/loongarch/net/Makefile b/arch/loongarch/net/Makefile new file mode 100644 index 0000000000000..1ec12a0c324a5 --- /dev/null +++ b/arch/loongarch/net/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for arch/loongarch/net +# +# Copyright (C) 2022 Loongson Technology Corporation Limited +# +obj-$(CONFIG_BPF_JIT) += bpf_jit.o diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c new file mode 100644 index 0000000000000..43f0a98efe380 --- /dev/null +++ b/arch/loongarch/net/bpf_jit.c @@ -0,0 +1,1179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * BPF JIT compiler for LoongArch + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + */ +#include "bpf_jit.h" + +#define REG_TCC LOONGARCH_GPR_A6 +#define TCC_SAVED LOONGARCH_GPR_S5 + +#define SAVE_RA BIT(0) +#define SAVE_TCC BIT(1) + +static const int regmap[] = { + /* return value from in-kernel function, and exit value for eBPF program */ + [BPF_REG_0] = LOONGARCH_GPR_A5, + /* arguments from eBPF program to in-kernel function */ + [BPF_REG_1] = LOONGARCH_GPR_A0, + [BPF_REG_2] = LOONGARCH_GPR_A1, + [BPF_REG_3] = LOONGARCH_GPR_A2, + [BPF_REG_4] = LOONGARCH_GPR_A3, + [BPF_REG_5] = LOONGARCH_GPR_A4, + /* callee saved registers that in-kernel function will preserve */ + [BPF_REG_6] = LOONGARCH_GPR_S0, + [BPF_REG_7] = LOONGARCH_GPR_S1, + [BPF_REG_8] = LOONGARCH_GPR_S2, + [BPF_REG_9] = LOONGARCH_GPR_S3, + /* read-only frame pointer to access stack */ + [BPF_REG_FP] = LOONGARCH_GPR_S4, + /* temporary register for blinding constants */ + [BPF_REG_AX] = LOONGARCH_GPR_T0, +}; + +static void mark_call(struct jit_ctx *ctx) +{ + ctx->flags |= SAVE_RA; +} + +static void mark_tail_call(struct jit_ctx *ctx) +{ + ctx->flags |= SAVE_TCC; +} + +static bool seen_call(struct jit_ctx *ctx) +{ + return (ctx->flags & SAVE_RA); +} + +static bool seen_tail_call(struct jit_ctx *ctx) +{ + return (ctx->flags & SAVE_TCC); +} + +static u8 tail_call_reg(struct jit_ctx *ctx) +{ + if (seen_call(ctx)) + return TCC_SAVED; + + return REG_TCC; +} + +/* + * eBPF prog stack layout: + * + * high + * original $sp ------------> +-------------------------+ <--LOONGARCH_GPR_FP + * | $ra | + * +-------------------------+ + * | $fp | + * +-------------------------+ + * | $s0 | + * +-------------------------+ + * | $s1 | + * +-------------------------+ + * | $s2 | + * +-------------------------+ + * | $s3 | + * +-------------------------+ + * | $s4 | + * +-------------------------+ + * | $s5 | + * +-------------------------+ <--BPF_REG_FP + * | prog->aux->stack_depth | + * | (optional) | + * current $sp -------------> +-------------------------+ + * low + */ +static void build_prologue(struct jit_ctx *ctx) +{ + int stack_adjust = 0, store_offset, bpf_stack_adjust; + + bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); + + /* To store ra, fp, s0, s1, s2, s3, s4 and s5. */ + stack_adjust += sizeof(long) * 8; + + stack_adjust = round_up(stack_adjust, 16); + stack_adjust += bpf_stack_adjust; + + /* + * First instruction initializes the tail call count (TCC). + * On tail call we skip this instruction, and the TCC is + * passed in REG_TCC from the caller. + */ + emit_insn(ctx, addid, REG_TCC, LOONGARCH_GPR_ZERO, MAX_TAIL_CALL_CNT); + + emit_insn(ctx, addid, LOONGARCH_GPR_SP, LOONGARCH_GPR_SP, -stack_adjust); + + store_offset = stack_adjust - sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_RA, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_FP, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_S0, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_S1, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_S2, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_S3, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_S4, LOONGARCH_GPR_SP, store_offset); + + store_offset -= sizeof(long); + emit_insn(ctx, std, LOONGARCH_GPR_S5, LOONGARCH_GPR_SP, store_offset); + + emit_insn(ctx, addid, LOONGARCH_GPR_FP, LOONGARCH_GPR_SP, stack_adjust); + + if (bpf_stack_adjust) + emit_insn(ctx, addid, regmap[BPF_REG_FP], LOONGARCH_GPR_SP, bpf_stack_adjust); + + /* + * Program contains calls and tail calls, so REG_TCC need + * to be saved across calls. + */ + if (seen_tail_call(ctx) && seen_call(ctx)) + move_reg(ctx, TCC_SAVED, REG_TCC); + + ctx->stack_size = stack_adjust; +} + +static void __build_epilogue(struct jit_ctx *ctx, bool is_tail_call) +{ + int stack_adjust = ctx->stack_size; + int load_offset; + + load_offset = stack_adjust - sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_RA, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_FP, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_S0, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_S1, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_S2, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_S3, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_S4, LOONGARCH_GPR_SP, load_offset); + + load_offset -= sizeof(long); + emit_insn(ctx, ldd, LOONGARCH_GPR_S5, LOONGARCH_GPR_SP, load_offset); + + emit_insn(ctx, addid, LOONGARCH_GPR_SP, LOONGARCH_GPR_SP, stack_adjust); + + if (!is_tail_call) { + /* Set return value */ + move_reg(ctx, LOONGARCH_GPR_A0, regmap[BPF_REG_0]); + /* Return to the caller */ + emit_insn(ctx, jirl, LOONGARCH_GPR_RA, LOONGARCH_GPR_ZERO, 0); + } else { + /* + * Call the next bpf prog and skip the first instruction + * of TCC initialization. + */ + emit_insn(ctx, jirl, LOONGARCH_GPR_T3, LOONGARCH_GPR_ZERO, 1); + } +} + +static void build_epilogue(struct jit_ctx *ctx) +{ + __build_epilogue(ctx, false); +} + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + +/* initialized on the first pass of build_body() */ +static int out_offset = -1; +static int emit_bpf_tail_call(struct jit_ctx *ctx) +{ + int off; + u8 tcc = tail_call_reg(ctx); + u8 a1 = LOONGARCH_GPR_A1; + u8 a2 = LOONGARCH_GPR_A2; + u8 t1 = LOONGARCH_GPR_T1; + u8 t2 = LOONGARCH_GPR_T2; + u8 t3 = LOONGARCH_GPR_T3; + const int idx0 = ctx->idx; + +#define cur_offset (ctx->idx - idx0) +#define jmp_offset (out_offset - (cur_offset)) + + /* + * a0: &ctx + * a1: &array + * a2: index + * + * if (index >= array->map.max_entries) + * goto out; + */ + off = offsetof(struct bpf_array, map.max_entries); + emit_insn(ctx, ldwu, t1, a1, off); + /* bgeu $a2, $t1, jmp_offset */ + if (emit_tailcall_jmp(ctx, BPF_JGE, a2, t1, jmp_offset) < 0) + goto toofar; + + /* + * if (--TCC < 0) + * goto out; + */ + emit_insn(ctx, addid, REG_TCC, tcc, -1); + if (emit_tailcall_jmp(ctx, BPF_JSLT, REG_TCC, LOONGARCH_GPR_ZERO, jmp_offset) < 0) + goto toofar; + + /* + * prog = array->ptrs[index]; + * if (!prog) + * goto out; + */ + emit_insn(ctx, alsld, t2, a2, a1, 2); + off = offsetof(struct bpf_array, ptrs); + emit_insn(ctx, ldd, t2, t2, off); + /* beq $t2, $zero, jmp_offset */ + if (emit_tailcall_jmp(ctx, BPF_JEQ, t2, LOONGARCH_GPR_ZERO, jmp_offset) < 0) + goto toofar; + + /* goto *(prog->bpf_func + 4); */ + off = offsetof(struct bpf_prog, bpf_func); + emit_insn(ctx, ldd, t3, t2, off); + __build_epilogue(ctx, true); + + /* out: */ + if (out_offset == -1) + out_offset = cur_offset; + if (cur_offset != out_offset) { + pr_err_once("tail_call out_offset = %d, expected %d!\n", + cur_offset, out_offset); + return -1; + } + + return 0; + +toofar: + pr_info_once("tail_call: jump too far\n"); + return -1; +#undef cur_offset +#undef jmp_offset +} + +static void emit_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) +{ + const u8 t1 = LOONGARCH_GPR_T1; + const u8 t2 = LOONGARCH_GPR_T2; + const u8 t3 = LOONGARCH_GPR_T3; + const u8 src = regmap[insn->src_reg]; + const u8 dst = regmap[insn->dst_reg]; + const s16 off = insn->off; + const s32 imm = insn->imm; + const bool isdw = BPF_SIZE(insn->code) == BPF_DW; + + move_imm(ctx, t1, off, false); + emit_insn(ctx, addd, t1, dst, t1); + move_reg(ctx, t3, src); + + switch (imm) { + /* lock *(size *)(dst + off) <op>= src */ + case BPF_ADD: + if (isdw) + emit_insn(ctx, amaddd, t2, t1, src); + else + emit_insn(ctx, amaddw, t2, t1, src); + break; + case BPF_AND: + if (isdw) + emit_insn(ctx, amandd, t2, t1, src); + else + emit_insn(ctx, amandw, t2, t1, src); + break; + case BPF_OR: + if (isdw) + emit_insn(ctx, amord, t2, t1, src); + else + emit_insn(ctx, amorw, t2, t1, src); + break; + case BPF_XOR: + if (isdw) + emit_insn(ctx, amxord, t2, t1, src); + else + emit_insn(ctx, amxorw, t2, t1, src); + break; + /* src = atomic_fetch_<op>(dst + off, src) */ + case BPF_ADD | BPF_FETCH: + if (isdw) { + emit_insn(ctx, amaddd, src, t1, t3); + } else { + emit_insn(ctx, amaddw, src, t1, t3); + emit_zext_32(ctx, src, true); + } + break; + case BPF_AND | BPF_FETCH: + if (isdw) { + emit_insn(ctx, amandd, src, t1, t3); + } else { + emit_insn(ctx, amandw, src, t1, t3); + emit_zext_32(ctx, src, true); + } + break; + case BPF_OR | BPF_FETCH: + if (isdw) { + emit_insn(ctx, amord, src, t1, t3); + } else { + emit_insn(ctx, amorw, src, t1, t3); + emit_zext_32(ctx, src, true); + } + break; + case BPF_XOR | BPF_FETCH: + if (isdw) { + emit_insn(ctx, amxord, src, t1, t3); + } else { + emit_insn(ctx, amxorw, src, t1, t3); + emit_zext_32(ctx, src, true); + } + break; + /* src = atomic_xchg(dst + off, src); */ + case BPF_XCHG: + if (isdw) { + emit_insn(ctx, amswapd, src, t1, t3); + } else { + emit_insn(ctx, amswapw, src, t1, t3); + emit_zext_32(ctx, src, true); + } + break; + /* r0 = atomic_cmpxchg(dst + off, r0, src); */ + case BPF_CMPXCHG: + u8 r0 = regmap[BPF_REG_0]; + + move_reg(ctx, t2, r0); + if (isdw) { + emit_insn(ctx, lld, r0, t1, 0); + emit_insn(ctx, bne, t2, r0, 4); + move_reg(ctx, t3, src); + emit_insn(ctx, scd, t3, t1, 0); + emit_insn(ctx, beq, t3, LOONGARCH_GPR_ZERO, -4); + } else { + emit_insn(ctx, llw, r0, t1, 0); + emit_zext_32(ctx, t2, true); + emit_zext_32(ctx, r0, true); + emit_insn(ctx, bne, t2, r0, 4); + move_reg(ctx, t3, src); + emit_insn(ctx, scw, t3, t1, 0); + emit_insn(ctx, beq, t3, LOONGARCH_GPR_ZERO, -6); + emit_zext_32(ctx, r0, true); + } + break; + } +} + +static bool is_signed_bpf_cond(u8 cond) +{ + return cond == BPF_JSGT || cond == BPF_JSLT || + cond == BPF_JSGE || cond == BPF_JSLE; +} + +static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool extra_pass) +{ + const bool is32 = BPF_CLASS(insn->code) == BPF_ALU || + BPF_CLASS(insn->code) == BPF_JMP32; + const u8 code = insn->code; + const u8 cond = BPF_OP(code); + const u8 t1 = LOONGARCH_GPR_T1; + const u8 t2 = LOONGARCH_GPR_T2; + const u8 src = regmap[insn->src_reg]; + const u8 dst = regmap[insn->dst_reg]; + const s16 off = insn->off; + const s32 imm = insn->imm; + int jmp_offset; + int i = insn - ctx->prog->insnsi; + + switch (code) { + /* dst = src */ + case BPF_ALU | BPF_MOV | BPF_X: + case BPF_ALU64 | BPF_MOV | BPF_X: + move_reg(ctx, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = imm */ + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_K: + move_imm(ctx, dst, imm, is32); + break; + + /* dst = dst + src */ + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU64 | BPF_ADD | BPF_X: + emit_insn(ctx, addd, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst + imm */ + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_ADD | BPF_K: + if (is_signed_imm12(imm)) { + emit_insn(ctx, addid, dst, dst, imm); + } else { + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, addd, dst, dst, t1); + } + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst - src */ + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU64 | BPF_SUB | BPF_X: + emit_insn(ctx, subd, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst - imm */ + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + if (is_signed_imm12(-imm)) { + emit_insn(ctx, addid, dst, dst, -imm); + } else { + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, subd, dst, dst, t1); + } + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst * src */ + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU64 | BPF_MUL | BPF_X: + emit_insn(ctx, muld, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst * imm */ + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_K: + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, muld, dst, dst, t1); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst / src */ + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_DIV | BPF_X: + emit_zext_32(ctx, dst, is32); + move_reg(ctx, t1, src); + emit_zext_32(ctx, t1, is32); + emit_insn(ctx, divdu, dst, dst, t1); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst / imm */ + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + move_imm(ctx, t1, imm, is32); + emit_zext_32(ctx, dst, is32); + emit_insn(ctx, divdu, dst, dst, t1); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst % src */ + case BPF_ALU | BPF_MOD | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_X: + emit_zext_32(ctx, dst, is32); + move_reg(ctx, t1, src); + emit_zext_32(ctx, t1, is32); + emit_insn(ctx, moddu, dst, dst, t1); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst % imm */ + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: + move_imm(ctx, t1, imm, is32); + emit_zext_32(ctx, dst, is32); + emit_insn(ctx, moddu, dst, dst, t1); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = -dst */ + case BPF_ALU | BPF_NEG: + case BPF_ALU64 | BPF_NEG: + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, subd, dst, LOONGARCH_GPR_ZERO, dst); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst & src */ + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU64 | BPF_AND | BPF_X: + emit_insn(ctx, and, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst & imm */ + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + if (is_unsigned_imm12(imm)) { + emit_insn(ctx, andi, dst, dst, imm); + } else { + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, and, dst, dst, t1); + } + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst | src */ + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU64 | BPF_OR | BPF_X: + emit_insn(ctx, or, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst | imm */ + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + if (is_unsigned_imm12(imm)) { + emit_insn(ctx, ori, dst, dst, imm); + } else { + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, or, dst, dst, t1); + } + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst ^ src */ + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU64 | BPF_XOR | BPF_X: + emit_insn(ctx, xor, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst ^ imm */ + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + if (is_unsigned_imm12(imm)) { + emit_insn(ctx, xori, dst, dst, imm); + } else { + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, xor, dst, dst, t1); + } + emit_zext_32(ctx, dst, is32); + break; + + /* dst = dst << src (logical) */ + case BPF_ALU | BPF_LSH | BPF_X: + emit_insn(ctx, sllw, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + case BPF_ALU64 | BPF_LSH | BPF_X: + emit_insn(ctx, slld, dst, dst, src); + break; + + /* dst = dst << imm (logical) */ + case BPF_ALU | BPF_LSH | BPF_K: + emit_insn(ctx, slliw, dst, dst, imm); + emit_zext_32(ctx, dst, is32); + break; + + case BPF_ALU64 | BPF_LSH | BPF_K: + emit_insn(ctx, sllid, dst, dst, imm); + break; + + /* dst = dst >> src (logical) */ + case BPF_ALU | BPF_RSH | BPF_X: + emit_insn(ctx, srlw, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + case BPF_ALU64 | BPF_RSH | BPF_X: + emit_insn(ctx, srld, dst, dst, src); + break; + + /* dst = dst >> imm (logical) */ + case BPF_ALU | BPF_RSH | BPF_K: + emit_insn(ctx, srliw, dst, dst, imm); + emit_zext_32(ctx, dst, is32); + break; + + case BPF_ALU64 | BPF_RSH | BPF_K: + emit_insn(ctx, srlid, dst, dst, imm); + break; + + /* dst = dst >> src (arithmetic) */ + case BPF_ALU | BPF_ARSH | BPF_X: + emit_insn(ctx, sraw, dst, dst, src); + emit_zext_32(ctx, dst, is32); + break; + + case BPF_ALU64 | BPF_ARSH | BPF_X: + emit_insn(ctx, srad, dst, dst, src); + break; + + /* dst = dst >> imm (arithmetic) */ + case BPF_ALU | BPF_ARSH | BPF_K: + emit_insn(ctx, sraiw, dst, dst, imm); + emit_zext_32(ctx, dst, is32); + break; + + case BPF_ALU64 | BPF_ARSH | BPF_K: + emit_insn(ctx, sraid, dst, dst, imm); + break; + + /* dst = BSWAP##imm(dst) */ + case BPF_ALU | BPF_END | BPF_FROM_LE: + switch (imm) { + case 16: + /* zero-extend 16 bits into 64 bits */ + emit_insn(ctx, bstrpickd, dst, dst, 15, 0); + break; + case 32: + /* zero-extend 32 bits into 64 bits */ + emit_zext_32(ctx, dst, is32); + break; + case 64: + /* do nothing */ + break; + } + break; + + case BPF_ALU | BPF_END | BPF_FROM_BE: + switch (imm) { + case 16: + emit_insn(ctx, revb2h, dst, dst); + /* zero-extend 16 bits into 64 bits */ + emit_insn(ctx, bstrpickd, dst, dst, 15, 0); + break; + case 32: + emit_insn(ctx, revb2w, dst, dst); + /* zero-extend 32 bits into 64 bits */ + emit_zext_32(ctx, dst, is32); + break; + case 64: + emit_insn(ctx, revbd, dst, dst); + break; + } + break; + + /* PC += off if dst cond src */ + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: + case BPF_JMP32 | BPF_JEQ | BPF_X: + case BPF_JMP32 | BPF_JNE | BPF_X: + case BPF_JMP32 | BPF_JGT | BPF_X: + case BPF_JMP32 | BPF_JGE | BPF_X: + case BPF_JMP32 | BPF_JLT | BPF_X: + case BPF_JMP32 | BPF_JLE | BPF_X: + case BPF_JMP32 | BPF_JSGT | BPF_X: + case BPF_JMP32 | BPF_JSGE | BPF_X: + case BPF_JMP32 | BPF_JSLT | BPF_X: + case BPF_JMP32 | BPF_JSLE | BPF_X: + jmp_offset = bpf2la_offset(i, off, ctx); + move_reg(ctx, t1, dst); + move_reg(ctx, t2, src); + if (is_signed_bpf_cond(BPF_OP(code))) { + emit_sext_32(ctx, t1, is32); + emit_sext_32(ctx, t2, is32); + } else { + emit_zext_32(ctx, t1, is32); + emit_zext_32(ctx, t2, is32); + } + if (emit_cond_jmp(ctx, cond, t1, t2, jmp_offset) < 0) + goto toofar; + break; + + /* PC += off if dst cond imm */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + u8 t7 = -1; + jmp_offset = bpf2la_offset(i, off, ctx); + if (imm) { + move_imm(ctx, t1, imm, false); + t7 = t1; + } else { + /* If imm is 0, simply use zero register. */ + t7 = LOONGARCH_GPR_ZERO; + } + move_reg(ctx, t2, dst); + if (is_signed_bpf_cond(BPF_OP(code))) { + emit_sext_32(ctx, t7, is32); + emit_sext_32(ctx, t2, is32); + } else { + emit_zext_32(ctx, t7, is32); + emit_zext_32(ctx, t2, is32); + } + if (emit_cond_jmp(ctx, cond, t2, t7, jmp_offset) < 0) + goto toofar; + break; + + /* PC += off if dst & src */ + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP32 | BPF_JSET | BPF_X: + jmp_offset = bpf2la_offset(i, off, ctx); + emit_insn(ctx, and, t1, dst, src); + emit_zext_32(ctx, t1, is32); + if (emit_cond_jmp(ctx, cond, t1, LOONGARCH_GPR_ZERO, jmp_offset) < 0) + goto toofar; + break; + + /* PC += off if dst & imm */ + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_K: + jmp_offset = bpf2la_offset(i, off, ctx); + move_imm(ctx, t1, imm, is32); + emit_insn(ctx, and, t1, dst, t1); + emit_zext_32(ctx, t1, is32); + if (emit_cond_jmp(ctx, cond, t1, LOONGARCH_GPR_ZERO, jmp_offset) < 0) + goto toofar; + break; + + /* PC += off */ + case BPF_JMP | BPF_JA: + jmp_offset = bpf2la_offset(i, off, ctx); + if (emit_uncond_jmp(ctx, jmp_offset) < 0) + goto toofar; + break; + + /* function call */ + case BPF_JMP | BPF_CALL: + int ret; + u64 func_addr; + bool func_addr_fixed; + + mark_call(ctx); + ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + + move_imm(ctx, t1, func_addr, is32); + emit_insn(ctx, jirl, t1, LOONGARCH_GPR_RA, 0); + move_reg(ctx, regmap[BPF_REG_0], LOONGARCH_GPR_A0); + break; + + /* tail call */ + case BPF_JMP | BPF_TAIL_CALL: + mark_tail_call(ctx); + if (emit_bpf_tail_call(ctx) < 0) + return -EINVAL; + break; + + /* function return */ + case BPF_JMP | BPF_EXIT: + emit_sext_32(ctx, regmap[BPF_REG_0], true); + + if (i == ctx->prog->len - 1) + break; + + jmp_offset = epilogue_offset(ctx); + if (emit_uncond_jmp(ctx, jmp_offset) < 0) + goto toofar; + break; + + /* dst = imm64 */ + case BPF_LD | BPF_IMM | BPF_DW: + u64 imm64 = (u64)(insn + 1)->imm << 32 | (u32)insn->imm; + + move_imm(ctx, dst, imm64, is32); + return 1; + + /* dst = *(size *)(src + off) */ + case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_MEM | BPF_DW: + switch (BPF_SIZE(code)) { + case BPF_B: + if (is_signed_imm12(off)) { + emit_insn(ctx, ldbu, dst, src, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, ldxbu, dst, src, t1); + } + break; + case BPF_H: + if (is_signed_imm12(off)) { + emit_insn(ctx, ldhu, dst, src, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, ldxhu, dst, src, t1); + } + break; + case BPF_W: + if (is_signed_imm12(off)) { + emit_insn(ctx, ldwu, dst, src, off); + } else if (is_signed_imm14(off)) { + emit_insn(ctx, ldptrw, dst, src, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, ldxwu, dst, src, t1); + } + break; + case BPF_DW: + if (is_signed_imm12(off)) { + emit_insn(ctx, ldd, dst, src, off); + } else if (is_signed_imm14(off)) { + emit_insn(ctx, ldptrd, dst, src, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, ldxd, dst, src, t1); + } + break; + } + break; + + /* *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_B: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_DW: + switch (BPF_SIZE(code)) { + case BPF_B: + move_imm(ctx, t1, imm, is32); + if (is_signed_imm12(off)) { + emit_insn(ctx, stb, t1, dst, off); + } else { + move_imm(ctx, t2, off, is32); + emit_insn(ctx, stxb, t1, dst, t2); + } + break; + case BPF_H: + move_imm(ctx, t1, imm, is32); + if (is_signed_imm12(off)) { + emit_insn(ctx, sth, t1, dst, off); + } else { + move_imm(ctx, t2, off, is32); + emit_insn(ctx, stxh, t1, dst, t2); + } + break; + case BPF_W: + move_imm(ctx, t1, imm, is32); + if (is_signed_imm12(off)) { + emit_insn(ctx, stw, t1, dst, off); + } else if (is_signed_imm14(off)) { + emit_insn(ctx, stptrw, t1, dst, off); + } else { + move_imm(ctx, t2, off, is32); + emit_insn(ctx, stxw, t1, dst, t2); + } + break; + case BPF_DW: + move_imm(ctx, t1, imm, is32); + if (is_signed_imm12(off)) { + emit_insn(ctx, std, t1, dst, off); + } else if (is_signed_imm14(off)) { + emit_insn(ctx, stptrd, t1, dst, off); + } else { + move_imm(ctx, t2, off, is32); + emit_insn(ctx, stxd, t1, dst, t2); + } + break; + } + break; + + /* *(size *)(dst + off) = src */ + case BPF_STX | BPF_MEM | BPF_B: + case BPF_STX | BPF_MEM | BPF_H: + case BPF_STX | BPF_MEM | BPF_W: + case BPF_STX | BPF_MEM | BPF_DW: + switch (BPF_SIZE(code)) { + case BPF_B: + if (is_signed_imm12(off)) { + emit_insn(ctx, stb, src, dst, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, stxb, src, dst, t1); + } + break; + case BPF_H: + if (is_signed_imm12(off)) { + emit_insn(ctx, sth, src, dst, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, stxh, src, dst, t1); + } + break; + case BPF_W: + if (is_signed_imm12(off)) { + emit_insn(ctx, stw, src, dst, off); + } else if (is_signed_imm14(off)) { + emit_insn(ctx, stptrw, src, dst, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, stxw, src, dst, t1); + } + break; + case BPF_DW: + if (is_signed_imm12(off)) { + emit_insn(ctx, std, src, dst, off); + } else if (is_signed_imm14(off)) { + emit_insn(ctx, stptrd, src, dst, off); + } else { + move_imm(ctx, t1, off, is32); + emit_insn(ctx, stxd, src, dst, t1); + } + break; + } + break; + + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + emit_atomic(insn, ctx); + break; + + default: + pr_err("bpf_jit: unknown opcode %02x\n", code); + return -EINVAL; + } + + return 0; + +toofar: + pr_info_once("bpf_jit: opcode %02x, jump too far\n", code); + return -E2BIG; +} + +static int build_body(struct jit_ctx *ctx, bool extra_pass) +{ + int i; + const struct bpf_prog *prog = ctx->prog; + + for (i = 0; i < prog->len; i++) { + const struct bpf_insn *insn = &prog->insnsi[i]; + int ret; + + if (ctx->image == NULL) + ctx->offset[i] = ctx->idx; + + ret = build_insn(insn, ctx, extra_pass); + if (ret > 0) { + i++; + if (ctx->image == NULL) + ctx->offset[i] = ctx->idx; + continue; + } + if (ret) + return ret; + } + + if (ctx->image == NULL) + ctx->offset[i] = ctx->idx; + + return 0; +} + +/* Fill space with break instructions */ +static void jit_fill_hole(void *area, unsigned int size) +{ + u32 *ptr; + + /* We are guaranteed to have aligned memory */ + for (ptr = area; size >= sizeof(u32); size -= sizeof(u32)) + *ptr++ = INSN_BREAK; +} + +static int validate_code(struct jit_ctx *ctx) +{ + int i; + union loongarch_instruction insn; + + for (i = 0; i < ctx->idx; i++) { + insn = ctx->image[i]; + /* Check INSN_BREAK */ + if (insn.word == INSN_BREAK) + return -1; + } + + return 0; +} + +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +{ + bool tmp_blinded = false, extra_pass = false; + u8 *image_ptr; + int image_size; + struct jit_ctx ctx; + struct jit_data *jit_data; + struct bpf_binary_header *header; + struct bpf_prog *tmp, *orig_prog = prog; + + /* + * If BPF JIT was not enabled then we must fall back to + * the interpreter. + */ + if (!prog->jit_requested) + return orig_prog; + + tmp = bpf_jit_blind_constants(prog); + /* + * If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. Otherwise, we save + * the new JITed code. + */ + if (IS_ERR(tmp)) + return orig_prog; + + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } + + jit_data = prog->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + prog = orig_prog; + goto out; + } + prog->aux->jit_data = jit_data; + } + if (jit_data->ctx.offset) { + ctx = jit_data->ctx; + image_ptr = jit_data->image; + header = jit_data->header; + extra_pass = true; + image_size = sizeof(u32) * ctx.idx; + goto skip_init_ctx; + } + + memset(&ctx, 0, sizeof(ctx)); + ctx.prog = prog; + + ctx.offset = kvcalloc(prog->len + 1, sizeof(u32), GFP_KERNEL); + if (ctx.offset == NULL) { + prog = orig_prog; + goto out_offset; + } + + /* 1. Initial fake pass to compute ctx->idx and set ctx->flags */ + build_prologue(&ctx); + if (build_body(&ctx, extra_pass)) { + prog = orig_prog; + goto out_offset; + } + ctx.epilogue_offset = ctx.idx; + build_epilogue(&ctx); + + /* Now we know the actual image size. + * As each LoongArch instruction is of length 32bit, + * we are translating number of JITed intructions into + * the size required to store these JITed code. + */ + image_size = sizeof(u32) * ctx.idx; + /* Now we know the size of the structure to make */ + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + if (header == NULL) { + prog = orig_prog; + goto out_offset; + } + + /* 2. Now, the actual pass to generate final JIT code */ + ctx.image = (union loongarch_instruction *)image_ptr; + +skip_init_ctx: + ctx.idx = 0; + + build_prologue(&ctx); + if (build_body(&ctx, extra_pass)) { + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_offset; + } + build_epilogue(&ctx); + + /* 3. Extra pass to validate JITed code */ + if (validate_code(&ctx)) { + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_offset; + } + + /* And we're done */ + if (bpf_jit_enable > 1) + bpf_jit_dump(prog->len, image_size, 2, ctx.image); + + /* Update the icache */ + flush_icache_range((unsigned long)header, (unsigned long)(ctx.image + ctx.idx)); + + if (!prog->is_func || extra_pass) { + if (extra_pass && ctx.idx != jit_data->ctx.idx) { + pr_err_once("multi-func JIT bug %d != %d\n", + ctx.idx, jit_data->ctx.idx); + bpf_jit_binary_free(header); + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + goto out_offset; + } + bpf_jit_binary_lock_ro(header); + } else { + jit_data->ctx = ctx; + jit_data->image = image_ptr; + jit_data->header = header; + } + prog->jited = 1; + prog->jited_len = image_size; + prog->bpf_func = (void *)ctx.image; + + if (!prog->is_func || extra_pass) { + int i; + + /* offset[prog->len] is the size of program */ + for (i = 0; i <= prog->len; i++) + ctx.offset[i] *= LOONGARCH_INSN_SIZE; + bpf_prog_fill_jited_linfo(prog, ctx.offset + 1); + +out_offset: + kvfree(ctx.offset); + kfree(jit_data); + prog->aux->jit_data = NULL; + } + +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? tmp : orig_prog); + + out_offset = -1; + + return prog; +} diff --git a/arch/loongarch/net/bpf_jit.h b/arch/loongarch/net/bpf_jit.h new file mode 100644 index 0000000000000..e665ddb0aeb85 --- /dev/null +++ b/arch/loongarch/net/bpf_jit.h @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * BPF JIT compiler for LoongArch + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + */ +#include <linux/bpf.h> +#include <linux/filter.h> +#include <asm/cacheflush.h> +#include <asm/inst.h> + +struct jit_ctx { + const struct bpf_prog *prog; + unsigned int idx; + unsigned int flags; + unsigned int epilogue_offset; + u32 *offset; + union loongarch_instruction *image; + u32 stack_size; +}; + +struct jit_data { + struct bpf_binary_header *header; + u8 *image; + struct jit_ctx ctx; +}; + +#define emit_insn(ctx, func, ...) \ +do { \ + if (ctx->image != NULL) { \ + union loongarch_instruction *insn = &ctx->image[ctx->idx]; \ + emit_##func(insn, ##__VA_ARGS__); \ + } \ + ctx->idx++; \ +} while (0) + +#define is_signed_imm12(val) signed_imm_check(val, 12) +#define is_signed_imm14(val) signed_imm_check(val, 14) +#define is_signed_imm16(val) signed_imm_check(val, 16) +#define is_signed_imm26(val) signed_imm_check(val, 26) +#define is_signed_imm32(val) signed_imm_check(val, 32) +#define is_signed_imm52(val) signed_imm_check(val, 52) +#define is_unsigned_imm12(val) unsigned_imm_check(val, 12) + +static inline int bpf2la_offset(int bpf_insn, int off, const struct jit_ctx *ctx) +{ + /* BPF JMP offset is relative to the next instruction */ + bpf_insn++; + /* + * Whereas LoongArch branch instructions encode the offset + * from the branch itself, so we must subtract 1 from the + * instruction offset. + */ + return (ctx->offset[bpf_insn + off] - (ctx->offset[bpf_insn] - 1)); +} + +static inline int epilogue_offset(const struct jit_ctx *ctx) +{ + int from = ctx->idx; + int to = ctx->epilogue_offset; + + return (to - from); +} + +/* Zero-extend 32 bits into 64 bits */ +static inline void emit_zext_32(struct jit_ctx *ctx, enum loongarch_gpr reg, bool is32) +{ + if (!is32) + return; + + emit_insn(ctx, lu32id, reg, 0); +} + +/* Signed-extend 32 bits into 64 bits */ +static inline void emit_sext_32(struct jit_ctx *ctx, enum loongarch_gpr reg, bool is32) +{ + if (!is32) + return; + + emit_insn(ctx, addiw, reg, reg, 0); +} + +static inline void move_imm(struct jit_ctx *ctx, enum loongarch_gpr rd, long imm, bool is32) +{ + long imm_11_0, imm_31_12, imm_51_32, imm_63_52, imm_51_0, imm_51_31; + + /* or rd, $zero, $zero */ + if (imm == 0) { + emit_insn(ctx, or, rd, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_ZERO); + return; + } + + /* addiw rd, $zero, imm_11_0 */ + if (is_signed_imm12(imm)) { + emit_insn(ctx, addiw, rd, LOONGARCH_GPR_ZERO, imm); + goto zext; + } + + /* ori rd, $zero, imm_11_0 */ + if (is_unsigned_imm12(imm)) { + emit_insn(ctx, ori, rd, LOONGARCH_GPR_ZERO, imm); + goto zext; + } + + /* lu52id rd, $zero, imm_63_52 */ + imm_63_52 = (imm >> 52) & 0xfff; + imm_51_0 = imm & 0xfffffffffffff; + if (imm_63_52 != 0 && imm_51_0 == 0) { + emit_insn(ctx, lu52id, rd, LOONGARCH_GPR_ZERO, imm_63_52); + return; + } + + /* lu12iw rd, imm_31_12 */ + imm_31_12 = (imm >> 12) & 0xfffff; + emit_insn(ctx, lu12iw, rd, imm_31_12); + + /* ori rd, rd, imm_11_0 */ + imm_11_0 = imm & 0xfff; + if (imm_11_0 != 0) + emit_insn(ctx, ori, rd, rd, imm_11_0); + + if (!is_signed_imm32(imm)) { + if (imm_51_0 != 0) { + /* + * If bit[51:31] is all 0 or all 1, + * it means bit[51:32] is sign extended by lu12iw, + * no need to call lu32id to do a new filled operation. + */ + imm_51_31 = (imm >> 31) & 0x1fffff; + if (imm_51_31 != 0 || imm_51_31 != 0x1fffff) { + /* lu32id rd, imm_51_32 */ + imm_51_32 = (imm >> 32) & 0xfffff; + emit_insn(ctx, lu32id, rd, imm_51_32); + } + } + + /* lu52id rd, rd, imm_63_52 */ + if (!is_signed_imm52(imm)) + emit_insn(ctx, lu52id, rd, rd, imm_63_52); + } + +zext: + emit_zext_32(ctx, rd, is32); +} + +static inline void move_reg(struct jit_ctx *ctx, enum loongarch_gpr rd, + enum loongarch_gpr rj) +{ + emit_insn(ctx, or, rd, rj, LOONGARCH_GPR_ZERO); +} + +static inline int invert_jmp_cond(u8 cond) +{ + switch (cond) { + case BPF_JEQ: + return BPF_JNE; + case BPF_JNE: + case BPF_JSET: + return BPF_JEQ; + case BPF_JGT: + return BPF_JLE; + case BPF_JGE: + return BPF_JLT; + case BPF_JLT: + return BPF_JGE; + case BPF_JLE: + return BPF_JGT; + case BPF_JSGT: + return BPF_JSLE; + case BPF_JSGE: + return BPF_JSLT; + case BPF_JSLT: + return BPF_JSGE; + case BPF_JSLE: + return BPF_JSGT; + } + return -1; +} + +static inline void cond_jmp_offset(struct jit_ctx *ctx, u8 cond, enum loongarch_gpr rj, + enum loongarch_gpr rd, int jmp_offset) +{ + switch (cond) { + case BPF_JEQ: + /* PC += jmp_offset if rj == rd */ + emit_insn(ctx, beq, rj, rd, jmp_offset); + return; + case BPF_JNE: + case BPF_JSET: + /* PC += jmp_offset if rj != rd */ + emit_insn(ctx, bne, rj, rd, jmp_offset); + return; + case BPF_JGT: + /* PC += jmp_offset if rj > rd (unsigned) */ + emit_insn(ctx, bltu, rd, rj, jmp_offset); + return; + case BPF_JLT: + /* PC += jmp_offset if rj < rd (unsigned) */ + emit_insn(ctx, bltu, rj, rd, jmp_offset); + return; + case BPF_JGE: + /* PC += jmp_offset if rj >= rd (unsigned) */ + emit_insn(ctx, bgeu, rj, rd, jmp_offset); + return; + case BPF_JLE: + /* PC += jmp_offset if rj <= rd (unsigned) */ + emit_insn(ctx, bgeu, rd, rj, jmp_offset); + return; + case BPF_JSGT: + /* PC += jmp_offset if rj > rd (signed) */ + emit_insn(ctx, blt, rd, rj, jmp_offset); + return; + case BPF_JSLT: + /* PC += jmp_offset if rj < rd (signed) */ + emit_insn(ctx, blt, rj, rd, jmp_offset); + return; + case BPF_JSGE: + /* PC += jmp_offset if rj >= rd (signed) */ + emit_insn(ctx, bge, rj, rd, jmp_offset); + return; + case BPF_JSLE: + /* PC += jmp_offset if rj <= rd (signed) */ + emit_insn(ctx, bge, rd, rj, jmp_offset); + return; + } +} + +static inline void cond_jmp_offs26(struct jit_ctx *ctx, u8 cond, enum loongarch_gpr rj, + enum loongarch_gpr rd, int jmp_offset) +{ + cond = invert_jmp_cond(cond); + cond_jmp_offset(ctx, cond, rj, rd, 2); + emit_insn(ctx, b, jmp_offset); +} + +static inline void uncond_jmp_offs26(struct jit_ctx *ctx, int jmp_offset) +{ + emit_insn(ctx, b, jmp_offset); +} + +static inline int emit_cond_jmp(struct jit_ctx *ctx, u8 cond, enum loongarch_gpr rj, + enum loongarch_gpr rd, int jmp_offset) +{ + /* + * A large PC-relative jump offset may overflow the immediate field of + * the native conditional branch instruction, triggering a conversion + * to use an absolute jump instead, this jump sequence is particularly + * nasty. For now, use cond_jmp_offs26() directly to keep it simple. + * In the future, maybe we can add support for far branching, the branch + * relaxation requires more than two passes to converge, the code seems + * too complex to understand, not quite sure whether it is necessary and + * worth the extra pain. Anyway, just leave it as it is to enhance code + * readability now. + */ + if (is_signed_imm26(jmp_offset)) { + cond_jmp_offs26(ctx, cond, rj, rd, jmp_offset); + return 0; + } + + return -EINVAL; +} + +static inline int emit_uncond_jmp(struct jit_ctx *ctx, int jmp_offset) +{ + if (is_signed_imm26(jmp_offset)) { + uncond_jmp_offs26(ctx, jmp_offset); + return 0; + } + + return -EINVAL; +} + +static inline int emit_tailcall_jmp(struct jit_ctx *ctx, u8 cond, enum loongarch_gpr rj, + enum loongarch_gpr rd, int jmp_offset) +{ + if (is_signed_imm16(jmp_offset)) { + cond_jmp_offset(ctx, cond, rj, rd, jmp_offset); + return 0; + } + + return -EINVAL; +} -- GitLab From 6246ed09111fbb17168619006b4380103c6673c3 Mon Sep 17 00:00:00 2001 From: Jianmin Lv <lvjianmin@loongson.cn> Date: Wed, 12 Oct 2022 16:36:20 +0800 Subject: [PATCH 1809/2223] LoongArch: Add ACPI-based generic laptop driver This add ACPI-based generic laptop driver for Loongson-3. Some of the codes are derived from drivers/platform/x86/thinkpad_acpi.c. Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- drivers/platform/Kconfig | 2 + drivers/platform/Makefile | 1 + drivers/platform/loongarch/Kconfig | 31 + drivers/platform/loongarch/Makefile | 1 + drivers/platform/loongarch/loongson-laptop.c | 624 +++++++++++++++++++ 5 files changed, 659 insertions(+) create mode 100644 drivers/platform/loongarch/Kconfig create mode 100644 drivers/platform/loongarch/Makefile create mode 100644 drivers/platform/loongarch/loongson-laptop.c diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig index b437847b62377..dbd3277122052 100644 --- a/drivers/platform/Kconfig +++ b/drivers/platform/Kconfig @@ -3,6 +3,8 @@ if MIPS source "drivers/platform/mips/Kconfig" endif +source "drivers/platform/loongarch/Kconfig" + source "drivers/platform/goldfish/Kconfig" source "drivers/platform/chrome/Kconfig" diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile index 4de08ef4ec9d0..41640172975a7 100644 --- a/drivers/platform/Makefile +++ b/drivers/platform/Makefile @@ -4,6 +4,7 @@ # obj-$(CONFIG_X86) += x86/ +obj-$(CONFIG_LOONGARCH) += loongarch/ obj-$(CONFIG_MELLANOX_PLATFORM) += mellanox/ obj-$(CONFIG_MIPS) += mips/ obj-$(CONFIG_OLPC_EC) += olpc/ diff --git a/drivers/platform/loongarch/Kconfig b/drivers/platform/loongarch/Kconfig new file mode 100644 index 0000000000000..5633e4d73991a --- /dev/null +++ b/drivers/platform/loongarch/Kconfig @@ -0,0 +1,31 @@ +# +# LoongArch Platform Specific Drivers +# + +menuconfig LOONGARCH_PLATFORM_DEVICES + bool "LoongArch Platform Specific Device Drivers" + default y + depends on LOONGARCH + help + Say Y here to get to see options for device drivers of various + LoongArch platforms, including vendor-specific laptop/desktop + extension and hardware monitor drivers. This option itself does + not add any kernel code. + + If you say N, all options in this submenu will be skipped and disabled. + +if LOONGARCH_PLATFORM_DEVICES + +config LOONGSON_LAPTOP + tristate "Generic Loongson-3 Laptop Driver" + depends on ACPI + depends on BACKLIGHT_CLASS_DEVICE + depends on INPUT + depends on MACH_LOONGSON64 + select ACPI_VIDEO + select INPUT_SPARSEKMAP + default y + help + ACPI-based Loongson-3 family laptops generic driver. + +endif # LOONGARCH_PLATFORM_DEVICES diff --git a/drivers/platform/loongarch/Makefile b/drivers/platform/loongarch/Makefile new file mode 100644 index 0000000000000..f43ab03db1a2d --- /dev/null +++ b/drivers/platform/loongarch/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LOONGSON_LAPTOP) += loongson-laptop.o diff --git a/drivers/platform/loongarch/loongson-laptop.c b/drivers/platform/loongarch/loongson-laptop.c new file mode 100644 index 0000000000000..f0166ad5d2c28 --- /dev/null +++ b/drivers/platform/loongarch/loongson-laptop.c @@ -0,0 +1,624 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic Loongson processor based LAPTOP/ALL-IN-ONE driver + * + * Jianmin Lv <lvjianmin@loongson.cn> + * Huacai Chen <chenhuacai@loongson.cn> + * + * Copyright (C) 2022 Loongson Technology Corporation Limited + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/acpi.h> +#include <linux/backlight.h> +#include <linux/device.h> +#include <linux/input.h> +#include <linux/input/sparse-keymap.h> +#include <linux/platform_device.h> +#include <linux/string.h> +#include <linux/types.h> +#include <acpi/video.h> + +/* 1. Driver-wide structs and misc. variables */ + +/* ACPI HIDs */ +#define LOONGSON_ACPI_EC_HID "PNP0C09" +#define LOONGSON_ACPI_HKEY_HID "LOON0000" + +#define ACPI_LAPTOP_NAME "loongson-laptop" +#define ACPI_LAPTOP_ACPI_EVENT_PREFIX "loongson" + +#define MAX_ACPI_ARGS 3 +#define GENERIC_HOTKEY_MAP_MAX 64 + +#define GENERIC_EVENT_TYPE_OFF 12 +#define GENERIC_EVENT_TYPE_MASK 0xF000 +#define GENERIC_EVENT_CODE_MASK 0x0FFF + +struct generic_sub_driver { + u32 type; + char *name; + acpi_handle *handle; + struct acpi_device *device; + struct platform_driver *driver; + int (*init)(struct generic_sub_driver *sub_driver); + void (*notify)(struct generic_sub_driver *sub_driver, u32 event); + u8 acpi_notify_installed; +}; + +static u32 input_device_registered; +static struct input_dev *generic_inputdev; + +static acpi_handle hotkey_handle; +static struct key_entry hotkey_keycode_map[GENERIC_HOTKEY_MAP_MAX]; + +int loongson_laptop_turn_on_backlight(void); +int loongson_laptop_turn_off_backlight(void); +static int loongson_laptop_backlight_update(struct backlight_device *bd); + +/* 2. ACPI Helpers and device model */ + +static int acpi_evalf(acpi_handle handle, int *res, char *method, char *fmt, ...) +{ + char res_type; + char *fmt0 = fmt; + va_list ap; + int success, quiet; + acpi_status status; + struct acpi_object_list params; + struct acpi_buffer result, *resultp; + union acpi_object in_objs[MAX_ACPI_ARGS], out_obj; + + if (!*fmt) { + pr_err("acpi_evalf() called with empty format\n"); + return 0; + } + + if (*fmt == 'q') { + quiet = 1; + fmt++; + } else + quiet = 0; + + res_type = *(fmt++); + + params.count = 0; + params.pointer = &in_objs[0]; + + va_start(ap, fmt); + while (*fmt) { + char c = *(fmt++); + switch (c) { + case 'd': /* int */ + in_objs[params.count].integer.value = va_arg(ap, int); + in_objs[params.count++].type = ACPI_TYPE_INTEGER; + break; + /* add more types as needed */ + default: + pr_err("acpi_evalf() called with invalid format character '%c'\n", c); + va_end(ap); + return 0; + } + } + va_end(ap); + + if (res_type != 'v') { + result.length = sizeof(out_obj); + result.pointer = &out_obj; + resultp = &result; + } else + resultp = NULL; + + status = acpi_evaluate_object(handle, method, ¶ms, resultp); + + switch (res_type) { + case 'd': /* int */ + success = (status == AE_OK && out_obj.type == ACPI_TYPE_INTEGER); + if (success && res) + *res = out_obj.integer.value; + break; + case 'v': /* void */ + success = status == AE_OK; + break; + /* add more types as needed */ + default: + pr_err("acpi_evalf() called with invalid format character '%c'\n", res_type); + return 0; + } + + if (!success && !quiet) + pr_err("acpi_evalf(%s, %s, ...) failed: %s\n", + method, fmt0, acpi_format_exception(status)); + + return success; +} + +static int hotkey_status_get(int *status) +{ + if (!acpi_evalf(hotkey_handle, status, "GSWS", "d")) + return -EIO; + + return 0; +} + +static void dispatch_acpi_notify(acpi_handle handle, u32 event, void *data) +{ + struct generic_sub_driver *sub_driver = data; + + if (!sub_driver || !sub_driver->notify) + return; + sub_driver->notify(sub_driver, event); +} + +static int __init setup_acpi_notify(struct generic_sub_driver *sub_driver) +{ + acpi_status status; + + if (!*sub_driver->handle) + return 0; + + sub_driver->device = acpi_fetch_acpi_dev(*sub_driver->handle); + if (!sub_driver->device) { + pr_err("acpi_fetch_acpi_dev(%s) failed\n", sub_driver->name); + return -ENODEV; + } + + sub_driver->device->driver_data = sub_driver; + sprintf(acpi_device_class(sub_driver->device), "%s/%s", + ACPI_LAPTOP_ACPI_EVENT_PREFIX, sub_driver->name); + + status = acpi_install_notify_handler(*sub_driver->handle, + sub_driver->type, dispatch_acpi_notify, sub_driver); + if (ACPI_FAILURE(status)) { + if (status == AE_ALREADY_EXISTS) { + pr_notice("Another device driver is already " + "handling %s events\n", sub_driver->name); + } else { + pr_err("acpi_install_notify_handler(%s) failed: %s\n", + sub_driver->name, acpi_format_exception(status)); + } + return -ENODEV; + } + sub_driver->acpi_notify_installed = 1; + + return 0; +} + +static int loongson_hotkey_suspend(struct device *dev) +{ + return 0; +} + +static int loongson_hotkey_resume(struct device *dev) +{ + int status = 0; + struct key_entry ke; + struct backlight_device *bd; + + /* + * Only if the firmware supports SW_LID event model, we can handle the + * event. This is for the consideration of development board without EC. + */ + if (test_bit(SW_LID, generic_inputdev->swbit)) { + if (hotkey_status_get(&status) < 0) + return -EIO; + /* + * The input device sw element records the last lid status. + * When the system is awakened by other wake-up sources, + * the lid event will also be reported. The judgment of + * adding SW_LID bit which in sw element can avoid this + * case. + * + * Input system will drop lid event when current lid event + * value and last lid status in the same. So laptop driver + * doesn't report repeated events. + * + * Lid status is generally 0, but hardware exception is + * considered. So add lid status confirmation. + */ + if (test_bit(SW_LID, generic_inputdev->sw) && !(status & (1 << SW_LID))) { + ke.type = KE_SW; + ke.sw.value = (u8)status; + ke.sw.code = SW_LID; + sparse_keymap_report_entry(generic_inputdev, &ke, 1, true); + } + } + + bd = backlight_device_get_by_type(BACKLIGHT_PLATFORM); + if (bd) { + loongson_laptop_backlight_update(bd) ? + pr_warn("Loongson_backlight: resume brightness failed") : + pr_info("Loongson_backlight: resume brightness %d\n", bd->props.brightness); + } + + return 0; +} + +static DEFINE_SIMPLE_DEV_PM_OPS(loongson_hotkey_pm, + loongson_hotkey_suspend, loongson_hotkey_resume); + +static int loongson_hotkey_probe(struct platform_device *pdev) +{ + hotkey_handle = ACPI_HANDLE(&pdev->dev); + + if (!hotkey_handle) + return -ENODEV; + + return 0; +} + +static const struct acpi_device_id loongson_device_ids[] = { + {LOONGSON_ACPI_HKEY_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, loongson_device_ids); + +static struct platform_driver loongson_hotkey_driver = { + .probe = loongson_hotkey_probe, + .driver = { + .name = "loongson-hotkey", + .owner = THIS_MODULE, + .pm = pm_ptr(&loongson_hotkey_pm), + .acpi_match_table = loongson_device_ids, + }, +}; + +static int hotkey_map(void) +{ + u32 index; + acpi_status status; + struct acpi_buffer buf; + union acpi_object *pack; + + buf.length = ACPI_ALLOCATE_BUFFER; + status = acpi_evaluate_object_typed(hotkey_handle, "KMAP", NULL, &buf, ACPI_TYPE_PACKAGE); + if (status != AE_OK) { + pr_err("ACPI exception: %s\n", acpi_format_exception(status)); + return -1; + } + pack = buf.pointer; + for (index = 0; index < pack->package.count; index++) { + union acpi_object *element, *sub_pack; + + sub_pack = &pack->package.elements[index]; + + element = &sub_pack->package.elements[0]; + hotkey_keycode_map[index].type = element->integer.value; + element = &sub_pack->package.elements[1]; + hotkey_keycode_map[index].code = element->integer.value; + element = &sub_pack->package.elements[2]; + hotkey_keycode_map[index].keycode = element->integer.value; + } + + return 0; +} + +static int hotkey_backlight_set(bool enable) +{ + if (!acpi_evalf(hotkey_handle, NULL, "VCBL", "vd", enable ? 1 : 0)) + return -EIO; + + return 0; +} + +static int ec_get_brightness(void) +{ + int status = 0; + + if (!hotkey_handle) + return -ENXIO; + + if (!acpi_evalf(hotkey_handle, &status, "ECBG", "d")) + return -EIO; + + return status; +} + +static int ec_set_brightness(int level) +{ + + int ret = 0; + + if (!hotkey_handle) + return -ENXIO; + + if (!acpi_evalf(hotkey_handle, NULL, "ECBS", "vd", level)) + ret = -EIO; + + return ret; +} + +static int ec_backlight_level(u8 level) +{ + int status = 0; + + if (!hotkey_handle) + return -ENXIO; + + if (!acpi_evalf(hotkey_handle, &status, "ECLL", "d")) + return -EIO; + + if ((status < 0) || (level > status)) + return status; + + if (!acpi_evalf(hotkey_handle, &status, "ECSL", "d")) + return -EIO; + + if ((status < 0) || (level < status)) + return status; + + return level; +} + +static int loongson_laptop_backlight_update(struct backlight_device *bd) +{ + int lvl = ec_backlight_level(bd->props.brightness); + + if (lvl < 0) + return -EIO; + if (ec_set_brightness(lvl)) + return -EIO; + + return 0; +} + +static int loongson_laptop_get_brightness(struct backlight_device *bd) +{ + int level; + + level = ec_get_brightness(); + if (level < 0) + return -EIO; + + return level; +} + +static const struct backlight_ops backlight_laptop_ops = { + .update_status = loongson_laptop_backlight_update, + .get_brightness = loongson_laptop_get_brightness, +}; + +static int laptop_backlight_register(void) +{ + int status = 0; + struct backlight_properties props; + + memset(&props, 0, sizeof(props)); + + if (!acpi_evalf(hotkey_handle, &status, "ECLL", "d")) + return -EIO; + + props.brightness = 1; + props.max_brightness = status; + props.type = BACKLIGHT_PLATFORM; + + backlight_device_register("loongson_laptop", + NULL, NULL, &backlight_laptop_ops, &props); + + return 0; +} + +int loongson_laptop_turn_on_backlight(void) +{ + int status; + union acpi_object arg0 = { ACPI_TYPE_INTEGER }; + struct acpi_object_list args = { 1, &arg0 }; + + arg0.integer.value = 1; + status = acpi_evaluate_object(NULL, "\\BLSW", &args, NULL); + if (ACPI_FAILURE(status)) { + pr_info("Loongson lvds error: 0x%x\n", status); + return -ENODEV; + } + + return 0; +} + +int loongson_laptop_turn_off_backlight(void) +{ + int status; + union acpi_object arg0 = { ACPI_TYPE_INTEGER }; + struct acpi_object_list args = { 1, &arg0 }; + + arg0.integer.value = 0; + status = acpi_evaluate_object(NULL, "\\BLSW", &args, NULL); + if (ACPI_FAILURE(status)) { + pr_info("Loongson lvds error: 0x%x\n", status); + return -ENODEV; + } + + return 0; +} + +static int __init event_init(struct generic_sub_driver *sub_driver) +{ + int ret; + + ret = hotkey_map(); + if (ret < 0) { + pr_err("Failed to parse keymap from DSDT\n"); + return ret; + } + + ret = sparse_keymap_setup(generic_inputdev, hotkey_keycode_map, NULL); + if (ret < 0) { + pr_err("Failed to setup input device keymap\n"); + input_free_device(generic_inputdev); + + return ret; + } + + /* + * This hotkey driver handle backlight event when + * acpi_video_get_backlight_type() gets acpi_backlight_vendor + */ + if (acpi_video_get_backlight_type() == acpi_backlight_vendor) + hotkey_backlight_set(true); + else + hotkey_backlight_set(false); + + pr_info("ACPI: enabling firmware HKEY event interface...\n"); + + return ret; +} + +static void event_notify(struct generic_sub_driver *sub_driver, u32 event) +{ + int type, scan_code; + struct key_entry *ke = NULL; + + scan_code = event & GENERIC_EVENT_CODE_MASK; + type = (event & GENERIC_EVENT_TYPE_MASK) >> GENERIC_EVENT_TYPE_OFF; + ke = sparse_keymap_entry_from_scancode(generic_inputdev, scan_code); + if (ke) { + if (type == KE_SW) { + int status = 0; + + if (hotkey_status_get(&status) < 0) + return; + + ke->sw.value = !!(status & (1 << ke->sw.code)); + } + sparse_keymap_report_entry(generic_inputdev, ke, 1, true); + } +} + +/* 3. Infrastructure */ + +static void generic_subdriver_exit(struct generic_sub_driver *sub_driver); + +static int __init generic_subdriver_init(struct generic_sub_driver *sub_driver) +{ + int ret; + + if (!sub_driver || !sub_driver->driver) + return -EINVAL; + + ret = platform_driver_register(sub_driver->driver); + if (ret) + return -EINVAL; + + if (sub_driver->init) + sub_driver->init(sub_driver); + + if (sub_driver->notify) { + ret = setup_acpi_notify(sub_driver); + if (ret == -ENODEV) { + ret = 0; + goto err_out; + } + if (ret < 0) + goto err_out; + } + + return 0; + +err_out: + generic_subdriver_exit(sub_driver); + return (ret < 0) ? ret : 0; +} + +static void generic_subdriver_exit(struct generic_sub_driver *sub_driver) +{ + + if (sub_driver->acpi_notify_installed) { + acpi_remove_notify_handler(*sub_driver->handle, + sub_driver->type, dispatch_acpi_notify); + sub_driver->acpi_notify_installed = 0; + } + platform_driver_unregister(sub_driver->driver); +} + +static struct generic_sub_driver generic_sub_drivers[] __refdata = { + { + .name = "hotkey", + .init = event_init, + .notify = event_notify, + .handle = &hotkey_handle, + .type = ACPI_DEVICE_NOTIFY, + .driver = &loongson_hotkey_driver, + }, +}; + +static int __init generic_acpi_laptop_init(void) +{ + bool ec_found; + int i, ret, status; + + if (acpi_disabled) + return -ENODEV; + + /* The EC device is required */ + ec_found = acpi_dev_found(LOONGSON_ACPI_EC_HID); + if (!ec_found) + return -ENODEV; + + /* Enable SCI for EC */ + acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1); + + generic_inputdev = input_allocate_device(); + if (!generic_inputdev) { + pr_err("Unable to allocate input device\n"); + return -ENOMEM; + } + + /* Prepare input device, but don't register */ + generic_inputdev->name = + "Loongson Generic Laptop/All-in-One Extra Buttons"; + generic_inputdev->phys = ACPI_LAPTOP_NAME "/input0"; + generic_inputdev->id.bustype = BUS_HOST; + generic_inputdev->dev.parent = NULL; + + /* Init subdrivers */ + for (i = 0; i < ARRAY_SIZE(generic_sub_drivers); i++) { + ret = generic_subdriver_init(&generic_sub_drivers[i]); + if (ret < 0) { + input_free_device(generic_inputdev); + while (--i >= 0) + generic_subdriver_exit(&generic_sub_drivers[i]); + return ret; + } + } + + ret = input_register_device(generic_inputdev); + if (ret < 0) { + input_free_device(generic_inputdev); + while (--i >= 0) + generic_subdriver_exit(&generic_sub_drivers[i]); + pr_err("Unable to register input device\n"); + return ret; + } + + input_device_registered = 1; + + if (acpi_evalf(hotkey_handle, &status, "ECBG", "d")) { + pr_info("Loongson Laptop used, init brightness is 0x%x\n", status); + ret = laptop_backlight_register(); + if (ret < 0) + pr_err("Loongson Laptop: laptop-backlight device register failed\n"); + } + + return 0; +} + +static void __exit generic_acpi_laptop_exit(void) +{ + if (generic_inputdev) { + if (input_device_registered) + input_unregister_device(generic_inputdev); + else + input_free_device(generic_inputdev); + } +} + +module_init(generic_acpi_laptop_init); +module_exit(generic_acpi_laptop_exit); + +MODULE_AUTHOR("Jianmin Lv <lvjianmin@loongson.cn>"); +MODULE_AUTHOR("Huacai Chen <chenhuacai@loongson.cn>"); +MODULE_DESCRIPTION("Loongson Laptop/All-in-One ACPI Driver"); +MODULE_LICENSE("GPL"); -- GitLab From 2c8577f5e455b149f3ecb24e9a9f48f372a5d71a Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhuacai@loongson.cn> Date: Wed, 12 Oct 2022 16:36:23 +0800 Subject: [PATCH 1810/2223] LoongArch: Update Loongson-3 default config file 1, Enable ZBOOT, KEXEC and BPF_JIT; 2, Add more patition types; 3, Add some USB Type-C options; 4, Add some common network options; 5, Add some Bluetooth device drivers; 6, Remove obsolete config options (for some detailed information, see Link). Link: https://lore.kernel.org/kernel-janitors/20220929090645.1389-1-lukas.bulwahn@gmail.com/ Co-developed-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Co-developed-by: Youling Tang <tangyouling@loongson.cn> Signed-off-by: Youling Tang <tangyouling@loongson.cn> Co-developed-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn> --- arch/loongarch/configs/loongson3_defconfig | 63 +++++++++++++++++++--- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 3712552e18d39..3540e9c0a6310 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -4,6 +4,7 @@ CONFIG_POSIX_MQUEUE=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y CONFIG_PREEMPT=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y @@ -45,6 +46,7 @@ CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y CONFIG_NR_CPUS=64 CONFIG_NUMA=y +CONFIG_KEXEC=y CONFIG_PAGE_SIZE_16KB=y CONFIG_HZ_250=y CONFIG_ACPI=y @@ -55,6 +57,7 @@ CONFIG_ACPI_DOCK=y CONFIG_ACPI_IPMI=m CONFIG_ACPI_PCI_SLOT=y CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_EFI_ZBOOT=y CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y CONFIG_EFI_CAPSULE_LOADER=m CONFIG_EFI_TEST=m @@ -65,6 +68,8 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_PARTITION_ADVANCED=y +CONFIG_BSD_DISKLABEL=y +CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m @@ -82,8 +87,11 @@ CONFIG_ZSMALLOC=m CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y CONFIG_XFRM_USER=y CONFIG_NET_KEY=y +CONFIG_XDP_SOCKETS=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y @@ -95,6 +103,7 @@ CONFIG_IP_PNP_DHCP=y CONFIG_IP_PNP_BOOTP=y CONFIG_IP_PNP_RARP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_IP_MROUTE=y CONFIG_INET_ESP=m CONFIG_INET_UDP_DIAG=y @@ -102,6 +111,7 @@ CONFIG_TCP_CONG_ADVANCED=y CONFIG_TCP_CONG_BBR=m CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +CONFIG_INET6_ESP=m CONFIG_IPV6_MROUTE=y CONFIG_NETWORK_PHY_TIMESTAMPING=y CONFIG_NETFILTER=y @@ -112,10 +122,11 @@ CONFIG_NF_LOG_NETDEV=m CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_TABLES=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_CONNLIMIT=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -200,7 +211,6 @@ CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y -CONFIG_NF_LOG_ARP=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -254,10 +264,14 @@ CONFIG_BPFILTER=y CONFIG_IP_SCTP=m CONFIG_RDS=y CONFIG_L2TP=m +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_VLAN_8021Q_MVRP=y +CONFIG_LLC2=m CONFIG_NET_SCHED=y CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_PRIO=m @@ -282,9 +296,33 @@ CONFIG_VSOCKETS=m CONFIG_VIRTIO_VSOCKETS=m CONFIG_NETLINK_DIAG=y CONFIG_CGROUP_NET_PRIO=y +CONFIG_BPF_STREAM_PARSER=y CONFIG_BT=m +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y CONFIG_BT_HCIBTUSB=m -# CONFIG_BT_HCIBTUSB_BCM is not set +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_ATH3K=m +CONFIG_BT_VIRTIO=m CONFIG_CFG80211=m CONFIG_CFG80211_WEXT=y CONFIG_MAC80211=m @@ -329,7 +367,6 @@ CONFIG_PARPORT_PC_FIFO=y CONFIG_ZRAM=m CONFIG_ZRAM_DEF_COMP_ZSTD=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 @@ -486,6 +523,7 @@ CONFIG_PPP_FILTER=y CONFIG_PPP_MPPE=m CONFIG_PPP_MULTILINK=y CONFIG_PPPOE=m +CONFIG_PPTP=m CONFIG_PPPOL2TP=m CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m @@ -505,7 +543,6 @@ CONFIG_ATH9K_HTC=m CONFIG_IWLWIFI=m CONFIG_IWLDVM=m CONFIG_IWLMVM=m -CONFIG_IWLWIFI_BCAST_FILTERING=y CONFIG_HOSTAP=m CONFIG_MT7601U=m CONFIG_RT2X00=m @@ -521,6 +558,14 @@ CONFIG_RTL8821AE=m CONFIG_RTL8192CU=m # CONFIG_RTLWIFI_DEBUG is not set CONFIG_RTL8XXXU=m +CONFIG_RTW88=m +CONFIG_RTW88_8822BE=m +CONFIG_RTW88_8822CE=m +CONFIG_RTW88_8723DE=m +CONFIG_RTW88_8821CE=m +CONFIG_RTW89=m +CONFIG_RTW89_8852AE=m +CONFIG_RTW89_8852CE=m CONFIG_ZD1211RW=m CONFIG_USB_NET_RNDIS_WLAN=m CONFIG_INPUT_MOUSEDEV=y @@ -651,6 +696,11 @@ CONFIG_USB_SERIAL_FTDI_SIO=m CONFIG_USB_SERIAL_PL2303=m CONFIG_USB_SERIAL_OPTION=m CONFIG_USB_GADGET=y +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_ACPI=m CONFIG_INFINIBAND=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y @@ -688,7 +738,6 @@ CONFIG_COMEDI_NI_PCIDIO=m CONFIG_COMEDI_NI_PCIMIO=m CONFIG_STAGING=y CONFIG_R8188EU=m -# CONFIG_88EU_AP_MODE is not set CONFIG_PM_DEVFREQ=y CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y CONFIG_DEVFREQ_GOV_PERFORMANCE=y @@ -772,14 +821,12 @@ CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m -- GitLab From a8e5e5146ad08d794c58252bab00b261045ef16d Mon Sep 17 00:00:00 2001 From: Catalin Marinas <catalin.marinas@arm.com> Date: Thu, 6 Oct 2022 17:33:54 +0100 Subject: [PATCH 1811/2223] arm64: mte: Avoid setting PG_mte_tagged if no tags cleared or restored Prior to commit 69e3b846d8a7 ("arm64: mte: Sync tags for pages where PTE is untagged"), mte_sync_tags() was only called for pte_tagged() entries (those mapped with PROT_MTE). Therefore mte_sync_tags() could safely use test_and_set_bit(PG_mte_tagged, &page->flags) without inadvertently setting PG_mte_tagged on an untagged page. The above commit was required as guests may enable MTE without any control at the stage 2 mapping, nor a PROT_MTE mapping in the VMM. However, the side-effect was that any page with a PTE that looked like swap (or migration) was getting PG_mte_tagged set automatically. A subsequent page copy (e.g. migration) copied the tags to the destination page even if the tags were owned by KASAN. This issue was masked by the page_kasan_tag_reset() call introduced in commit e5b8d9218951 ("arm64: mte: reset the page tag in page->flags"). When this commit was reverted (20794545c146), KASAN started reporting access faults because the overriding tags in a page did not match the original page->flags (with CONFIG_KASAN_HW_TAGS=y): BUG: KASAN: invalid-access in copy_page+0x10/0xd0 arch/arm64/lib/copy_page.S:26 Read at addr f5ff000017f2e000 by task syz-executor.1/2218 Pointer tag: [f5], memory tag: [f2] Move the PG_mte_tagged bit setting from mte_sync_tags() to the actual place where tags are cleared (mte_sync_page_tags()) or restored (mte_restore_tags()). Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: syzbot+c2c79c6d6eddc5262b77@syzkaller.appspotmail.com Fixes: 69e3b846d8a7 ("arm64: mte: Sync tags for pages where PTE is untagged") Cc: <stable@vger.kernel.org> # 5.14.x Cc: Steven Price <steven.price@arm.com> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/0000000000004387dc05e5888ae5@google.com/ Reviewed-by: Steven Price <steven.price@arm.com> Link: https://lore.kernel.org/r/20221006163354.3194102-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- arch/arm64/kernel/mte.c | 9 +++++++-- arch/arm64/mm/mteswap.c | 7 ++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index aca88470fb69d..7467217c1eaf3 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -48,7 +48,12 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, if (!pte_is_tagged) return; - mte_clear_page_tags(page_address(page)); + /* + * Test PG_mte_tagged again in case it was racing with another + * set_pte_at(). + */ + if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + mte_clear_page_tags(page_address(page)); } void mte_sync_tags(pte_t old_pte, pte_t pte) @@ -64,7 +69,7 @@ void mte_sync_tags(pte_t old_pte, pte_t pte) /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (!test_bit(PG_mte_tagged, &page->flags)) mte_sync_page_tags(page, old_pte, check_swap, pte_is_tagged); } diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index 4334dec93bd44..bed803d8e1585 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -53,7 +53,12 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page) if (!tags) return false; - mte_restore_page_tags(page_address(page), tags); + /* + * Test PG_mte_tagged again in case it was racing with another + * set_pte_at(). + */ + if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + mte_restore_page_tags(page_address(page), tags); return true; } -- GitLab From a1ae8d4d9be0178132df7c4931a1ba77d0e76039 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg <sagi@grimberg.me> Date: Wed, 28 Sep 2022 09:23:26 +0300 Subject: [PATCH 1812/2223] nvme-rdma: fix possible hang caused during ctrl deletion When we delete a controller, we execute the following: 1. nvme_stop_ctrl() - stop some work elements that may be inflight or scheduled (specifically also .stop_ctrl which cancels ctrl error recovery work) 2. nvme_remove_namespaces() - which first flushes scan_work to avoid competing ns addition/removal 3. continue to teardown the controller However, if err_work was scheduled to run in (1), it is designed to cancel any inflight I/O, particularly I/O that is originating from ns scan_work in (2), but because it is cancelled in .stop_ctrl(), we can prevent forward progress of (2) as ns scanning is blocking on I/O (that will never be cancelled). The race is: 1. transport layer error observed -> err_work is scheduled 2. scan_work executes, discovers ns, generate I/O to it 3. nvme_ctop_ctrl() -> .stop_ctrl() -> cancel_work_sync(err_work) - err_work never executed 4. nvme_remove_namespaces() -> flush_work(scan_work) --> deadlock, because scan_work is blocked on I/O that was supposed to be cancelled by err_work, but was cancelled before executing. Fix this by flushing err_work instead of cancelling it, to force it to execute and cancel all inflight I/O. Fixes: b435ecea2a4d ("nvme: Add .stop_ctrl to nvme ctrl ops") Fixes: f6c8e432cb04 ("nvme: flush namespace scanning work just before removing namespaces") Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5ad0ab2853a49..6e079abb22ee9 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -996,7 +996,7 @@ static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - cancel_work_sync(&ctrl->err_work); + flush_work(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); } -- GitLab From c4abd8757189c7ca5803828f9c892328d7d94943 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg <sagi@grimberg.me> Date: Wed, 28 Sep 2022 09:23:25 +0300 Subject: [PATCH 1813/2223] nvme-tcp: fix possible hang caused during ctrl deletion When we delete a controller, we execute the following: 1. nvme_stop_ctrl() - stop some work elements that may be inflight or scheduled (specifically also .stop_ctrl which cancels ctrl error recovery work) 2. nvme_remove_namespaces() - which first flushes scan_work to avoid competing ns addition/removal 3. continue to teardown the controller However, if err_work was scheduled to run in (1), it is designed to cancel any inflight I/O, particularly I/O that is originating from ns scan_work in (2), but because it is cancelled in .stop_ctrl(), we can prevent forward progress of (2) as ns scanning is blocking on I/O (that will never be cancelled). The race is: 1. transport layer error observed -> err_work is scheduled 2. scan_work executes, discovers ns, generate I/O to it 3. nvme_ctop_ctrl() -> .stop_ctrl() -> cancel_work_sync(err_work) - err_work never executed 4. nvme_remove_namespaces() -> flush_work(scan_work) --> deadlock, because scan_work is blocked on I/O that was supposed to be cancelled by err_work, but was cancelled before executing (see stack trace [1]). Fix this by flushing err_work instead of cancelling it, to force it to execute and cancel all inflight I/O. [1]: -- Call Trace: <TASK> __schedule+0x390/0x910 ? scan_shadow_nodes+0x40/0x40 schedule+0x55/0xe0 io_schedule+0x16/0x40 do_read_cache_page+0x55d/0x850 ? __page_cache_alloc+0x90/0x90 read_cache_page+0x12/0x20 read_part_sector+0x3f/0x110 amiga_partition+0x3d/0x3e0 ? osf_partition+0x33/0x220 ? put_partition+0x90/0x90 bdev_disk_changed+0x1fe/0x4d0 blkdev_get_whole+0x7b/0x90 blkdev_get_by_dev+0xda/0x2d0 device_add_disk+0x356/0x3b0 nvme_mpath_set_live+0x13c/0x1a0 [nvme_core] ? nvme_parse_ana_log+0xae/0x1a0 [nvme_core] nvme_update_ns_ana_state+0x3a/0x40 [nvme_core] nvme_mpath_add_disk+0x120/0x160 [nvme_core] nvme_alloc_ns+0x594/0xa00 [nvme_core] nvme_validate_or_alloc_ns+0xb9/0x1a0 [nvme_core] ? __nvme_submit_sync_cmd+0x1d2/0x210 [nvme_core] nvme_scan_work+0x281/0x410 [nvme_core] process_one_work+0x1be/0x380 worker_thread+0x37/0x3b0 ? process_one_work+0x380/0x380 kthread+0x12d/0x150 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x1f/0x30 </TASK> INFO: task nvme:6725 blocked for more than 491 seconds. Not tainted 5.15.65-f0.el7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:nvme state:D stack: 0 pid: 6725 ppid: 1761 flags:0x00004000 Call Trace: <TASK> __schedule+0x390/0x910 ? sched_clock+0x9/0x10 schedule+0x55/0xe0 schedule_timeout+0x24b/0x2e0 ? try_to_wake_up+0x358/0x510 ? finish_task_switch+0x88/0x2c0 wait_for_completion+0xa5/0x110 __flush_work+0x144/0x210 ? worker_attach_to_pool+0xc0/0xc0 flush_work+0x10/0x20 nvme_remove_namespaces+0x41/0xf0 [nvme_core] nvme_do_delete_ctrl+0x47/0x66 [nvme_core] nvme_sysfs_delete.cold.96+0x8/0xd [nvme_core] dev_attr_store+0x14/0x30 sysfs_kf_write+0x38/0x50 kernfs_fop_write_iter+0x146/0x1d0 new_sync_write+0x114/0x1b0 ? intel_pmu_handle_irq+0xe0/0x420 vfs_write+0x18d/0x270 ksys_write+0x61/0xe0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x37/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb -- Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Jonathan Nicklin <jnicklin@blockbridge.com> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Jonathan Nicklin <jnicklin@blockbridge.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 93e2e313fa70f..1eed0fc26b3ae 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2181,7 +2181,7 @@ out_fail: static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) { - cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + flush_work(&to_tcp_ctrl(ctrl)->err_work); cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); } -- GitLab From 80b2624094c8d369a3c6eab515e8f1564d2e5db2 Mon Sep 17 00:00:00 2001 From: Abhijit <abhijit@abhijittomar.com> Date: Mon, 10 Oct 2022 10:30:05 +0200 Subject: [PATCH 1814/2223] nvme-pci: add NVME_QUIRK_BOGUS_NID for Lexar NM760 Add a quirk to fix Lexar NM760 SSD drives reporting duplicate nsids. Signed-off-by: Abhijit <abhijit@abhijittomar.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b796efa325b7..fd2c0231d2e20 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3527,6 +3527,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), -- GitLab From d5d3c100ac40dcb03959a6f1d2f0f13204c4f145 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao <xry111@xry111.site> Date: Wed, 28 Sep 2022 17:39:13 +0800 Subject: [PATCH 1815/2223] nvme-pci: avoid the deepest sleep state on ZHITAI TiPro5000 SSDs ZHITAI TiPro5000 SSDs has the same APST sleep problem as its cousin, TiPro7000. The quirk for TiPro7000 has been added in commit 6b961bce50e4 ("nvme-pci: avoid the deepest sleep state on ZHITAI TiPro7000 SSDs"), use the same quirk for TiPro5000. The ASPT data from "nvme id-ctrl /dev/nvme1": vid : 0x1e49 ssvid : 0x1e49 sn : ZTA21T0KA2227304LM mn : ZHITAI TiPlus5000 1TB fr : ZTA09139 [...] ps 0 : mp:6.50W operational enlat:0 exlat:0 rrt:0 rrl:0 rwt:0 rwl:0 idle_power:- active_power:- ps 1 : mp:5.80W operational enlat:0 exlat:0 rrt:1 rrl:1 rwt:1 rwl:1 idle_power:- active_power:- ps 2 : mp:3.60W operational enlat:0 exlat:0 rrt:2 rrl:2 rwt:2 rwl:2 idle_power:- active_power:- ps 3 : mp:0.0500W non-operational enlat:5000 exlat:10000 rrt:3 rrl:3 rwt:3 rwl:3 idle_power:- active_power:- ps 4 : mp:0.0025W non-operational enlat:8000 exlat:45000 rrt:4 rrl:4 rwt:4 rwl:4 idle_power:- active_power:- Reported-and-tested-by: Chang Feng <flukehn@gmail.com> Signed-off-by: Xi Ruoyao <xry111@xry111.site> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fd2c0231d2e20..bcbef6bc5672f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3521,6 +3521,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ -- GitLab From 72e3b8883a36e80ebfa41015c7b6926ce31ace05 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg <sagi@grimberg.me> Date: Thu, 29 Sep 2022 10:36:47 +0300 Subject: [PATCH 1816/2223] nvme-multipath: fix possible hang in live ns resize with ANA access When we revalidate paths as part of ns size change (as of commit e7d65803e2bb), it is possible that during the path revalidation, the only paths that is IO capable (i.e. optimized/non-optimized) are the ones that ns resize was not yet informed to the host, which will cause inflight requests to be requeued (as we have available paths but none are IO capable). These requests on the requeue list are waiting for someone to resubmit them at some point. The IO capable paths will eventually notify the ns resize change to the host, but there is nothing that will kick the requeue list to resubmit the queued requests. Fix this by always kicking the requeue list, and if no IO capable path exists, these requests will be queued again. A typical log that indicates that IOs are requeued: -- nvme nvme1: creating 4 I/O queues. nvme nvme1: new ctrl: "testnqn1" nvme nvme2: creating 4 I/O queues. nvme nvme2: mapped 4/0/0 default/read/poll queues. nvme nvme2: new ctrl: NQN "testnqn1", addr 127.0.0.1:8009 nvme nvme1: rescanning namespaces. nvme1n1: detected capacity change from 2097152 to 4194304 block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O block nvme1n1: no usable path - requeuing I/O nvme nvme2: rescanning namespaces. -- Reported-by: Yogev Cohen <yogev@lightbitslabs.com> Fixes: e7d65803e2bb ("nvme-multipath: revalidate paths during rescan") Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Cc: <stable@vger.kernel.org> # v5.15+ Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/multipath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 00f2f81e20fa1..0ea7e441e080f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -182,6 +182,7 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns) for_each_node(node) rcu_assign_pointer(head->current_path[node], NULL); + kblockd_schedule_work(&head->requeue_work); } static bool nvme_path_is_disabled(struct nvme_ns *ns) -- GitLab From 30f7d1cac2aab8fec560a388ad31ca5e5d04a822 Mon Sep 17 00:00:00 2001 From: Zheng Yejian <zhengyejian1@huawei.com> Date: Tue, 11 Oct 2022 12:03:52 +0000 Subject: [PATCH 1817/2223] ftrace: Fix char print issue in print_ip_ins() When ftrace bug happened, following log shows every hex data in problematic ip address: actual: ffffffe8:6b:ffffffd9:01:21 But so many 'f's seem a little confusing, and that is because format '%x' being used to print signed chars in array 'ins'. As suggested by Joe, change to use format "%*phC" to print array 'ins'. After this patch, the log is like: actual: e8:6b:d9:01:21 Link: https://lkml.kernel.org/r/20221011120352.1878494-1-zhengyejian1@huawei.com Fixes: 6c14133d2d3f ("ftrace: Do not blindly read the ip address in ftrace_bug()") Suggested-by: Joe Perches <joe@perches.com> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83362a1557916..75c16215d065d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2028,7 +2028,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, static void print_ip_ins(const char *fmt, const unsigned char *p) { char ins[MCOUNT_INSN_SIZE]; - int i; if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); @@ -2036,9 +2035,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) } printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); + pr_cont("%*phC", MCOUNT_INSN_SIZE, ins); } enum ftrace_bug_type ftrace_bug_type; -- GitLab From 6e31ce831c63bd7aec8ff9cc2a6d50ee8c4d4e04 Mon Sep 17 00:00:00 2001 From: Phil Sutter <phil@nwl.cc> Date: Wed, 5 Oct 2022 18:07:04 +0200 Subject: [PATCH 1818/2223] selftests: netfilter: Test reverse path filtering Test reverse path (filter) matches in iptables, ip6tables and nftables. Both with a regular interface and a VRF. Signed-off-by: Phil Sutter <phil@nwl.cc> Reviewed-by: Guillaume Nault <gnault@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> --- tools/testing/selftests/netfilter/Makefile | 2 +- tools/testing/selftests/netfilter/rpath.sh | 147 +++++++++++++++++++++ 2 files changed, 148 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/rpath.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 600e3a19d5e28..4504ee07be08d 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -6,7 +6,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ - conntrack_vrf.sh nft_synproxy.sh + conntrack_vrf.sh nft_synproxy.sh rpath.sh CFLAGS += $(shell pkg-config --cflags libmnl 2>/dev/null || echo "-I/usr/include/libmnl") LDLIBS = -lmnl diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh new file mode 100755 index 0000000000000..2d8da7bd8ab74 --- /dev/null +++ b/tools/testing/selftests/netfilter/rpath.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# return code to signal skipped test +ksft_skip=4 + +# search for legacy iptables (it uses the xtables extensions +if iptables-legacy --version >/dev/null 2>&1; then + iptables='iptables-legacy' +elif iptables --version >/dev/null 2>&1; then + iptables='iptables' +else + iptables='' +fi + +if ip6tables-legacy --version >/dev/null 2>&1; then + ip6tables='ip6tables-legacy' +elif ! ip6tables --version >/dev/null 2>&1; then + ip6tables='ip6tables' +else + ip6tables='' +fi + +if nft --version >/dev/null 2>&1; then + nft='nft' +else + nft='' +fi + +if [ -z "$iptables$ip6tables$nft" ]; then + echo "SKIP: Test needs iptables, ip6tables or nft" + exit $ksft_skip +fi + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +trap "ip netns del $ns1; ip netns del $ns2" EXIT + +# create two netns, disable rp_filter in ns2 and +# keep IPv6 address when moving into VRF +ip netns add "$ns1" +ip netns add "$ns2" +ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1 + +# a standard connection between the netns, should not trigger rp filter +ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2" +ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up +ip -net "$ns1" a a 192.168.23.2/24 dev v0 +ip -net "$ns2" a a 192.168.23.1/24 dev v0 +ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad + +# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0 +ip -net "$ns2" link add d0 type dummy +ip -net "$ns2" link set d0 up +ip -net "$ns1" a a 192.168.42.2/24 dev v0 +ip -net "$ns2" a a 192.168.42.1/24 dev d0 +ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad + +# firewall matches to test +ip netns exec "$ns2" "$iptables" -t raw -A PREROUTING -s 192.168.0.0/16 -m rpfilter +ip netns exec "$ns2" "$ip6tables" -t raw -A PREROUTING -s fec0::/16 -m rpfilter +ip netns exec "$ns2" nft -f - <<EOF +table inet t { + chain c { + type filter hook prerouting priority raw; + ip saddr 192.168.0.0/16 fib saddr . iif oif exists counter + ip6 saddr fec0::/16 fib saddr . iif oif exists counter + } +} +EOF + +die() { + echo "FAIL: $*" + #ip netns exec "$ns2" "$iptables" -t raw -vS + #ip netns exec "$ns2" "$ip6tables" -t raw -vS + #ip netns exec "$ns2" nft list ruleset + exit 1 +} + +# check rule counters, return true if rule did not match +ipt_zero_rule() { # (command) + [ -n "$1" ] || return 0 + ip netns exec "$ns2" "$1" -t raw -vS | grep -q -- "-m rpfilter -c 0 0" +} +nft_zero_rule() { # (family) + [ -n "$nft" ] || return 0 + ip netns exec "$ns2" "$nft" list chain inet t c | \ + grep -q "$1 saddr .* counter packets 0 bytes 0" +} + +netns_ping() { # (netns, args...) + local netns="$1" + shift + ip netns exec "$netns" ping -q -c 1 -W 1 "$@" >/dev/null +} + +testrun() { + # clear counters first + [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z + [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z + if [ -n "$nft" ]; then + ( + echo "delete table inet t"; + ip netns exec "$ns2" nft -s list table inet t; + ) | ip netns exec "$ns2" nft -f - + fi + + # test 1: martian traffic should fail rpfilter matches + netns_ping "$ns1" -I v0 192.168.42.1 && \ + die "martian ping 192.168.42.1 succeeded" + netns_ping "$ns1" -I v0 fec0:42::1 && \ + die "martian ping fec0:42::1 succeeded" + + ipt_zero_rule "$iptables" || die "iptables matched martian" + ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" + nft_zero_rule ip || die "nft IPv4 matched martian" + nft_zero_rule ip6 || die "nft IPv6 matched martian" + + # test 2: rpfilter match should pass for regular traffic + netns_ping "$ns1" 192.168.23.1 || \ + die "regular ping 192.168.23.1 failed" + netns_ping "$ns1" fec0:23::1 || \ + die "regular ping fec0:23::1 failed" + + ipt_zero_rule "$iptables" && die "iptables match not effective" + ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" + nft_zero_rule ip && die "nft IPv4 match not effective" + nft_zero_rule ip6 && die "nft IPv6 match not effective" + +} + +testrun + +# repeat test with vrf device in $ns2 +ip -net "$ns2" link add vrf0 type vrf table 10 +ip -net "$ns2" link set vrf0 up +ip -net "$ns2" link set v0 master vrf0 + +testrun + +echo "PASS: netfilter reverse path match works as intended" +exit 0 -- GitLab From acc641ab95b66b813c1ce856c377a2bbe71e7f52 Mon Sep 17 00:00:00 2001 From: Phil Sutter <phil@nwl.cc> Date: Wed, 5 Oct 2022 18:07:05 +0200 Subject: [PATCH 1819/2223] netfilter: rpfilter/fib: Populate flowic_l3mdev field Use the introduced field for correct operation with VRF devices instead of conditionally overwriting flowic_oif. This is a partial revert of commit b575b24b8eee3 ("netfilter: Fix rpfilter dropping vrf packets by mistake"), implementing a simpler solution. Signed-off-by: Phil Sutter <phil@nwl.cc> Reviewed-by: David Ahern <dsahern@kernel.org> Reviewed-by: Guillaume Nault <gnault@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> --- net/ipv4/netfilter/ipt_rpfilter.c | 2 +- net/ipv4/netfilter/nft_fib_ipv4.c | 2 +- net/ipv6/netfilter/ip6t_rpfilter.c | 9 +++------ net/ipv6/netfilter/nft_fib_ipv6.c | 5 ++--- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8183bbcabb4af..ff85db52b2e56 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -77,7 +77,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; - flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); + flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7ade04ff972d7..e886147eed11d 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -84,7 +84,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, oif = NULL; if (priv->flags & NFTA_FIB_F_IIF) - fl4.flowi4_oif = l3mdev_master_ifindex_rcu(oif); + fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(oif); if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index d800801a5dd27..69d86b040a6af 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -37,6 +37,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, bool ret = false; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev), .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_proto = iph->nexthdr, .daddr = iph->saddr, @@ -55,9 +56,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, if (rpfilter_addr_linklocal(&iph->saddr)) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6.flowi6_oif = dev->ifindex; - /* Set flowi6_oif for vrf devices to lookup route in l3mdev domain. */ - } else if (netif_is_l3_master(dev) || netif_is_l3_slave(dev) || - (flags & XT_RPFILTER_LOOSE) == 0) + } else if ((flags & XT_RPFILTER_LOOSE) == 0) fl6.flowi6_oif = dev->ifindex; rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); @@ -72,9 +71,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, goto out; } - if (rt->rt6i_idev->dev == dev || - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex || - (flags & XT_RPFILTER_LOOSE)) + if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) ret = true; out: ip6_rt_put(rt); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 1d7e520d9966c..91faac610e03d 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -41,9 +41,8 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); - } else if ((priv->flags & NFTA_FIB_F_IIF) && - (netif_is_l3_master(dev) || netif_is_l3_slave(dev))) { - fl6->flowi6_oif = dev->ifindex; + } else if (priv->flags & NFTA_FIB_F_IIF) { + fl6->flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) -- GitLab From 6a91e7270936c5a504af7e0a197d7021e169d281 Mon Sep 17 00:00:00 2001 From: Phil Sutter <phil@nwl.cc> Date: Wed, 5 Oct 2022 17:34:36 +0200 Subject: [PATCH 1820/2223] selftests: netfilter: Fix nft_fib.sh for all.rp_filter=1 If net.ipv4.conf.all.rp_filter is set, it overrides the per-interface setting and thus defeats the fix from bbe4c0896d250 ("selftests: netfilter: disable rp_filter on router"). Unset it as well to cover that case. Fixes: bbe4c0896d250 ("selftests: netfilter: disable rp_filter on router") Signed-off-by: Phil Sutter <phil@nwl.cc> Signed-off-by: Florian Westphal <fw@strlen.de> --- tools/testing/selftests/netfilter/nft_fib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh index fd76b69635a44..dff476e45e772 100755 --- a/tools/testing/selftests/netfilter/nft_fib.sh +++ b/tools/testing/selftests/netfilter/nft_fib.sh @@ -188,6 +188,7 @@ test_ping() { ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null sleep 3 -- GitLab From 3a732b46736cd8a29092e4b0b1a9ba83e672bf89 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr <jk@codeconstruct.com.au> Date: Wed, 12 Oct 2022 10:08:51 +0800 Subject: [PATCH 1821/2223] mctp: prevent double key removal and unref Currently, we have a bug where a simultaneous DROPTAG ioctl and socket close may race, as we attempt to remove a key from lists twice, and perform an unref for each removal operation. This may result in a uaf when we attempt the second unref. This change fixes the race by making __mctp_key_remove tolerant to being called on a key that has already been removed from the socket/net lists, and only performs the unref when we do the actual remove. We also need to hold the list lock on the ioctl cleanup path. This fix is based on a bug report and comprehensive analysis from butt3rflyh4ck <butterflyhuangxx@gmail.com>, found via syzkaller. Cc: stable@vger.kernel.org Fixes: 63ed1aab3d40 ("mctp: Add SIOCMCTP{ALLOC,DROP}TAG ioctls for tag control") Reported-by: butt3rflyh4ck <butterflyhuangxx@gmail.com> Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mctp/af_mctp.c | 23 ++++++++++++++++------- net/mctp/route.c | 10 +++++----- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index c2fc2a7b25285..b6b5e496fa403 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -295,11 +295,12 @@ __must_hold(&net->mctp.keys_lock) mctp_dev_release_key(key->dev, key); spin_unlock_irqrestore(&key->lock, flags); - hlist_del(&key->hlist); - hlist_del(&key->sklist); - - /* unref for the lists */ - mctp_key_unref(key); + if (!hlist_unhashed(&key->hlist)) { + hlist_del_init(&key->hlist); + hlist_del_init(&key->sklist); + /* unref for the lists */ + mctp_key_unref(key); + } kfree_skb(skb); } @@ -373,9 +374,17 @@ static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg) ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC; if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) { - spin_lock_irqsave(&key->lock, flags); - __mctp_key_remove(key, net, flags, MCTP_TRACE_KEY_DROPPED); + unsigned long fl2; + /* Unwind our key allocation: the keys list lock needs to be + * taken before the individual key locks, and we need a valid + * flags value (fl2) to pass to __mctp_key_remove, hence the + * second spin_lock_irqsave() rather than a plain spin_lock(). + */ + spin_lock_irqsave(&net->mctp.keys_lock, flags); + spin_lock_irqsave(&key->lock, fl2); + __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_DROPPED); mctp_key_unref(key); + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); return -EFAULT; } diff --git a/net/mctp/route.c b/net/mctp/route.c index 3b24b8d18b5b5..2155f15a074cd 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -228,12 +228,12 @@ __releases(&key->lock) if (!key->manual_alloc) { spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_del(&key->hlist); - hlist_del(&key->sklist); + if (!hlist_unhashed(&key->hlist)) { + hlist_del_init(&key->hlist); + hlist_del_init(&key->sklist); + mctp_key_unref(key); + } spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - - /* unref for the lists */ - mctp_key_unref(key); } /* and one for the local reference */ -- GitLab From b7085b6ffe71ac2668f27a2ced6a1e516f66f8c1 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Sun, 9 Oct 2022 10:06:42 +0800 Subject: [PATCH 1822/2223] ring-buffer: Fix kernel-doc kernel/trace/ring_buffer.c:895: warning: expecting prototype for ring_buffer_nr_pages_dirty(). Prototype was for ring_buffer_nr_dirty_pages() instead. kernel/trace/ring_buffer.c:5313: warning: expecting prototype for ring_buffer_reset_cpu(). Prototype was for ring_buffer_reset_online_cpus() instead. kernel/trace/ring_buffer.c:5382: warning: expecting prototype for rind_buffer_empty(). Prototype was for ring_buffer_empty() instead. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2340 Link: https://lkml.kernel.org/r/20221009020642.12506-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c3f354cfc5ba1..199759c735196 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -885,7 +885,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) } /** - * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer + * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * @@ -5305,7 +5305,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** - * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ @@ -5375,7 +5375,7 @@ void ring_buffer_reset(struct trace_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_reset); /** - * rind_buffer_empty - is the ring buffer empty? + * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) -- GitLab From e237506238352f3bfa9cf3983cdab873e35651eb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Wed, 12 Oct 2022 13:53:34 +1000 Subject: [PATCH 1823/2223] powerpc/32: fix syscall wrappers with 64-bit arguments of unaligned register-pairs powerpc 32-bit system call (and function) calling convention for 64-bit arguments requires the next available odd-pair (two sequential registers with the first being odd-numbered) from the standard register argument allocation. The first argument register is r3, so a 64-bit argument that appears at an even position in the argument list must skip a register (unless there were preceding 64-bit arguments, which might throw things off). This requires non-standard compat definitions to deal with the holes in the argument register allocation. With pt_regs syscall wrappers which use a standard mapper to map pt_regs GPRs to function arguments, 32-bit kernels hit the same basic problem, the standard definitions don't cope with the unused argument registers. Fix this by having 32-bit kernels share those syscall definitions with compat. Thanks to Jason for spending a lot of time finding and bisecting this and developing a trivial reproducer. The perfect bug report. Reported-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Fixes: 7e92e01b72452 ("powerpc: Provide syscall wrapper") Reviewed-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221012035335.866440-1-npiggin@gmail.com --- arch/powerpc/include/asm/syscalls.h | 16 ++++++++++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/sys_ppc32.c | 38 ++++++++++++++++++------ arch/powerpc/kernel/syscalls/syscall.tbl | 16 ++++++---- 4 files changed, 56 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 9840d572da553..a1142496cd588 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -89,6 +89,22 @@ long compat_sys_rt_sigreturn(void); * responsible for combining parameter pairs. */ +#ifdef CONFIG_PPC32 +long sys_ppc_pread64(unsigned int fd, + char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); +long sys_ppc_pwrite64(unsigned int fd, + const char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); +long sys_ppc_readahead(int fd, u32 r4, + u32 offset1, u32 offset2, u32 count); +long sys_ppc_truncate64(const char __user *path, u32 reg4, + unsigned long len1, unsigned long len2); +long sys_ppc_ftruncate64(unsigned int fd, u32 reg4, + unsigned long len1, unsigned long len2); +long sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, + size_t len, int advice); +#endif #ifdef CONFIG_COMPAT long compat_sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ee2d76cb31878..9b6146056e48b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -73,6 +73,7 @@ obj-y := cputable.o syscalls.o \ obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o irq_64.o\ paca.o nvram_64.o note.o +obj-$(CONFIG_PPC32) += sys_ppc32.o obj-$(CONFIG_COMPAT) += sys_ppc32.o signal_32.o obj-$(CONFIG_VDSO32) += vdso32_wrapper.o obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index dcc3c9fd4cfd1..1ab4a4d95abaf 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -1,13 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * sys_ppc32.c: Conversion between 32bit and 64bit native syscalls. + * sys_ppc32.c: 32-bit system calls with complex calling conventions. * * Copyright (C) 2001 IBM * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) * - * These routines maintain argument size conversion between 32bit and 64bit - * environment. + * 32-bit system calls with 64-bit arguments pass those in register pairs. + * This must be specially dealt with on 64-bit kernels. The compat_arg_u64_dual + * in generic compat syscalls is not always usable because the register + * pairing is constrained depending on preceding arguments. + * + * An analogous problem exists on 32-bit kernels with ARCH_HAS_SYSCALL_WRAPPER, + * the defined system call functions take the pt_regs as an argument, and there + * is a mapping macro which maps registers to arguments + * (SC_POWERPC_REGS_TO_ARGS) which also does not deal with these 64-bit + * arguments. + * + * This file contains these system calls. */ #include <linux/kernel.h> @@ -47,7 +57,17 @@ #include <asm/syscalls.h> #include <asm/switch_to.h> -COMPAT_SYSCALL_DEFINE6(ppc_pread64, +#ifdef CONFIG_PPC32 +#define PPC32_SYSCALL_DEFINE4 SYSCALL_DEFINE4 +#define PPC32_SYSCALL_DEFINE5 SYSCALL_DEFINE5 +#define PPC32_SYSCALL_DEFINE6 SYSCALL_DEFINE6 +#else +#define PPC32_SYSCALL_DEFINE4 COMPAT_SYSCALL_DEFINE4 +#define PPC32_SYSCALL_DEFINE5 COMPAT_SYSCALL_DEFINE5 +#define PPC32_SYSCALL_DEFINE6 COMPAT_SYSCALL_DEFINE6 +#endif + +PPC32_SYSCALL_DEFINE6(ppc_pread64, unsigned int, fd, char __user *, ubuf, compat_size_t, count, u32, reg6, u32, pos1, u32, pos2) @@ -55,7 +75,7 @@ COMPAT_SYSCALL_DEFINE6(ppc_pread64, return ksys_pread64(fd, ubuf, count, merge_64(pos1, pos2)); } -COMPAT_SYSCALL_DEFINE6(ppc_pwrite64, +PPC32_SYSCALL_DEFINE6(ppc_pwrite64, unsigned int, fd, const char __user *, ubuf, compat_size_t, count, u32, reg6, u32, pos1, u32, pos2) @@ -63,28 +83,28 @@ COMPAT_SYSCALL_DEFINE6(ppc_pwrite64, return ksys_pwrite64(fd, ubuf, count, merge_64(pos1, pos2)); } -COMPAT_SYSCALL_DEFINE5(ppc_readahead, +PPC32_SYSCALL_DEFINE5(ppc_readahead, int, fd, u32, r4, u32, offset1, u32, offset2, u32, count) { return ksys_readahead(fd, merge_64(offset1, offset2), count); } -COMPAT_SYSCALL_DEFINE4(ppc_truncate64, +PPC32_SYSCALL_DEFINE4(ppc_truncate64, const char __user *, path, u32, reg4, unsigned long, len1, unsigned long, len2) { return ksys_truncate(path, merge_64(len1, len2)); } -COMPAT_SYSCALL_DEFINE4(ppc_ftruncate64, +PPC32_SYSCALL_DEFINE4(ppc_ftruncate64, unsigned int, fd, u32, reg4, unsigned long, len1, unsigned long, len2) { return ksys_ftruncate(fd, merge_64(len1, len2)); } -COMPAT_SYSCALL_DEFINE6(ppc32_fadvise64, +PPC32_SYSCALL_DEFINE6(ppc32_fadvise64, int, fd, u32, unused, u32, offset1, u32, offset2, size_t, len, int, advice) { diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 2bca64f96164a..e9e0df4f9a61a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -228,8 +228,10 @@ 176 64 rt_sigtimedwait sys_rt_sigtimedwait 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -179 common pread64 sys_pread64 compat_sys_ppc_pread64 -180 common pwrite64 sys_pwrite64 compat_sys_ppc_pwrite64 +179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 +179 64 pread64 sys_pread64 +180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 +180 64 pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -242,10 +244,11 @@ 188 common putpmsg sys_ni_syscall 189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit -191 common readahead sys_readahead compat_sys_ppc_readahead +191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead +191 64 readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 -193 32 truncate64 sys_truncate64 compat_sys_ppc_truncate64 -194 32 ftruncate64 sys_ftruncate64 compat_sys_ppc_ftruncate64 +193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 +194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 195 32 stat64 sys_stat64 196 32 lstat64 sys_lstat64 197 32 fstat64 sys_fstat64 @@ -288,7 +291,8 @@ 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address -233 common fadvise64 sys_fadvise64 compat_sys_ppc32_fadvise64 +233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 +233 64 fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie 236 common epoll_create sys_epoll_create -- GitLab From 37a3a3278516eae364006d5597f2b9d40580a450 Mon Sep 17 00:00:00 2001 From: ChiaEn Wu <chiaen_wu@richtek.com> Date: Thu, 6 Oct 2022 11:16:13 +0800 Subject: [PATCH 1824/2223] dt-bindings: leds: mt6370: Fix MT6370 LED indicator DT warning Add '$ref' and 'unevaluatedProperties: false' in 'multi-led', and remove unused 'allOf' property. Fixes: 440c57dabb45 ("dt-bindings: leds: mt6370: Add MediaTek MT6370 current sink type LED indicator") Signed-off-by: ChiaEn Wu <chiaen_wu@richtek.com> Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Link: https://lore.kernel.org/r/435f6888ebc20c5abae63eb9cb3a055b60db2ed1.1665050503.git.chiaen_wu@richtek.com Signed-off-by: Rob Herring <robh@kernel.org> --- .../devicetree/bindings/leds/mediatek,mt6370-indicator.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/mediatek,mt6370-indicator.yaml b/Documentation/devicetree/bindings/leds/mediatek,mt6370-indicator.yaml index 204b103ffc2c1..16b3abc2af3a7 100644 --- a/Documentation/devicetree/bindings/leds/mediatek,mt6370-indicator.yaml +++ b/Documentation/devicetree/bindings/leds/mediatek,mt6370-indicator.yaml @@ -13,9 +13,6 @@ description: | This module is part of the MT6370 MFD device. Add MT6370 LED driver include 4-channel RGB LED support Register/PWM/Breath Mode -allOf: - - $ref: leds-class-multicolor.yaml# - properties: compatible: const: mediatek,mt6370-indicator @@ -29,6 +26,8 @@ properties: patternProperties: "^multi-led@[0-3]$": type: object + $ref: leds-class-multicolor.yaml# + unevaluatedProperties: false properties: reg: -- GitLab From 6127dab7a126387744290101514d31b79bb62b8e Mon Sep 17 00:00:00 2001 From: ChiaEn Wu <chiaen_wu@richtek.com> Date: Wed, 12 Oct 2022 15:08:14 +0800 Subject: [PATCH 1825/2223] dt-bindings: mfd: mt6370: fix the interrupt order of the charger in the example Fix the interrupt order of the charger in the binding example. Fixes: 76f52f815f1a ("dt-bindings: mfd: Add MediaTek MT6370") Signed-off-by: ChiaEn Wu <chiaen_wu@richtek.com> Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com> Link: https://lore.kernel.org/r/fcf4e7e7594070a8698dc0d4b96e031bcaa9b3a3.1665585952.git.chiaen_wu@richtek.com Signed-off-by: Rob Herring <robh@kernel.org> --- Documentation/devicetree/bindings/mfd/mediatek,mt6370.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/mediatek,mt6370.yaml b/Documentation/devicetree/bindings/mfd/mediatek,mt6370.yaml index 250484d59ecdb..5644882db2e8a 100644 --- a/Documentation/devicetree/bindings/mfd/mediatek,mt6370.yaml +++ b/Documentation/devicetree/bindings/mfd/mediatek,mt6370.yaml @@ -139,8 +139,8 @@ examples: charger { compatible = "mediatek,mt6370-charger"; - interrupts = <48>, <68>, <6>; - interrupt-names = "attach_i", "uvp_d_evt", "mivr"; + interrupts = <68>, <48>, <6>; + interrupt-names = "uvp_d_evt", "attach_i", "mivr"; io-channels = <&mt6370_adc MT6370_CHAN_IBUS>; mt6370_otg_vbus: usb-otg-vbus-regulator { -- GitLab From 0811b9e4530d7c46542a8993ce6b725d042c6154 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai <aurabindo.pillai@amd.com> Date: Thu, 6 Oct 2022 17:17:40 -0400 Subject: [PATCH 1826/2223] drm/amd/display: Add HUBP surface flip interrupt handler Add the hubp surface flip handler. This fixes some flip timeout issues. Acked-by: Alex Deucher <alexander.deucher@amd.com> Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com> Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c index f4b901d393ebc..ac1c6458dd55a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubp.c @@ -181,6 +181,7 @@ static struct hubp_funcs dcn32_hubp_funcs = { .hubp_init = hubp3_init, .set_unbounded_requesting = hubp31_set_unbounded_requesting, .hubp_soft_reset = hubp31_soft_reset, + .hubp_set_flip_int = hubp1_set_flip_int, .hubp_in_blank = hubp1_in_blank, .hubp_update_force_pstate_disallow = hubp32_update_force_pstate_disallow, .phantom_hubp_post_enable = hubp32_phantom_hubp_post_enable, -- GitLab From 06267eb2decaa6baac81bbd882265a8e7782dba4 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt <palmer@rivosinc.com> Date: Thu, 28 Jul 2022 14:07:15 -0700 Subject: [PATCH 1827/2223] doc: RISC-V: Document that misaligned accesses are supported The RISC-V ISA manual used to mandate that misaligned accesses were supported in user mode, but that requirement was removed in 2018 via riscv-isa-manual commit 61cadb9 ("Provide new description of misaligned load/store behavior compatible with privileged architecture."). Since the Linux uABI was already frozen at that point it's just been demoted to part of the uABI, but that was never written down. Link: https://lore.kernel.org/r/20220728210715.17214-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- Documentation/riscv/index.rst | 1 + Documentation/riscv/uabi.rst | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 Documentation/riscv/uabi.rst diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst index e23b876ad6ebb..2e5b18fbb1451 100644 --- a/Documentation/riscv/index.rst +++ b/Documentation/riscv/index.rst @@ -8,6 +8,7 @@ RISC-V architecture boot-image-header vm-layout patch-acceptance + uabi features diff --git a/Documentation/riscv/uabi.rst b/Documentation/riscv/uabi.rst new file mode 100644 index 0000000000000..21a82cfb6c4dd --- /dev/null +++ b/Documentation/riscv/uabi.rst @@ -0,0 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 + +RISC-V Linux User ABI +===================== + +Misaligned accesses are supported in userspace, but they may perform poorly. -- GitLab From 0e5d5ae837c8ce04d2ddb874ec5f920118bd9d31 Mon Sep 17 00:00:00 2001 From: D Scott Phillips <scott@os.amperecomputing.com> Date: Mon, 10 Oct 2022 19:21:40 -0700 Subject: [PATCH 1828/2223] arm64: Add AMPERE1 to the Spectre-BHB affected list Per AmpereOne erratum AC03_CPU_12, "Branch history may allow control of speculative execution across software contexts," the AMPERE1 core needs the bhb clearing loop to mitigate Spectre-BHB, with a loop iteration count of 11. Signed-off-by: D Scott Phillips <scott@os.amperecomputing.com> Link: https://lore.kernel.org/r/20221011022140.432370-1-scott@os.amperecomputing.com Reviewed-by: James Morse <james.morse@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> --- arch/arm64/include/asm/cputype.h | 4 ++++ arch/arm64/kernel/proton-pack.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 8aa0d276a6362..abc418650fec0 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -60,6 +60,7 @@ #define ARM_CPU_IMP_FUJITSU 0x46 #define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_IMP_APPLE 0x61 +#define ARM_CPU_IMP_AMPERE 0xC0 #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -123,6 +124,8 @@ #define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 #define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 +#define AMPERE_CPU_PART_AMPERE1 0xAC3 + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -172,6 +175,7 @@ #define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) #define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX) #define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) +#define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index a8ea1637b1379..bfce41c2a53b3 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -868,6 +868,10 @@ u8 spectre_bhb_loop_affected(int scope) MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), {}, }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; static const struct midr_range spectre_bhb_k8_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), @@ -878,6 +882,8 @@ u8 spectre_bhb_loop_affected(int scope) k = 32; else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) + k = 11; else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) k = 8; -- GitLab From 4f001a21080ff2e2f0e1c3692f5e119aedbb3bc1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada <masahiroy@kernel.org> Date: Wed, 5 Oct 2022 01:29:03 +0900 Subject: [PATCH 1829/2223] Kconfig.debug: simplify the dependency of DEBUG_INFO_DWARF4/5 Commit c0a5c81ca9be ("Kconfig.debug: drop GCC 5+ version check for DWARF5") could have cleaned up the code a bit more. "CC_IS_CLANG &&" is unneeded. No functional change is intended. Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> Reviewed-by: Nathan Chancellor <nathan@kernel.org> --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3761118d1879a..72f01edec00e4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -264,7 +264,7 @@ config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT config DEBUG_INFO_DWARF4 bool "Generate DWARF Version 4 debuginfo" select DEBUG_INFO - depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) + depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502) help Generate DWARF v4 debug info. This requires gcc 4.5+, binutils 2.35.2 if using clang without clang's integrated assembler, and gdb 7.0+. @@ -276,7 +276,7 @@ config DEBUG_INFO_DWARF4 config DEBUG_INFO_DWARF5 bool "Generate DWARF Version 5 debuginfo" select DEBUG_INFO - depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) + depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502) help Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc 5.0+ accepts the -gdwarf-5 flag but only had partial support for some -- GitLab From bb1435f3f575b5213eaf27434efa3971f51c01de Mon Sep 17 00:00:00 2001 From: Masahiro Yamada <masahiroy@kernel.org> Date: Wed, 5 Oct 2022 01:29:04 +0900 Subject: [PATCH 1830/2223] Kconfig.debug: add toolchain checks for DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT does not give explicit -gdwarf-* flag. The actual DWARF version is up to the toolchain. The combination of GCC and GAS works fine, and Clang with the integrated assembler is good too. The combination of Clang and GAS is tricky, but at least, the -g flag works for Clang <=13, which defaults to DWARF v4. Clang 14 switched its default to DWARF v5. Now, CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT has the same issue as addressed by commit 98cd6f521f10 ("Kconfig: allow explicit opt in to DWARF v5"). CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y for Clang >= 14 and GAS < 2.35 produces a ton of errors like follows: /tmp/main-c2741c.s: Assembler messages: /tmp/main-c2741c.s:109: Error: junk at end of line, first unrecognized character is `"' /tmp/main-c2741c.s:109: Error: file number less than one Add 'depends on' to check toolchains. Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> Reviewed-by: Nathan Chancellor <nathan@kernel.org> --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 72f01edec00e4..db8d9271cabf5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -253,6 +253,7 @@ config DEBUG_INFO_NONE config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT bool "Rely on the toolchain's implicit default DWARF version" select DEBUG_INFO + depends on !CC_IS_CLANG || AS_IS_LLVM || CLANG_VERSION < 140000 || (AS_IS_GNU && AS_VERSION >= 23502) help The implicit default version of DWARF debug info produced by a toolchain changes over time. -- GitLab From fc8c2d8ff20651f887e574767533d1176e3a479c Mon Sep 17 00:00:00 2001 From: Zack Rusin <zackr@vmware.com> Date: Thu, 6 Oct 2022 10:33:19 -0400 Subject: [PATCH 1831/2223] kbuild: Stop including vmlinux.bz2 in the rpm's vmlinux.bz2 was added to the rpm packages in 2009 in the fc370ecfdb37 ("kbuild: add vmlinux to kernel rpm") but seemingly hasn't been used since. Originally this should have been split up in a seperate debugging package because it massively increases the size of the generated rpm's e.g. kernel rpm built using binrpm-pkg on Fedora 36 default 5.19.8 kernel config and localmodconfig is ~255MB with vmlinux.bz2 and only ~65MB without it. Make the kernel built rpms about 4x smaller by not including the unused vmlinux.bz2 in them. Signed-off-by: Zack Rusin <zackr@vmware.com> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> --- scripts/package/mkspec | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index c920c1b18e7ad..70392fd2fd29c 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -97,8 +97,6 @@ $M $MAKE %{?_smp_mflags} INSTALL_MOD_PATH=%{buildroot} modules_install $MAKE %{?_smp_mflags} INSTALL_HDR_PATH=%{buildroot}/usr headers_install cp System.map %{buildroot}/boot/System.map-$KERNELRELEASE cp .config %{buildroot}/boot/config-$KERNELRELEASE - bzip2 -9 --keep vmlinux - mv vmlinux.bz2 %{buildroot}/boot/vmlinux-$KERNELRELEASE.bz2 $S$M rm -f %{buildroot}/lib/modules/$KERNELRELEASE/build $S$M rm -f %{buildroot}/lib/modules/$KERNELRELEASE/source $S$M mkdir -p %{buildroot}/usr/src/kernels/$KERNELRELEASE -- GitLab From f1d3cbfaafc10464550c6d3a125f4fc802bbaed5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" <rostedt@goodmis.org> Date: Wed, 12 Oct 2022 06:40:56 -0400 Subject: [PATCH 1832/2223] tracing: Move duplicate code of trace_kprobe/eprobe.c into header The functions: fetch_store_strlen_user() fetch_store_strlen() fetch_store_string_user() fetch_store_string() are identical in both trace_kprobe.c and trace_eprobe.c. Move them into a new header file trace_probe_kernel.h to share it. This code will later be used by the synthetic events as well. Marked for stable as a fix for a crash in synthetic events requires it. Link: https://lkml.kernel.org/r/20221012104534.467668078@goodmis.org Cc: stable@vger.kernel.org Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Tom Zanussi <zanussi@kernel.org> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Reviewed-by: Tom Zanussi <zanussi@kernel.org> Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_eprobe.c | 60 ++----------------- kernel/trace/trace_kprobe.c | 60 ++----------------- kernel/trace/trace_probe_kernel.h | 96 +++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 110 deletions(-) create mode 100644 kernel/trace/trace_probe_kernel.h diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08bde9871ec5..5dd0617e5df6d 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -16,6 +16,7 @@ #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -456,29 +457,14 @@ NOKPROBE_SYMBOL(process_fetch_insn) static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -488,21 +474,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -512,29 +484,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 23f7f0ec4f4cf..5a75b039e5860 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -20,6 +20,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h new file mode 100644 index 0000000000000..1d43df29a1f8e --- /dev/null +++ b/kernel/trace/trace_probe_kernel.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRACE_PROBE_KERNEL_H_ +#define __TRACE_PROBE_KERNEL_H_ + +/* + * This depends on trace_probe.h, but can not include it due to + * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. + * Which means that any other user must include trace_probe.h before including + * this file. + */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return kern_fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return kern_fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +#endif /* __TRACE_PROBE_KERNEL_H_ */ -- GitLab From 2e9906f84fc7c99388bb7123ade167250d50f1c0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" <rostedt@goodmis.org> Date: Wed, 12 Oct 2022 06:40:57 -0400 Subject: [PATCH 1833/2223] tracing: Add "(fault)" name injection to kernel probes Have the specific functions for kernel probes that read strings to inject the "(fault)" name directly. trace_probes.c does this too (for uprobes) but as the code to read strings are going to be used by synthetic events (and perhaps other utilities), it simplifies the code by making sure those other uses do not need to implement the "(fault)" name injection as well. Link: https://lkml.kernel.org/r/20221012104534.644803645@goodmis.org Cc: stable@vger.kernel.org Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Tom Zanussi <zanussi@kernel.org> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Reviewed-by: Tom Zanussi <zanussi@kernel.org> Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_probe_kernel.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 1d43df29a1f8e..77dbd9ff97826 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -2,6 +2,8 @@ #ifndef __TRACE_PROBE_KERNEL_H_ #define __TRACE_PROBE_KERNEL_H_ +#define FAULT_STRING "(fault)" + /* * This depends on trace_probe.h, but can not include it due to * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. @@ -13,8 +15,16 @@ static nokprobe_inline int kern_fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; + int ret; - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + /* + * strnlen_user_nofault returns zero on fault, insert the + * FAULT_STRING when that occurs. + */ + if (ret <= 0) + return strlen(FAULT_STRING) + 1; + return ret; } /* Return the length of string -- including null terminal byte */ @@ -34,7 +44,18 @@ kern_fetch_store_strlen(unsigned long addr) len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - return (ret < 0) ? ret : len; + /* For faults, return enough to hold the FAULT_STRING */ + return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; +} + +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) +{ + if (ret >= 0) { + *(u32 *)dest = make_data_loc(ret, __dest - base); + } else { + strscpy(__dest, FAULT_STRING, len); + ret = strlen(__dest) + 1; + } } /* @@ -55,8 +76,7 @@ kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base, maxlen); return ret; } @@ -87,8 +107,7 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base, maxlen); return ret; } -- GitLab From 0934ae9977c27133449b6dd8c6213970e7eece38 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" <rostedt@goodmis.org> Date: Wed, 12 Oct 2022 06:40:58 -0400 Subject: [PATCH 1834/2223] tracing: Fix reading strings from synthetic events The follow commands caused a crash: # cd /sys/kernel/tracing # echo 's:open char file[]' > dynamic_events # echo 'hist:keys=common_pid:file=filename:onchange($file).trace(open,$file)' > events/syscalls/sys_enter_openat/trigger' # echo 1 > events/synthetic/open/enable BOOM! The problem is that the synthetic event field "char file[]" will read the value given to it as a string without any memory checks to make sure the address is valid. The above example will pass in the user space address and the sythetic event code will happily call strlen() on it and then strscpy() where either one will cause an oops when accessing user space addresses. Use the helper functions from trace_kprobe and trace_eprobe that can read strings safely (and actually succeed when the address is from user space and the memory is mapped in). Now the above can show: packagekitd-1721 [000] ...2. 104.597170: open: file=/usr/lib/rpm/fileattrs/cmake.attr in:imjournal-978 [006] ...2. 104.599642: open: file=/var/lib/rsyslog/imjournal.state.tmp packagekitd-1721 [000] ...2. 104.626308: open: file=/usr/lib/rpm/fileattrs/debuginfo.attr Link: https://lkml.kernel.org/r/20221012104534.826549315@goodmis.org Cc: stable@vger.kernel.org Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Tom Zanussi <zanussi@kernel.org> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Reviewed-by: Tom Zanussi <zanussi@kernel.org> Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- kernel/trace/trace_events_synth.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 5e8c07aef071b..e310052dc83ce 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -17,6 +17,8 @@ /* for gfp flag names */ #include <linux/trace_events.h> #include <trace/events/mmflags.h> +#include "trace_probe.h" +#include "trace_probe_kernel.h" #include "trace_synth.h" @@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, { unsigned int len = 0; char *str_field; + int ret; if (is_dynamic) { u32 data_offset; @@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry, data_offset += event->n_u64 * sizeof(u64); data_offset += data_size; - str_field = (char *)entry + data_offset; - - len = strlen(str_val) + 1; - strscpy(str_field, str_val, len); + len = kern_fetch_store_strlen((unsigned long)str_val); data_offset |= len << 16; *(u32 *)&entry->fields[*n_u64] = data_offset; + ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); + (*n_u64)++; } else { str_field = (char *)&entry->fields[*n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)str_val < TASK_SIZE) + ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + else +#endif + ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); + + if (ret < 0) + strcpy(str_field, FAULT_STRING); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); } @@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = strlen(str_val) + 1; + len = kern_fetch_store_strlen((unsigned long)str_val); fields_size += len; } -- GitLab From d4fa7d772adc02451076b3ad1f990d8b822909fc Mon Sep 17 00:00:00 2001 From: Billy Tsai <billy_tsai@aspeedtech.com> Date: Mon, 26 Sep 2022 18:51:44 +0800 Subject: [PATCH 1835/2223] i3c: master: Free the old_dyn_addr when reattach. This patch is used to free the old_dyn_addr when the caller want to reattach the device to the different dynamic address. If the old_dyn_addr is 0 the function will treat it as no old_dyn_addr is reserved on the bus. Without the patch, when the driver reattach the i3c device after setnewda the old_dyn_addr will be permanently occupied. Signed-off-by: Billy Tsai <billy_tsai@aspeedtech.com> Link: https://lore.kernel.org/r/20220926105145.8145-1-billy_tsai@aspeedtech.com Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/i3c/master.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 7850287dfe7a9..6349ce0ce8357 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -1379,6 +1379,9 @@ static int i3c_master_reattach_i3c_dev(struct i3c_dev_desc *dev, i3c_bus_set_addr_slot_status(&master->bus, dev->info.dyn_addr, I3C_ADDR_SLOT_I3C_DEV); + if (old_dyn_addr) + i3c_bus_set_addr_slot_status(&master->bus, old_dyn_addr, + I3C_ADDR_SLOT_FREE); } if (master->ops->reattach_i3c_dev) { -- GitLab From 90f4a09a15239f4a819b2e90a7a0b92a75060655 Mon Sep 17 00:00:00 2001 From: Billy Tsai <billy_tsai@aspeedtech.com> Date: Mon, 26 Sep 2022 18:51:45 +0800 Subject: [PATCH 1836/2223] i3c: master: Remove the wrong place of reattach. The reattach should be used when an I3C device has its address changed. But the modified place in this patch doesn't have the address changed of the newdev. This wrong reattach will reserve the same address slot twice and return unexpected -EBUSY when the bus find the duplicate device with diffent dynamic address. Signed-off-by: Billy Tsai <billy_tsai@aspeedtech.com> Link: https://lore.kernel.org/r/20220926105145.8145-2-billy_tsai@aspeedtech.com Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/i3c/master.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 6349ce0ce8357..351c81a929a6c 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -1911,10 +1911,6 @@ int i3c_master_add_i3c_dev_locked(struct i3c_master_controller *master, i3c_master_free_i3c_dev(olddev); } - ret = i3c_master_reattach_i3c_dev(newdev, old_dyn_addr); - if (ret) - goto err_detach_dev; - /* * Depending on our previous state, the expected dynamic address might * differ: -- GitLab From 0759011157b0d666b02b03b986d3de005d84027e Mon Sep 17 00:00:00 2001 From: Lin Yujun <linyujun809@huawei.com> Date: Thu, 15 Sep 2022 14:52:53 +0800 Subject: [PATCH 1837/2223] rtc: stmp3xxx: Add failure handling for stmp3xxx_wdt_register() Use platform_device_put() to free platform device before print error message when platform_device_add() fails to run. Fixes: 1a71fb84fda6 ("rtc: stmp3xxx: add wdt-accessor function") Signed-off-by: Lin Yujun <linyujun809@huawei.com> Reviewed-by: Wolfram Sang <wsa@kernel.org> Link: https://lore.kernel.org/r/20220915065253.43668-1-linyujun809@huawei.com Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-stmp3xxx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 40c0f7ed36e06..aae40d20d0868 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -107,6 +107,8 @@ static void stmp3xxx_wdt_register(struct platform_device *rtc_pdev) wdt_pdev->dev.parent = &rtc_pdev->dev; wdt_pdev->dev.platform_data = &wdt_pdata; rc = platform_device_add(wdt_pdev); + if (rc) + platform_device_put(wdt_pdev); } if (rc) -- GitLab From a35a2ad2b88a66732ac442ad5f86dc49af51673f Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:16 +0200 Subject: [PATCH 1838/2223] rtc: isl12022: stop using deprecated devm_rtc_device_register() The comments say that devm_rtc_device_register() is deprecated and that one should instead use devm_rtc_allocate_device() and [devm_]rtc_register_device. So do that. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-2-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index 79461ded1a486..2dc19061cf5f2 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -246,10 +246,13 @@ static int isl12022_probe(struct i2c_client *client) i2c_set_clientdata(client, isl12022); - isl12022->rtc = devm_rtc_device_register(&client->dev, - isl12022_driver.driver.name, - &isl12022_rtc_ops, THIS_MODULE); - return PTR_ERR_OR_ZERO(isl12022->rtc); + isl12022->rtc = devm_rtc_allocate_device(&client->dev); + if (IS_ERR(isl12022->rtc)) + return PTR_ERR(isl12022->rtc); + + isl12022->rtc->ops = &isl12022_rtc_ops; + + return devm_rtc_register_device(isl12022->rtc); } #ifdef CONFIG_OF -- GitLab From ca03b7a2c0b098321365f69538823d1bcc860552 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:17 +0200 Subject: [PATCH 1839/2223] rtc: isl12022: specify range_min and range_max The isl12022 can (only) keep track of times in the range 2000-2099. The data sheet says The calendar registers track date, month, year, and day of the week and are accurate through 2099, with automatic leap year correction. The lower bound of 2000 is obtained by simply observing that its YR register only counts from 00 through 99. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-3-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index 2dc19061cf5f2..3bc197f5548f0 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -251,6 +251,8 @@ static int isl12022_probe(struct i2c_client *client) return PTR_ERR(isl12022->rtc); isl12022->rtc->ops = &isl12022_rtc_ops; + isl12022->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; + isl12022->rtc->range_max = RTC_TIMESTAMP_END_2099; return devm_rtc_register_device(isl12022->rtc); } -- GitLab From 43a96b9cf67770d4bb46267e1554d3d8b4cf78ac Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:18 +0200 Subject: [PATCH 1840/2223] rtc: isl12022: drop a dev_info() This dev_info() seems to be a debug leftover, and it would only get printed once (or, once per battery change). Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-4-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index 3bc197f5548f0..2fc9fbefc6fc4 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -173,9 +173,6 @@ static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) /* Check if WRTC (write rtc enable) is set factory default is * 0 (not set) */ if (!(buf[0] & ISL12022_INT_WRTC)) { - dev_info(&client->dev, - "init write enable and 24 hour format\n"); - /* Set the write enable bit. */ ret = isl12022_write_reg(client, ISL12022_REG_INT, -- GitLab From ca35887186b7c53f26c42aee1285ba213adb4365 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:19 +0200 Subject: [PATCH 1841/2223] rtc: isl12022: simplify some expressions These instances of '&client->dev' might as well be spelled 'dev', since 'client' has been computed from 'dev' via 'client = to_i2c_client(dev)'. Later patches will get rid of that local variable 'client', so remove these unnecessary references so those later patches become easier to read. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-5-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index 2fc9fbefc6fc4..7efe23fa74df1 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -112,13 +112,13 @@ static int isl12022_rtc_read_time(struct device *dev, struct rtc_time *tm) return ret; if (buf[ISL12022_REG_SR] & (ISL12022_SR_LBAT85 | ISL12022_SR_LBAT75)) { - dev_warn(&client->dev, + dev_warn(dev, "voltage dropped below %u%%, " "date and time is not reliable.\n", buf[ISL12022_REG_SR] & ISL12022_SR_LBAT85 ? 85 : 75); } - dev_dbg(&client->dev, + dev_dbg(dev, "%s: raw data is sec=%02x, min=%02x, hr=%02x, " "mday=%02x, mon=%02x, year=%02x, wday=%02x, " "sr=%02x, int=%02x", @@ -141,7 +141,7 @@ static int isl12022_rtc_read_time(struct device *dev, struct rtc_time *tm) tm->tm_mon = bcd2bin(buf[ISL12022_REG_MO] & 0x1F) - 1; tm->tm_year = bcd2bin(buf[ISL12022_REG_YR]) + 100; - dev_dbg(&client->dev, "%s: secs=%d, mins=%d, hours=%d, " + dev_dbg(dev, "%s: secs=%d, mins=%d, hours=%d, " "mday=%d, mon=%d, year=%d, wday=%d\n", __func__, tm->tm_sec, tm->tm_min, tm->tm_hour, @@ -158,7 +158,7 @@ static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) int ret; uint8_t buf[ISL12022_REG_DW + 1]; - dev_dbg(&client->dev, "%s: secs=%d, mins=%d, hours=%d, " + dev_dbg(dev, "%s: secs=%d, mins=%d, hours=%d, " "mday=%d, mon=%d, year=%d, wday=%d\n", __func__, tm->tm_sec, tm->tm_min, tm->tm_hour, -- GitLab From 7093b8a471f48d49891da2108f44fd64742408cb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:20 +0200 Subject: [PATCH 1842/2223] rtc: isl12022: use %ptR Simplify the code and make the output format consistent with other RTC drivers by standardizing on using the %ptR printf extension. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-6-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index 7efe23fa74df1..d396d6076db52 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -141,11 +141,7 @@ static int isl12022_rtc_read_time(struct device *dev, struct rtc_time *tm) tm->tm_mon = bcd2bin(buf[ISL12022_REG_MO] & 0x1F) - 1; tm->tm_year = bcd2bin(buf[ISL12022_REG_YR]) + 100; - dev_dbg(dev, "%s: secs=%d, mins=%d, hours=%d, " - "mday=%d, mon=%d, year=%d, wday=%d\n", - __func__, - tm->tm_sec, tm->tm_min, tm->tm_hour, - tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); + dev_dbg(dev, "%s: %ptR\n", __func__, tm); return 0; } @@ -158,11 +154,7 @@ static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) int ret; uint8_t buf[ISL12022_REG_DW + 1]; - dev_dbg(dev, "%s: secs=%d, mins=%d, hours=%d, " - "mday=%d, mon=%d, year=%d, wday=%d\n", - __func__, - tm->tm_sec, tm->tm_min, tm->tm_hour, - tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); + dev_dbg(dev, "%s: %ptR\n", __func__, tm); if (!isl12022->write_enabled) { -- GitLab From 31b108acc50cddf3d16472ead45c4cd0d1337289 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:21 +0200 Subject: [PATCH 1843/2223] rtc: isl12022: use dev_set_drvdata() instead of i2c_set_clientdata() As another preparation for removing direct references to the i2c_client in the helper functions, stash a pointer to the private data via dev_set_drvdata() instead of i2c_set_clientdata(). Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-7-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index d396d6076db52..df6d91f4e8f37 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -149,7 +149,7 @@ static int isl12022_rtc_read_time(struct device *dev, struct rtc_time *tm) static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) { struct i2c_client *client = to_i2c_client(dev); - struct isl12022 *isl12022 = i2c_get_clientdata(client); + struct isl12022 *isl12022 = dev_get_drvdata(dev); size_t i; int ret; uint8_t buf[ISL12022_REG_DW + 1]; @@ -232,8 +232,7 @@ static int isl12022_probe(struct i2c_client *client) GFP_KERNEL); if (!isl12022) return -ENOMEM; - - i2c_set_clientdata(client, isl12022); + dev_set_drvdata(&client->dev, isl12022); isl12022->rtc = devm_rtc_allocate_device(&client->dev); if (IS_ERR(isl12022->rtc)) -- GitLab From 0a2abbfd8586d396a8581ebf9b96fd5746f08b14 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:22 +0200 Subject: [PATCH 1844/2223] rtc: isl12022: drop redundant write to HR register There's nothing in the data sheet that says writing to one of the time keeping registers is necessary to start the RTC. It does so at the stop condition of the i2c transfer setting the WRTC bit: Upon initialization or power-up, the WRTC must be set to "1" to enable the RTC. Upon the completion of a valid write (STOP), the RTC starts counting. Moreover, even if such a write to one of the timekeeping registers was necessary, that's exactly what we do anyway just below when we actually write the given struct rtc_time to the device. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-8-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-isl12022.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index df6d91f4e8f37..6fb13a5d17f12 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -171,20 +171,6 @@ static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) buf[0] | ISL12022_INT_WRTC); if (ret) return ret; - - /* Write to any RTC register to start RTC, we use the - * HR register, setting the MIL bit to use the 24 hour - * format. */ - ret = isl12022_read_regs(client, ISL12022_REG_HR, - buf, 1); - if (ret) - return ret; - - ret = isl12022_write_reg(client, - ISL12022_REG_HR, - buf[0] | ISL12022_HR_MIL); - if (ret) - return ret; } isl12022->write_enabled = true; -- GitLab From b1a1baa657c738e8bb0107ce304f5e78b9847f37 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes <linux@rasmusvillemoes.dk> Date: Wed, 21 Sep 2022 13:46:23 +0200 Subject: [PATCH 1845/2223] rtc: isl12022: switch to using regmap API The regmap abstraction allows us to avoid the private i2c transfer helpers, and also offers some nice utility functions such as the regmap_update_bits family. While at it, simplify the code even more by not keeping track of ->write_enabled: rtc_set_time is not a hot path, so one extra i2c read doesn't hurt (regmap_update_bits elides the write when the bits are already as desired). Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Link: https://lore.kernel.org/r/20220921114624.3250848-9-linux@rasmusvillemoes.dk Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/Kconfig | 1 + drivers/rtc/rtc-isl12022.c | 110 +++++++++---------------------------- 2 files changed, 26 insertions(+), 85 deletions(-) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index b8de25118ad09..bb63edb507da4 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -423,6 +423,7 @@ config RTC_DRV_ISL1208 config RTC_DRV_ISL12022 tristate "Intersil ISL12022" + select REGMAP_I2C help If you say yes here you get support for the Intersil ISL12022 RTC chip. diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index 6fb13a5d17f12..ca677c4265e6c 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -16,6 +16,7 @@ #include <linux/err.h> #include <linux/of.h> #include <linux/of_device.h> +#include <linux/regmap.h> /* ISL register offsets */ #define ISL12022_REG_SC 0x00 @@ -42,72 +43,21 @@ static struct i2c_driver isl12022_driver; struct isl12022 { struct rtc_device *rtc; - - bool write_enabled; /* true if write enable is set */ + struct regmap *regmap; }; - -static int isl12022_read_regs(struct i2c_client *client, uint8_t reg, - uint8_t *data, size_t n) -{ - struct i2c_msg msgs[] = { - { - .addr = client->addr, - .flags = 0, - .len = 1, - .buf = data - }, /* setup read ptr */ - { - .addr = client->addr, - .flags = I2C_M_RD, - .len = n, - .buf = data - } - }; - - int ret; - - data[0] = reg; - ret = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); - if (ret != ARRAY_SIZE(msgs)) { - dev_err(&client->dev, "%s: read error, ret=%d\n", - __func__, ret); - return -EIO; - } - - return 0; -} - - -static int isl12022_write_reg(struct i2c_client *client, - uint8_t reg, uint8_t val) -{ - uint8_t data[2] = { reg, val }; - int err; - - err = i2c_master_send(client, data, sizeof(data)); - if (err != sizeof(data)) { - dev_err(&client->dev, - "%s: err=%d addr=%02x, data=%02x\n", - __func__, err, data[0], data[1]); - return -EIO; - } - - return 0; -} - - /* * In the routines that deal directly with the isl12022 hardware, we use * rtc_time -- month 0-11, hour 0-23, yr = calendar year-epoch. */ static int isl12022_rtc_read_time(struct device *dev, struct rtc_time *tm) { - struct i2c_client *client = to_i2c_client(dev); + struct isl12022 *isl12022 = dev_get_drvdata(dev); + struct regmap *regmap = isl12022->regmap; uint8_t buf[ISL12022_REG_INT + 1]; int ret; - ret = isl12022_read_regs(client, ISL12022_REG_SC, buf, sizeof(buf)); + ret = regmap_bulk_read(regmap, ISL12022_REG_SC, buf, sizeof(buf)); if (ret) return ret; @@ -148,33 +98,18 @@ static int isl12022_rtc_read_time(struct device *dev, struct rtc_time *tm) static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) { - struct i2c_client *client = to_i2c_client(dev); struct isl12022 *isl12022 = dev_get_drvdata(dev); - size_t i; + struct regmap *regmap = isl12022->regmap; int ret; uint8_t buf[ISL12022_REG_DW + 1]; dev_dbg(dev, "%s: %ptR\n", __func__, tm); - if (!isl12022->write_enabled) { - - ret = isl12022_read_regs(client, ISL12022_REG_INT, buf, 1); - if (ret) - return ret; - - /* Check if WRTC (write rtc enable) is set factory default is - * 0 (not set) */ - if (!(buf[0] & ISL12022_INT_WRTC)) { - /* Set the write enable bit. */ - ret = isl12022_write_reg(client, - ISL12022_REG_INT, - buf[0] | ISL12022_INT_WRTC); - if (ret) - return ret; - } - - isl12022->write_enabled = true; - } + /* Ensure the write enable bit is set. */ + ret = regmap_update_bits(regmap, ISL12022_REG_INT, + ISL12022_INT_WRTC, ISL12022_INT_WRTC); + if (ret) + return ret; /* hours, minutes and seconds */ buf[ISL12022_REG_SC] = bin2bcd(tm->tm_sec); @@ -191,15 +126,8 @@ static int isl12022_rtc_set_time(struct device *dev, struct rtc_time *tm) buf[ISL12022_REG_DW] = tm->tm_wday & 0x07; - /* write register's data */ - for (i = 0; i < ARRAY_SIZE(buf); i++) { - ret = isl12022_write_reg(client, ISL12022_REG_SC + i, - buf[ISL12022_REG_SC + i]); - if (ret) - return -EIO; - } - - return 0; + return regmap_bulk_write(isl12022->regmap, ISL12022_REG_SC, + buf, sizeof(buf)); } static const struct rtc_class_ops isl12022_rtc_ops = { @@ -207,6 +135,12 @@ static const struct rtc_class_ops isl12022_rtc_ops = { .set_time = isl12022_rtc_set_time, }; +static const struct regmap_config regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .use_single_write = true, +}; + static int isl12022_probe(struct i2c_client *client) { struct isl12022 *isl12022; @@ -220,6 +154,12 @@ static int isl12022_probe(struct i2c_client *client) return -ENOMEM; dev_set_drvdata(&client->dev, isl12022); + isl12022->regmap = devm_regmap_init_i2c(client, ®map_config); + if (IS_ERR(isl12022->regmap)) { + dev_err(&client->dev, "regmap allocation failed\n"); + return PTR_ERR(isl12022->regmap); + } + isl12022->rtc = devm_rtc_allocate_device(&client->dev); if (IS_ERR(isl12022->rtc)) return PTR_ERR(isl12022->rtc); -- GitLab From d73d66c0e05741b35b7398e647b8c4f2aaea9b09 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Mon, 3 Oct 2022 16:37:11 +0100 Subject: [PATCH 1846/2223] rtc: ds1685: Fix spelling of function name in comment block The function name is missing the letter 'd' in the comment block. Fix it. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Acked-by: Joshua Kinard <kumba@gentoo.org> Link: https://lore.kernel.org/r/20221003153711.271630-1-colin.i.king@gmail.com Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-ds1685.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c index a24331ba8a5fc..5db9c737c022f 100644 --- a/drivers/rtc/rtc-ds1685.c +++ b/drivers/rtc/rtc-ds1685.c @@ -132,7 +132,7 @@ ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask) } /** - * s1685_rtc_check_mday - check validity of the day of month. + * ds1685_rtc_check_mday - check validity of the day of month. * @rtc: pointer to the ds1685 rtc structure. * @mday: day of month. * -- GitLab From 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Mon, 3 Oct 2022 13:59:47 +0100 Subject: [PATCH 1847/2223] io_uring/af_unix: defer registered files gc to io_uring release Instead of putting io_uring's registered files in unix_gc() we want it to be done by io_uring itself. The trick here is to consider io_uring registered files for cycle detection but not actually putting them down. Because io_uring can't register other ring instances, this will remove all refs to the ring file triggering the ->release path and clean up with io_ring_ctx_free(). Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-and-tested-by: David Bouman <dbouman03@gmail.com> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com> [axboe: add kerneldoc comment to skb, fold in skb leak fix] Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/skbuff.h | 2 ++ io_uring/rsrc.c | 1 + net/unix/garbage.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9fcf534f2d927..7be5bb4c94b6d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t; * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) + * @scm_io_uring: SKB holds io_uring registered files * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required @@ -982,6 +983,7 @@ struct sk_buff { #endif __u8 slow_gro:1; __u8 csum_not_inet:1; + __u8 scm_io_uring:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6f88ded0e7e56..012fdb04ec238 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -855,6 +855,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) UNIXCB(skb).fp = fpl; skb->sk = sk; + skb->scm_io_uring = 1; skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d45d5366115a7..dc27635403932 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -204,6 +204,7 @@ void wait_for_unix_gc(void) /* The external entry point: unix_gc() */ void unix_gc(void) { + struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; @@ -297,11 +298,30 @@ void unix_gc(void) spin_unlock(&unix_gc_lock); + /* We need io_uring to clean its registered files, ignore all io_uring + * originated skbs. It's fine as io_uring doesn't keep references to + * other io_uring instances and so killing all other files in the cycle + * will put all io_uring references forcing it to go through normal + * release.path eventually putting registered files. + */ + skb_queue_walk_safe(&hitlist, skb, next_skb) { + if (skb->scm_io_uring) { + __skb_unlink(skb, &hitlist); + skb_queue_tail(&skb->sk->sk_receive_queue, skb); + } + } + /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); + /* There could be io_uring registered files, just push them back to + * the inflight list + */ + list_for_each_entry_safe(u, next, &gc_candidates, link) + list_move_tail(&u->link, &gc_inflight_list); + /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); -- GitLab From 42b6419d0aba47c5d8644cdc0b68502254671de5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Tue, 4 Oct 2022 03:19:08 +0100 Subject: [PATCH 1848/2223] io_uring: correct pinned_vm accounting ->mm_account should be released only after we free all registered buffers, otherwise __io_sqe_buffers_unregister() will see a NULL ->mm_account and skip locked_vm accounting. Cc: <Stable@vger.kernel.org> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/6d798f65ed4ab8db3664c4d3397d4af16ca98846.1664849932.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 63f6ce5e53551..ea5cee593bbdc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2585,12 +2585,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - - if (ctx->mm_account) { - mmdrop(ctx->mm_account); - ctx->mm_account = NULL; - } - io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ io_wait_rsrc_data(ctx->buf_data); @@ -2633,6 +2627,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); + if (ctx->mm_account) { + mmdrop(ctx->mm_account); + ctx->mm_account = NULL; + } io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); -- GitLab From b7a817752efc850603c4c23ed78da2b990a6a34a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Tue, 4 Oct 2022 03:19:25 +0100 Subject: [PATCH 1849/2223] io_uring: remove notif leftovers Notifications were killed but there is a couple of fields and struct declarations left, remove them. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/8df8877d677be5a2b43afd936d600e60105ea960.1664849941.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/io_uring_types.h | 5 ----- io_uring/io_uring.c | 1 - 2 files changed, 6 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa4d90a538663..f5b687a787a34 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -34,9 +34,6 @@ struct io_file_table { unsigned int alloc_hint; }; -struct io_notif; -struct io_notif_slot; - struct io_hash_bucket { spinlock_t lock; struct hlist_head list; @@ -242,8 +239,6 @@ struct io_ring_ctx { unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; - struct io_notif_slot *notif_slots; - unsigned nr_notif_slots; struct io_submit_state submit_state; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ea5cee593bbdc..b12ec6b5a4640 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2625,7 +2625,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); if (ctx->mm_account) { mmdrop(ctx->mm_account); -- GitLab From 3fb1bd68817288729179444caf1fd5c5c4d2d65d Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Tue, 4 Oct 2022 20:29:48 -0600 Subject: [PATCH 1850/2223] io_uring/net: handle -EINPROGRESS correct for IORING_OP_CONNECT We treat EINPROGRESS like EAGAIN, but if we're retrying post getting EINPROGRESS, then we just need to check the socket for errors and terminate the request. This was exposed on a bluetooth connection request which ends up taking a while and hitting EINPROGRESS, and yields a CQE result of -EBADFD because we're retrying a connect on a socket that is now connected. Cc: stable@vger.kernel.org Fixes: 87f80d623c6c ("io_uring: handle connect -EINPROGRESS like -EAGAIN") Link: https://github.com/axboe/liburing/issues/671 Reported-by: Aidan Sun <aidansun05@gmail.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/net.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index caa6a803cb72c..8c7226b5bf413 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -46,6 +46,7 @@ struct io_connect { struct file *file; struct sockaddr __user *addr; int addr_len; + bool in_progress; }; struct io_sr_msg { @@ -1386,6 +1387,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); + conn->in_progress = false; return 0; } @@ -1397,6 +1399,16 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (connect->in_progress) { + struct socket *socket; + + ret = -ENOTSOCK; + socket = sock_from_file(req->file); + if (socket) + ret = sock_error(socket->sk); + goto out; + } + if (req_has_async_data(req)) { io = req->async_data; } else { @@ -1413,13 +1425,17 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_connect_file(req->file, &io->address, connect->addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; + if (ret == -EINPROGRESS) { + connect->in_progress = true; + } else { + if (req_has_async_data(req)) + return -EAGAIN; + if (io_alloc_async_data(req)) { + ret = -ENOMEM; + goto out; + } + memcpy(req->async_data, &__io, sizeof(__io)); } - memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (ret == -ERESTARTSYS) -- GitLab From fc86f9d3bb4904117eea70347d323fde34a47c79 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Thu, 6 Oct 2022 02:06:10 +0100 Subject: [PATCH 1851/2223] io_uring: remove redundant memory barrier in io_req_local_work_add io_cqring_wake() needs a barrier for the waitqueue_active() check. However, in the case of io_req_local_work_add(), we call llist_add() first, which implies an atomic. Hence we can replace smb_mb() with smp_mb__after_atomic(). Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/43983bc8bc507172adda7a0f00cab1aff09fd238.1665018309.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 5 +++-- io_uring/io_uring.h | 11 +++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b12ec6b5a4640..12870cd7cb07c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1106,6 +1106,8 @@ static void io_req_local_work_add(struct io_kiocb *req) if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) return; + /* need it for the following io_cqring_wake() */ + smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { io_move_task_work_from_local(ctx); @@ -1117,8 +1119,7 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->has_evfd) io_eventfd_signal(ctx); - io_cqring_wake(ctx); - + __io_cqring_wake(ctx); } static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 48ce2348c8c1e..47d4cad1e9c46 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -203,17 +203,24 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } -static inline void io_cqring_wake(struct io_ring_ctx *ctx) +/* requires smb_mb() prior, see wq_has_sleeper() */ +static inline void __io_cqring_wake(struct io_ring_ctx *ctx) { /* * wake_up_all() may seem excessive, but io_wake_function() and * io_should_wake() handle the termination of the loop and only * wake as many waiters as we need to. */ - if (wq_has_sleeper(&ctx->cq_wait)) + if (waitqueue_active(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } +static inline void io_cqring_wake(struct io_ring_ctx *ctx) +{ + smp_mb(); + __io_cqring_wake(ctx); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; -- GitLab From 44f87745d5f24a3cdf0548bf1d84fbb7316ce229 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Thu, 6 Oct 2022 21:42:33 +0100 Subject: [PATCH 1852/2223] io_uring: optimise locking for local tw with submit_wait Running local task_work requires taking uring_lock, for submit + wait we can try to run them right after submit while we still hold the lock and save one lock/unlokc pair. The optimisation was implemented in the first local tw patches but got dropped for simplicity. Suggested-by: Dylan Yudaken <dylany@fb.com> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/281fc79d98b5d91fe4778c5137a17a2ab4693e5c.1665088876.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 12 ++++++++++-- io_uring/io_uring.h | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 12870cd7cb07c..de08d9902b30b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3227,8 +3227,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_unlock(&ctx->uring_lock); goto out; } - if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) - goto iopoll_locked; + if (flags & IORING_ENTER_GETEVENTS) { + if (ctx->syscall_iopoll) + goto iopoll_locked; + /* + * Ignore errors, we'll soon call io_cqring_wait() and + * it should handle ownership problems if any. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + (void)io_run_local_work_locked(ctx); + } mutex_unlock(&ctx->uring_lock); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 47d4cad1e9c46..ef77d2aa3172c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -275,6 +275,13 @@ static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) return ret; } +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) +{ + if (llist_empty(&ctx->work_llist)) + return 0; + return __io_run_local_work(ctx, true); +} + static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { -- GitLab From 11528491c65a493050c682786c6b7cfd9e9b4a8f Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Fri, 7 Oct 2022 12:26:02 -0600 Subject: [PATCH 1853/2223] io_uring/opdef: remove 'audit_skip' from SENDMSG_ZC The msg variants of sending aren't audited separately, so we should not be setting audit_skip for the zerocopy sendmsg variant either. Fixes: 493108d95f14 ("io_uring/net: zerocopy sendmsg") Reported-by: Paul Moore <paul@paul-moore.com> Reviewed-by: Paul Moore <paul@paul-moore.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/opdef.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2330f6da791e1..83dc0f9ad3b2f 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -510,7 +510,6 @@ const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .audit_skip = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) -- GitLab From c86416c6ff5ba7f7e5f3ff1dd8a9d1b3d0be827c Mon Sep 17 00:00:00 2001 From: Stefan Roesch <shr@devkernel.io> Date: Mon, 10 Oct 2022 16:43:30 -0700 Subject: [PATCH 1854/2223] io_uring: local variable rw shadows outer variable in io_write This fixes the shadowing of the outer variable rw in the function io_write(). No issue is caused by this, but let's silence the shadowing warning anyway. Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Stefan Roesch <shr@devkernel.io> Link: https://lore.kernel.org/r/20221010234330.244244-1-shr@devkernel.io Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/rw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index a25cd44cd4153..453e0ae921609 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -916,7 +916,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto copy_iov; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { - struct io_async_rw *rw; + struct io_async_rw *io; trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, req->cqe.res, ret2); @@ -929,9 +929,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_save_state(&s->iter, &s->iter_state); ret = io_setup_async_rw(req, iovec, s, true); - rw = req->async_data; - if (rw) - rw->bytes_done += ret2; + io = req->async_data; + if (io) + io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); -- GitLab From 00927931cb630bbf8edb6d7f4dadb25139fc5e16 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Tue, 11 Oct 2022 01:59:57 +0100 Subject: [PATCH 1855/2223] io_uring: fix fdinfo sqe offsets calculation Only with the big sqe feature they take 128 bytes per entry, but we unconditionally advance by 128B. Fix it by using sq_shift. Fixes: 3b8fdd1dc35e3 ("io_uring/fdinfo: fix sqe dumping for IORING_SETUP_SQE128") Reported-and-tested-by: syzbot+e5198737e8a2d23d958c@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/8b41287cb75d5efb8fcb5cccde845ddbbadd8372.1665449983.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/fdinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 4eae088046d0d..2e04850a657b0 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -94,7 +94,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; - sqe = &ctx->sq_sqes[sq_idx << 1]; + sqe = &ctx->sq_sqes[sq_idx << sq_shift]; seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu", -- GitLab From 2ec33a6c3cca9fe2465e82050c81f5ffdc508b36 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Tue, 11 Oct 2022 09:06:23 -0600 Subject: [PATCH 1856/2223] io_uring/rw: ensure kiocb_end_write() is always called A previous commit moved the notifications and end-write handling, but it is now missing a few spots where we also want to call both of those. Without that, we can potentially be missing file notifications, and more importantly, have an imbalance in the super_block writers sem accounting. Fixes: b000145e9907 ("io_uring/rw: defer fsnotify calls to task context") Reported-by: Dave Chinner <david@fromorbit.com> Link: https://lore.kernel.org/all/20221010050319.GC2703033@dread.disaster.area/ Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/rw.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 453e0ae921609..100de2626e478 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -234,11 +234,34 @@ static void kiocb_end_write(struct io_kiocb *req) } } +/* + * Trigger the notifications after having done some IO, and finish the write + * accounting, if any. + */ +static void io_req_io_end(struct io_kiocb *req) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + + WARN_ON(!in_task()); + + if (rw->kiocb.ki_flags & IOCB_WRITE) { + kiocb_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); + } +} + static bool __io_complete_rw_common(struct io_kiocb *req, long res) { if (unlikely(res != req->cqe.res)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { + /* + * Reissue will start accounting again, finish the + * current cycle. + */ + io_req_io_end(req); req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; return true; } @@ -264,15 +287,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) static void io_req_rw_complete(struct io_kiocb *req, bool *locked) { - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - - if (rw->kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); - fsnotify_modify(req->file); - } else { - fsnotify_access(req->file); - } - + io_req_io_end(req); io_req_task_complete(req, locked); } @@ -317,6 +332,11 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { if (!__io_complete_rw_common(req, ret)) { + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, issue_flags)); return IOU_OK; -- GitLab From 24fb316155a5f6ba278a8b110c60e67b79900356 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Wed, 24 Aug 2022 10:18:25 +0200 Subject: [PATCH 1857/2223] rtc: mpfs: Use devm_clk_get_enabled() helper The devm_clk_get_enabled() helper: - calls devm_clk_get() - calls clk_prepare_enable() and registers what is needed in order to call clk_disable_unprepare() when needed, as a managed resource. This simplifies the code, the error handling paths and avoid the need of a dedicated function used with devm_add_action_or_reset(). That said, mpfs_rtc_init_clk() is the same as devm_clk_get_enabled(), so use this function directly instead. This also fixes an (unlikely) unchecked devm_add_action_or_reset() error. Based on my test with allyesconfig, this reduces the .o size from: text data bss dec hex filename 5330 2208 0 7538 1d72 drivers/rtc/rtc-mpfs.o down to: 5074 2208 0 7282 1c72 drivers/rtc/rtc-mpfs.o Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/e55c959f2821a2c367a4c5de529a638b1cc6b8cd.1661329086.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-mpfs.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/rtc/rtc-mpfs.c b/drivers/rtc/rtc-mpfs.c index 944ad10365164..2a479d44f1981 100644 --- a/drivers/rtc/rtc-mpfs.c +++ b/drivers/rtc/rtc-mpfs.c @@ -193,23 +193,6 @@ static int mpfs_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static inline struct clk *mpfs_rtc_init_clk(struct device *dev) -{ - struct clk *clk; - int ret; - - clk = devm_clk_get(dev, "rtc"); - if (IS_ERR(clk)) - return clk; - - ret = clk_prepare_enable(clk); - if (ret) - return ERR_PTR(ret); - - devm_add_action_or_reset(dev, (void (*) (void *))clk_disable_unprepare, clk); - return clk; -} - static irqreturn_t mpfs_rtc_wakeup_irq_handler(int irq, void *dev) { struct mpfs_rtc_dev *rtcdev = dev; @@ -251,7 +234,7 @@ static int mpfs_rtc_probe(struct platform_device *pdev) /* range is capped by alarm max, lower reg is 31:0 & upper is 10:0 */ rtcdev->rtc->range_max = GENMASK_ULL(42, 0); - clk = mpfs_rtc_init_clk(&pdev->dev); + clk = devm_clk_get_enabled(&pdev->dev, "rtc"); if (IS_ERR(clk)) return PTR_ERR(clk); -- GitLab From 94e4603d1a262f8d79f6186d0df0379243613b95 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Wed, 24 Aug 2022 10:42:29 +0200 Subject: [PATCH 1858/2223] rtc: jz4740: Use devm_clk_get_enabled() helper The devm_clk_get_enabled() helper: - calls devm_clk_get() - calls clk_prepare_enable() and registers what is needed in order to call clk_disable_unprepare() when needed, as a managed resource. This simplifies the code, the error handling paths and avoid the need of a dedicated function used with devm_add_action_or_reset(). As a side effect, some error messages are not logged anymore, so also use dev_err_probe() instead of dev_err() in case of error. At least the error code will be logged (and -EPROBE_DEFER will be filtered) Based on my test with allyesconfig, this reduces the .o size from: text data bss dec hex filename 9025 2488 128 11641 2d79 drivers/rtc/rtc-jz4740.o down to: 8267 2080 128 10475 28eb drivers/rtc/rtc-jz4740.o Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Acked-by: Paul Cercueil <paul@crapouillou.net> Link: https://lore.kernel.org/r/af10570000d7e103d70bbea590ce8df4f8902b67.1661330532.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-jz4740.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 6e51df72fd658..c383719292c7d 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -257,11 +257,6 @@ static void jz4740_rtc_power_off(void) kernel_halt(); } -static void jz4740_rtc_clk_disable(void *data) -{ - clk_disable_unprepare(data); -} - static const struct of_device_id jz4740_rtc_of_match[] = { { .compatible = "ingenic,jz4740-rtc", .data = (void *)ID_JZ4740 }, { .compatible = "ingenic,jz4760-rtc", .data = (void *)ID_JZ4760 }, @@ -329,23 +324,9 @@ static int jz4740_rtc_probe(struct platform_device *pdev) if (IS_ERR(rtc->base)) return PTR_ERR(rtc->base); - clk = devm_clk_get(dev, "rtc"); - if (IS_ERR(clk)) { - dev_err(dev, "Failed to get RTC clock\n"); - return PTR_ERR(clk); - } - - ret = clk_prepare_enable(clk); - if (ret) { - dev_err(dev, "Failed to enable clock\n"); - return ret; - } - - ret = devm_add_action_or_reset(dev, jz4740_rtc_clk_disable, clk); - if (ret) { - dev_err(dev, "Failed to register devm action\n"); - return ret; - } + clk = devm_clk_get_enabled(dev, "rtc"); + if (IS_ERR(clk)) + return dev_err_probe(dev, PTR_ERR(clk), "Failed to get RTC clock\n"); spin_lock_init(&rtc->lock); -- GitLab From 8f08553e7e4370cdb8f55f0e3dc4db91ed6a4931 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Wed, 24 Aug 2022 10:25:11 +0200 Subject: [PATCH 1859/2223] rtc: k3: Use devm_clk_get_enabled() helper The devm_clk_get_enabled() helper: - calls devm_clk_get() - calls clk_prepare_enable() and registers what is needed in order to call clk_disable_unprepare() when needed, as a managed resource. This simplifies the code, the error handling paths and avoid the need of a dedicated function used with devm_add_action_or_reset(). Based on my test with allyesconfig, this reduces the .o size from: text data bss dec hex filename 12843 4804 64 17711 452f drivers/rtc/rtc-ti-k3.o down to: 12523 4804 64 17391 43ef drivers/rtc/rtc-ti-k3.o Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Link: https://lore.kernel.org/r/601288834ab71c0fddde7eedd8cdb8001254ed7e.1661329498.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-ti-k3.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/rtc/rtc-ti-k3.c b/drivers/rtc/rtc-ti-k3.c index 68e50c6a72f1d..ba23163cc0428 100644 --- a/drivers/rtc/rtc-ti-k3.c +++ b/drivers/rtc/rtc-ti-k3.c @@ -515,21 +515,12 @@ static struct nvmem_config ti_k3_rtc_nvmem_config = { static int k3rtc_get_32kclk(struct device *dev, struct ti_k3_rtc *priv) { - int ret; struct clk *clk; - clk = devm_clk_get(dev, "osc32k"); + clk = devm_clk_get_enabled(dev, "osc32k"); if (IS_ERR(clk)) return PTR_ERR(clk); - ret = clk_prepare_enable(clk); - if (ret) - return ret; - - ret = devm_add_action_or_reset(dev, (void (*)(void *))clk_disable_unprepare, clk); - if (ret) - return ret; - priv->rate_32k = clk_get_rate(clk); /* Make sure we are exact 32k clock. Else, try to compensate delay */ @@ -544,24 +535,19 @@ static int k3rtc_get_32kclk(struct device *dev, struct ti_k3_rtc *priv) */ priv->sync_timeout_us = (u32)(DIV_ROUND_UP_ULL(1000000, priv->rate_32k) * 4); - return ret; + return 0; } static int k3rtc_get_vbusclk(struct device *dev, struct ti_k3_rtc *priv) { - int ret; struct clk *clk; /* Note: VBUS isn't a context clock, it is needed for hardware operation */ - clk = devm_clk_get(dev, "vbus"); + clk = devm_clk_get_enabled(dev, "vbus"); if (IS_ERR(clk)) return PTR_ERR(clk); - ret = clk_prepare_enable(clk); - if (ret) - return ret; - - return devm_add_action_or_reset(dev, (void (*)(void *))clk_disable_unprepare, clk); + return 0; } static int ti_k3_rtc_probe(struct platform_device *pdev) -- GitLab From acfac37851e01b40c30a7afd0d93ad8db8914f25 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Fri, 7 Oct 2022 12:59:20 -0700 Subject: [PATCH 1860/2223] mm/hugetlb.c: make __hugetlb_vma_unlock_write_put() static Reported-by: kernel test robot <lkp@intel.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ad53ad98e742..41d3aa0778373 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6804,7 +6804,7 @@ void hugetlb_vma_lock_release(struct kref *kref) kfree(vma_lock); } -void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; -- GitLab From 7efc3b7261030da79001c00d92bc3392fd6c664c Mon Sep 17 00:00:00 2001 From: Chuyi Zhou <zhouchuyi@bytedance.com> Date: Wed, 13 Jul 2022 14:20:09 +0800 Subject: [PATCH 1861/2223] mm/compaction: fix set skip in fast_find_migrateblock When we successfully find a pageblock in fast_find_migrateblock(), the block will be set skip-flag through set_pageblock_skip(). However, when entering isolate_migratepages_block(), the whole pageblock will be skipped due to the branch 'if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages))'. Eventually we will goto isolate_abort and isolate nothing. That makes fast_find_migrateblock useless. In this patch, when we find a suitable pageblock in fast_find_migrateblock, we do noting but let isolate_migratepages_block to set skip flag to the pageblock after scan it. Normally, we would isolate some pages from the fast-find block. I use mmtest/thpscale-madvhugepage test it. Here is the result: baseline patch Amean fault-both-1 1331.66 ( 0.00%) 1261.04 * 5.30%* Amean fault-both-3 1383.95 ( 0.00%) 1191.69 * 13.89%* Amean fault-both-5 1568.13 ( 0.00%) 1445.20 * 7.84%* Amean fault-both-7 1819.62 ( 0.00%) 1555.13 * 14.54%* Amean fault-both-12 1106.96 ( 0.00%) 1149.43 * -3.84%* Amean fault-both-18 2196.93 ( 0.00%) 1875.77 * 14.62%* Amean fault-both-24 2642.69 ( 0.00%) 2671.21 * -1.08%* Amean fault-both-30 2901.89 ( 0.00%) 2857.32 * 1.54%* Amean fault-both-32 3747.00 ( 0.00%) 3479.23 * 7.15%* Link: https://lkml.kernel.org/r/20220713062009.597255-1-zhouchuyi@bytedance.com Fixes: 70b44595eafe9 ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: zhouchuyi <zhouchuyi@bytedance.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index e2a9615f5fded..c4e4453187a2c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1851,7 +1851,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } -- GitLab From 92b7399695a5cc961c44fc6e4624d3bc3c699ee7 Mon Sep 17 00:00:00 2001 From: Liam Howlett <liam.howlett@oracle.com> Date: Tue, 11 Oct 2022 20:36:51 +0000 Subject: [PATCH 1862/2223] mmap: fix copy_vma() failure path The anon vma was not unlinked and the file was not closed in the failure path when the machine runs out of memory during the maple tree modification. This caused a memory leak of the anon vma chain and vma since neither would be freed. Link: https://lkml.kernel.org/r/20221011203621.1446507-1-Liam.Howlett@oracle.com Fixes: 524e00b36e8c ("mm: remove rb tree") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Tested-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index 6e447544f07dd..fc8581cefef71 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3240,6 +3240,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_vma_link: if (new_vma->vm_ops && new_vma->vm_ops->close) new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: -- GitLab From 7be1c1a3c7b13fb259bb5159662a7b83622013b8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Tue, 11 Oct 2022 20:55:31 +0300 Subject: [PATCH 1863/2223] mm: more vma cache removal Link: https://lkml.kernel.org/r/Y0WuE3Riv4iy5Jx8@localhost.localdomain Fixes: 7964cf8caa4d ("mm: remove vmacache") Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Acked-by: Liam Howlett <liam.howlett@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 88a043f7235eb..e0bb85cf8bdd1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,8 +861,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; - /* Per-thread vma caching: */ - #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif -- GitLab From 28c5609fb236807910ca347ad3e26c4567998526 Mon Sep 17 00:00:00 2001 From: Liam Howlett <liam.howlett@oracle.com> Date: Tue, 11 Oct 2022 16:08:37 +0000 Subject: [PATCH 1864/2223] mm/mmap: preallocate maple nodes for brk vma expansion If the brk VMA is the last vma in a maple node and meets the rare criteria that it can be expanded, then preallocation is necessary to avoid a potential fs_reclaim circular lock issue on low resources. At the same time use the actual vma start address (unaligned) when calling vma_adjust_trans_huge(). Link: https://lkml.kernel.org/r/20221011160624.1253454-1-Liam.Howlett@oracle.com Fixes: 2e7ce7d354f2 (mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()) Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmap.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index fc8581cefef71..5855f26639f98 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2942,17 +2942,18 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { - mas->index = vma->vm_start; - mas->last = addr + len - 1; - vma_adjust_trans_huge(vma, addr, addr + len, 0); + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); if (vma->anon_vma) { anon_vma_lock_write(vma->anon_vma); anon_vma_interval_tree_pre_update_vma(vma); } vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; - if (mas_store_gfp(mas, vma, GFP_KERNEL)) - goto mas_expand_failed; + mas_store_prealloc(mas, vma); if (vma->anon_vma) { anon_vma_interval_tree_post_update_vma(vma); @@ -2993,13 +2994,6 @@ mas_store_fail: vma_alloc_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; - -mas_expand_failed: - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } - return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) -- GitLab From 515778e2d790652a38a24554fdb7f21420d91efc Mon Sep 17 00:00:00 2001 From: Peter Xu <peterx@redhat.com> Date: Fri, 30 Sep 2022 20:25:55 -0400 Subject: [PATCH 1865/2223] mm/uffd: fix warning without PTE_MARKER_UFFD_WP compiled in When PTE_MARKER_UFFD_WP not configured, it's still possible to reach pte marker code and trigger an warning. Add a few CONFIG_PTE_MARKER_UFFD_WP ifdefs to make sure the code won't be reached when not compiled in. Link: https://lkml.kernel.org/r/YzeR+R6b4bwBlBHh@x1n Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu <peterx@redhat.com> Reported-by: <syzbot+2b9b4f0895be09a6dec3@syzkaller.appspotmail.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Brian Geffon <bgeffon@google.com> Cc: Edward Liaw <edliaw@google.com> Cc: Liu Shixin <liushixin2@huawei.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 4 ++++ mm/memory.c | 2 ++ mm/mprotect.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41d3aa0778373..9a910612336da 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to @@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); else +#endif huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; @@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); diff --git a/mm/memory.c b/mm/memory.c index df678fa30cdb9..2c7723ea43714 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, diff --git a/mm/mprotect.c b/mm/mprotect.c index 461dcbd4f21a6..668bfaa6ed2ae 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { /* * For file-backed mem, we need to be able to @@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, make_pte_marker(PTE_MARKER_UFFD_WP)); pages++; } +#endif } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); -- GitLab From 826249942679a110353e71a1d92764fcf43e7cf7 Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Tue, 23 Aug 2022 19:33:17 +0100 Subject: [PATCH 1866/2223] dt-bindings: timer: sifive,clint: add legacy riscv compatible While "real" hardware might not use the compatible string "riscv,clint0" it is present in the driver & QEMU uses it for automatically generated virt machine dtbs. To avoid dt-validate problems with QEMU produced dtbs, such as the following, add it to the binding. riscv-virt.dtb: clint@2000000: compatible:0: 'sifive,clint0' is not one of ['sifive,fu540-c000-clint', 'starfive,jh7100-clint', 'canaan,k210-clint'] Reported-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/linux-riscv/20220803170552.GA2250266-robh@kernel.org/ Reviewed-by: Rob Herring <robh@kernel.org> Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Heiko Stuebner <heiko@sntech.de> Link: https://lore.kernel.org/r/20220823183319.3314940-2-mail@conchuod.ie Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- .../bindings/timer/sifive,clint.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/sifive,clint.yaml b/Documentation/devicetree/bindings/timer/sifive,clint.yaml index e64f46339079f..bbad241658374 100644 --- a/Documentation/devicetree/bindings/timer/sifive,clint.yaml +++ b/Documentation/devicetree/bindings/timer/sifive,clint.yaml @@ -22,12 +22,18 @@ description: properties: compatible: - items: - - enum: - - sifive,fu540-c000-clint - - starfive,jh7100-clint - - canaan,k210-clint - - const: sifive,clint0 + oneOf: + - items: + - enum: + - sifive,fu540-c000-clint + - starfive,jh7100-clint + - canaan,k210-clint + - const: sifive,clint0 + - items: + - const: sifive,clint0 + - const: riscv,clint0 + deprecated: true + description: For the QEMU virt machine only description: Should be "<vendor>,<chip>-clint" and "sifive,clint<version>". -- GitLab From 6e965c9bd7388762b302dca5852eb25cbe9cc085 Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Tue, 23 Aug 2022 19:33:18 +0100 Subject: [PATCH 1867/2223] dt-bindings: interrupt-controller: sifive,plic: add legacy riscv compatible While "real" hardware might not use the compatible string "riscv,plic0" it is present in the driver & QEMU uses it for automatically generated virt machine dtbs. To avoid dt-validate problems with QEMU produced dtbs, such as the following, add it to the binding. riscv-virt.dtb: plic@c000000: compatible: 'oneOf' conditional failed, one must be fixed: 'sifive,plic-1.0.0' is not one of ['sifive,fu540-c000-plic', 'starfive,jh7100-plic', 'canaan,k210-plic'] 'sifive,plic-1.0.0' is not one of ['allwinner,sun20i-d1-plic'] 'sifive,plic-1.0.0' was expected 'thead,c900-plic' was expected riscv-virt.dtb: plic@c000000: '#address-cells' is a required property Reported-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/linux-riscv/20220803170552.GA2250266-robh@kernel.org/ Reviewed-by: Rob Herring <robh@kernel.org> Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Heiko Stuebner <heiko@sntech.de> Link: https://lore.kernel.org/r/20220823183319.3314940-3-mail@conchuod.ie Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- .../bindings/interrupt-controller/sifive,plic-1.0.0.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml index 92e0f8c3eff2d..99e01f4d0a693 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml @@ -66,6 +66,11 @@ properties: - enum: - allwinner,sun20i-d1-plic - const: thead,c900-plic + - items: + - const: sifive,plic-1.0.0 + - const: riscv,plic0 + deprecated: true + description: For the QEMU virt machine only reg: maxItems: 1 -- GitLab From 299824e68bd0fac60f8352c940fd731fde609de1 Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Tue, 23 Aug 2022 19:33:19 +0100 Subject: [PATCH 1868/2223] dt-bindings: riscv: add new riscv,isa strings for emulators The QEMU virt and spike machines currently export a riscv,isa string of "rv64imafdcsuh", While the RISC-V foundation has been ratifying a bunch of extenstions etc, the kernel has remained relatively static with what hardware is supported - but the same is not true of QEMU. Using the virt machine and running dt-validate on the dumped dtb fails, partly due to the unexpected isa string. Rather than enumerate the many many possbilities, change the pattern to a regex, with the following assumptions: - ima are required - the single letter order is fixed & we don't care about things that can't even do "ima" - the standard multi letter extensions are all in a "_z<foo>" format where the first letter of <foo> is a valid single letter extension - _s & _h are used for supervisor and hyper visor extensions - convention says that after the first two chars, a standard multi letter extension name could be an english word (ifencei anyone?) so it's not worth restricting the charset - as the above is just convention, don't apply any charset restrictions to reduce future churn - vendor ISA extensions begind with _x and have no charset restrictions - we don't care about an e extension from an OS pov - that attempting to validate the contents of the multiletter extensions with dt-validate beyond the formatting is a futile, massively verbose or unwieldy exercise at best The following limitations also apply: - multi letter extension ordering is not enforced. dt-schema does not appear to allow for named match groups, so the resulting regex would be even more of a headache - ditto for the numbered extensions Finally, add me as a maintainer of the binding so that when it breaks in the future, I can be held responsible! Reported-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/linux-riscv/20220803170552.GA2250266-robh@kernel.org/ Reviewed-by: Andrew Jones <ajones@ventanamicro.com> Acked-by: Guo Ren <guoren@kernel.org> Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Acked-by: Heiko Stuebner <heiko@sntech.de> Reviewed-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/r/20220823183319.3314940-4-mail@conchuod.ie Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- Documentation/devicetree/bindings/riscv/cpus.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index 873dd12f6e896..90a7cabf58feb 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -9,6 +9,7 @@ title: RISC-V bindings for 'cpus' DT nodes maintainers: - Paul Walmsley <paul.walmsley@sifive.com> - Palmer Dabbelt <palmer@sifive.com> + - Conor Dooley <conor@kernel.org> description: | This document uses some terminology common to the RISC-V community @@ -79,9 +80,7 @@ properties: insensitive, letters in the riscv,isa string must be all lowercase to simplify parsing. $ref: "/schemas/types.yaml#/definitions/string" - enum: - - rv64imac - - rv64imafdc + pattern: ^rv(?:64|32)imaf?d?q?c?b?v?k?h?(?:_[hsxz](?:[a-z])+)*$ # RISC-V requires 'timebase-frequency' in /cpus, so disallow it here timebase-frequency: false -- GitLab From 3c52c6bb831f6335c176a0fc7214e26f43adbd11 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Thu, 6 Oct 2022 11:53:45 -0700 Subject: [PATCH 1869/2223] tcp/udp: Fix memory leak in ipv6_renew_options(). syzbot reported a memory leak [0] related to IPV6_ADDRFORM. The scenario is that while one thread is converting an IPv6 socket into IPv4 with IPV6_ADDRFORM, another thread calls do_ipv6_setsockopt() and allocates memory to inet6_sk(sk)->XXX after conversion. Then, the converted sk with (tcp|udp)_prot never frees the IPv6 resources, which inet6_destroy_sock() should have cleaned up. setsockopt(IPV6_ADDRFORM) setsockopt(IPV6_DSTOPTS) +-----------------------+ +----------------------+ - do_ipv6_setsockopt(sk, ...) - sockopt_lock_sock(sk) - do_ipv6_setsockopt(sk, ...) - lock_sock(sk) ^._ called via tcpv6_prot - WRITE_ONCE(sk->sk_prot, &tcp_prot) before WRITE_ONCE() - xchg(&np->opt, NULL) - txopt_put(opt) - sockopt_release_sock(sk) - release_sock(sk) - sockopt_lock_sock(sk) - lock_sock(sk) - ipv6_set_opt_hdr(sk, ...) - ipv6_update_options(sk, opt) - xchg(&inet6_sk(sk)->opt, opt) ^._ opt is never freed. - sockopt_release_sock(sk) - release_sock(sk) Since IPV6_DSTOPTS allocates options under lock_sock(), we can avoid this memory leak by testing whether sk_family is changed by IPV6_ADDRFORM after acquiring the lock. This issue exists from the initial commit between IPV6_ADDRFORM and IPV6_PKTOPTIONS. [0]: BUG: memory leak unreferenced object 0xffff888009ab9f80 (size 96): comm "syz-executor583", pid 328, jiffies 4294916198 (age 13.034s) hex dump (first 32 bytes): 01 00 00 00 48 00 00 00 08 00 00 00 00 00 00 00 ....H........... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000002ee98ae1>] kmalloc include/linux/slab.h:605 [inline] [<000000002ee98ae1>] sock_kmalloc+0xb3/0x100 net/core/sock.c:2566 [<0000000065d7b698>] ipv6_renew_options+0x21e/0x10b0 net/ipv6/exthdrs.c:1318 [<00000000a8c756d7>] ipv6_set_opt_hdr net/ipv6/ipv6_sockglue.c:354 [inline] [<00000000a8c756d7>] do_ipv6_setsockopt.constprop.0+0x28b7/0x4350 net/ipv6/ipv6_sockglue.c:668 [<000000002854d204>] ipv6_setsockopt+0xdf/0x190 net/ipv6/ipv6_sockglue.c:1021 [<00000000e69fdcf8>] tcp_setsockopt+0x13b/0x2620 net/ipv4/tcp.c:3789 [<0000000090da4b9b>] __sys_setsockopt+0x239/0x620 net/socket.c:2252 [<00000000b10d192f>] __do_sys_setsockopt net/socket.c:2263 [inline] [<00000000b10d192f>] __se_sys_setsockopt net/socket.c:2260 [inline] [<00000000b10d192f>] __x64_sys_setsockopt+0xbe/0x160 net/socket.c:2260 [<000000000a80d7aa>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<000000000a80d7aa>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000004562b5c6>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv6/ipv6_sockglue.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2d2f4dd9e5dfa..408345fc4c5cd 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -419,6 +419,12 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, rtnl_lock(); sockopt_lock_sock(sk); + /* Another thread has converted the socket into IPv4 with + * IPV6_ADDRFORM concurrently. + */ + if (unlikely(sk->sk_family != AF_INET6)) + goto unlock; + switch (optname) { case IPV6_ADDRFORM: @@ -994,6 +1000,7 @@ done: break; } +unlock: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); -- GitLab From 21985f43376cee092702d6cb963ff97a9d2ede68 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Thu, 6 Oct 2022 11:53:46 -0700 Subject: [PATCH 1870/2223] udp: Call inet6_destroy_sock() in setsockopt(IPV6_ADDRFORM). Commit 4b340ae20d0e ("IPv6: Complete IPV6_DONTFRAG support") forgot to add a change to free inet6_sk(sk)->rxpmtu while converting an IPv6 socket into IPv4 with IPV6_ADDRFORM. After conversion, sk_prot is changed to udp_prot and ->destroy() never cleans it up, resulting in a memory leak. This is due to the discrepancy between inet6_destroy_sock() and IPV6_ADDRFORM, so let's call inet6_destroy_sock() from IPV6_ADDRFORM to remove the difference. However, this is not enough for now because rxpmtu can be changed without lock_sock() after commit 03485f2adcde ("udpv6: Add lockless sendmsg() support"). We will fix this case in the following patch. Note we will rename inet6_destroy_sock() to inet6_cleanup_sock() and remove unnecessary inet6_destroy_sock() calls in sk_prot->destroy() in the future. Fixes: 4b340ae20d0e ("IPv6: Complete IPV6_DONTFRAG support") Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/net/ipv6.h | 1 + net/ipv6/af_inet6.c | 6 ++++++ net/ipv6/ipv6_sockglue.c | 20 ++++++++------------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d664ba5812d87..335a49ecd8a09 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1182,6 +1182,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); +void inet6_cleanup_sock(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d40b7d60e00ee..ded827944fa60 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -510,6 +510,12 @@ void inet6_destroy_sock(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_destroy_sock); +void inet6_cleanup_sock(struct sock *sk) +{ + inet6_destroy_sock(sk); +} +EXPORT_SYMBOL_GPL(inet6_cleanup_sock); + /* * This does both peername and sockname. */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 408345fc4c5cd..a20edae868fd6 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -431,9 +431,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { - struct ipv6_txoptions *opt; - struct sk_buff *pktopt; - if (sk->sk_type == SOCK_RAW) break; @@ -464,7 +461,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; } - fl6_free_socklist(sk); __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); @@ -501,14 +497,14 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg((__force struct ipv6_txoptions **)&np->opt, - NULL); - if (opt) { - atomic_sub(opt->tot_len, &sk->sk_omem_alloc); - txopt_put(opt); - } - pktopt = xchg(&np->pktoptions, NULL); - kfree_skb(pktopt); + + /* Disable all options not to allocate memory anymore, + * but there is still a race. See the lockless path + * in udpv6_sendmsg() and ipv6_local_rxpmtu(). + */ + np->rxopt.all = 0; + + inet6_cleanup_sock(sk); /* * ... and add it to the refcnt debug socks count -- GitLab From d38afeec26ed4739c640bf286c270559aab2ba5f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Thu, 6 Oct 2022 11:53:47 -0700 Subject: [PATCH 1871/2223] tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct(). Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were able to clean them up by calling inet6_destroy_sock() during the IPv6 -> IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6: Add lockless sendmsg() support") added a lockless memory allocation path, which could cause a memory leak: setsockopt(IPV6_ADDRFORM) sendmsg() +-----------------------+ +-------+ - do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...) - sockopt_lock_sock(sk) ^._ called via udpv6_prot - lock_sock(sk) before WRITE_ONCE() - WRITE_ONCE(sk->sk_prot, &tcp_prot) - inet6_destroy_sock() - if (!corkreq) - sockopt_release_sock(sk) - ip6_make_skb(sk, ...) - release_sock(sk) ^._ lockless fast path for the non-corking case - __ip6_append_data(sk, ...) - ipv6_local_rxpmtu(sk, ...) - xchg(&np->rxpmtu, skb) ^._ rxpmtu is never freed. - goto out_no_dst; - lock_sock(sk) For now, rxpmtu is only the case, but not to miss the future change and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix memleak in ipv6_renew_options()."), let's set a new function to IPv6 sk->sk_destruct() and call inet6_cleanup_sock() there. Since the conversion does not change sk->sk_destruct(), we can guarantee that we can clean up IPv6 resources finally. We can now remove all inet6_destroy_sock() calls from IPv6 protocol specific ->destroy() functions, but such changes are invasive to backport. So they can be posted as a follow-up later for net-next. Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support") Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/net/ipv6.h | 1 + include/net/udp.h | 2 +- include/net/udplite.h | 8 -------- net/ipv4/udp.c | 9 ++++++--- net/ipv4/udplite.c | 8 ++++++++ net/ipv6/af_inet6.c | 8 +++++++- net/ipv6/udp.c | 15 ++++++++++++++- net/ipv6/udp_impl.h | 1 + net/ipv6/udplite.c | 9 ++++++++- 9 files changed, 46 insertions(+), 15 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 335a49ecd8a09..37943ba3a73c5 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1183,6 +1183,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); +void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/net/udp.h b/include/net/udp.h index 5ee88ddf79c3f..fee053bcd17c6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -247,7 +247,7 @@ static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, } /* net/ipv4/udp.c */ -void udp_destruct_sock(struct sock *sk); +void udp_destruct_common(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/udplite.h b/include/net/udplite.h index 0143b373602ec..299c14ce2bb94 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -25,14 +25,6 @@ static __inline__ int udplite_getfrag(void *from, char *to, int offset, return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; } -/* Designate sk as UDP-Lite socket */ -static inline int udplite_sk_init(struct sock *sk) -{ - udp_init_sock(sk); - udp_sk(sk)->pcflag = UDPLITE_BIT; - return 0; -} - /* * Checksumming routines */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d63118ce59006..8126f67d18b34 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1598,7 +1598,7 @@ drop: } EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); -void udp_destruct_sock(struct sock *sk) +void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); @@ -1611,10 +1611,14 @@ void udp_destruct_sock(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); +} +EXPORT_SYMBOL_GPL(udp_destruct_common); +static void udp_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); inet_sock_destruct(sk); } -EXPORT_SYMBOL_GPL(udp_destruct_sock); int udp_init_sock(struct sock *sk) { @@ -1622,7 +1626,6 @@ int udp_init_sock(struct sock *sk) sk->sk_destruct = udp_destruct_sock; return 0; } -EXPORT_SYMBOL_GPL(udp_init_sock); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 6e08a76ae1e7e..e0c9cc39b81e3 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -17,6 +17,14 @@ struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); +/* Designate sk as UDP-Lite socket */ +static int udplite_sk_init(struct sock *sk) +{ + udp_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ded827944fa60..0241910049825 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -109,6 +109,12 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } +void inet6_sock_destruct(struct sock *sk) +{ + inet6_cleanup_sock(sk); + inet_sock_destruct(sk); +} + static int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -201,7 +207,7 @@ lookup_protocol: inet->hdrincl = 1; } - sk->sk_destruct = inet_sock_destruct; + sk->sk_destruct = inet6_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 91e795bb9ade6..8d09f0ea5b8c7 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -56,6 +56,19 @@ #include <trace/events/skb.h> #include "udp_impl.h" +static void udpv6_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +int udpv6_init_sock(struct sock *sk) +{ + skb_queue_head_init(&udp_sk(sk)->reader_queue); + sk->sk_destruct = udpv6_destruct_sock; + return 0; +} + static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -1733,7 +1746,7 @@ struct proto udpv6_prot = { .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, - .init = udp_init_sock, + .init = udpv6_init_sock, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 4251e49d32a0d..0590f566379d7 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -12,6 +12,7 @@ int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, __be32, struct udp_table *); +int udpv6_init_sock(struct sock *sk); int udp_v6_get_port(struct sock *sk, unsigned short snum); void udp_v6_rehash(struct sock *sk); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index b707258562597..67eaf3ca14cea 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -12,6 +12,13 @@ #include <linux/proc_fs.h> #include "udp_impl.h" +static int udplitev6_sk_init(struct sock *sk) +{ + udpv6_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplitev6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); @@ -38,7 +45,7 @@ struct proto udplitev6_prot = { .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, - .init = udplite_sk_init, + .init = udplitev6_sk_init, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, -- GitLab From 364f997b5cfe1db0d63a390fe7c801fa2b3115f6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Thu, 6 Oct 2022 11:53:48 -0700 Subject: [PATCH 1872/2223] ipv6: Fix data races around sk->sk_prot. Commit 086d49058cd8 ("ipv6: annotate some data-races around sk->sk_prot") fixed some data-races around sk->sk_prot but it was not enough. Some functions in inet6_(stream|dgram)_ops still access sk->sk_prot without lock_sock() or rtnl_lock(), so they need READ_ONCE() to avoid load tearing. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/core/sock.c | 6 ++++-- net/ipv4/af_inet.c | 23 ++++++++++++++++------- net/ipv6/ipv6_sockglue.c | 4 ++-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index eeb6cbac6f499..a3ba0358c77c0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3610,7 +3610,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); @@ -3636,7 +3637,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e2c2193823455..3dd02396517df 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -558,22 +558,27 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (uaddr->sa_family == AF_UNSPEC) - return sk->sk_prot->disconnect(sk, flags); + return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, uaddr, addr_len); + return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); @@ -734,10 +739,11 @@ EXPORT_SYMBOL(inet_stream_connect); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { - struct sock *sk1 = sock->sk; + struct sock *sk1 = sock->sk, *sk2; int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) goto do_err; @@ -825,12 +831,15 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; - if (sk->sk_prot->sendpage) - return sk->sk_prot->sendpage(sk, page, offset, size, flags); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->sendpage) + return prot->sendpage(sk, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } EXPORT_SYMBOL(inet_sendpage); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a20edae868fd6..d7207a546aecd 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -477,7 +477,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; @@ -492,7 +492,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; -- GitLab From f49cd2f4d6170d27a2c61f1fecb03d8a70c91f57 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Thu, 6 Oct 2022 11:53:49 -0700 Subject: [PATCH 1873/2223] tcp: Fix data races around icsk->icsk_af_ops. setsockopt(IPV6_ADDRFORM) and tcp_v6_connect() change icsk->icsk_af_ops under lock_sock(), but tcp_(get|set)sockopt() read it locklessly. To avoid load/store tearing, we need to add READ_ONCE() and WRITE_ONCE() for the reads and writes. Thanks to Eric Dumazet for providing the syzbot report: BUG: KCSAN: data-race in tcp_setsockopt / tcp_v6_connect write to 0xffff88813c624518 of 8 bytes by task 23936 on cpu 0: tcp_v6_connect+0x5b3/0xce0 net/ipv6/tcp_ipv6.c:240 __inet_stream_connect+0x159/0x6d0 net/ipv4/af_inet.c:660 inet_stream_connect+0x44/0x70 net/ipv4/af_inet.c:724 __sys_connect_file net/socket.c:1976 [inline] __sys_connect+0x197/0x1b0 net/socket.c:1993 __do_sys_connect net/socket.c:2003 [inline] __se_sys_connect net/socket.c:2000 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:2000 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88813c624518 of 8 bytes by task 23937 on cpu 1: tcp_setsockopt+0x147/0x1c80 net/ipv4/tcp.c:3789 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3585 __sys_setsockopt+0x212/0x2b0 net/socket.c:2252 __do_sys_setsockopt net/socket.c:2263 [inline] __se_sys_setsockopt net/socket.c:2260 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0xffffffff8539af68 -> 0xffffffff8539aff8 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23937 Comm: syz-executor.5 Not tainted 6.0.0-rc4-syzkaller-00331-g4ed9c1e971b1-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot <syzkaller@googlegroups.com> Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/tcp.c | 10 ++++++---- net/ipv6/ipv6_sockglue.c | 3 ++- net/ipv6/tcp_ipv6.c | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c51abeee172c..f8232811a5be1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3796,8 +3796,9 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, + optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); @@ -4396,8 +4397,9 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, + optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d7207a546aecd..532f4478c8840 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -479,7 +479,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); - icsk->icsk_af_ops = &ipv4_specific; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a8adda623da15..2a3f9296df1e5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -238,7 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - icsk->icsk_af_ops = &ipv6_mapped; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -250,7 +251,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &ipv6_specific; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; -- GitLab From 3c1860543fccc1d0cfe3fd6b190e414a418fe60e Mon Sep 17 00:00:00 2001 From: Xin Long <lucien.xin@gmail.com> Date: Thu, 6 Oct 2022 15:45:02 -0400 Subject: [PATCH 1874/2223] openvswitch: add nf_ct_is_confirmed check before assigning the helper A WARN_ON call trace would be triggered when 'ct(commit, alg=helper)' applies on a confirmed connection: WARNING: CPU: 0 PID: 1251 at net/netfilter/nf_conntrack_extend.c:98 RIP: 0010:nf_ct_ext_add+0x12d/0x150 [nf_conntrack] Call Trace: <TASK> nf_ct_helper_ext_add+0x12/0x60 [nf_conntrack] __nf_ct_try_assign_helper+0xc4/0x160 [nf_conntrack] __ovs_ct_lookup+0x72e/0x780 [openvswitch] ovs_ct_execute+0x1d8/0x920 [openvswitch] do_execute_actions+0x4e6/0xb60 [openvswitch] ovs_execute_actions+0x60/0x140 [openvswitch] ovs_packet_cmd_execute+0x2ad/0x310 [openvswitch] genl_family_rcv_msg_doit.isra.15+0x113/0x150 genl_rcv_msg+0xef/0x1f0 which can be reproduced with these OVS flows: table=0, in_port=veth1,tcp,tcp_dst=2121,ct_state=-trk actions=ct(commit, table=1) table=1, in_port=veth1,tcp,tcp_dst=2121,ct_state=+trk+new actions=ct(commit, alg=ftp),normal The issue was introduced by commit 248d45f1e193 ("openvswitch: Allow attaching helper in later commit") where it somehow removed the check of nf_ct_is_confirmed before asigning the helper. This patch is to fix it by bringing it back. Fixes: 248d45f1e193 ("openvswitch: Allow attaching helper in later commit") Reported-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Aaron Conole <aconole@redhat.com> Tested-by: Aaron Conole <aconole@redhat.com> Link: https://lore.kernel.org/r/c5c9092a22a2194650222bffaf786902613deb16.1665085502.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/openvswitch/conntrack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index cb255d8ed99a9..c7b10234cf7c4 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1015,7 +1015,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * connections which we will commit, we may need to attach * the helper here. */ - if (info->commit && info->helper && !nfct_help(ct)) { + if (!nf_ct_is_confirmed(ct) && info->commit && + info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) -- GitLab From fa182ea26ff09cbadb28bbcd6196209b3555eb1d Mon Sep 17 00:00:00 2001 From: Divya Koppera <Divya.Koppera@microchip.com> Date: Tue, 11 Oct 2022 15:24:37 +0530 Subject: [PATCH 1875/2223] net: phy: micrel: Fixes FIELD_GET assertion FIELD_GET() must only be used with a mask that is a compile-time constant. Mark the functions as __always_inline to avoid the problem. Fixes: 21b688dabecb6a ("net: phy: micrel: Cable Diag feature for lan8814 phy") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Divya Koppera <Divya.Koppera@microchip.com> Link: https://lore.kernel.org/r/20221011095437.12580-1-Divya.Koppera@microchip.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/phy/micrel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3757e069c486c..54a17b576eac0 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1838,7 +1838,7 @@ static int ksz886x_cable_test_start(struct phy_device *phydev) return phy_clear_bits(phydev, MII_BMCR, BMCR_ANENABLE | BMCR_SPEED100); } -static int ksz886x_cable_test_result_trans(u16 status, u16 mask) +static __always_inline int ksz886x_cable_test_result_trans(u16 status, u16 mask) { switch (FIELD_GET(mask, status)) { case KSZ8081_LMD_STAT_NORMAL: @@ -1854,13 +1854,13 @@ static int ksz886x_cable_test_result_trans(u16 status, u16 mask) } } -static bool ksz886x_cable_test_failed(u16 status, u16 mask) +static __always_inline bool ksz886x_cable_test_failed(u16 status, u16 mask) { return FIELD_GET(mask, status) == KSZ8081_LMD_STAT_FAIL; } -static bool ksz886x_cable_test_fault_length_valid(u16 status, u16 mask) +static __always_inline bool ksz886x_cable_test_fault_length_valid(u16 status, u16 mask) { switch (FIELD_GET(mask, status)) { case KSZ8081_LMD_STAT_OPEN: @@ -1871,7 +1871,8 @@ static bool ksz886x_cable_test_fault_length_valid(u16 status, u16 mask) return false; } -static int ksz886x_cable_test_fault_length(struct phy_device *phydev, u16 status, u16 data_mask) +static __always_inline int ksz886x_cable_test_fault_length(struct phy_device *phydev, + u16 status, u16 data_mask) { int dt; -- GitLab From deb0f6562884b5b4beb883d73e66a7d3a1b96d99 Mon Sep 17 00:00:00 2001 From: Carlos Llamas <cmllamas@google.com> Date: Fri, 30 Sep 2022 00:38:43 +0000 Subject: [PATCH 1876/2223] mm/mmap: undo ->mmap() when arch_validate_flags() fails Commit c462ac288f2c ("mm: Introduce arch_validate_flags()") added a late check in mmap_region() to let architectures validate vm_flags. The check needs to happen after calling ->mmap() as the flags can potentially be modified during this callback. If arch_validate_flags() check fails we unmap and free the vma. However, the error path fails to undo the ->mmap() call that previously succeeded and depending on the specific ->mmap() implementation this translates to reference increments, memory allocations and other operations what will not be cleaned up. There are several places (mainly device drivers) where this is an issue. However, one specific example is bpf_map_mmap() which keeps count of the mappings in map->writecnt. The count is incremented on ->mmap() and then decremented on vm_ops->close(). When arch_validate_flags() fails this count is off since bpf_map_mmap_close() is never called. One can reproduce this issue in arm64 devices with MTE support. Here the vm_flags are checked to only allow VM_MTE if VM_MTE_ALLOWED has been set previously. From userspace then is enough to pass the PROT_MTE flag to mmap() syscall to trigger the arch_validate_flags() failure. The following program reproduces this issue: #include <stdio.h> #include <unistd.h> #include <linux/unistd.h> #include <linux/bpf.h> #include <sys/mman.h> int main(void) { union bpf_attr attr = { .map_type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), .value_size = sizeof(long long), .max_entries = 256, .map_flags = BPF_F_MMAPABLE, }; int fd; fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); mmap(NULL, 4096, PROT_WRITE | PROT_MTE, MAP_SHARED, fd, 0); return 0; } By manually adding some log statements to the vm_ops callbacks we can confirm that when passing PROT_MTE to mmap() the map->writecnt is off upon ->release(): With PROT_MTE flag: root@debian:~# ./bpf-test [ 111.263874] bpf_map_write_active_inc: map=9 writecnt=1 [ 111.288763] bpf_map_release: map=9 writecnt=1 Without PROT_MTE flag: root@debian:~# ./bpf-test [ 157.816912] bpf_map_write_active_inc: map=10 writecnt=1 [ 157.830442] bpf_map_write_active_dec: map=10 writecnt=0 [ 157.832396] bpf_map_release: map=10 writecnt=0 This patch fixes the above issue by calling vm_ops->close() when the arch_validate_flags() check fails, after this we can proceed to unmap and free the vma on the error path. Link: https://lkml.kernel.org/r/20220930003844.1210987-1-cmllamas@google.com Fixes: c462ac288f2c ("mm: Introduce arch_validate_flags()") Signed-off-by: Carlos Llamas <cmllamas@google.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Acked-by: Andrii Nakryiko <andrii@kernel.org> Reviewed-by: Liam Howlett <liam.howlett@oracle.com> Cc: Christian Brauner (Microsoft) <brauner@kernel.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: <stable@vger.kernel.org> [5.10+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5855f26639f98..bf2122af94e7a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2673,7 +2673,7 @@ cannot_expand: if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; if (file) - goto unmap_and_free_vma; + goto close_and_free_vma; else goto free_vma; } @@ -2742,6 +2742,9 @@ expanded: validate_mm(mm); return addr; +close_and_free_vma: + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); unmap_and_free_vma: fput(vma->vm_file); vma->vm_file = NULL; -- GitLab From 4fa0e3ff217f775cb58d2d6d51820ec519243fb9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" <willy@infradead.org> Date: Wed, 12 Oct 2022 20:34:19 +0100 Subject: [PATCH 1877/2223] ext4,f2fs: fix readahead of verity data The recent change of page_cache_ra_unbounded() arguments was buggy in the two callers, causing us to readahead the wrong pages. Move the definition of ractl down to after the index is set correctly. This affected performance on configurations that use fs-verity. Link: https://lkml.kernel.org/r/20221012193419.1453558-1-willy@infradead.org Fixes: 73bb49da50cd ("mm/readahead: make page_cache_ra_unbounded take a readahead_control") Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reported-by: Jintao Yin <nicememory@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ext4/verity.c | 3 ++- fs/f2fs/verity.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b051d19b5c8a0..94442c690ca7d 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -365,13 +365,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 7b8f2b41c29b1..c0733f8670746 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -262,13 +262,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) -- GitLab From ac801e7e252c5588325e3c983c7d4167fc68c024 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko <glider@google.com> Date: Mon, 5 Sep 2022 14:24:27 +0200 Subject: [PATCH 1878/2223] kmsan: unpoison @tlb in arch_tlb_gather_mmu() This is an optimization to reduce stackdepot pressure. struct mmu_gather contains 7 1-bit fields packed into a 32-bit unsigned int value. The remaining 25 bits remain uninitialized and are never used, but KMSAN updates the origin for them in zap_pXX_range() in mm/memory.c, thus creating very long origin chains. This is technically correct, but consumes too much memory. Unpoisoning the whole structure will prevent creating such chains. Link: https://lkml.kernel.org/r/20220905122452.2258262-20-glider@google.com Signed-off-by: Alexander Potapenko <glider@google.com> Acked-by: Marco Elver <elver@google.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Konovalov <andreyknvl@google.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Christoph Hellwig <hch@lst.de> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Eric Biggers <ebiggers@google.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Ilya Leoshkevich <iii@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Kees Cook <keescook@chromium.org> Cc: Liu Shixin <liushixin2@huawei.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael S. Tsirkin <mst@redhat.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Petr Mladek <pmladek@suse.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vegard Nossum <vegard.nossum@oracle.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmu_gather.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a71924bd38c0d..add4244e5790d 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -1,6 +1,7 @@ #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> +#include <linux/kmsan-checks.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> @@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb) static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { + /* + * struct mmu_gather contains 7 1-bit fields packed into a 32-bit + * unsigned int value. The remaining 25 bits remain uninitialized + * and are never used, but KMSAN updates the origin for them in + * zap_pXX_range() in mm/memory.c, thus creating very long origin + * chains. This is technically correct, but consumes too much memory. + * Unpoisoning the whole structure will prevent creating such chains. + */ + kmsan_unpoison_memory(tlb, sizeof(*tlb)); tlb->mm = mm; tlb->fullmm = fullmm; -- GitLab From ea091fa53680030881b56520d731e36d3ff6cdd5 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang <xkernel.wang@foxmail.com> Date: Fri, 4 Mar 2022 17:12:15 +0800 Subject: [PATCH 1879/2223] lib/test_meminit: add checks for the allocation functions alloc_pages(), kmalloc() and vmalloc() are all memory allocation functions which can return NULL when some internal memory failures happen. So it is better to check the return of them to catch the failure in time for better test them. Link: https://lkml.kernel.org/r/tencent_D44A49FFB420EDCCBFB9221C8D14DFE12908@qq.com Signed-off-by: Xiaoke Wang <xkernel.wang@foxmail.com> Reviewed-by: Alexander Potapenko <glider@google.com> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Marco Elver <elver@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- lib/test_meminit.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index c95db11a69064..60e1984c060fa 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -67,17 +67,24 @@ static int __init do_alloc_pages_order(int order, int *total_failures) size_t size = PAGE_SIZE << order; page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); fill_with_garbage(buf, size); __free_pages(page, order); page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); __free_pages(page, order); return 1; +err: + (*total_failures)++; + return 1; } /* Test the page allocator by calling alloc_pages with different orders. */ @@ -100,15 +107,22 @@ static int __init do_kmalloc_size(size_t size, int *total_failures) void *buf; buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; fill_with_garbage(buf, size); kfree(buf); buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); kfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test vmalloc() with given parameters. */ @@ -117,15 +131,22 @@ static int __init do_vmalloc_size(size_t size, int *total_failures) void *buf; buf = vmalloc(size); + if (!buf) + goto err; fill_with_garbage(buf, size); vfree(buf); buf = vmalloc(size); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); vfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test kmalloc()/vmalloc() by allocating objects of different sizes. */ -- GitLab From 652e04464d3944226052c827bdaaf5113b072870 Mon Sep 17 00:00:00 2001 From: Xin Hao <xhao@linux.alibaba.com> Date: Tue, 27 Sep 2022 08:19:45 +0800 Subject: [PATCH 1880/2223] mm/damon: move sz_damon_region to damon_sz_region Rename sz_damon_region() to damon_sz_region(), and move it to "include/linux/damon.h", because in many places, we can to use this func. Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao <xhao@linux.alibaba.com> Suggested-by: SeongJae Park <sj@kernel.org> Reviewed-by: SeongJae Park <sj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 9 ++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index ed5470f50babd..620ada094c3b2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t) return list_first_entry(&t->regions_list, struct damon_region, list); } +static inline unsigned long damon_sz_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} + + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 4de8c7c529794..5b9e0d585aef2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -864,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -static inline unsigned long sz_damon_region(struct damon_region *r) -{ - return r->ar.end - r->ar.start; -} - /* * Merge two adjacent regions into one region */ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { - unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); @@ -904,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && - sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else prev = r; -- GitLab From ab63f63f3885d492e62da55304b0483a2a9e6a7d Mon Sep 17 00:00:00 2001 From: Xin Hao <xhao@linux.alibaba.com> Date: Tue, 27 Sep 2022 08:19:46 +0800 Subject: [PATCH 1881/2223] mm/damon: use damon_sz_region() in appropriate place In many places we can use damon_sz_region() to instead of "r->ar.end - r->ar.start". Link: https://lkml.kernel.org/r/20220927001946.85375-2-xhao@linux.alibaba.com Signed-off-by: Xin Hao <xhao@linux.alibaba.com> Suggested-by: SeongJae Park <sj@kernel.org> Reviewed-by: SeongJae Park <sj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/damon/core.c | 17 ++++++++--------- mm/damon/vaddr.c | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 5b9e0d585aef2..515ac4e52a113 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -490,7 +490,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - sz += r->ar.end - r->ar.start; + sz += damon_sz_region(r); } if (ctx->attrs.min_nr_regions) @@ -673,7 +673,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; - sz = r->ar.end - r->ar.start; + sz = damon_sz_region(r); return s->pattern.min_sz_region <= sz && sz <= s->pattern.max_sz_region && s->pattern.min_nr_accesses <= r->nr_accesses && @@ -701,7 +701,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = r->ar.end - r->ar.start; + unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; @@ -730,14 +730,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c, sz = ALIGN_DOWN(quota->charge_addr_from - r->ar.start, DAMON_MIN_REGION); if (!sz) { - if (r->ar.end - r->ar.start <= - DAMON_MIN_REGION) + if (damon_sz_region(r) <= + DAMON_MIN_REGION) continue; sz = DAMON_MIN_REGION; } damon_split_region_at(t, r, sz); r = damon_next_region(r); - sz = r->ar.end - r->ar.start; + sz = damon_sz_region(r); } quota->charge_target_from = NULL; quota->charge_addr_from = 0; @@ -842,8 +842,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) continue; score = c->ops.get_scheme_score( c, t, r, s); - quota->histogram[score] += - r->ar.end - r->ar.start; + quota->histogram[score] += damon_sz_region(r); if (score > max_score) max_score = score; } @@ -957,7 +956,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) int i; damon_for_each_region_safe(r, next, t) { - sz_region = r->ar.end - r->ar.start; + sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && sz_region > 2 * DAMON_MIN_REGION; i++) { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index ea94e0b2c3113..15f03df66db60 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, return -EINVAL; orig_end = r->ar.end; - sz_orig = r->ar.end - r->ar.start; + sz_orig = damon_sz_region(r); sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); if (!sz_piece) @@ -618,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target, { struct mm_struct *mm; unsigned long start = PAGE_ALIGN(r->ar.start); - unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long len = PAGE_ALIGN(damon_sz_region(r)); unsigned long applied; mm = damon_get_mm(target); -- GitLab From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:15 +1000 Subject: [PATCH 1882/2223] mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Lyude Paul <lyude@redhat.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 +++++++------ drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++----- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +++++--- include/linux/migrate.h | 8 ++++++ lib/test_hmm.c | 7 ++--- mm/memory.c | 16 ++++++++++- mm/migrate.c | 34 ++++++++++++++---------- mm/migrate_device.c | 18 +++++++++---- 9 files changed, 89 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 5980063016207..965c9e9e500bc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page; /* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -580,12 +581,14 @@ out_finalize: static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret; mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock); return ret; @@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true; if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out; - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index b059a77b6081d..776448bd9fe4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; unsigned long cpages = 0; dma_addr_t *scratch; void *buf; @@ -668,7 +668,7 @@ out_oom: static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, unsigned long cpages = 0; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; dma_addr_t *scratch; void *buf; int r = -ENOMEM; @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.src = buf; migrate.dst = migrate.src + npages; + migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, @@ -766,7 +767,7 @@ out: * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct amdgpu_device *adev; struct vm_area_struct *vma; @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } next = min(vma->vm_end, end); - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, + fault_page); if (r < 0) { pr_debug("failed %ld to migrate prange %p\n", r, prange); break; @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger); + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) goto out_unlock_prange; } - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); + r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, + vmf->page); if (r) pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, prange, prange->start, prange->last); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index b3f0754b32faa..a5d7e6d222646 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, struct mm_struct *mm, uint32_t trigger); int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger); + uint32_t trigger, struct page *fault_page); unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 11074cc8c333b..9139e5a0b2a07 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2913,13 +2913,15 @@ retry_write_locked: */ if (prange->actual_loc) r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); else r = 0; } } else { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_vram_to_ram(prange, mm, + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_TTM_EVICTION); + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); if (!r && prange->actual_loc) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 704a04f5a0746..52090d1f92307 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, @@ -197,6 +199,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 6a33f6b1b4651..e566166b55712 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, struct vm_area_struct *vma; unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns = 0; unsigned long dst_pfns = 0; struct page *rpage; @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); + args.fault_page = vmf->page; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 2c7723ea43714..4ad6077164cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3750,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_swapin_error_entry(entry)) { diff --git a/mm/migrate.c b/mm/migrate.c index c228afba0963d..1379e1912772e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. @@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - int rc; - - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - - rc = folio_migrate_mapping(mapping, dst, src, 0); - - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return migrate_folio_extra(mapping, dst, src, mode, 0); } EXPORT_SYMBOL(migrate_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ab6ab9d2ed82..8dee38ffcda25 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -405,7 +405,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (page_mapped(page) || + !migrate_vma_check_page(page, migrate->fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); @@ -517,6 +518,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -747,8 +750,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } -- GitLab From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:16 +1000 Subject: [PATCH 1883/2223] mm: free device private pages have zero refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") device private pages have no longer had an extra reference count when the page is in use. However before handing them back to the owning device driver we add an extra reference count such that free pages have a reference count of one. This makes it difficult to tell if a page is free or not because both free and in use pages will have a non-zero refcount. Instead we should return pages to the drivers page allocator with a zero reference count. Kernel code can then safely use kernel functions such as get_page_unless_zero(). Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 1 + lib/test_hmm.c | 2 +- mm/memremap.c | 9 +++++++++ mm/page_alloc.c | 8 ++++++++ 7 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 965c9e9e500bc..e2f11f9c3f2aa 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -718,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - lock_page(dpage); + zone_device_page_init(dpage); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 776448bd9fe4a..97a684568ae01 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - lock_page(page); + zone_device_page_init(page); } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 16356611b5b95..b092988266a6a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - lock_page(page); + zone_device_page_init(page); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c3b4cc84877b5..7fcaf3180a5b6 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE +void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e566166b55712..bc2b949911653 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,8 +627,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } + zone_device_page_init(dpage); dpage->zone_device_data = rpage; - lock_page(dpage); return dpage; error: diff --git a/mm/memremap.c b/mm/memremap.c index 25029a474d30b..1c2c038f34109 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,8 +505,17 @@ void free_zone_device_page(struct page *page) /* * Reset the page count to 1 to prepare for handing out the page again. */ + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + set_page_count(page, 1); +} + +void zone_device_page_init(struct page *page) +{ set_page_count(page, 1); + lock_page(page); } +EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX bool __put_devmap_managed_page_refs(struct page *page, int refs) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12b6184cbbed6..059f6946832fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6819,6 +6819,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); } /* -- GitLab From 0dc45ca1ce18900572282c4f054bbe78351cb6a7 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:17 +1000 Subject: [PATCH 1884/2223] mm/memremap.c: take a pgmap reference on page allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ZONE_DEVICE pages have a struct dev_pagemap which is allocated by a driver. When the struct page is first allocated by the kernel in memremap_pages() a reference is taken on the associated pagemap to ensure it is not freed prior to the pages being freed. Prior to 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") pages were considered free and returned to the driver when the reference count dropped to one. However the pagemap reference was not dropped until the page reference count hit zero. This would occur as part of the final put_page() in memunmap_pages() which would wait for all pages to be freed prior to returning. When the extra refcount was removed the pagemap reference was no longer being dropped in put_page(). Instead memunmap_pages() was changed to explicitly drop the pagemap references. This means that memunmap_pages() can complete even though pages are still mapped by the kernel which can lead to kernel crashes, particularly if a driver frees the pagemap. To fix this drivers should take a pagemap reference when allocating the page. This reference can then be returned when the page is freed. Link: https://lkml.kernel.org/r/12d155ec727935ebfbb4d639a03ab374917ea51b.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Fixes: 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/memremap.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index 1c2c038f34109..421bec3a29ee7 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap) int i; percpu_ref_kill(&pgmap->ref); - for (i = 0; i < pgmap->nr_range; i++) - percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + for (i = 0; i < pgmap->nr_range; i++) + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + wait_for_completion(&pgmap->done); for (i = 0; i < pgmap->nr_range; i++) @@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: @@ -502,16 +507,24 @@ void free_zone_device_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); - /* - * Reset the page count to 1 to prepare for handing out the page again. - */ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && page->pgmap->type != MEMORY_DEVICE_COHERENT) + /* + * Reset the page count to 1 to prepare for handing out the page + * again. + */ set_page_count(page, 1); + else + put_dev_pagemap(page->pgmap); } void zone_device_page_init(struct page *page) { + /* + * Drivers shouldn't be allocating pages after calling + * memunmap_pages(). + */ + WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref)); set_page_count(page, 1); lock_page(page); } -- GitLab From 241f68859656836ae3e85179cc224cc4c5e4e6a7 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:18 +1000 Subject: [PATCH 1885/2223] mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_device_coherent_page() reuses the existing migrate_vma family of functions to migrate a specific page without providing a valid mapping or vma. This looks a bit odd because it means we are calling migrate_vma_*() without setting a valid vma, however it was considered acceptable at the time because the details were internal to migrate_device.c and there was only a single user. One of the reasons the details could be kept internal was that this was strictly for migrating device coherent memory. Such memory can be copied directly by the CPU without intervention from a driver. However this isn't true for device private memory, and a future change requires similar functionality for device private memory. So refactor the code into something more sensible for migrating device memory without a vma. Link: https://lkml.kernel.org/r/c7b2ff84e9b33d022cf4a40f87d051f281a16d8f.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Zi Yan <ziy@nvidia.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Yang Shi <shy828301@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/migrate_device.c | 150 +++++++++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 65 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8dee38ffcda25..7707c1d898f55 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -357,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) } /* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. + * Unmaps pages for migration. Returns number of unmapped pages. */ -static void migrate_vma_unmap(struct migrate_vma *migrate) +static unsigned long migrate_device_unmap(unsigned long *src_pfns, + unsigned long npages, + struct page *fault_page) { - const unsigned long npages = migrate->npages; unsigned long i, restore = 0; bool allow_drain = true; + unsigned long unmapped = 0; lru_add_drain(); for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; if (!page) @@ -391,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } @@ -406,34 +399,54 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) try_to_migrate(folio, 0); if (page_mapped(page) || - !migrate_vma_check_page(page, migrate->fault_page)) { + !migrate_vma_check_page(page, fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); } - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } + + unmapped++; } for (i = 0; i < npages && restore; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; folio = page_folio(page); remove_migration_ptes(folio, folio, false); - migrate->src[i] = 0; + src_pfns[i] = 0; folio_unlock(folio); folio_put(folio); restore--; } + + return unmapped; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, + migrate->fault_page); } /** @@ -680,41 +693,36 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -/** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information - * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. - */ -void migrate_vma_pages(struct migrate_vma *migrate) +static void migrate_device_pages(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages, + struct migrate_vma *migrate) { - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; struct mmu_notifier_range range; - unsigned long addr, i; + unsigned long i; bool notified = false; - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; int r; if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } if (!page) { + unsigned long addr; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't * called if the page could not be unmapped. */ - VM_BUG_ON(!migrate->vma); - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + VM_BUG_ON(!migrate); + addr = migrate->start + i*PAGE_SIZE; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { notified = true; @@ -726,7 +734,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); + &src_pfns[i]); continue; } @@ -739,18 +747,18 @@ void migrate_vma_pages(struct migrate_vma *migrate) * device private or coherent memory. */ if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } } else if (is_zone_device_page(newpage)) { /* * Other types of ZONE_DEVICE page are not supported. */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - if (migrate->fault_page == page) + if (migrate && migrate->fault_page == page) r = migrate_folio_extra(mapping, page_folio(newpage), page_folio(page), MIGRATE_SYNC_NO_COPY, 1); @@ -758,7 +766,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) r = migrate_folio(mapping, page_folio(newpage), page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } /* @@ -769,28 +777,30 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (notified) mmu_notifier_invalidate_range_only_end(&range); } -EXPORT_SYMBOL(migrate_vma_pages); /** - * migrate_vma_finalize() - restore CPU page table entry + * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. - * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. */ -void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_vma_pages(struct migrate_vma *migrate) +{ + migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); +} +EXPORT_SYMBOL(migrate_vma_pages); + +static void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { - const unsigned long npages = migrate->npages; unsigned long i; for (i = 0; i < npages; i++) { struct folio *dst, *src; - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); if (!page) { if (newpage) { @@ -800,7 +810,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) continue; } - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { if (newpage) { unlock_page(newpage); put_page(newpage); @@ -827,6 +837,22 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } } + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); +} EXPORT_SYMBOL(migrate_vma_finalize); /* @@ -837,25 +863,19 @@ EXPORT_SYMBOL(migrate_vma_finalize); int migrate_device_coherent_page(struct page *page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma args; struct page *dpage; WARN_ON_ONCE(PageCompound(page)); lock_page(page); src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - args.src = &src_pfn; - args.dst = &dst_pfn; - args.cpages = 1; - args.npages = 1; - args.vma = NULL; /* * We don't have a VMA and don't need to walk the page tables to find * the source page. So call migrate_vma_unmap() directly to unmap the * page as migrate_vma_setup() will fail if args.vma == NULL. */ - migrate_vma_unmap(&args); + migrate_device_unmap(&src_pfn, 1, NULL); if (!(src_pfn & MIGRATE_PFN_MIGRATE)) return -EBUSY; @@ -865,10 +885,10 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_vma_pages(&args); + migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); - migrate_vma_finalize(&args); + migrate_device_finalize(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) return 0; -- GitLab From e778406b40dbb1342a1888cd751ca9d2982a12e2 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:19 +1000 Subject: [PATCH 1886/2223] mm/migrate_device.c: add migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device drivers can use the migrate_vma family of functions to migrate existing private anonymous mappings to device private pages. These pages are backed by memory on the device with drivers being responsible for copying data to and from device memory. Device private pages are freed via the pgmap->page_free() callback when they are unmapped and their refcount drops to zero. Alternatively they may be freed indirectly via migration back to CPU memory in response to a pgmap->migrate_to_ram() callback called whenever the CPU accesses an address mapped to a device private page. In other words drivers cannot control the lifetime of data allocated on the devices and must wait until these pages are freed from userspace. This causes issues when memory needs to reclaimed on the device, either because the device is going away due to a ->release() callback or because another user needs to use the memory. Drivers could use the existing migrate_vma functions to migrate data off the device. However this would require them to track the mappings of each page which is both complicated and not always possible. Instead drivers need to be able to migrate device pages directly so they can free up device memory. To allow that this patch introduces the migrate_device family of functions which are functionally similar to migrate_vma but which skips the initial lookup based on mapping. Link: https://lkml.kernel.org/r/868116aab70b0c8ee467d62498bb2cf0ef907295.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Zi Yan <ziy@nvidia.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Yang Shi <shy828301@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- include/linux/migrate.h | 7 ++++ mm/migrate_device.c | 89 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 52090d1f92307..3ef77f52a4f04 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -210,6 +210,13 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages); +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages); +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages); + #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 7707c1d898f55..6fa682eef7a00 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -693,7 +693,7 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -static void migrate_device_pages(unsigned long *src_pfns, +static void __migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages, struct migrate_vma *migrate) { @@ -715,6 +715,9 @@ static void migrate_device_pages(unsigned long *src_pfns, if (!page) { unsigned long addr; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't @@ -722,8 +725,6 @@ static void migrate_device_pages(unsigned long *src_pfns, */ VM_BUG_ON(!migrate); addr = migrate->start + i*PAGE_SIZE; - if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; if (!notified) { notified = true; @@ -778,6 +779,22 @@ static void migrate_device_pages(unsigned long *src_pfns, mmu_notifier_invalidate_range_only_end(&range); } +/** + * migrate_device_pages() - migrate meta-data from src page to dst page + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Equivalent to migrate_vma_pages(). This is called to migrate struct page + * meta-data from source struct page to destination. + */ +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages) +{ + __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); +} +EXPORT_SYMBOL(migrate_device_pages); + /** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information @@ -788,12 +805,22 @@ static void migrate_device_pages(unsigned long *src_pfns, */ void migrate_vma_pages(struct migrate_vma *migrate) { - migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); + __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); } EXPORT_SYMBOL(migrate_vma_pages); -static void migrate_device_finalize(unsigned long *src_pfns, - unsigned long *dst_pfns, unsigned long npages) +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { unsigned long i; @@ -837,6 +864,7 @@ static void migrate_device_finalize(unsigned long *src_pfns, } } } +EXPORT_SYMBOL(migrate_device_finalize); /** * migrate_vma_finalize() - restore CPU page table entry @@ -855,6 +883,53 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); +/** + * migrate_device_range() - migrate device private pfns to normal memory. + * @src_pfns: array large enough to hold migrating source device private pfns. + * @start: starting pfn in the range to migrate. + * @npages: number of pages to migrate. + * + * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that + * instead of looking up pages based on virtual address mappings a range of + * device pfns that should be migrated to system memory is used instead. + * + * This is useful when a driver needs to free device memory but doesn't know the + * virtual mappings of every page that may be in device memory. For example this + * is often the case when a driver is being unloaded or unbound from a device. + * + * Like migrate_vma_setup() this function will take a reference and lock any + * migrating pages that aren't free before unmapping them. Drivers may then + * allocate destination pages and start copying data from the device to CPU + * memory before calling migrate_device_pages(). + */ +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages) +{ + unsigned long i, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + if (!get_page_unless_zero(page)) { + src_pfns[i] = 0; + continue; + } + + if (!trylock_page(page)) { + src_pfns[i] = 0; + put_page(page); + continue; + } + + src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + } + + migrate_device_unmap(src_pfns, npages, NULL); + + return 0; +} +EXPORT_SYMBOL(migrate_device_range); + /* * Migrate a device coherent page back to normal memory. The caller should have * a reference on page which will be copied to the new page if migration is @@ -885,7 +960,7 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); + migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); migrate_device_finalize(&src_pfn, &dst_pfn, 1); -- GitLab From d9b719394a1147614351961ac454589111c76e76 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:20 +1000 Subject: [PATCH 1887/2223] nouveau/dmem: refactor nouveau_dmem_fault_copy_one() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nouveau_dmem_fault_copy_one() is used during handling of CPU faults via the migrate_to_ram() callback and is used to copy data from GPU to CPU memory. It is currently specific to fault handling, however a future patch implementing eviction of data during teardown needs similar functionality. Refactor out the core functionality so that it is not specific to fault handling. Link: https://lkml.kernel.org/r/20573d7b4e641a78fde9935f948e64e71c9e709e.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Reviewed-by: Lyude Paul <lyude@redhat.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 58 +++++++++++++------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index b092988266a6a..65f51fb6a70ce 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -139,44 +139,24 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence) } } -static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, - struct vm_fault *vmf, struct migrate_vma *args, - dma_addr_t *dma_addr) +static int nouveau_dmem_copy_one(struct nouveau_drm *drm, struct page *spage, + struct page *dpage, dma_addr_t *dma_addr) { struct device *dev = drm->dev->dev; - struct page *dpage, *spage; - struct nouveau_svmm *svmm; - - spage = migrate_pfn_to_page(args->src[0]); - if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) - return 0; - dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); - if (!dpage) - return VM_FAULT_SIGBUS; lock_page(dpage); *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); if (dma_mapping_error(dev, *dma_addr)) - goto error_free_page; + return -EIO; - svmm = spage->zone_device_data; - mutex_lock(&svmm->mutex); - nouveau_svmm_invalidate(svmm, args->start, args->end); if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, - NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) - goto error_dma_unmap; - mutex_unlock(&svmm->mutex); + NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) { + dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + return -EIO; + } - args->dst[0] = migrate_pfn(page_to_pfn(dpage)); return 0; - -error_dma_unmap: - mutex_unlock(&svmm->mutex); - dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); -error_free_page: - __free_page(dpage); - return VM_FAULT_SIGBUS; } static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) @@ -184,9 +164,11 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) struct nouveau_drm *drm = page_to_drm(vmf->page); struct nouveau_dmem *dmem = drm->dmem; struct nouveau_fence *fence; + struct nouveau_svmm *svmm; + struct page *spage, *dpage; unsigned long src = 0, dst = 0; dma_addr_t dma_addr = 0; - vm_fault_t ret; + vm_fault_t ret = 0; struct migrate_vma args = { .vma = vmf->vma, .start = vmf->address, @@ -207,9 +189,25 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) if (!args.cpages) return 0; - ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr); - if (ret || dst == 0) + spage = migrate_pfn_to_page(src); + if (!spage || !(src & MIGRATE_PFN_MIGRATE)) + goto done; + + dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); + if (!dpage) + goto done; + + dst = migrate_pfn(page_to_pfn(dpage)); + + svmm = spage->zone_device_data; + mutex_lock(&svmm->mutex); + nouveau_svmm_invalidate(svmm, args.start, args.end); + ret = nouveau_dmem_copy_one(drm, spage, dpage, &dma_addr); + mutex_unlock(&svmm->mutex); + if (ret) { + ret = VM_FAULT_SIGBUS; goto done; + } nouveau_fence_new(dmem->migrate.chan, false, &fence); migrate_vma_pages(&args); -- GitLab From 249881232e1471d28b68f9a3829acc14d150cf5d Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:21 +1000 Subject: [PATCH 1888/2223] nouveau/dmem: evict device private memory during release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the module is unloaded or a GPU is unbound from the module it is possible for device private pages to still be mapped in currently running processes. This can lead to a hangs and RCU stall warnings when unbinding the device as memunmap_pages() will wait in an uninterruptible state until all device pages have been freed which may never happen. Fix this by migrating device mappings back to normal CPU memory prior to freeing the GPU memory chunks and associated device private pages. Link: https://lkml.kernel.org/r/66277601fb8fda9af408b33da9887192bf895bda.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 65f51fb6a70ce..5fe209107246f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -367,6 +367,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm) mutex_unlock(&drm->dmem->mutex); } +/* + * Evict all pages mapping a chunk. + */ +static void +nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) +{ + unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT; + unsigned long *src_pfns, *dst_pfns; + dma_addr_t *dma_addrs; + struct nouveau_fence *fence; + + src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL); + dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL); + dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL); + + migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT, + npages); + + for (i = 0; i < npages; i++) { + if (src_pfns[i] & MIGRATE_PFN_MIGRATE) { + struct page *dpage; + + /* + * _GFP_NOFAIL because the GPU is going away and there + * is nothing sensible we can do if we can't copy the + * data back. + */ + dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL); + dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + nouveau_dmem_copy_one(chunk->drm, + migrate_pfn_to_page(src_pfns[i]), dpage, + &dma_addrs[i]); + } + } + + nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence); + migrate_device_pages(src_pfns, dst_pfns, npages); + nouveau_dmem_fence_done(&fence); + migrate_device_finalize(src_pfns, dst_pfns, npages); + kfree(src_pfns); + kfree(dst_pfns); + for (i = 0; i < npages; i++) + dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL); + kfree(dma_addrs); +} + void nouveau_dmem_fini(struct nouveau_drm *drm) { @@ -378,8 +424,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm) mutex_lock(&drm->dmem->mutex); list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) { + nouveau_dmem_evict_chunk(chunk); nouveau_bo_unpin(chunk->bo); nouveau_bo_ref(NULL, &chunk->bo); + WARN_ON(chunk->callocated); list_del(&chunk->list); memunmap_pages(&chunk->pagemap); release_mem_region(chunk->pagemap.range.start, -- GitLab From ad4c365221b0f92f9c24a203119f2bade30c970e Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 28 Sep 2022 22:01:22 +1000 Subject: [PATCH 1889/2223] hmm-tests: add test for migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://lkml.kernel.org/r/a73cf109de0224cfd118d22be58ddebac3ae2897.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- lib/test_hmm.c | 120 ++++++++++++++++++++----- lib/test_hmm_uapi.h | 1 + tools/testing/selftests/vm/hmm-tests.c | 49 ++++++++++ 3 files changed, 149 insertions(+), 21 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index bc2b949911653..67e6f83fe0f82 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -100,6 +100,7 @@ struct dmirror { struct dmirror_chunk { struct dev_pagemap pagemap; struct dmirror_device *mdevice; + bool remove; }; /* @@ -192,11 +193,15 @@ static int dmirror_fops_release(struct inode *inode, struct file *filp) return 0; } +static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page) +{ + return container_of(page->pgmap, struct dmirror_chunk, pagemap); +} + static struct dmirror_device *dmirror_page_to_device(struct page *page) { - return container_of(page->pgmap, struct dmirror_chunk, - pagemap)->mdevice; + return dmirror_page_to_chunk(page)->mdevice; } static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) @@ -1218,6 +1223,85 @@ static int dmirror_snapshot(struct dmirror *dmirror, return ret; } +static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) +{ + unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT; + unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT; + unsigned long npages = end_pfn - start_pfn + 1; + unsigned long i; + unsigned long *src_pfns; + unsigned long *dst_pfns; + + src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL); + dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL); + + migrate_device_range(src_pfns, start_pfn, npages); + for (i = 0; i < npages; i++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(src_pfns[i]); + if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_device_private_page(spage) && + !is_device_coherent_page(spage))) + continue; + spage = BACKING_PAGE(spage); + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + lock_page(dpage); + copy_highpage(dpage, spage); + dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + if (src_pfns[i] & MIGRATE_PFN_WRITE) + dst_pfns[i] |= MIGRATE_PFN_WRITE; + } + migrate_device_pages(src_pfns, dst_pfns, npages); + migrate_device_finalize(src_pfns, dst_pfns, npages); + kfree(src_pfns); + kfree(dst_pfns); +} + +/* Removes free pages from the free list so they can't be re-allocated */ +static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) +{ + struct dmirror_device *mdevice = devmem->mdevice; + struct page *page; + + for (page = mdevice->free_pages; page; page = page->zone_device_data) + if (dmirror_page_to_chunk(page) == devmem) + mdevice->free_pages = page->zone_device_data; +} + +static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) +{ + unsigned int i; + + mutex_lock(&mdevice->devmem_lock); + if (mdevice->devmem_chunks) { + for (i = 0; i < mdevice->devmem_count; i++) { + struct dmirror_chunk *devmem = + mdevice->devmem_chunks[i]; + + spin_lock(&mdevice->lock); + devmem->remove = true; + dmirror_remove_free_pages(devmem); + spin_unlock(&mdevice->lock); + + dmirror_device_evict_chunk(devmem); + memunmap_pages(&devmem->pagemap); + if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); + kfree(devmem); + } + mdevice->devmem_count = 0; + mdevice->devmem_capacity = 0; + mdevice->free_pages = NULL; + kfree(mdevice->devmem_chunks); + mdevice->devmem_chunks = NULL; + } + mutex_unlock(&mdevice->devmem_lock); +} + static long dmirror_fops_unlocked_ioctl(struct file *filp, unsigned int command, unsigned long arg) @@ -1272,6 +1356,11 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_snapshot(dmirror, &cmd); break; + case HMM_DMIRROR_RELEASE: + dmirror_device_remove_chunks(dmirror->mdevice); + ret = 0; + break; + default: return -EINVAL; } @@ -1326,9 +1415,13 @@ static void dmirror_devmem_free(struct page *page) mdevice = dmirror_page_to_device(page); spin_lock(&mdevice->lock); - mdevice->cfree++; - page->zone_device_data = mdevice->free_pages; - mdevice->free_pages = page; + + /* Return page to our allocator if not freeing the chunk */ + if (!dmirror_page_to_chunk(page)->remove) { + mdevice->cfree++; + page->zone_device_data = mdevice->free_pages; + mdevice->free_pages = page; + } spin_unlock(&mdevice->lock); } @@ -1408,22 +1501,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) static void dmirror_device_remove(struct dmirror_device *mdevice) { - unsigned int i; - - if (mdevice->devmem_chunks) { - for (i = 0; i < mdevice->devmem_count; i++) { - struct dmirror_chunk *devmem = - mdevice->devmem_chunks[i]; - - memunmap_pages(&devmem->pagemap); - if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); - kfree(devmem); - } - kfree(mdevice->devmem_chunks); - } - + dmirror_device_remove_chunks(mdevice); cdev_device_del(&mdevice->cdevice, &mdevice->device); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index e31d58c9034a7..8c818a2cf4f69 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -36,6 +36,7 @@ struct hmm_dmirror_cmd { #define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) #define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) #define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index f2c2c970eeb27..28232adec883b 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1054,6 +1054,55 @@ TEST_F(hmm, migrate_fault) hmm_buffer_free(buffer); } +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + /* * Migrate anonymous shared memory to device private memory. */ -- GitLab From d6e5040bd8e53371fafd7e0c7c63b090b3a675db Mon Sep 17 00:00:00 2001 From: Andrey Konovalov <andreyknvl@google.com> Date: Mon, 26 Sep 2022 20:08:47 +0200 Subject: [PATCH 1890/2223] kasan: fix array-bounds warnings in tests GCC's -Warray-bounds option detects out-of-bounds accesses to statically-sized allocations in krealloc out-of-bounds tests. Use OPTIMIZER_HIDE_VAR to suppress the warning. Also change kmalloc_memmove_invalid_size to use OPTIMIZER_HIDE_VAR instead of a volatile variable. Link: https://lkml.kernel.org/r/e94399242d32e00bba6fd0d9ec4c897f188128e8.1664215688.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov <andreyknvl@google.com> Reported-by: kernel test robot <lkp@intel.com> Reviewed-by: Kees Cook <keescook@chromium.org> Cc: Alexander Potapenko <glider@google.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Marco Elver <elver@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/kasan/kasan_test.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index f25692def7813..57e4c72aa8bd2 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* All offsets up to size2 must be accessible. */ ptr2[size1 - 1] = 'x'; ptr2[size1] = 'x'; @@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* Must be accessible for all modes. */ ptr2[size2 - 1] = 'x'; @@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = size; + size_t invalid_size = size; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); -- GitLab From bce8cb3c04dc01d21b6b17baf1cb6c277e7e6848 Mon Sep 17 00:00:00 2001 From: Qi Zheng <zhengqi.arch@bytedance.com> Date: Thu, 29 Sep 2022 19:23:17 +0800 Subject: [PATCH 1891/2223] mm: use update_mmu_tlb() on the second thread As message in commit 7df676974359 ("mm/memory.c: Update local TLB if PTE entry exists") said, we should update local TLB only on the second thread. So in the do_anonymous_page() here, we should use update_mmu_tlb() instead of update_mmu_cache() on the second thread. As David pointed out, this is a performance improvement, not a correctness fix. Link: https://lkml.kernel.org/r/20220929112318.32393-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Bibo Mao <maobibo@loongson.cn> Cc: Chris Zankel <chris@zankel.net> Cc: Huacai Chen <chenhuacai@loongson.cn> Cc: Max Filippov <jcmvbkbc@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 4ad6077164cd2..f88c351aecd41 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4134,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; } -- GitLab From 14c2ac36811b82479b1138383b2c9ff1ab6ba47d Mon Sep 17 00:00:00 2001 From: Qi Zheng <zhengqi.arch@bytedance.com> Date: Thu, 29 Sep 2022 19:23:18 +0800 Subject: [PATCH 1892/2223] LoongArch: update local TLB if PTE entry exists Currently, the implementation of update_mmu_tlb() is empty if __HAVE_ARCH_UPDATE_MMU_TLB is not defined. Then if two threads concurrently fault at the same page, the second thread that did not win the race will give up and do nothing. In the LoongArch architecture, this second thread will trigger another fault, and only updates its local TLB. Instead of triggering another fault, it's better to implement update_mmu_tlb() to directly update the local TLB of the second thread. Just do it. Link: https://lkml.kernel.org/r/20220929112318.32393-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Suggested-by: Bibo Mao <maobibo@loongson.cn> Acked-by: Huacai Chen <chenhuacai@loongson.cn> Cc: Chris Zankel <chris@zankel.net> Cc: David Hildenbrand <david@redhat.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Muchun Song <songmuchun@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- arch/loongarch/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 8ea57e2f0e04c..946704bee599e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -412,6 +412,9 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, __update_tlb(vma, address, ptep); } +#define __HAVE_ARCH_UPDATE_MMU_TLB +#define update_mmu_tlb update_mmu_cache + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { -- GitLab From 94541bc3fbde45bbd40e7989995246b22732679a Mon Sep 17 00:00:00 2001 From: Brian Geffon <bgeffon@google.com> Date: Mon, 3 Oct 2022 10:48:32 -0400 Subject: [PATCH 1893/2223] zram: always expose rw_page Currently zram will adjust its fops to a version which does not contain rw_page when a backing device has been assigned. This is done to prevent upper layers from assuming a synchronous operation when a page may have been written back. This forces every operation through bio which has overhead associated with bio_alloc/frees. The code can be simplified to always expose an rw_page method and only in the rare event that a page is written back we instead will return -EOPNOTSUPP forcing the upper layer to fallback to bio. Link: https://lkml.kernel.org/r/20221003144832.2906610-1-bgeffon@google.com Signed-off-by: Brian Geffon <bgeffon@google.com> Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Rom Lemarchand <romlem@google.com> Cc: Suleiman Souhlal <suleiman@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- drivers/block/zram/zram_drv.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 43eeef2b9fbe2..2ba5c98319e52 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -52,9 +52,6 @@ static unsigned int num_devices = 1; static size_t huge_class_size; static const struct block_device_operations zram_devops; -#ifdef CONFIG_ZRAM_WRITEBACK -static const struct block_device_operations zram_wb_devops; -#endif static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, @@ -546,17 +543,6 @@ static ssize_t backing_dev_store(struct device *dev, zram->backing_dev = backing_dev; zram->bitmap = bitmap; zram->nr_pages = nr_pages; - /* - * With writeback feature, zram does asynchronous IO so it's no longer - * synchronous device so let's remove synchronous io flag. Othewise, - * upper layer(e.g., swap) could wait IO completion rather than - * (submit and return), which will cause system sluggish. - * Furthermore, when the IO function returns(e.g., swap_readpage), - * upper layer expects IO was done so it could deallocate the page - * freely but in fact, IO is going on so finally could cause - * use-after-free when the IO is really done. - */ - zram->disk->fops = &zram_wb_devops; up_write(&zram->init_lock); pr_info("setup backing device %s\n", file_name); @@ -1270,6 +1256,9 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, struct bio_vec bvec; zram_slot_unlock(zram, index); + /* A null bio means rw_page was used, we must fallback to bio */ + if (!bio) + return -EOPNOTSUPP; bvec.bv_page = page; bvec.bv_len = PAGE_SIZE; @@ -1856,15 +1845,6 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; -#ifdef CONFIG_ZRAM_WRITEBACK -static const struct block_device_operations zram_wb_devops = { - .open = zram_open, - .submit_bio = zram_submit_bio, - .swap_slot_free_notify = zram_slot_free_notify, - .owner = THIS_MODULE -}; -#endif - static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); -- GitLab From 2ea7ff1e39cbe3753d3c649beb70f2cf861dca75 Mon Sep 17 00:00:00 2001 From: Peter Xu <peterx@redhat.com> Date: Tue, 4 Oct 2022 15:33:58 -0400 Subject: [PATCH 1894/2223] mm/hugetlb: fix race condition of uffd missing/minor handling Patch series "mm/hugetlb: Fix selftest failures with write check", v3. Currently akpm mm-unstable fails with uffd hugetlb private mapping test randomly on a write check. The initial bisection of that points to the recent pmd unshare series, but it turns out there's no direction relationship with the series but only some timing change caused the race to start trigger. The race should be fixed in patch 1. Patch 2 is a trivial cleanup on the similar race with hugetlb migrations, patch 3 comment on the write check so when anyone read it again it'll be clear why it's there. This patch (of 3): After the recent rework patchset of hugetlb locking on pmd sharing, kselftest for userfaultfd sometimes fails on hugetlb private tests with unexpected write fault checks. It turns out there's nothing wrong within the locking series regarding this matter, but it could have changed the timing of threads so it can trigger an old bug. The real bug is when we call hugetlb_no_page() we're not with the pgtable lock. It means we're reading the pte values lockless. It's perfectly fine in most cases because before we do normal page allocations we'll take the lock and check pte_same() again. However before that, there are actually two paths on userfaultfd missing/minor handling that may directly move on with the fault process without checking the pte values. It means for these two paths we may be generating an uffd message based on an unstable pte, while an unstable pte can legally be anything as long as the modifier holds the pgtable lock. One example, which is also what happened in the failing kselftest and caused the test failure, is that for private mappings wr-protection changes can happen on one page. While hugetlb_change_protection() generally requires pte being cleared before being changed, then there can be a race condition like: thread 1 thread 2 -------- -------- UFFDIO_WRITEPROTECT hugetlb_fault hugetlb_change_protection pgtable_lock() huge_ptep_modify_prot_start pte==NULL hugetlb_no_page generate uffd missing event even if page existed!! huge_ptep_modify_prot_commit pgtable_unlock() Fix this by rechecking the pte after pgtable lock for both userfaultfd missing & minor fault paths. This bug should have been around starting from uffd hugetlb introduced, so attaching a Fixes to the commit. Also attach another Fixes to the minor support commit for easier tracking. Note that userfaultfd is actually fine with false positives (e.g. caused by pte changed), but not wrong logical events (e.g. caused by reading a pte during changing). The latter can confuse the userspace, so the strictness is very much preferred. E.g., MISSING event should never happen on the page after UFFDIO_COPY has correctly installed the page and returned. Link: https://lkml.kernel.org/r/20221004193400.110155-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221004193400.110155-2-peterx@redhat.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode") Signed-off-by: Peter Xu <peterx@redhat.com> Co-developed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a910612336da..bf9d8d04bf4f8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5535,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, return handle_userfault(&vmf, reason); } +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -5575,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MISSING); + if (userfaultfd_missing(vma)) { + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); + } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5644,9 +5684,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MINOR); + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); } } -- GitLab From f9bf6c03eca1077cae8de0e6d86427656fa42a9b Mon Sep 17 00:00:00 2001 From: Peter Xu <peterx@redhat.com> Date: Tue, 4 Oct 2022 15:33:59 -0400 Subject: [PATCH 1895/2223] mm/hugetlb: use hugetlb_pte_stable in migration race check After hugetlb_pte_stable() introduced, we can also rewrite the migration race condition against page allocation to use the new helper too. Link: https://lkml.kernel.org/r/20221004193400.110155-3-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: David Hildenbrand <david@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Nadav Amit <nadav.amit@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bf9d8d04bf4f8..9b26055f31197 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5634,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - ptl = huge_pte_lock(h, mm, ptep); - ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(page)); - spin_unlock(ptl); + else + ret = 0; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); -- GitLab From 26c92d37d3dc484157bdb4eb7d29991c017b168b Mon Sep 17 00:00:00 2001 From: Peter Xu <peterx@redhat.com> Date: Tue, 4 Oct 2022 15:34:00 -0400 Subject: [PATCH 1896/2223] mm/selftest: uffd: explain the write missing fault check It's not obvious why we had a write check for each of the missing messages, especially when it should be a locking op. Add a rich comment for that, and also try to explain its good side and limitations, so that if someone hit it again for either a bug or a different glibc impl there'll be some clue to start with. Link: https://lkml.kernel.org/r/20221004193400.110155-4-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: David Hildenbrand <david@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Nadav Amit <nadav.amit@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- tools/testing/selftests/vm/userfaultfd.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 74babdbc02e56..297f250c1d956 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -774,7 +774,27 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, continue_range(uffd, msg->arg.pagefault.address, page_size); stats->minor_faults++; } else { - /* Missing page faults */ + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) err("unexpected write fault"); -- GitLab From 15cd90049d595e592d8860ee15a3f23491d54d17 Mon Sep 17 00:00:00 2001 From: Yafang Shao <laoar.shao@gmail.com> Date: Thu, 6 Oct 2022 10:15:40 +0000 Subject: [PATCH 1897/2223] mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page PGFREE and PGALLOC represent the number of freed and allocated pages. So the page order must be considered. Link: https://lkml.kernel.org/r/20221006101540.40686-1-laoar.shao@gmail.com Fixes: 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") Signed-off-by: Yafang Shao <laoar.shao@gmail.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Reviewed-by: Miaohe Lin <linmiaohe@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 059f6946832fa..8e9b7f08a32ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3446,7 +3446,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high; - __count_vm_event(PGFREE); + __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3803,7 +3803,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp_spin_unlock_irqrestore(pcp, flags); pcp_trylock_finish(UP_flags); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); } return page; -- GitLab From ef6e06b2ef87077104d1145a0fd452ff8dbbc4b7 Mon Sep 17 00:00:00 2001 From: Ira Weiny <ira.weiny@intel.com> Date: Wed, 5 Oct 2022 21:05:55 -0700 Subject: [PATCH 1898/2223] highmem: fix kmap_to_page() for kmap_local_page() addresses kmap_to_page() is used to get the page for a virtual address which may be kmap'ed. Unfortunately, kmap_local_page() stores mappings in a thread local array separate from kmap(). These mappings were not checked by the call. Check the kmap_local_page() mappings and return the page if found. Because it is intended to remove kmap_to_page() add a warn on once to the kmap checks to flag potential issues early. NOTE Due to 32bit x86 use of kmap local in iomap atmoic, KMAP_LOCAL does not require HIGHMEM to be set. Therefore the support calls required a new KMAP_LOCAL section to fix 0day build errors. [akpm@linux-foundation.org: fix warning] Link: https://lkml.kernel.org/r/20221006040555.1502679-1-ira.weiny@intel.com Signed-off-by: Ira Weiny <ira.weiny@intel.com> Reported-by: Al Viro <viro@zeniv.linux.org.uk> Reported-by: kernel test robot <lkp@intel.com> Cc: "Fabio M. De Francesco" <fmdefrancesco@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/highmem.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index c707d7202d5f7..db251e77f98f8 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -30,6 +30,17 @@ #include <asm/tlbflush.h> #include <linux/vmalloc.h> +#ifdef CONFIG_KMAP_LOCAL +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif +#endif /* CONFIG_KMAP_LOCAL */ + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -142,12 +153,29 @@ pte_t *pkmap_page_table; struct page *__kmap_to_page(void *vaddr) { + unsigned long base = (unsigned long) vaddr & PAGE_MASK; + struct kmap_ctrl *kctrl = ¤t->kmap_ctrl; unsigned long addr = (unsigned long)vaddr; + int i; + + /* kmap() mappings */ + if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && + addr < PKMAP_ADDR(LAST_PKMAP))) + return pte_page(pkmap_page_table[PKMAP_NR(addr)]); - if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { - int i = PKMAP_NR(addr); + /* kmap_local_page() mappings */ + if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && + base < __fix_to_virt(FIX_KMAP_BEGIN))) { + for (i = 0; i < kctrl->idx; i++) { + unsigned long base_addr; + int idx; - return pte_page(pkmap_page_table[i]); + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + + if (base_addr == base) + return pte_page(kctrl->pteval[i]); + } } return virt_to_page(vaddr); @@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void) # define arch_kmap_local_post_unmap(vaddr) do { } while (0) #endif -#ifndef arch_kmap_local_map_idx -#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) -#endif - #ifndef arch_kmap_local_unmap_idx #define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) #endif @@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) return false; } -static inline int kmap_local_calc_idx(int idx) -{ - return idx + KM_MAX_IDX * smp_processor_id(); -} - static pte_t *__kmap_pte; static pte_t *kmap_get_pte(unsigned long vaddr, int idx) -- GitLab From f2913d006fcdb61719635e093d1b5dd0dafecac7 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt <palmer@rivosinc.com> Date: Tue, 20 Sep 2022 13:00:37 -0700 Subject: [PATCH 1899/2223] RISC-V: Avoid dereferening NULL regs in die() I don't think we can actually die() without a regs pointer, but the compiler was warning about a NULL check after a dereference. It seems prudent to just avoid the possibly-NULL dereference, given that when die()ing the system is already toast so who knows how we got there. Reported-by: kernel test robot <lkp@intel.com> Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220920200037.6727-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/kernel/traps.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 635e6ec269380..f3e96d60a2ff3 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -33,6 +33,7 @@ void die(struct pt_regs *regs, const char *str) { static int die_counter; int ret; + long cause; oops_enter(); @@ -42,11 +43,13 @@ void die(struct pt_regs *regs, const char *str) pr_emerg("%s [#%d]\n", str, ++die_counter); print_modules(); - show_regs(regs); + if (regs) + show_regs(regs); - ret = notify_die(DIE_OOPS, str, regs, 0, regs->cause, SIGSEGV); + cause = regs ? regs->cause : -1; + ret = notify_die(DIE_OOPS, str, regs, 0, cause, SIGSEGV); - if (regs && kexec_should_crash(current)) + if (kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); -- GitLab From 05c2224d4b049406b0545a10be05280ff4b8ba0a Mon Sep 17 00:00:00 2001 From: Gavin Shan <gshan@redhat.com> Date: Thu, 13 Oct 2022 14:30:20 +0800 Subject: [PATCH 1900/2223] KVM: selftests: Fix number of pages for memory slot in memslot_modification_stress_test It's required by vm_userspace_mem_region_add() that memory size should be aligned to host page size. However, one guest page is provided by memslot_modification_stress_test. It triggers failure in the scenario of 64KB-page-size-host and 4KB-page-size-guest, as the following messages indicate. # ./memslot_modification_stress_test Testing guest mode: PA-bits:40, VA-bits:48, 4K pages guest physical test memory: [0xffbfff0000, 0xffffff0000) Finished creating vCPUs Started all vCPUs ==== Test Assertion Failure ==== lib/kvm_util.c:824: vm_adjust_num_guest_pages(vm->mode, npages) == npages pid=5712 tid=5712 errno=0 - Success 1 0x0000000000404eeb: vm_userspace_mem_region_add at kvm_util.c:822 2 0x0000000000401a5b: add_remove_memslot at memslot_modification_stress_test.c:82 3 (inlined by) run_test at memslot_modification_stress_test.c:110 4 0x0000000000402417: for_each_guest_mode at guest_modes.c:100 5 0x00000000004016a7: main at memslot_modification_stress_test.c:187 6 0x0000ffffb8cd4383: ?? ??:0 7 0x0000000000401827: _start at :? Number of guest pages is not compatible with the host. Try npages=16 Fix the issue by providing 16 guest pages to the memory slot for this particular combination of 64KB-page-size-host and 4KB-page-size-guest on aarch64. Fixes: ef4c9f4f65462 ("KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()") Signed-off-by: Gavin Shan <gshan@redhat.com> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221013063020.201856-1-gshan@redhat.com --- tools/testing/selftests/kvm/memslot_modification_stress_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 6ee7e1dde4043..bb1d17a1171bc 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -67,7 +67,7 @@ struct memslot_antagonist_args { static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, uint64_t nr_modifications) { - const uint64_t pages = 1; + uint64_t pages = max_t(int, vm->page_size, getpagesize()) / vm->page_size; uint64_t gpa; int i; -- GitLab From a4cb3651a174366cc85a677da9e3681fbe97fdae Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Thu, 13 Oct 2022 16:44:18 +1000 Subject: [PATCH 1901/2223] powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context It's possible for an interrupt returning to an irqs-disabled context to lose a pending soft-masked irq because it branches to part of the exit code for irqs-enabled contexts, which is meant to clear only the PACA_IRQS_HARD_DIS flag from PACAIRQHAPPENED by zeroing the byte. This just looks like a simple thinko from a recent commit (if there was no hard mask pending, there would be no reason to clear it anyway). This also adds comment to the code that actually does need to clear the flag. Fixes: e485f6c751e0a ("powerpc/64/interrupt: Fix return to masked context after hard-mask irq becomes pending") Reported-by: Sachin Sant <sachinp@linux.ibm.com> Reported-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221013064418.1311104-1-npiggin@gmail.com --- arch/powerpc/kernel/interrupt_64.S | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 904a5608cbe30..978a173eb3396 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -538,7 +538,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled lbz r11,PACAIRQHAPPENED(r13) andi. r10,r11,PACA_IRQ_MUST_HARD_MASK - beq 1f // No HARD_MASK pending + beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending /* Must clear MSR_EE from _MSR */ #ifdef CONFIG_PPC_BOOK3S @@ -555,12 +555,23 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) b .Lfast_kernel_interrupt_return_\srr\() .Linterrupt_return_\srr\()_soft_enabled: + /* + * In the soft-enabled case, need to double-check that we have no + * pending interrupts that might have come in before we reached the + * restart section of code, and restart the exit so those can be + * handled. + * + * If there are none, it is be possible that the interrupt still + * has PACA_IRQ_HARD_DIS set, which needs to be cleared for the + * interrupted context. This clear will not clobber a new pending + * interrupt coming in, because we're in the restart section, so + * such would return to the restart location. + */ #ifdef CONFIG_PPC_BOOK3S lbz r11,PACAIRQHAPPENED(r13) andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l bne- interrupt_return_\srr\()_kernel_restart #endif -1: li r11,0 stb r11,PACAIRQHAPPENED(r13) // clear the possible HARD_DIS -- GitLab From 90d5ce82e143b42b2fdfb95401a89f86b71cedb7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Thu, 13 Oct 2022 17:31:31 +1000 Subject: [PATCH 1902/2223] powerpc/pseries: Fix CONFIG_DTL=n build The recently moved dtl code must be compiled-in if CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y even if CONFIG_DTL=n. Fixes: 6ba5aa541aaa0 ("powerpc/pseries: Move dtl scanning and steal time accounting to pseries platform") Reported-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221013073131.1485742-1-npiggin@gmail.com --- arch/powerpc/platforms/pseries/Makefile | 3 +- arch/powerpc/platforms/pseries/dtl.c | 151 +++++++++++++----------- 2 files changed, 80 insertions(+), 74 deletions(-) diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 14e143b946a36..92310202bdd76 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -7,7 +7,7 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ - papr_platform_attributes.o + papr_platform_attributes.o dtl.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_KEXEC_CORE) += kexec.o obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o @@ -19,7 +19,6 @@ obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o obj-$(CONFIG_CMM) += cmm.o -obj-$(CONFIG_DTL) += dtl.o obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 1b1977bc78e73..3f1cdccebc9c1 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -18,6 +18,7 @@ #include <asm/plpar_wrappers.h> #include <asm/machdep.h> +#ifdef CONFIG_DTL struct dtl { struct dtl_entry *buf; int cpu; @@ -57,78 +58,6 @@ static DEFINE_PER_CPU(struct dtl_ring, dtl_rings); static atomic_t dtl_count; -/* - * Scan the dispatch trace log and count up the stolen time. - * Should be called with interrupts disabled. - */ -static notrace u64 scan_dispatch_log(u64 stop_tb) -{ - u64 i = local_paca->dtl_ridx; - struct dtl_entry *dtl = local_paca->dtl_curr; - struct dtl_entry *dtl_end = local_paca->dispatch_log_end; - struct lppaca *vpa = local_paca->lppaca_ptr; - u64 tb_delta; - u64 stolen = 0; - u64 dtb; - - if (!dtl) - return 0; - - if (i == be64_to_cpu(vpa->dtl_idx)) - return 0; - while (i < be64_to_cpu(vpa->dtl_idx)) { - dtb = be64_to_cpu(dtl->timebase); - tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + - be32_to_cpu(dtl->ready_to_enqueue_time); - barrier(); - if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { - /* buffer has overflowed */ - i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; - dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); - continue; - } - if (dtb > stop_tb) - break; - if (dtl_consumer) - dtl_consumer(dtl, i); - stolen += tb_delta; - ++i; - ++dtl; - if (dtl == dtl_end) - dtl = local_paca->dispatch_log; - } - local_paca->dtl_ridx = i; - local_paca->dtl_curr = dtl; - return stolen; -} - -/* - * Accumulate stolen time by scanning the dispatch trace log. - * Called on entry from user mode. - */ -void notrace pseries_accumulate_stolen_time(void) -{ - u64 sst, ust; - struct cpu_accounting_data *acct = &local_paca->accounting; - - sst = scan_dispatch_log(acct->starttime_user); - ust = scan_dispatch_log(acct->starttime); - acct->stime -= sst; - acct->utime -= ust; - acct->steal_time += ust + sst; -} - -u64 pseries_calculate_stolen_time(u64 stop_tb) -{ - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return 0; - - if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) - return scan_dispatch_log(stop_tb); - - return 0; -} - /* * The cpu accounting code controls the DTL ring buffer, and we get * given entries as they are processed. @@ -436,3 +365,81 @@ static int dtl_init(void) return 0; } machine_arch_initcall(pseries, dtl_init); +#endif /* CONFIG_DTL */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +/* + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. + */ +static notrace u64 scan_dispatch_log(u64 stop_tb) +{ + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; + + if (i == be64_to_cpu(vpa->dtl_idx)) + return 0; + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtb = be64_to_cpu(dtl->timebase); + tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + + be32_to_cpu(dtl->ready_to_enqueue_time); + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; +#ifdef CONFIG_DTL + if (dtl_consumer) + dtl_consumer(dtl, i); +#endif + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; +} + +/* + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. + */ +void notrace pseries_accumulate_stolen_time(void) +{ + u64 sst, ust; + struct cpu_accounting_data *acct = &local_paca->accounting; + + sst = scan_dispatch_log(acct->starttime_user); + ust = scan_dispatch_log(acct->starttime); + acct->stime -= sst; + acct->utime -= ust; + acct->steal_time += ust + sst; +} + +u64 pseries_calculate_stolen_time(u64 stop_tb) +{ + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) + return scan_dispatch_log(stop_tb); + + return 0; +} + +#endif -- GitLab From d1c0b7de4dfa5505cf7a1d6220aa72aace4435d0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard <maxime@cerno.tech> Date: Fri, 2 Sep 2022 16:41:11 +0200 Subject: [PATCH 1903/2223] drm/vc4: Add module dependency on hdmi-codec The VC4 HDMI controller driver relies on the HDMI codec ASoC driver. In order to set it up properly, in vc4_hdmi_audio_init(), our HDMI driver will register a device matching the HDMI codec driver, and then register an ASoC card using that codec. However, if vc4 is compiled as a module, chances are that the hdmi-codec driver will be too. In such a case, the module loader will have a very narrow window to load the module between the device registration and the card registration. If it fails to load the module in time, the card registration will fail with EPROBE_DEFER, and we'll abort the audio initialisation, unregistering the HDMI codec device in the process. The next time the bind callback will be run, it's likely that we end up missing that window again, effectively preventing vc4 to probe entirely. In order to prevent this, we can create a soft dependency of the vc4 driver on the HDMI codec one so that we're sure the HDMI codec will be loaded before the VC4 module is, and thus we'll never end up in the previous situation. Fixes: 91e99e113929 ("drm/vc4: hdmi: Register HDMI codec") Reviewed-by: Javier Martinez Canillas <javierm@redhat.com> Signed-off-by: Maxime Ripard <maxime@cerno.tech> Link: https://patchwork.freedesktop.org/patch/msgid/20220902144111.3424560-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index 292d1b6a01b6f..3dc01af0f90f2 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -480,6 +480,7 @@ module_init(vc4_drm_register); module_exit(vc4_drm_unregister); MODULE_ALIAS("platform:vc4-drm"); +MODULE_SOFTDEP("pre: snd-soc-hdmi-codec"); MODULE_DESCRIPTION("Broadcom VC4 DRM Driver"); MODULE_AUTHOR("Eric Anholt <eric@anholt.net>"); MODULE_LICENSE("GPL v2"); -- GitLab From ae71ab585c819f83aec84f91eb01157a90552ef2 Mon Sep 17 00:00:00 2001 From: Maxime Ripard <maxime@cerno.tech> Date: Thu, 29 Sep 2022 11:21:17 +0200 Subject: [PATCH 1904/2223] drm/vc4: hdmi: Enforce the minimum rate at runtime_resume This is a revert of commit fd5894fa2413 ("drm/vc4: hdmi: Remove clock rate initialization"), with the code slightly moved around. It turns out that we can't downright remove that code from the driver, since the Pi0-3 and Pi4 are in different cases, and it only works for the Pi4. Indeed, the commit mentioned above was relying on the RaspberryPi firmware clocks driver to initialize the rate if it wasn't done by the firmware. However, the Pi0-3 are using the clk-bcm2835 clock driver that wasn't doing this initialization. We therefore end up with the clock not being assigned a rate, and the CPU stalling when trying to access a register. We can't move that initialization in the clk-bcm2835 driver, since the HSM clock we depend on is actually part of the HDMI power domain, so any rate setup is only valid when the power domain is enabled. Thus, we reinstated the minimum rate setup at runtime_suspend, which should address both issues. Link: https://lore.kernel.org/dri-devel/20220922145448.w3xfywkn5ecak2et@pengutronix.de/ Fixes: fd5894fa2413 ("drm/vc4: hdmi: Remove clock rate initialization") Reported-by: Marc Kleine-Budde <mkl@pengutronix.de> Reviewed-by: Javier Martinez Canillas <javierm@redhat.com> Tested-by: Stefan Wahren <stefan.wahren@i2se.com> Signed-off-by: Maxime Ripard <maxime@cerno.tech> Link: https://patchwork.freedesktop.org/patch/msgid/20220929-rpi-pi3-unplugged-fixes-v1-1-cd22e962296c@cerno.tech --- drivers/gpu/drm/vc4/vc4_hdmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 1e5f68704d7d8..780a19a75c3f5 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -2871,6 +2871,15 @@ static int vc4_hdmi_runtime_resume(struct device *dev) u32 __maybe_unused value; int ret; + /* + * The HSM clock is in the HDMI power domain, so we need to set + * its frequency while the power domain is active so that it + * keeps its rate. + */ + ret = clk_set_min_rate(vc4_hdmi->hsm_clock, HSM_MIN_CLOCK_FREQ); + if (ret) + return ret; + ret = clk_prepare_enable(vc4_hdmi->hsm_clock); if (ret) return ret; -- GitLab From 4190e8bbcbc77a9c36724681801cedc5229e7fc2 Mon Sep 17 00:00:00 2001 From: Maxime Ripard <maxime@cerno.tech> Date: Thu, 29 Sep 2022 11:21:18 +0200 Subject: [PATCH 1905/2223] drm/vc4: hdmi: Check the HSM rate at runtime_resume If our HSM clock has not been properly initialized, any register access will silently lock up the system. Let's check that this can't happen by adding a check for the rate before any register access, and error out otherwise. Link: https://lore.kernel.org/dri-devel/20220922145448.w3xfywkn5ecak2et@pengutronix.de/ Reviewed-by: Javier Martinez Canillas <javierm@redhat.com> Tested-by: Stefan Wahren <stefan.wahren@i2se.com> Signed-off-by: Maxime Ripard <maxime@cerno.tech> Link: https://patchwork.freedesktop.org/patch/msgid/20220929-rpi-pi3-unplugged-fixes-v1-2-cd22e962296c@cerno.tech --- drivers/gpu/drm/vc4/vc4_hdmi.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 780a19a75c3f5..874c6bd787c56 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -2869,6 +2869,7 @@ static int vc4_hdmi_runtime_resume(struct device *dev) struct vc4_hdmi *vc4_hdmi = dev_get_drvdata(dev); unsigned long __maybe_unused flags; u32 __maybe_unused value; + unsigned long rate; int ret; /* @@ -2884,6 +2885,21 @@ static int vc4_hdmi_runtime_resume(struct device *dev) if (ret) return ret; + /* + * Whenever the RaspberryPi boots without an HDMI monitor + * plugged in, the firmware won't have initialized the HSM clock + * rate and it will be reported as 0. + * + * If we try to access a register of the controller in such a + * case, it will lead to a silent CPU stall. Let's make sure we + * prevent such a case. + */ + rate = clk_get_rate(vc4_hdmi->hsm_clock); + if (!rate) { + ret = -EINVAL; + goto err_disable_clk; + } + if (vc4_hdmi->variant->reset) vc4_hdmi->variant->reset(vc4_hdmi); @@ -2905,6 +2921,10 @@ static int vc4_hdmi_runtime_resume(struct device *dev) #endif return 0; + +err_disable_clk: + clk_disable_unprepare(vc4_hdmi->hsm_clock); + return ret; } static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data) -- GitLab From 2145ab513e3b3f910bd4a93abbdfa74fc65dfea4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" <mst@redhat.com> Date: Wed, 12 Oct 2022 17:58:28 -0400 Subject: [PATCH 1906/2223] virtio_pci: use irq to detect interrupt support commit 71491c54eafa ("virtio_pci: don't try to use intxif pin is zero") breaks virtio_pci on powerpc, when running as a qemu guest. vp_find_vqs() bails out because pci_dev->pin == 0. But pci_dev->irq is populated correctly, so vp_find_vqs_intx() would succeed if we called it - which is what the code used to do. This seems to happen because pci_dev->pin is not populated in pci_assign_irq(). A PCI core bug? Maybe. However Linus said: I really think that that is basically the only time you should use that 'pci_dev->pin' thing: it basically exists not for "does this device have an IRQ", but for "what is the routing of this irq on this device". and The correct way to check for "no irq" doesn't use NO_IRQ at all, it just does if (dev->irq) ... so let's just check irq and be done with it. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Reported-by: Michael Ellerman <mpe@ellerman.id.au> Fixes: 71491c54eafa ("virtio_pci: don't try to use intxif pin is zero") Cc: "Angus Chen" <angus.chen@jaguarmicro.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Tested-by: Michael Ellerman <mpe@ellerman.id.au> Acked-by: Jason Wang <jasowang@redhat.com> Message-Id: <20221012220312.308522-1-mst@redhat.com> --- drivers/virtio/virtio_pci_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 4df77eeb4d16e..a6c86f916dbdf 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -409,8 +409,8 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); if (!err) return 0; - /* Is there an interrupt pin? If not give up. */ - if (!(to_vp_device(vdev)->pci_dev->pin)) + /* Is there an interrupt? If not give up. */ + if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); -- GitLab From be8ddea9e75e65b05837f6d51dc5774b866d0bcf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" <mst@redhat.com> Date: Wed, 12 Oct 2022 00:49:23 -0400 Subject: [PATCH 1907/2223] vdpa/ifcvf: add reviewer Zhu Lingshan has been writing and reviewing ifcvf patches for a while now, add as reviewer. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Zhu Lingshan <lingshan.zhu@intel.com> Acked-by: Jason Wang <jasowang@redhat.com> --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8d960ca4e9696..531bbb0a507a7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21736,6 +21736,10 @@ F: include/linux/virtio*.h F: include/uapi/linux/virtio_*.h F: tools/virtio/ +IFCVF VIRTIO DATA PATH ACCELERATOR +R: Zhu Lingshan <lingshan.zhu@intel.com> +F: drivers/vdpa/ifcvf/ + VIRTIO BALLOON M: "Michael S. Tsirkin" <mst@redhat.com> M: David Hildenbrand <david@redhat.com> -- GitLab From 2f6f19c7aaad5005dc75298a413eb0243c5d312d Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Wed, 12 Oct 2022 09:12:07 +1000 Subject: [PATCH 1908/2223] cifs: fix regression in very old smb1 mounts BZ: 215375 Fixes: 76a3c92ec9e0 ("cifs: remove support for NTLM and weaker authentication algorithms") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/connect.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 40900aace416e..e158257da1cd7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3922,12 +3922,11 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) { - pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ - *bcc_ptr = 0; /* password is null byte */ - bcc_ptr++; /* skip password */ - /* already aligned so no need to do it below */ - } + + pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ + bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; -- GitLab From 977bb6530807a9a8e7f29d7dfba5737135b50df6 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Tue, 11 Oct 2022 23:26:33 -0500 Subject: [PATCH 1909/2223] smb3: clarify multichannel warning When server does not return network interfaces, clarify the message to indicate that "multichannel not available" not just that "empty network interface returned by server ..." Suggested-by: Tom Talpey <tom@talpey.com> Reviewed-by: Bharath SM <bharathsm@microsoft.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5187250c5f662..32d5387a9ddec 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -550,7 +550,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, /* avoid spamming logs every 10 minutes, so log only in mount */ if ((ses->chan_max > 1) && in_mount) cifs_dbg(VFS, - "empty network interface list returned by server %s\n", + "multichannel not available\n" + "Empty network interface list returned by server %s\n", ses->server->hostname); rc = -EINVAL; goto out; -- GitLab From 76894f3e2f71177747b8b4763fb180e800279585 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara <pc@cjr.nz> Date: Mon, 3 Oct 2022 18:43:50 -0300 Subject: [PATCH 1910/2223] cifs: improve symlink handling for smb2+ When creating inode for symlink, the client used to send below requests to fill it in: * create+query_info+close (STATUS_STOPPED_ON_SYMLINK) * create(+reparse_flag)+query_info+close (set file attrs) * create+ioctl(get_reparse)+close (query reparse tag) and then for every access to the symlink dentry, the ->link() method would send another: * create+ioctl(get_reparse)+close (parse symlink) So, in order to improve: (i) Get rid of unnecessary roundtrips and then resolve symlinks as follows: * create+query_info+close (STATUS_STOPPED_ON_SYMLINK + parse symlink + get reparse tag) * create(+reparse_flag)+query_info+close (set file attrs) (ii) Set the resolved symlink target directly in inode->i_link and use simple_get_link() for ->link() to simply return it. Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsfs.c | 9 ++- fs/cifs/cifsglob.h | 46 +++++++++--- fs/cifs/cifsproto.h | 13 ++-- fs/cifs/dir.c | 30 +++----- fs/cifs/file.c | 41 ++++++----- fs/cifs/inode.c | 170 ++++++++++++++++++++++++++------------------ fs/cifs/link.c | 107 +--------------------------- fs/cifs/readdir.c | 2 + fs/cifs/smb1ops.c | 56 +++++++++------ fs/cifs/smb2file.c | 127 +++++++++++++++++++++++++++------ fs/cifs/smb2inode.c | 169 ++++++++++++++++++++++--------------------- fs/cifs/smb2ops.c | 109 ++++++---------------------- fs/cifs/smb2pdu.h | 3 + fs/cifs/smb2proto.h | 22 +++--- 14 files changed, 451 insertions(+), 453 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8042d7280dec1..c6ac19223ddc0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -396,6 +396,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->epoch = 0; spin_lock_init(&cifs_inode->open_file_lock); generate_random_uuid(cifs_inode->lease_key); + cifs_inode->symlink_target = NULL; /* * Can not set i_flags here - they get immediately overwritten to zero @@ -412,7 +413,11 @@ cifs_alloc_inode(struct super_block *sb) static void cifs_free_inode(struct inode *inode) { - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); + struct cifsInodeInfo *cinode = CIFS_I(inode); + + if (S_ISLNK(inode->i_mode)) + kfree(cinode->symlink_target); + kmem_cache_free(cifs_inode_cachep, cinode); } static void @@ -1139,7 +1144,7 @@ const struct inode_operations cifs_file_inode_ops = { }; const struct inode_operations cifs_symlink_inode_ops = { - .get_link = cifs_get_link, + .get_link = simple_get_link, .permission = cifs_permission, .listxattr = cifs_listxattr, }; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 52ddf4163b981..9c0253835f1c7 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -185,6 +185,19 @@ struct cifs_cred { struct cifs_ace *aces; }; +struct cifs_open_info_data { + char *symlink_target; + union { + struct smb2_file_all_info fi; + struct smb311_posix_qinfo posix_fi; + }; +}; + +static inline void cifs_free_open_info(struct cifs_open_info_data *data) +{ + kfree(data->symlink_target); +} + /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -307,20 +320,20 @@ struct smb_version_operations { int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); /* query path data from the server */ - int (*query_path_info)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - FILE_ALL_INFO *, bool *, bool *); + int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); /* query file data from the server */ - int (*query_file_info)(const unsigned int, struct cifs_tcon *, - struct cifs_fid *, FILE_ALL_INFO *); + int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data); /* query reparse tag from srv to determine which type of special file */ int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); /* get server index number */ - int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - u64 *uniqueid, FILE_ALL_INFO *); + int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid, + struct cifs_open_info_data *data); /* set size by path */ int (*set_path_size)(const unsigned int, struct cifs_tcon *, const char *, __u64, struct cifs_sb_info *, bool); @@ -369,8 +382,8 @@ struct smb_version_operations { struct cifs_sb_info *, const char *, char **, bool); /* open a file for non-posix mounts */ - int (*open)(const unsigned int, struct cifs_open_parms *, - __u32 *, FILE_ALL_INFO *); + int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); /* set fid protocol-specific info */ void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); /* close a file */ @@ -1123,6 +1136,7 @@ struct cifs_fattr { struct timespec64 cf_mtime; struct timespec64 cf_ctime; u32 cf_cifstag; + char *cf_symlink_target; }; /* @@ -1385,6 +1399,7 @@ struct cifsFileInfo { struct work_struct put; /* work for the final part of _put */ struct delayed_work deferred; bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ + char *symlink_target; }; struct cifs_io_parms { @@ -1543,6 +1558,7 @@ struct cifsInodeInfo { struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ + char *symlink_target; }; static inline struct cifsInodeInfo * @@ -2111,4 +2127,14 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) return sizeof(ses->workstation_name); } +static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src) +{ + memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src)); + dst->AccessFlags = src->AccessFlags; + dst->CurrentByteOffset = src->CurrentByteOffset; + dst->Mode = src->Mode; + dst->AlignmentRequirement = src->AlignmentRequirement; + dst->FileNameLength = src->FileNameLength; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 84ec71bdfacdf..83e83d8beabba 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -182,10 +182,9 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); extern void cifs_down_write(struct rw_semaphore *sem); -extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, - struct file *file, - struct tcon_link *tlink, - __u32 oplock); +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target); extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, @@ -200,9 +199,9 @@ extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); -extern int cifs_get_inode_info(struct inode **inode, const char *full_path, - FILE_ALL_INFO *data, struct super_block *sb, - int xid, const struct cifs_fid *fid); +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f58869306309f..cbd46ac59cd2f 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -165,10 +165,9 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) /* Inode operations in similar order to how they appear in Linux file fs.h */ -static int -cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, - struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid) +static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, + struct tcon_link *tlink, unsigned int oflags, umode_t mode, __u32 *oplock, + struct cifs_fid *fid, struct cifs_open_info_data *buf) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -177,7 +176,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct cifs_tcon *tcon = tlink_tcon(tlink); const char *full_path; void *page = alloc_dentry_path(); - FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; struct TCP_Server_Info *server = tcon->ses->server; @@ -290,12 +288,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto out; } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - /* * if we're not using unix extensions, see if we need to set * ATTR_READONLY on the create call @@ -364,8 +356,7 @@ cifs_create_get_file_info: { #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* TODO: Add support for calling POSIX query info here, but passing in fid */ - rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, - xid, fid); + rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid); if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); @@ -402,7 +393,6 @@ cifs_create_set_dentry: d_add(direntry, newinode); out: - kfree(buf); free_dentry_path(page); return rc; @@ -427,6 +417,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, struct cifs_pending_open open; __u32 oplock; struct cifsFileInfo *file_info; + struct cifs_open_info_data buf = {}; if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) return -EIO; @@ -484,8 +475,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); - + &oplock, &fid, &buf); if (rc) { cifs_del_pending_open(&open); goto out; @@ -510,7 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, file->f_op = &cifs_file_direct_ops; } - file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target); if (file_info == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -526,6 +516,7 @@ out: cifs_put_tlink(tlink); out_free_xid: free_xid(xid); + cifs_free_open_info(&buf); return rc; } @@ -547,6 +538,7 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; + struct cifs_open_info_data buf = {}; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); @@ -565,11 +557,11 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, if (server->ops->new_lease_key) server->ops->new_lease_key(&fid); - rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, &oplock, &fid, &buf); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); + cifs_free_open_info(&buf); cifs_put_tlink(tlink); out_free_xid: free_xid(xid); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7d756721e1a68..dcec1690312be 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -209,16 +209,14 @@ posix_open_ret: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static int -cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, - struct cifs_fid *fid, unsigned int xid) +static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, + struct cifs_fid *fid, unsigned int xid, struct cifs_open_info_data *buf) { int rc; int desired_access; int disposition; int create_options = CREATE_NOT_DIR; - FILE_ALL_INFO *buf; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; @@ -255,10 +253,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci /* BB pass O_SYNC flag through on file attributes .. BB */ - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!buf) - return -ENOMEM; - /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (f_flags & O_SYNC) create_options |= CREATE_WRITE_THROUGH; @@ -276,9 +270,8 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci oparms.reconnect = false; rc = server->ops->open(xid, &oparms, oplock, buf); - if (rc) - goto out; + return rc; /* TODO: Add support for calling posix query info but with passing in fid */ if (tcon->unix_ext) @@ -294,8 +287,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci rc = -EOPENSTALE; } -out: - kfree(buf); return rc; } @@ -325,9 +316,9 @@ cifs_down_write(struct rw_semaphore *sem) static void cifsFileInfo_put_work(struct work_struct *work); -struct cifsFileInfo * -cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, - struct tcon_link *tlink, __u32 oplock) +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); @@ -347,6 +338,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, return NULL; } + if (symlink_target) { + cfile->symlink_target = kstrdup(symlink_target, GFP_KERNEL); + if (!cfile->symlink_target) { + kfree(fdlocks); + kfree(cfile); + return NULL; + } + } + INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; @@ -440,6 +440,7 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); cifs_sb_deactive(sb); + kfree(cifs_file->symlink_target); kfree(cifs_file); } @@ -572,6 +573,7 @@ int cifs_open(struct inode *inode, struct file *file) bool posix_open_ok = false; struct cifs_fid fid; struct cifs_pending_open open; + struct cifs_open_info_data data = {}; xid = get_xid(); @@ -662,15 +664,15 @@ int cifs_open(struct inode *inode, struct file *file) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); - rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, - file->f_flags, &oplock, &fid, xid); + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, file->f_flags, &oplock, &fid, + xid, &data); if (rc) { cifs_del_pending_open(&open); goto out; } } - cfile = cifs_new_fileinfo(&fid, file, tlink, oplock); + cfile = cifs_new_fileinfo(&fid, file, tlink, oplock, data.symlink_target); if (cfile == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -712,6 +714,7 @@ out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); + cifs_free_open_info(&data); return rc; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ad10c61ab5c9d..be6dafcb25e31 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -210,6 +210,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; } + + if (S_ISLNK(fattr->cf_mode)) { + kfree(cifs_i->symlink_target); + cifs_i->symlink_target = fattr->cf_symlink_target; + fattr->cf_symlink_target = NULL; + + if (unlikely(!cifs_i->symlink_target)) + inode->i_link = ERR_PTR(-EOPNOTSUPP); + else + inode->i_link = cifs_i->symlink_target; + } spin_unlock(&inode->i_lock); if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) @@ -347,13 +358,20 @@ cifs_get_file_info_unix(struct file *filp) int rc; unsigned int xid; FILE_UNIX_BASIC_INFO find_data; - struct cifs_fattr fattr; + struct cifs_fattr fattr = {}; struct inode *inode = file_inode(filp); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); xid = get_xid(); + + if (cfile->symlink_target) { + fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!fattr.cf_symlink_target) + return -ENOMEM; + } + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); @@ -378,6 +396,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, FILE_UNIX_BASIC_INFO find_data; struct cifs_fattr fattr; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -387,10 +406,12 @@ int cifs_get_inode_info_unix(struct inode **pinode, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_dbg(FYI, "%s: query path info: rc = %d\n", __func__, rc); cifs_put_tlink(tlink); if (!rc) { @@ -410,6 +431,17 @@ int cifs_get_inode_info_unix(struct inode **pinode, cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } + if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) { + if (!server->ops->query_symlink) + return -EOPNOTSUPP; + rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &fattr.cf_symlink_target, false); + if (rc) { + cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); + goto cgiiu_exit; + } + } + if (*pinode == NULL) { /* get new inode */ cifs_fill_uniqueid(sb, &fattr); @@ -432,6 +464,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } cgiiu_exit: + kfree(fattr.cf_symlink_target); return rc; } #else @@ -601,10 +634,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from POSIX info struct */ -static void -smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info, - struct super_block *sb, bool adjust_tz, bool symlink) +static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct smb311_posix_qinfo *info = &data->posix_fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -639,6 +672,8 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * if (symlink) { fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode |= S_IFDIR; fattr->cf_dtype = DT_DIR; @@ -655,13 +690,11 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } - -/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -static void -cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct super_block *sb, bool adjust_tz, - bool symlink, u32 reparse_tag) +static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink, + u32 reparse_tag) { + struct smb2_file_all_info *info = &data->fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -703,7 +736,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_BLK; - } else if (symlink) { /* TODO add more reparse tag checks */ + } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK || + reparse_tag == IO_REPARSE_TAG_NFS) { fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -735,6 +769,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } } + if (S_ISLNK(fattr->cf_mode)) { + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; + } + fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; } @@ -744,23 +783,28 @@ cifs_get_file_info(struct file *filp) { int rc; unsigned int xid; - FILE_ALL_INFO find_data; + struct cifs_open_info_data data = {}; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + bool symlink = false; + u32 tag = 0; if (!server->ops->query_file_info) return -ENOSYS; xid = get_xid(); - rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); + rc = server->ops->query_file_info(xid, tcon, cfile, &data); switch (rc) { case 0: /* TODO: add support to query reparse tag */ - cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, - false, 0 /* no reparse tag */); + if (data.symlink_target) { + symlink = true; + tag = IO_REPARSE_TAG_SYMLINK; + } + cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -789,6 +833,7 @@ cifs_get_file_info(struct file *filp) /* if filetype is different, return error */ rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: + cifs_free_open_info(&data); free_xid(xid); return rc; } @@ -860,14 +905,9 @@ cifs_backup_query_path_info(int xid, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static void -cifs_set_fattr_ino(int xid, - struct cifs_tcon *tcon, - struct super_block *sb, - struct inode **inode, - const char *full_path, - FILE_ALL_INFO *data, - struct cifs_fattr *fattr) +static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_block *sb, + struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct cifs_fattr *fattr) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct TCP_Server_Info *server = tcon->ses->server; @@ -885,11 +925,8 @@ cifs_set_fattr_ino(int xid, * If we have an inode pass a NULL tcon to ensure we don't * make a round trip to the server. This only works for SMB2+. */ - rc = server->ops->get_srv_inum(xid, - *inode ? NULL : tcon, - cifs_sb, full_path, - &fattr->cf_uniqueid, - data); + rc = server->ops->get_srv_inum(xid, *inode ? NULL : tcon, cifs_sb, full_path, + &fattr->cf_uniqueid, data); if (rc) { /* * If that fails reuse existing ino or generate one @@ -923,14 +960,10 @@ static inline bool is_inode_cache_good(struct inode *ino) return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; } -int -cifs_get_inode_info(struct inode **inode, - const char *full_path, - FILE_ALL_INFO *in_data, - struct super_block *sb, int xid, - const struct cifs_fid *fid) +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid) { - struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; @@ -938,8 +971,7 @@ cifs_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool is_reparse_point = false; - FILE_ALL_INFO *data = in_data; - FILE_ALL_INFO *tmp_data = NULL; + struct cifs_open_info_data tmp_data = {}; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; @@ -960,21 +992,15 @@ cifs_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!tmp_data) { - rc = -ENOMEM; - goto out; - } - rc = server->ops->query_path_info(xid, tcon, cifs_sb, - full_path, tmp_data, - &adjust_tz, &is_reparse_point); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data, + &adjust_tz, &is_reparse_point); #ifdef CONFIG_CIFS_DFS_UPCALL if (rc == -ENOENT && is_tcon_dfs(tcon)) rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, full_path); #endif - data = tmp_data; + data = &tmp_data; } /* @@ -988,14 +1014,24 @@ cifs_get_inode_info(struct inode **inode, * since we have to check if its reparse tag matches a known * special file type e.g. symlink or fifo or char etc. */ - if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) && - server->ops->query_reparse_tag) { - rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, - full_path, &reparse_tag); - cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag); + if (is_reparse_point && data->symlink_target) { + reparse_tag = IO_REPARSE_TAG_SYMLINK; + } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) && + server->ops->query_reparse_tag) { + tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path, + &reparse_tag); + if (tmprc) + cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc); + if (server->ops->query_symlink) { + tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &data->symlink_target, + is_reparse_point); + if (tmprc) + cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__, + tmprc); + } } - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, - is_reparse_point, reparse_tag); + cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1014,18 +1050,20 @@ cifs_get_inode_info(struct inode **inode, */ if (backup_cred(cifs_sb) && is_smb1_server(server)) { /* for easier reading */ + FILE_ALL_INFO *fi; FILE_DIRECTORY_INFO *fdi; SEARCH_ID_FULL_DIR_INFO *si; rc = cifs_backup_query_path_info(xid, tcon, sb, full_path, &smb1_backup_rsp_buf, - &data); + &fi); if (rc) goto out; - fdi = (FILE_DIRECTORY_INFO *)data; - si = (SEARCH_ID_FULL_DIR_INFO *)data; + move_cifs_info_to_smb2(&data->fi, fi); + fdi = (FILE_DIRECTORY_INFO *)fi; + si = (SEARCH_ID_FULL_DIR_INFO *)fi; cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); @@ -1123,7 +1161,8 @@ handle_mnt_opt: out: cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); - kfree(tmp_data); + cifs_free_open_info(&tmp_data); + kfree(fattr.cf_symlink_target); return rc; } @@ -1138,7 +1177,7 @@ smb311_posix_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool symlink = false; - struct smb311_posix_qinfo *data = NULL; + struct cifs_open_info_data data = {}; int rc = 0; int tmprc = 0; @@ -1155,15 +1194,9 @@ smb311_posix_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL); - if (!data) { - rc = -ENOMEM; - goto out; - } - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, data, - &adjust_tz, &symlink); + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz, + &symlink); /* * 2. Convert it to internal cifs metadata (fattr) @@ -1171,7 +1204,7 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1228,7 +1261,8 @@ smb311_posix_get_inode_info(struct inode **inode, } out: cifs_put_tlink(tlink); - kfree(data); + cifs_free_open_info(&data); + kfree(fattr.cf_symlink_target); return rc; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index cd29c296cec60..bd374feeccaa1 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -201,40 +201,6 @@ out: return rc; } -static int -query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const unsigned char *path, - char **symlinkinfo) -{ - int rc; - u8 *buf = NULL; - unsigned int link_len = 0; - unsigned int bytes_read = 0; - - buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (tcon->ses->server->ops->query_mf_symlink) - rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, - cifs_sb, path, buf, &bytes_read); - else - rc = -ENOSYS; - - if (rc) - goto out; - - if (bytes_read == 0) { /* not a symlink */ - rc = -EINVAL; - goto out; - } - - rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); -out: - kfree(buf); - return rc; -} - int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -244,6 +210,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; + char *symlink = NULL; if (!couldbe_mf_symlink(fattr)) /* it's not a symlink */ @@ -265,7 +232,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (bytes_read == 0) /* not a symlink */ goto out; - rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); + rc = parse_mf_symlink(buf, bytes_read, &link_len, &symlink); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -280,6 +247,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, fattr->cf_mode &= ~S_IFMT; fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = symlink; out: kfree(buf); return rc; @@ -599,75 +567,6 @@ cifs_hl_exit: return rc; } -const char * -cifs_get_link(struct dentry *direntry, struct inode *inode, - struct delayed_call *done) -{ - int rc = -ENOMEM; - unsigned int xid; - const char *full_path; - void *page; - char *target_path = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct tcon_link *tlink = NULL; - struct cifs_tcon *tcon; - struct TCP_Server_Info *server; - - if (!direntry) - return ERR_PTR(-ECHILD); - - xid = get_xid(); - - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - free_xid(xid); - return ERR_CAST(tlink); - } - tcon = tlink_tcon(tlink); - server = tcon->ses->server; - - page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry, page); - if (IS_ERR(full_path)) { - free_xid(xid); - cifs_put_tlink(tlink); - free_dentry_path(page); - return ERR_CAST(full_path); - } - - cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); - - rc = -EACCES; - /* - * First try Minshall+French Symlinks, if configured - * and fallback to UNIX Extensions Symlinks. - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, - &target_path); - - if (rc != 0 && server->ops->query_symlink) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - bool reparse_point = false; - - if (cifsi->cifsAttrs & ATTR_REPARSE) - reparse_point = true; - - rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &target_path, reparse_point); - } - - free_dentry_path(page); - free_xid(xid); - cifs_put_tlink(tlink); - if (rc != 0) { - kfree(target_path); - return ERR_PTR(rc); - } - set_delayed_call(done, kfree_link, target_path); - return target_path; -} - int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct dentry *direntry, const char *symname) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1bb4624e768bf..2d75ba5aaa8ad 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -1011,6 +1011,8 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_unix_basic_to_fattr(&fattr, &((FILE_UNIX_INFO *)find_entry)->basic, cifs_sb); + if (S_ISLNK(fattr.cf_mode)) + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; break; case SMB_FIND_FILE_INFO_STANDARD: cifs_std_info_to_fattr(&fattr, diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index f36b2d2d40ca3..50480751e521c 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -542,31 +542,32 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink) +static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink) { int rc; + FILE_ALL_INFO fi = {}; *symlink = false; /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* * BB optimize code so we do not make the above call when server claims * no NT SMB support and the above call failed at least once - set flag * in tcon or mount. */ if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { - rc = SMBQueryInformation(xid, tcon, full_path, data, - cifs_sb->local_nls, + rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); *adjustTZ = true; } - if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { + if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) { int tmprc; int oplock = 0; struct cifs_fid fid; @@ -592,10 +593,9 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *unused) { /* * We can not use the IndexNumber field by default from Windows or @@ -613,11 +613,22 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, cifs_remap(cifs_sb)); } -static int -cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); + int rc; + FILE_ALL_INFO fi = {}; + + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + + rc = CIFSSMBQFileInfo(xid, tcon, cfile->fid.netfid, &fi); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); + return rc; } static void @@ -702,19 +713,20 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, cifsInode->cifsAttrs = dosattrs; } -static int -cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf) { + FILE_ALL_INFO *fi = buf; + if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS)) return SMBLegacyOpen(xid, oparms->tcon, oparms->path, oparms->disposition, oparms->desired_access, oparms->create_options, - &oparms->fid->netfid, oplock, buf, + &oparms->fid->netfid, oplock, fi, oparms->cifs_sb->local_nls, cifs_remap(oparms->cifs_sb)); - return CIFS_open(xid, oparms, oplock, buf); + return CIFS_open(xid, oparms, oplock, fi); } static void diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 9dfd2dd612c25..4992b43616a7a 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -20,40 +20,125 @@ #include "cifs_unicode.h" #include "fscache.h" #include "smb2proto.h" +#include "smb2status.h" -int -smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) +{ + struct smb2_err_rsp *err = iov->iov_base; + struct smb2_symlink_err_rsp *sym = ERR_PTR(-EINVAL); + u32 len; + + if (err->ErrorContextCount) { + struct smb2_error_context_rsp *p, *end; + + len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, + ErrorContextData) + + sizeof(struct smb2_symlink_err_rsp)); + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + return ERR_PTR(-EINVAL); + + p = (struct smb2_error_context_rsp *)err->ErrorData; + end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len); + do { + if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) { + sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData; + break; + } + cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n", + __func__, le32_to_cpu(p->ErrorId)); + + len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len); + } while (p < end); + } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && + iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) { + sym = (struct smb2_symlink_err_rsp *)err->ErrorData; + } + + if (!IS_ERR(sym) && (le32_to_cpu(sym->SymLinkErrorTag) != SYMLINK_ERROR_TAG || + le32_to_cpu(sym->ReparseTag) != IO_REPARSE_TAG_SYMLINK)) + sym = ERR_PTR(-EINVAL); + + return sym; +} + +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path) +{ + struct smb2_symlink_err_rsp *sym; + unsigned int sub_offs, sub_len; + unsigned int print_offs, print_len; + char *s; + + if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path) + return -EINVAL; + + sym = symlink_data(iov); + if (IS_ERR(sym)) + return PTR_ERR(sym); + + sub_len = le16_to_cpu(sym->SubstituteNameLength); + sub_offs = le16_to_cpu(sym->SubstituteNameOffset); + print_len = le16_to_cpu(sym->PrintNameLength); + print_offs = le16_to_cpu(sym->PrintNameOffset); + + if (iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offs + sub_len || + iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len) + return -EINVAL; + + s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true, + cifs_sb->local_nls); + if (!s) + return -ENOMEM; + convert_delimiter(s, '/'); + cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s); + + *path = s; + return 0; +} + +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf) { int rc; __le16 *smb2_path; - struct smb2_file_all_info *smb2_data = NULL; __u8 smb2_oplock; + struct cifs_open_info_data *data = buf; + struct smb2_file_all_info file_info = {}; + struct smb2_file_all_info *smb2_data = data ? &file_info : NULL; + struct kvec err_iov = {}; + int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); - if (smb2_path == NULL) { - rc = -ENOMEM; - goto out; - } - - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) { - rc = -ENOMEM; - goto out; - } + if (smb2_path == NULL) + return -ENOMEM; oparms->desired_access |= FILE_READ_ATTRIBUTES; smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; - rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, - NULL, NULL); + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + if (rc && data) { + struct smb2_hdr *hdr = err_iov.iov_base; + + if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER)) + rc = -ENOMEM; + else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK && oparms->cifs_sb) { + rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov, + &data->symlink_target); + if (!rc) { + memset(smb2_data, 0, sizeof(*smb2_data)); + oparms->create_options |= OPEN_REPARSE_POINT; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, + NULL, NULL, NULL); + oparms->create_options &= ~OPEN_REPARSE_POINT; + } + } + } + if (rc) goto out; - if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = @@ -73,7 +158,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } - if (buf) { + if (smb2_data) { /* if open response does not have IndexNumber field - get it */ if (smb2_data->IndexNumber == 0) { rc = SMB2_get_srv_num(xid, oparms->tcon, @@ -89,12 +174,12 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } } - move_smb2_info_to_cifs(buf, smb2_data); + memcpy(&data->fi, smb2_data, sizeof(data->fi)); } *oplock = smb2_oplock; out: - kfree(smb2_data); + free_rsp_buf(err_buftype, err_iov.iov_base); kfree(smb2_path); return rc; } diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index bb3e3d5a0cdac..adf71b328f328 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -24,6 +24,7 @@ #include "smb2pdu.h" #include "smb2proto.h" #include "cached_dir.h" +#include "smb2status.h" static void free_set_inf_compound(struct smb_rqst *rqst) @@ -50,13 +51,15 @@ struct cop_vars { /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. + * + * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all + * error responses. Caller is also responsible for freeing them up. */ -static int -smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, umode_t mode, void *ptr, int command, - struct cifsFileInfo *cfile) +static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, __u32 create_options, + umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + struct kvec *err_iov, int *err_buftype) { struct cop_vars *vars = NULL; struct kvec *rsp_iov; @@ -70,6 +73,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, int num_rqst = 0; int resp_buftype[3]; struct smb2_query_info_rsp *qi_rsp = NULL; + struct cifs_open_info_data *idata; int flags = 0; __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; unsigned int size[2]; @@ -385,14 +389,19 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, switch (command) { case SMB2_OP_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb2_file_all_info), - ptr); + &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -406,13 +415,19 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, tcon->tid); break; case SMB2_OP_POSIX_QUERY_INFO: + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr); + &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, + (char *)&idata->posix_fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -477,42 +492,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, free_set_inf_compound(rqst); break; } - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + + if (rc && err_iov && err_buftype) { + memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); + memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); + } else { + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + } kfree(vars); return rc; } -void -move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) -{ - memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); - dst->CurrentByteOffset = src->CurrentByteOffset; - dst->Mode = src->Mode; - dst->AlignmentRequirement = src->AlignmentRequirement; - dst->IndexNumber1 = 0; /* we don't use it */ -} - -int -smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse) +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; - struct smb2_file_all_info *smb2_data; __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - if (strcmp(full_path, "")) rc = -ENOENT; else @@ -520,63 +526,58 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!rc) { if (cfid->file_all_info_is_valid) { - move_smb2_info_to_cifs(data, - &cfid->file_all_info); + memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); } else { - rc = SMB2_query_info(xid, tcon, - cfid->fid.persistent_fid, - cfid->fid.volatile_fid, smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); + rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid, &data->fi); } close_cached_dir(cfid); - goto out; + return rc; } cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, + err_iov, err_buftype); if (rc == -EOPNOTSUPP) { + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_QUERY_INFO, cfile, NULL, NULL); } - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } -int -smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse) +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct smb311_posix_qinfo *smb2_data; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - /* BB TODO: Make struct larger when add support for parsing owner SIDs */ - smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo), - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - /* * BB TODO: Add support for using the cached root handle. * Create SMB2_query_posix_info worker function to do non-compounded query @@ -585,29 +586,32 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, + err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL); } - if (rc) - goto out; - - /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */ - memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo)); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } @@ -619,7 +623,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL); + NULL, NULL, NULL); } void @@ -641,7 +645,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile); + &data, SMB2_OP_SET_INFO, cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -652,7 +656,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL); + NULL, SMB2_OP_RMDIR, NULL, NULL, NULL); } int @@ -661,7 +665,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL); } static int @@ -680,7 +684,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile); + command, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -720,7 +724,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile); + &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL); } int @@ -746,7 +750,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile); + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, + NULL, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 32d5387a9ddec..73e951e9858c0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -831,33 +831,25 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *data) { - *uniqueid = le64_to_cpu(data->IndexNumber); + *uniqueid = le64_to_cpu(data->fi.IndexNumber); return 0; } -static int -smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - int rc; - struct smb2_file_all_info *smb2_data; + struct cifs_fid *fid = &cfile->fid; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - - rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, - smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); - kfree(smb2_data); - return rc; + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } #ifdef CONFIG_CIFS_XATTR @@ -2828,9 +2820,6 @@ parse_reparse_point(struct reparse_data_buffer *buf, } } -#define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) - static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -2842,13 +2831,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf = NULL; - struct smb2_symlink_err_rsp *symlink; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - unsigned int sub_len; - unsigned int sub_offset; - unsigned int print_len; - unsigned int print_offset; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -2965,47 +2948,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto querty_exit; } - err_buf = err_iov.iov_base; - if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { - rc = -EINVAL; - goto querty_exit; - } - - symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; - if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG || - le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) { - rc = -EINVAL; - goto querty_exit; - } - - /* open must fail on symlink - reset rc */ - rc = 0; - sub_len = le16_to_cpu(symlink->SubstituteNameLength); - sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); - print_len = le16_to_cpu(symlink->PrintNameLength); - print_offset = le16_to_cpu(symlink->PrintNameOffset); - - if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { - rc = -EINVAL; - goto querty_exit; - } - - if (err_iov.iov_len < - SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { - rc = -EINVAL; - goto querty_exit; - } - - *target_path = cifs_strndup_from_utf16( - (char *)symlink->PathBuffer + sub_offset, - sub_len, true, cifs_sb->local_nls); - if (!(*target_path)) { - rc = -ENOMEM; - goto querty_exit; - } - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path); querty_exit: cifs_dbg(FYI, "query symlink rc %d\n", rc); @@ -5115,7 +5058,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; - FILE_ALL_INFO *buf = NULL; + struct cifs_open_info_data buf = {}; struct cifs_io_parms io_parms = {0}; __u32 oplock = 0; struct cifs_fid fid; @@ -5131,7 +5074,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto out; + return rc; /* * TODO: Add ability to create instead via reparse point. Windows (e.g. @@ -5140,16 +5083,10 @@ smb2_make_node(unsigned int xid, struct inode *inode, */ if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto out; + return rc; cifs_dbg(FYI, "sfu compat create special file\n"); - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; @@ -5164,21 +5101,21 @@ smb2_make_node(unsigned int xid, struct inode *inode, oplock = REQ_OPLOCK; else oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); if (rc) - goto out; + return rc; /* * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)buf; + pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; + iov[1].iov_base = &buf.fi; iov[1].iov_len = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); @@ -5197,8 +5134,8 @@ smb2_make_node(unsigned int xid, struct inode *inode, d_drop(dentry); /* FIXME: add code here to set EAs */ -out: - kfree(buf); + + cifs_free_open_info(&buf); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f57881b8464fb..1237bb86e93a8 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -56,6 +56,9 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + #define SYMLINK_ERROR_TAG 0x4c4d5953 struct smb2_symlink_err_rsp { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 3f740f24b96a7..7818d0b835672 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -53,16 +53,12 @@ extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); - -extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, - struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, FILE_ALL_INFO *data, - bool *adjust_tz, bool *symlink); +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -95,9 +91,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_read); -extern int smb2_open_file(const unsigned int xid, - struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf); +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path); +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -278,9 +274,9 @@ extern int smb2_query_info_compound(const unsigned int xid, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); /* query path info from the server using SMB311 POSIX extensions*/ -extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf, - bool *adjust_tx, bool *symlink); +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); -- GitLab From 69ccafdd35cdffd72504bfed58dcaee5e73a88a7 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara <pc@cjr.nz> Date: Tue, 4 Oct 2022 15:10:09 -0300 Subject: [PATCH 1911/2223] cifs: fix uninitialised var in smb2_compound_op() Fix uninitialised variable @idata when calling smb2_compound_op() with SMB2_OP_POSIX_QUERY_INFO. Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index adf71b328f328..a6640e6ea58bc 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -415,6 +415,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, tcon->tid); break; case SMB2_OP_POSIX_QUERY_INFO: + idata = ptr; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) -- GitLab From 9ee2afe5207b63b20426ee081f486d831bae871d Mon Sep 17 00:00:00 2001 From: Paulo Alcantara <pc@cjr.nz> Date: Thu, 6 Oct 2022 13:04:05 -0300 Subject: [PATCH 1912/2223] cifs: prevent copying past input buffer boundaries Prevent copying past @data buffer in smb2_validate_and_copy_iov() as the output buffer in @iov might be potentially bigger and thus copying more bytes than requested in @minbufsize. Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b3c4d2e54eaa3..a3b77df2848ca 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3485,7 +3485,7 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, if (rc) return rc; - memcpy(data, begin_of_buf, buffer_length); + memcpy(data, begin_of_buf, minbufsize); return 0; } @@ -3609,7 +3609,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov, min_len, *data); + &rsp_iov, dlen ? *dlen : min_len, *data); if (rc && allocated) { kfree(*data); *data = NULL; -- GitLab From ebe98f1447bbccf8228335c62d86af02a0ed23f7 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Thu, 6 Oct 2022 00:14:31 -0500 Subject: [PATCH 1913/2223] cifs: enable caching of directories for which a lease is held This expands the directory caching to now cache an open handle for all directories (up to a maximum) and not just the root directory. In this patch, locking and refcounting is intended to work as so: The main function to get a reference to a cached handle is find_or_create_cached_dir() called from open_cached_dir() These functions are protected under the cfid_list_lock spin-lock to make sure we do not race creating new references for cached dirs with deletion of expired ones. An successful open_cached_dir() will take out 2 references to the cfid if this was the very first and successful call to open the directory and it acquired a lease from the server. One reference is for the lease and the other is for the cfid that we return. The is lease reference is tracked by cfid->has_lease. If the directory already has a handle with an active lease, then we just take out one new reference for the cfid and return it. It can happen that we have a thread that tries to open a cached directory where we have a cfid already but we do not, yet, have a working lease. In this case we will just return NULL, and this the caller will fall back to the case when no handle was available. In this model the total number of references we have on a cfid is 1 for while the handle is open and we have a lease, and one additional reference for each open instance of a cfid. Once we get a lease break (cached_dir_lease_break()) we remove the cfid from the list under the spinlock. This prevents any new threads to use it, and we also call smb2_cached_lease_break() via the work_queue in order to drop the reference we got for the lease (we drop it outside of the spin-lock.) Anytime a thread calls close_cached_dir() we also drop a reference to the cfid. When the last reference to the cfid is released smb2_close_cached_fid() will be invoked which will drop the reference ot the dentry we held for this cfid and it will also, if we the handle is open/has a lease also call SMB2_close() to close the handle on the server. Two events require special handling: invalidate_all_cached_dirs() this function is called from SMB2_tdis() and cifs_mark_open_files_invalid(). In both cases the tcon is either gone already or will be shortly so we do not need to actually close the handles. They will be dropped server side as part of the tcon dropping. But we have to be careful about a potential race with a concurrent lease break so we need to take out additional refences to avoid the cfid from being freed while we are still referencing it. free_cached_dirs() which is called from tconInfoFree(). This is called quite late in the umount process so there should no longer be any open handles or files and we can just free all the remaining data. Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 428 +++++++++++++++++++++++++------------------ fs/cifs/cached_dir.h | 20 +- fs/cifs/inode.c | 6 +- fs/cifs/smb2ops.c | 2 +- 4 files changed, 263 insertions(+), 193 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index b705dac383f9f..e5573d4e2d83e 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -11,7 +11,53 @@ #include "smb2proto.h" #include "cached_dir.h" -struct cached_fid *init_cached_dir(const char *path); +static struct cached_fid *init_cached_dir(const char *path); +static void free_cached_dir(struct cached_fid *cfid); + +static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, + const char *path, + bool lookup_only) +{ + struct cached_fid *cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (!strcmp(cfid->path, path)) { + /* + * If it doesn't have a lease it is either not yet + * fully cached or it may be in the process of + * being deleted due to a lease break. + */ + if (!cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; + } + } + if (lookup_only) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + if (cfids->num_entries >= MAX_CACHED_FIDS) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid = init_cached_dir(path); + if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid->cfids = cfids; + cfids->num_entries++; + list_add(&cfid->entry, &cfids->entries); + cfid->on_list = true; + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; +} /* * Open the and cache a directory handle. @@ -33,61 +79,65 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov[1]; int rc, flags = 0; - __le16 utf16_path = 0; /* Null - since an open of top of share */ + __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; - struct dentry *dentry; + struct dentry *dentry = NULL; struct cached_fid *cfid; + struct cached_fids *cfids; - if (tcon == NULL || tcon->nohandlecache || + + if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server)) return -EOPNOTSUPP; ses = tcon->ses; server = ses->server; + cfids = tcon->cfids; + + if (!server->ops->new_lease_key) + return -EIO; if (cifs_sb->root == NULL) return -ENOENT; + /* + * TODO: for better caching we need to find and use the dentry also + * for non-root directories. + */ if (!path[0]) dentry = cifs_sb->root; - else - return -ENOENT; - cfid = tcon->cfids->cfid; - if (cfid == NULL) { - cfid = init_cached_dir(path); - tcon->cfids->cfid = cfid; - } - if (cfid == NULL) + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) return -ENOMEM; - mutex_lock(&cfid->fid_mutex); - if (cfid->is_valid) { - cifs_dbg(FYI, "found a cached root file handle\n"); + cfid = find_or_create_cached_dir(cfids, path, lookup_only); + if (cfid == NULL) { + kfree(utf16_path); + return -ENOENT; + } + /* + * At this point we either have a lease already and we can just + * return it. If not we are guaranteed to be the only thread accessing + * this cfid. + */ + if (cfid->has_lease) { *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); + kfree(utf16_path); return 0; } /* * We do not hold the lock for the open because in case - * SMB2_open needs to reconnect, it will end up calling - * cifs_mark_open_files_invalid() which takes the lock again - * thus causing a deadlock + * SMB2_open needs to reconnect. + * This is safe because no other thread will be able to get a ref + * to the cfid until we have finished opening the file and (possibly) + * acquired a lease. */ - mutex_unlock(&cfid->fid_mutex); - - if (lookup_only) - return -ENOENT; - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - if (!server->ops->new_lease_key) - return -EIO; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); @@ -108,7 +158,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.reconnect = false; rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, &utf16_path); + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto oshr_free; smb2_set_next_command(tcon, &rqst[0]); @@ -131,47 +181,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - mutex_lock(&cfid->fid_mutex); - - /* - * Now we need to check again as the cached root might have - * been successfully re-opened from a concurrent process - */ - - if (cfid->is_valid) { - /* work was already done */ - - /* stash fids for close() later */ - struct cifs_fid fid = { - .persistent_fid = pfid->persistent_fid, - .volatile_fid = pfid->volatile_fid, - }; - - /* - * caller expects this func to set the fid in cfid to valid - * cached root, so increment the refcount. - */ - kref_get(&cfid->refcount); - - mutex_unlock(&cfid->fid_mutex); - - if (rc == 0) { - /* close extra handle outside of crit sec */ - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - } - rc = 0; - goto oshr_free; - } - - /* Cached root is still invalid, continue normaly */ - if (rc) { if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", tcon->tree_name); } - goto oshr_exit; + goto oshr_free; } atomic_inc(&tcon->num_remote_opens); @@ -184,46 +200,54 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, #endif /* CIFS_DEBUG2 */ cfid->tcon = tcon; - cfid->is_valid = true; - cfid->dentry = dentry; - if (dentry) + if (dentry) { + cfid->dentry = dentry; dget(dentry); - kref_init(&cfid->refcount); - + } /* BB TBD check to see if oplock level check can be removed below */ - if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { - /* - * See commit 2f94a3125b87. Increment the refcount when we - * get a lease for root, release it if lease break occurs - */ - kref_get(&cfid->refcount); - cfid->has_lease = true; - smb2_parse_contexts(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key, &oplock, - NULL, NULL); - } else - goto oshr_exit; + if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + goto oshr_free; + + + smb2_parse_contexts(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key, &oplock, + NULL, NULL); qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) - goto oshr_exit; + goto oshr_free; if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; - cfid->time = jiffies; + cfid->is_open = true; + cfid->has_lease = true; -oshr_exit: - mutex_unlock(&cfid->fid_mutex); oshr_free: + kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + spin_lock(&cfids->cfid_list_lock); + if (!cfid->has_lease) { + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + rc = -ENOENT; + } + spin_unlock(&cfids->cfid_list_lock); + if (rc) { + free_cached_dir(cfid); + cfid = NULL; + } + if (rc == 0) *ret_cfid = cfid; @@ -235,20 +259,22 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct cached_fid **ret_cfid) { struct cached_fid *cfid; + struct cached_fids *cfids = tcon->cfids; - cfid = tcon->cfids->cfid; - if (cfid == NULL) + if (cfids == NULL) return -ENOENT; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry == dentry) { - cifs_dbg(FYI, "found a cached root file handle by dentry\n"); - *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); - return 0; + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (dentry && cfid->dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + kref_get(&cfid->refcount); + *ret_cfid = cfid; + spin_unlock(&cfids->cfid_list_lock); + return 0; + } } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfids->cfid_list_lock); return -ENOENT; } @@ -257,63 +283,29 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); - struct cached_dirent *dirent, *q; - if (cfid->is_valid) { - cifs_dbg(FYI, "clear cached root file handle\n"); - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid); + spin_lock(&cfid->cfids->cfid_list_lock); + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfid->cfids->num_entries--; } + spin_unlock(&cfid->cfids->cfid_list_lock); - /* - * We only check validity above to send SMB2_close, - * but we still need to invalidate these entries - * when this function is called - */ - cfid->is_valid = false; - cfid->file_all_info_is_valid = false; - cfid->has_lease = false; - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - /* - * Delete all cached dirent names - */ - mutex_lock(&cfid->dirents.de_mutex); - list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { - list_del(&dirent->entry); - kfree(dirent->name); - kfree(dirent); + dput(cfid->dentry); + cfid->dentry = NULL; + + if (cfid->is_open) { + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); } - cfid->dirents.is_valid = 0; - cfid->dirents.is_failed = 0; - cfid->dirents.ctx = NULL; - cfid->dirents.pos = 0; - mutex_unlock(&cfid->dirents.de_mutex); + free_cached_dir(cfid); } void close_cached_dir(struct cached_fid *cfid) { - mutex_lock(&cfid->fid_mutex); kref_put(&cfid->refcount, smb2_close_cached_fid); - mutex_unlock(&cfid->fid_mutex); -} - -void close_cached_dir_lease_locked(struct cached_fid *cfid) -{ - if (cfid->has_lease) { - cfid->has_lease = false; - kref_put(&cfid->refcount, smb2_close_cached_fid); - } -} - -void close_cached_dir_lease(struct cached_fid *cfid) -{ - mutex_lock(&cfid->fid_mutex); - close_cached_dir_lease_locked(cfid); - mutex_unlock(&cfid->fid_mutex); } /* @@ -326,41 +318,62 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) struct cached_fid *cfid; struct cifs_tcon *tcon; struct tcon_link *tlink; + struct cached_fids *cfids; for (node = rb_first(root); node; node = rb_next(node)) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); tcon = tlink_tcon(tlink); if (IS_ERR(tcon)) continue; - cfid = tcon->cfids->cfid; - if (cfid == NULL) + cfids = tcon->cfids; + if (cfids == NULL) continue; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { + list_for_each_entry(cfid, &cfids->entries, entry) { dput(cfid->dentry); cfid->dentry = NULL; } - mutex_unlock(&cfid->fid_mutex); } } /* - * Invalidate and close all cached dirs when a TCON has been reset + * Invalidate all cached dirs when a TCON has been reset * due to a session loss. */ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { - struct cached_fid *cfid = tcon->cfids->cfid; - - if (cfid == NULL) - return; - - mutex_lock(&cfid->fid_mutex); - cfid->is_valid = false; - /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_cached_dir_lease_locked(cfid); - memset(&cfid->fid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&cfid->fid_mutex); + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid, *q; + struct list_head entry; + + INIT_LIST_HEAD(&entry); + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + list_del(&cfid->entry); + list_add(&cfid->entry, &entry); + cfids->num_entries--; + cfid->is_open = false; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + cfid->on_list = false; + list_del(&cfid->entry); + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { + /* + * We lease was never cancelled from the server so we + * need to drop the reference. + */ + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + } + /* Drop the extra reference opened above*/ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } } static void @@ -369,51 +382,83 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_cached_dir_lease(cfid); + spin_lock(&cfid->cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfid->cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { - struct cached_fid *cfid = tcon->cfids->cfid; + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid; - if (cfid == NULL) + if (cfids == NULL) return false; - if (cfid->is_valid && - !memcmp(lease_key, - cfid->fid.lease_key, - SMB2_LEASE_KEY_SIZE)) { - cfid->time = 0; - INIT_WORK(&cfid->lease_break, - smb2_cached_lease_break); - queue_work(cifsiod_wq, - &cfid->lease_break); - return true; + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (cfid->has_lease && + !memcmp(lease_key, + cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE)) { + cfid->time = 0; + /* + * We found a lease remove it from the list + * so no threads can access it. + */ + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + + queue_work(cifsiod_wq, + &cfid->lease_break); + spin_unlock(&cfids->cfid_list_lock); + return true; + } } + spin_unlock(&cfids->cfid_list_lock); return false; } -struct cached_fid *init_cached_dir(const char *path) +static struct cached_fid *init_cached_dir(const char *path) { struct cached_fid *cfid; - cfid = kzalloc(sizeof(*cfid), GFP_KERNEL); + cfid = kzalloc(sizeof(*cfid), GFP_ATOMIC); if (!cfid) return NULL; - cfid->path = kstrdup(path, GFP_KERNEL); + cfid->path = kstrdup(path, GFP_ATOMIC); if (!cfid->path) { kfree(cfid); return NULL; } + INIT_WORK(&cfid->lease_break, smb2_cached_lease_break); + INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); - mutex_init(&cfid->fid_mutex); + spin_lock_init(&cfid->fid_lock); + kref_init(&cfid->refcount); return cfid; } -void free_cached_dir(struct cached_fid *cfid) +static void free_cached_dir(struct cached_fid *cfid) { + struct cached_dirent *dirent, *q; + + dput(cfid->dentry); + cfid->dentry = NULL; + + /* + * Delete all cached dirent names + */ + list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { + list_del(&dirent->entry); + kfree(dirent->name); + kfree(dirent); + } + kfree(cfid->path); cfid->path = NULL; kfree(cfid); @@ -426,15 +471,34 @@ struct cached_fids *init_cached_dirs(void) cfids = kzalloc(sizeof(*cfids), GFP_KERNEL); if (!cfids) return NULL; - mutex_init(&cfids->cfid_list_mutex); + spin_lock_init(&cfids->cfid_list_lock); + INIT_LIST_HEAD(&cfids->entries); return cfids; } +/* + * Called from tconInfoFree when we are tearing down the tcon. + * There are no active users or open files/directories at this point. + */ void free_cached_dirs(struct cached_fids *cfids) { - if (cfids->cfid) { - free_cached_dir(cfids->cfid); - cfids->cfid = NULL; + struct cached_fid *cfid, *q; + struct list_head entry; + + INIT_LIST_HEAD(&entry); + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + cfid->on_list = false; + cfid->is_open = false; + list_del(&cfid->entry); + list_add(&cfid->entry, &entry); } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + free_cached_dir(cfid); + } + kfree(cfids); } diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index bdf6c3866653b..e536304ca2ce4 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -31,14 +31,17 @@ struct cached_dirents { }; struct cached_fid { + struct list_head entry; + struct cached_fids *cfids; const char *path; - bool is_valid:1; /* Do we have a useable root fid */ - bool file_all_info_is_valid:1; bool has_lease:1; + bool is_open:1; + bool on_list:1; + bool file_all_info_is_valid:1; unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid fid; - struct mutex fid_mutex; + spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; struct work_struct lease_break; @@ -46,9 +49,14 @@ struct cached_fid { struct cached_dirents dirents; }; +#define MAX_CACHED_FIDS 16 struct cached_fids { - struct mutex cfid_list_mutex; - struct cached_fid *cfid; + /* Must be held when: + * - accessing the cfids->entries list + */ + spinlock_t cfid_list_lock; + int num_entries; + struct list_head entries; }; extern struct cached_fids *init_cached_dirs(void); @@ -61,8 +69,6 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct dentry *dentry, struct cached_fid **cfid); extern void close_cached_dir(struct cached_fid *cfid); -extern void close_cached_dir_lease(struct cached_fid *cfid); -extern void close_cached_dir_lease_locked(struct cached_fid *cfid); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index be6dafcb25e31..7cf96e581d243 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2299,13 +2299,13 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - mutex_lock(&cfid->fid_mutex); + spin_lock(&cfid->fid_lock); if (cfid->time && cifs_i->time > cfid->time) { - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); return false; } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); } /* diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 73e951e9858c0..b907d1fab8d98 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -801,7 +801,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { - if (cfid->is_valid) { + if (cfid->has_lease) { close_cached_dir(cfid); return 0; } -- GitLab From e4029e072673d8a694f660f551609dd4f9265088 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Wed, 12 Oct 2022 06:13:03 -0500 Subject: [PATCH 1914/2223] cifs: find and use the dentry for cached non-root directories also This allows us to use cached attributes for the entries in a cached directory for as long as a lease is held on the directory itself. Previously we have always allowed "used cached attributes for 1 second" but this extends this to the lifetime of the lease as well as making the caching safer. Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 63 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index e5573d4e2d83e..fe88b67c863fe 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -5,6 +5,7 @@ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com> */ +#include <linux/namei.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" @@ -59,6 +60,44 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, return cfid; } +static struct dentry * +path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path) +{ + struct dentry *dentry; + const char *s, *p; + char sep; + + sep = CIFS_DIR_SEP(cifs_sb); + dentry = dget(cifs_sb->root); + s = path; + + do { + struct inode *dir = d_inode(dentry); + struct dentry *child; + + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + child = lookup_positive_unlocked(p, dentry, s - p); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); + return dentry; +} + /* * Open the and cache a directory handle. * If error then *cfid is not initialized. @@ -86,7 +125,6 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fid *cfid; struct cached_fids *cfids; - if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server)) return -EOPNOTSUPP; @@ -101,13 +139,6 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (cifs_sb->root == NULL) return -ENOENT; - /* - * TODO: for better caching we need to find and use the dentry also - * for non-root directories. - */ - if (!path[0]) - dentry = cifs_sb->root; - utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -199,12 +230,6 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ - cfid->tcon = tcon; - if (dentry) { - cfid->dentry = dentry; - dget(dentry); - } - /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) goto oshr_free; @@ -223,6 +248,16 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; + + if (!path[0]) + dentry = dget(cifs_sb->root); + else { + dentry = path_to_dentry(cifs_sb, path); + if (IS_ERR(dentry)) + goto oshr_free; + } + cfid->dentry = dentry; + cfid->tcon = tcon; cfid->time = jiffies; cfid->is_open = true; cfid->has_lease = true; -- GitLab From d7173623bf0b1503bc4e6f13cd0fccab5e98c6ce Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya <ematsumiya@suse.de> Date: Wed, 12 Oct 2022 22:53:09 -0500 Subject: [PATCH 1915/2223] cifs: use ALIGN() and round_up() macros Improve code readability by using existing macros: Replace hardcoded alignment computations (e.g. (len + 7) & ~0x7) by ALIGN()/IS_ALIGNED() macros. Also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which, if not optimized by the compiler, has the overhead of a multiplication and a division. Do the same for roundup() by replacing it by round_up() (division-less version, but requires the multiple to be a power of 2, which is always the case for us). And remove some unnecessary checks where !IS_ALIGNED() would fit, but calling round_up() directly is fine as it's a no-op if the value is already aligned. Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifssmb.c | 2 +- fs/cifs/connect.c | 11 +++++++++-- fs/cifs/sess.c | 18 ++++++------------ fs/cifs/smb2misc.c | 2 +- fs/cifs/smb2pdu.c | 38 ++++++++++++++++---------------------- 5 files changed, 33 insertions(+), 38 deletions(-) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7a808e41b1b89..1724066c15365 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, remap); } rename_info->target_name_len = cpu_to_le32(2 * len_of_str); - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); + count = sizeof(struct set_file_rename) + (2 * len_of_str); byte_count += count; pSMB->DataCount = cpu_to_le16(count); pSMB->TotalDataCount = pSMB->DataCount; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e158257da1cd7..ffb291579bb9d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2832,9 +2832,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet *ses_init_buf; + unsigned int req_noscope_len; struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), GFP_KERNEL); + if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; @@ -2870,8 +2873,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) ses_init_buf->trailer.session_req.scope2 = 0; smb_buf = (struct smb_hdr *)ses_init_buf; - /* sizeof RFC1002_SESSION_REQUEST with no scope */ - smb_buf->smb_buf_length = cpu_to_be32(0x81000044); + /* sizeof RFC1002_SESSION_REQUEST with no scopes */ + req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; + + /* == cpu_to_be32(0x81000044) */ + smb_buf->smb_buf_length = + cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); rc = smb_send(server, smb_buf, 0x44); kfree(ses_init_buf); /* diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index f1c3c6d9146c3..c9edec7081de7 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -601,11 +601,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB FIXME add check that strings total less than 335 or will need to send them as arrays */ - /* unicode strings, must be word aligned before the call */ -/* if ((long) bcc_ptr % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } */ /* copy user */ if (ses->user_name == NULL) { /* null user mount */ @@ -1324,7 +1319,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) } if (ses->capabilities & CAP_UNICODE) { - if (sess_data->iov[0].iov_len % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1364,7 +1359,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1448,8 +1443,7 @@ sess_auth_kerberos(struct sess_data *sess_data) if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len - + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1500,7 +1494,7 @@ sess_auth_kerberos(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1552,7 +1546,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) bcc_ptr = sess_data->iov[2].iov_base; /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1753,7 +1747,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7db5c09ecceba..a387204779660 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ - if (((calc_len + 7) & ~7) == len) + if (ALIGN(calc_len, 8) == len) return 0; /* diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a3b77df2848ca..e1162217ad1a6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt) /* * Context Data length must be rounded to multiple of 8 for some servers */ - pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP( - sizeof(struct smb2_signing_capabilities) - - sizeof(struct smb2_neg_context) + - (num_algs * 2 /* sizeof u16 */), 8) * 8); + pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) - + sizeof(struct smb2_neg_context) + + (num_algs * sizeof(u16)), 8)); pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs); pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC); - ctxt_len += 2 /* sizeof le16 */ * num_algs; - ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8; + ctxt_len += sizeof(__le16) * num_algs; + ctxt_len = ALIGN(ctxt_len, 8); return ctxt_len; /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */ } @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) /* copy up to max of first 100 bytes of server name to NetName field */ pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)); /* context size is DataLength + minimal smb2_neg_context */ - return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + - sizeof(struct smb2_neg_context), 8) * 8; + return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8); } static void @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * round up total_len of fixed part of SMB3 negotiate request to 8 * byte boundary before adding negotiate contexts */ - *total_len = roundup(*total_len, 8); + *total_len = ALIGN(*total_len, 8); pneg_ctxt = (*total_len) + (char *)req; req->NegotiateContextOffset = cpu_to_le32(*total_len); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) pneg_ctxt); - ctxt_len = DIV_ROUND_UP( - sizeof(struct smb2_compression_capabilities_context), - 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; neg_context_count++; @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, if (rc) break; /* offsets must be 8 byte aligned */ - clen = (clen + 7) & ~0x7; + clen = ALIGN(clen, 8); offset += clen + sizeof(struct smb2_neg_context); len_of_ctxts -= clen; } @@ -2426,7 +2422,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) unsigned int group_offset = 0; struct smb3_acl acl; - *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); + *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); if (set_owner) { /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ @@ -2500,7 +2496,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); - *len = roundup(ptr - (__u8 *)buf, 8); + *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8); return buf; } @@ -2594,7 +2590,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, * final path needs to be 8-byte aligned as specified in * MS-SMB2 2.2.13 SMB2 CREATE Request. */ - *out_size = roundup(*out_len * sizeof(__le16), 8); + *out_size = round_up(*out_len * sizeof(__le16), 8); *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL); if (!*out_path) return -ENOMEM; @@ -2839,9 +2835,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; /* MUST set path len (NameLength) to 0 opening root of share */ req->NameLength = cpu_to_le16(uni_path_len - 2); - copy_size = uni_path_len; - if (copy_size % 8 != 0) - copy_size = roundup(copy_size, 8); + copy_size = round_up(uni_path_len, 8); copy_path = kzalloc(copy_size, GFP_KERNEL); if (!copy_path) return -ENOMEM; @@ -4103,7 +4097,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { /* next 8-byte aligned request */ - *total_len = DIV_ROUND_UP(*total_len, 8) * 8; + *total_len = ALIGN(*total_len, 8); shdr->NextCommand = cpu_to_le32(*total_len); } else /* END_OF_CHAIN */ shdr->NextCommand = 0; -- GitLab From 3cebf80e9a0d3adcb174053be32c88a640b3344b Mon Sep 17 00:00:00 2001 From: Fangrui Song <maskray@google.com> Date: Sun, 18 Sep 2022 02:29:34 -0700 Subject: [PATCH 1916/2223] riscv: Pass -mno-relax only on lld < 15.0.0 lld since llvm:6611d58f5bbc ("[ELF] Relax R_RISCV_ALIGN"), which will be included in the 15.0.0 release, has implemented some RISC-V linker relaxation. -mno-relax is no longer needed in KBUILD_CFLAGS/KBUILD_AFLAGS to suppress R_RISCV_ALIGN which older lld can not handle: ld.lld: error: capability.c:(.fixup+0x0): relocation R_RISCV_ALIGN requires unimplemented linker relaxation; recompile with -mno-relax but the .o is already compiled with -mno-relax Signed-off-by: Fangrui Song <maskray@google.com> Link: https://lore.kernel.org/r/20220710071117.446112-1-maskray@google.com/ Link: https://lore.kernel.org/r/20220918092933.19943-1-palmer@rivosinc.com Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Tested-by: Nick Desaulniers <ndesaulniers@google.com> Tested-by: Nathan Chancellor <nathan@kernel.org> Tested-by: Conor Dooley <conor.dooley@microchip.com> Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index d63295e213731..76364cf67a72e 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -37,6 +37,7 @@ else endif ifeq ($(CONFIG_LD_IS_LLD),y) +ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 150000; echo $$?),0) KBUILD_CFLAGS += -mno-relax KBUILD_AFLAGS += -mno-relax ifndef CONFIG_AS_IS_LLVM @@ -44,6 +45,7 @@ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS += -Wa,-mno-relax endif endif +endif # ISA string setting riscv-march-$(CONFIG_ARCH_RV32I) := rv32ima -- GitLab From e47bddcb2ec531022a915f896f13586470b593d0 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner <heiko@sntech.de> Date: Mon, 5 Sep 2022 13:10:23 +0200 Subject: [PATCH 1917/2223] riscv: cleanup svpbmt cpufeature probing For better readability (and compile time coverage) use IS_ENABLED instead of ifdef and drop the new unneeded switch statement. Signed-off-by: Heiko Stuebner <heiko@sntech.de> Reviewed-by: Guo Ren <guoren@kernel.org> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Andrew Jones <ajones@ventanamicro.com> Link: https://lore.kernel.org/r/20220905111027.2463297-2-heiko@sntech.de Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/kernel/cpufeature.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 553d755483ed6..764ea220161f6 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -253,16 +253,13 @@ void __init riscv_fill_hwcap(void) #ifdef CONFIG_RISCV_ALTERNATIVE static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage) { -#ifdef CONFIG_RISCV_ISA_SVPBMT - switch (stage) { - case RISCV_ALTERNATIVES_EARLY_BOOT: + if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT)) return false; - default: - return riscv_isa_extension_available(NULL, SVPBMT); - } -#endif - return false; + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return false; + + return riscv_isa_extension_available(NULL, SVPBMT); } static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage) -- GitLab From f055268e3946555deb9bb80b2c8c9798c64dbc47 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner <heiko@sntech.de> Date: Mon, 5 Sep 2022 13:10:24 +0200 Subject: [PATCH 1918/2223] riscv: drop some idefs from CMO initialization Wrapping things in #ifdefs makes the code harder to read while we also have IS_ENABLED() macros to do this in regular code and the extension detection is not _that_ runtime critical. So define a stub for riscv_noncoherent_supported() in the non-CONFIG_RISCV_DMA_NONCOHERENT case and move the code to us IS_ENABLED. Suggested-by: Conor Dooley <conor.dooley@microchip.com> Signed-off-by: Heiko Stuebner <heiko@sntech.de> Reviewed-by: Guo Ren <guoren@kernel.org> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Andrew Jones <ajones@ventanamicro.com> Link: https://lore.kernel.org/r/20220905111027.2463297-3-heiko@sntech.de Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/errata/thead/errata.c | 7 +++---- arch/riscv/include/asm/cacheflush.h | 2 ++ arch/riscv/kernel/cpufeature.c | 22 +++++++++------------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index 202c83f677b2e..bffa711aaf647 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -30,7 +30,9 @@ static bool errata_probe_pbmt(unsigned int stage, static bool errata_probe_cmo(unsigned int stage, unsigned long arch_id, unsigned long impid) { -#ifdef CONFIG_ERRATA_THEAD_CMO + if (!IS_ENABLED(CONFIG_ERRATA_THEAD_CMO)) + return false; + if (arch_id != 0 || impid != 0) return false; @@ -39,9 +41,6 @@ static bool errata_probe_cmo(unsigned int stage, riscv_noncoherent_supported(); return true; -#else - return false; -#endif } static u32 thead_errata_probe(unsigned int stage, diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index a60acaecfedab..4363d0beb38a1 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -50,6 +50,8 @@ static inline void riscv_init_cbom_blocksize(void) { } #ifdef CONFIG_RISCV_DMA_NONCOHERENT void riscv_noncoherent_supported(void); +#else +static inline void riscv_noncoherent_supported(void) {} #endif /* diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 764ea220161f6..729f7a218093a 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -264,21 +264,17 @@ static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage) static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage) { -#ifdef CONFIG_RISCV_ISA_ZICBOM - switch (stage) { - case RISCV_ALTERNATIVES_EARLY_BOOT: + if (!IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM)) + return false; + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return false; + + if (!riscv_isa_extension_available(NULL, ZICBOM)) return false; - default: - if (riscv_isa_extension_available(NULL, ZICBOM)) { - riscv_noncoherent_supported(); - return true; - } else { - return false; - } - } -#endif - return false; + riscv_noncoherent_supported(); + return true; } /* -- GitLab From 499590c084f13b6aca225e5766edeebd48437ee8 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner <heiko@sntech.de> Date: Mon, 5 Sep 2022 13:10:25 +0200 Subject: [PATCH 1919/2223] riscv: use BIT() macros in t-head errata init Using the appropriate BIT macro makes the code better readable. Suggested-by: Conor Dooley <conor.dooley@microchip.com> Signed-off-by: Heiko Stuebner <heiko@sntech.de> Reviewed-by: Guo Ren <guoren@kernel.org> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Andrew Jones <ajones@ventanamicro.com> Link: https://lore.kernel.org/r/20220905111027.2463297-4-heiko@sntech.de Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/errata/thead/errata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index bffa711aaf647..a6f4bd8ccf3f8 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -49,10 +49,10 @@ static u32 thead_errata_probe(unsigned int stage, u32 cpu_req_errata = 0; if (errata_probe_pbmt(stage, archid, impid)) - cpu_req_errata |= (1U << ERRATA_THEAD_PBMT); + cpu_req_errata |= BIT(ERRATA_THEAD_PBMT); if (errata_probe_cmo(stage, archid, impid)) - cpu_req_errata |= (1U << ERRATA_THEAD_CMO); + cpu_req_errata |= BIT(ERRATA_THEAD_CMO); return cpu_req_errata; } -- GitLab From e283187c034cd80c1dd98ad732c73ce930a5efa4 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner <heiko@sntech.de> Date: Mon, 5 Sep 2022 13:10:26 +0200 Subject: [PATCH 1920/2223] riscv: use BIT() marco for cpufeature probing Using the appropriate BIT macro makes the code better readable. Suggested-by: Conor Dooley <conor.dooley@microchip.com> Signed-off-by: Heiko Stuebner <heiko@sntech.de> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220905111027.2463297-5-heiko@sntech.de Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/kernel/cpufeature.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 729f7a218093a..08f7445985dc9 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -289,10 +289,10 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage) u32 cpu_req_feature = 0; if (cpufeature_probe_svpbmt(stage)) - cpu_req_feature |= (1U << CPUFEATURE_SVPBMT); + cpu_req_feature |= BIT(CPUFEATURE_SVPBMT); if (cpufeature_probe_zicbom(stage)) - cpu_req_feature |= (1U << CPUFEATURE_ZICBOM); + cpu_req_feature |= BIT(CPUFEATURE_ZICBOM); return cpu_req_feature; } -- GitLab From 14057733109dcc83c35a6730f3b7112aac4d2b82 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner <heiko@sntech.de> Date: Mon, 5 Sep 2022 13:10:27 +0200 Subject: [PATCH 1921/2223] riscv: check for kernel config option in t-head memory types errata The t-head variant of page-based memory types should also check first for the enabled kernel config option. Fixes: a35707c3d850 ("riscv: add memory-type errata for T-Head") Signed-off-by: Heiko Stuebner <heiko@sntech.de> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Andrew Jones <ajones@ventanamicro.com> Reviewed-by: Guo Ren <guoren@kernel.org> Link: https://lore.kernel.org/r/20220905111027.2463297-6-heiko@sntech.de Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/errata/thead/errata.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index a6f4bd8ccf3f8..902e124528219 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -17,6 +17,9 @@ static bool errata_probe_pbmt(unsigned int stage, unsigned long arch_id, unsigned long impid) { + if (!IS_ENABLED(CONFIG_ERRATA_THEAD_PBMT)) + return false; + if (arch_id != 0 || impid != 0) return false; -- GitLab From 917c362b5f8a6e31ff35719b1bacfc1b76a1fd2f Mon Sep 17 00:00:00 2001 From: Frank Rowand <frank.rowand@sony.com> Date: Wed, 12 Oct 2022 17:05:48 -0500 Subject: [PATCH 1922/2223] MAINTAINERS: of: collapse overlay entry into main device tree entry Pantelis has not been active in recent years so no need to maintain a separate entry for device tree overlays. Signed-off-by: Frank Rowand <frank.rowand@sony.com> Link: https://lore.kernel.org/r/20221012220548.4163865-1-frowand.list@gmail.com Signed-off-by: Rob Herring <robh@kernel.org> --- MAINTAINERS | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a198da9861460..a497bd2b0ea66 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15359,17 +15359,6 @@ L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/ulp/opa_vnic -OPEN FIRMWARE AND DEVICE TREE OVERLAYS -M: Pantelis Antoniou <pantelis.antoniou@konsulko.com> -M: Frank Rowand <frowand.list@gmail.com> -L: devicetree@vger.kernel.org -S: Maintained -F: Documentation/devicetree/dynamic-resolution-notes.rst -F: Documentation/devicetree/overlay-notes.rst -F: drivers/of/overlay.c -F: drivers/of/resolver.c -K: of_overlay_notifier_ - OPEN FIRMWARE AND FLATTENED DEVICE TREE M: Rob Herring <robh+dt@kernel.org> M: Frank Rowand <frowand.list@gmail.com> @@ -15382,6 +15371,9 @@ F: Documentation/ABI/testing/sysfs-firmware-ofw F: drivers/of/ F: include/linux/of*.h F: scripts/dtc/ +K: of_overlay_notifier_ +K: of_overlay_fdt_apply +K: of_overlay_remove OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS M: Rob Herring <robh+dt@kernel.org> -- GitLab From 93c128e709aec23b10f3a2f78a824080d4085318 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@kernel.org> Date: Wed, 12 Oct 2022 14:42:54 -0400 Subject: [PATCH 1923/2223] nfsd: ensure we always call fh_verify_error tracepoint This is a conditional tracepoint. Call it every time, not just when nfs_permission fails. Signed-off-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- fs/nfsd/nfsfh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index d73434200df98..8c52b6c9d31a2 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -392,8 +392,8 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); - trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); out: + trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) nfsd_stats_fh_stale_inc(exp); return error; -- GitLab From 740ea3c4a0b2e326b23d7cdf05472a0e92aa39bc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Wed, 12 Oct 2022 07:50:36 -0700 Subject: [PATCH 1924/2223] tcp: Clean up kernel listener's reqsk in inet_twsk_purge() Eric Dumazet reported a use-after-free related to the per-netns ehash series. [0] When we create a TCP socket from userspace, the socket always holds a refcnt of the netns. This guarantees that a reqsk timer is always fired before netns dismantle. Each reqsk has a refcnt of its listener, so the listener is not freed before the reqsk, and the net is not freed before the listener as well. OTOH, when in-kernel users create a TCP socket, it might not hold a refcnt of its netns. Thus, a reqsk timer can be fired after the netns dismantle and access freed per-netns ehash. To avoid the use-after-free, we need to clean up TCP_NEW_SYN_RECV sockets in inet_twsk_purge() if the netns uses a per-netns ehash. [0]: https://lore.kernel.org/netdev/CANn89iLXMup0dRD_Ov79Xt8N9FM0XdhCHEN05sf3eLwxKweM6w@mail.gmail.com/ BUG: KASAN: use-after-free in tcp_or_dccp_get_hashinfo include/net/inet_hashtables.h:181 [inline] BUG: KASAN: use-after-free in reqsk_queue_unlink+0x320/0x350 net/ipv4/inet_connection_sock.c:913 Read of size 8 at addr ffff88807545bd80 by task syz-executor.2/8301 CPU: 1 PID: 8301 Comm: syz-executor.2 Not tainted 6.0.0-syzkaller-02757-gaf7d23f9d96a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 Call Trace: <IRQ> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 tcp_or_dccp_get_hashinfo include/net/inet_hashtables.h:181 [inline] reqsk_queue_unlink+0x320/0x350 net/ipv4/inet_connection_sock.c:913 inet_csk_reqsk_queue_drop net/ipv4/inet_connection_sock.c:927 [inline] inet_csk_reqsk_queue_drop_and_put net/ipv4/inet_connection_sock.c:939 [inline] reqsk_timer_handler+0x724/0x1160 net/ipv4/inet_connection_sock.c:1053 call_timer_fn+0x1a0/0x6b0 kernel/time/timer.c:1474 expire_timers kernel/time/timer.c:1519 [inline] __run_timers.part.0+0x674/0xa80 kernel/time/timer.c:1790 __run_timers kernel/time/timer.c:1768 [inline] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1803 __do_softirq+0x1d0/0x9c8 kernel/softirq.c:571 invoke_softirq kernel/softirq.c:445 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:650 irq_exit_rcu+0x5/0x20 kernel/softirq.c:662 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1107 </IRQ> Fixes: d1e5e6408b30 ("tcp: Introduce optional per-netns ehash.") Reported-by: syzbot <syzkaller@googlegroups.com> Reported-by: Eric Dumazet <edumazet@google.com> Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20221012145036.74960-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/inet_timewait_sock.c | 15 ++++++++++++++- net/ipv4/tcp_minisocks.c | 9 +++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 71d3bb0abf6c5..66fc940f9521a 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -268,8 +268,21 @@ restart_rcu: rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) + if (sk->sk_state != TCP_TIME_WAIT) { + /* A kernel listener socket might not hold refcnt for net, + * so reqsk_timer_handler() could be fired after net is + * freed. Userspace listener and reqsk never exist here. + */ + if (unlikely(sk->sk_state == TCP_NEW_SYN_RECV && + hashinfo->pernet)) { + struct request_sock *req = inet_reqsk(sk); + + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); + } + continue; + } + tw = inet_twsk(sk); if ((tw->tw_family != family) || refcount_read(&twsk_net(tw)->ns.count)) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 79f30f026d895..c375f603a16cf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -353,13 +353,14 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { - /* The last refcount is decremented in tcp_sk_exit_batch() */ - if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) - continue; - if (net->ipv4.tcp_death_row.hashinfo->pernet) { + /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); } else if (!purged_once) { + /* The last refcount is decremented in tcp_sk_exit_batch() */ + if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) + continue; + inet_twsk_purge(&tcp_hashinfo, family); purged_once = true; } -- GitLab From ec7eede369fe5b0d085ac51fdbb95184f87bfc6c Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Wed, 12 Oct 2022 13:34:12 +0000 Subject: [PATCH 1925/2223] kcm: avoid potential race in kcm_tx_work syzbot found that kcm_tx_work() could crash [1] in: /* Primarily for SOCK_SEQPACKET sockets */ if (likely(sk->sk_socket) && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { <<*>> clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_space(sk); } I think the reason is that another thread might concurrently run in kcm_release() and call sock_orphan(sk) while sk is not locked. kcm_tx_work() find sk->sk_socket being NULL. [1] BUG: KASAN: null-ptr-deref in instrument_atomic_write include/linux/instrumented.h:86 [inline] BUG: KASAN: null-ptr-deref in clear_bit include/asm-generic/bitops/instrumented-atomic.h:41 [inline] BUG: KASAN: null-ptr-deref in kcm_tx_work+0xff/0x160 net/kcm/kcmsock.c:742 Write of size 8 at addr 0000000000000008 by task kworker/u4:3/53 CPU: 0 PID: 53 Comm: kworker/u4:3 Not tainted 5.19.0-rc3-next-20220621-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kkcmd kcm_tx_work Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 kasan_report+0xbe/0x1f0 mm/kasan/report.c:495 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189 instrument_atomic_write include/linux/instrumented.h:86 [inline] clear_bit include/asm-generic/bitops/instrumented-atomic.h:41 [inline] kcm_tx_work+0xff/0x160 net/kcm/kcmsock.c:742 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:302 </TASK> Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Tom Herbert <tom@herbertland.com> Link: https://lore.kernel.org/r/20221012133412.519394-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/kcm/kcmsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 1215c863e1c41..27725464ec08f 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1838,10 +1838,10 @@ static int kcm_release(struct socket *sock) kcm = kcm_sk(sk); mux = kcm->mux; + lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); - lock_sock(sk); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. -- GitLab From 30e9672ac37f7b8b9e1379d25882798d8e76a96f Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Wed, 12 Oct 2022 18:00:59 +0300 Subject: [PATCH 1926/2223] net: marvell: prestera: fix a couple NULL vs IS_ERR() checks The __prestera_nexthop_group_create() function returns NULL on error and the prestera_nexthop_group_get() returns error pointers. Fix these two checks. Fixes: 0a23ae237171 ("net: marvell: prestera: Add router nexthops ABI") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Link: https://lore.kernel.org/r/Y0bWq+7DoKK465z8@kili Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/marvell/prestera/prestera_router_hw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c index 4f65df0ae5e87..aa080dc57ff00 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c @@ -498,8 +498,8 @@ prestera_nexthop_group_get(struct prestera_switch *sw, refcount_inc(&nh_grp->refcount); } else { nh_grp = __prestera_nexthop_group_create(sw, key); - if (IS_ERR(nh_grp)) - return ERR_CAST(nh_grp); + if (!nh_grp) + return ERR_PTR(-ENOMEM); refcount_set(&nh_grp->refcount, 1); } @@ -651,7 +651,7 @@ prestera_fib_node_create(struct prestera_switch *sw, case PRESTERA_FIB_TYPE_UC_NH: fib_node->info.nh_grp = prestera_nexthop_group_get(sw, nh_grp_key); - if (!fib_node->info.nh_grp) + if (IS_ERR(fib_node->info.nh_grp)) goto err_nh_grp_get; grp_id = fib_node->info.nh_grp->grp_id; -- GitLab From 99df45c9e0a43b1b88dab294265e2be4a040a441 Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Wed, 12 Oct 2022 18:01:32 +0300 Subject: [PATCH 1927/2223] sunhme: fix an IS_ERR() vs NULL check in probe The devm_request_region() function does not return error pointers, it returns NULL on error. Fixes: 914d9b2711dd ("sunhme: switch to devres") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Sean Anderson <seanga2@gmail.com> Reviewed-by: Rolf Eike Beer <eike-kernel@sf-tec.de> Link: https://lore.kernel.org/r/Y0bWzJL8JknX8MUf@kili Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/sun/sunhme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index 62deed210a957..91f10f746dffd 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -2896,8 +2896,8 @@ static int happy_meal_pci_probe(struct pci_dev *pdev, hpreg_res = devm_request_region(&pdev->dev, pci_resource_start(pdev, 0), pci_resource_len(pdev, 0), DRV_NAME); - if (IS_ERR(hpreg_res)) { - err = PTR_ERR(hpreg_res); + if (!hpreg_res) { + err = -EBUSY; dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting.\n"); goto err_out_clear_quattro; } -- GitLab From 877d95dcfd0a56102d4b97a9691115f5fb5e9ea3 Mon Sep 17 00:00:00 2001 From: Pierre Gondois <pierre.gondois@arm.com> Date: Thu, 6 Oct 2022 10:44:09 +0200 Subject: [PATCH 1928/2223] Documentation: rtla: Correct command line example The '-t/-T' parameters seem to have been swapped: -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt] -T/--thread us: stop trace if the thread latency is higher than the argument in us Swap them back. Signed-off-by: Pierre Gondois <pierre.gondois@arm.com> Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org> Link: https://lore.kernel.org/r/20221006084409.3882542-1-pierre.gondois@arm.com Signed-off-by: Jonathan Corbet <corbet@lwn.net> --- Documentation/tools/rtla/rtla-timerlat-top.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/tools/rtla/rtla-timerlat-top.rst b/Documentation/tools/rtla/rtla-timerlat-top.rst index 1c321de1c171e..7c4e4b1094933 100644 --- a/Documentation/tools/rtla/rtla-timerlat-top.rst +++ b/Documentation/tools/rtla/rtla-timerlat-top.rst @@ -39,7 +39,7 @@ higher than *30 us*. It is also set to stop the session if a *Thread* timer latency higher than *30 us* is hit. Finally, it is set to save the trace buffer if the stop condition is hit:: - [root@alien ~]# rtla timerlat top -s 30 -t 30 -T + [root@alien ~]# rtla timerlat top -s 30 -T 30 -t Timer Latency 0 00:00:59 | IRQ Timer Latency (us) | Thread Timer Latency (us) CPU COUNT | cur min avg max | cur min avg max -- GitLab From 44dce4b084f83f41922ed8c2a2c7d148254848bb Mon Sep 17 00:00:00 2001 From: Zong Li <zong.li@sifive.com> Date: Tue, 13 Sep 2022 06:18:11 +0000 Subject: [PATCH 1929/2223] dt-bindings: sifive-ccache: change Sifive L2 cache to Composable cache Since composable cache may be L3 cache if private L2 cache exists, we should use its original name Composable cache to prevent confusion. Signed-off-by: Zong Li <zong.li@sifive.com> Suggested-by: Conor Dooley <conor.dooley@microchip.com> Suggested-by: Ben Dooks <ben.dooks@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Rob Herring <robh@kernel.org> Link: https://lore.kernel.org/r/20220913061817.22564-2-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- ...five-l2-cache.yaml => sifive,ccache0.yaml} | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) rename Documentation/devicetree/bindings/riscv/{sifive-l2-cache.yaml => sifive,ccache0.yaml} (83%) diff --git a/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml b/Documentation/devicetree/bindings/riscv/sifive,ccache0.yaml similarity index 83% rename from Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml rename to Documentation/devicetree/bindings/riscv/sifive,ccache0.yaml index ca3b9be580584..bf3f07421f7e5 100644 --- a/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml +++ b/Documentation/devicetree/bindings/riscv/sifive,ccache0.yaml @@ -2,18 +2,18 @@ # Copyright (C) 2020 SiFive, Inc. %YAML 1.2 --- -$id: http://devicetree.org/schemas/riscv/sifive-l2-cache.yaml# +$id: http://devicetree.org/schemas/riscv/sifive,ccache0.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: SiFive L2 Cache Controller +title: SiFive Composable Cache Controller maintainers: - Sagar Kadam <sagar.kadam@sifive.com> - Paul Walmsley <paul.walmsley@sifive.com> description: - The SiFive Level 2 Cache Controller is used to provide access to fast copies - of memory for masters in a Core Complex. The Level 2 Cache Controller also + The SiFive Composable Cache Controller is used to provide access to fast copies + of memory for masters in a Core Complex. The Composable Cache Controller also acts as directory-based coherency manager. All the properties in ePAPR/DeviceTree specification applies for this platform. @@ -22,6 +22,7 @@ select: compatible: contains: enum: + - sifive,ccache0 - sifive,fu540-c000-ccache - sifive,fu740-c000-ccache @@ -33,6 +34,7 @@ properties: oneOf: - items: - enum: + - sifive,ccache0 - sifive,fu540-c000-ccache - sifive,fu740-c000-ccache - const: cache @@ -45,7 +47,7 @@ properties: const: 64 cache-level: - const: 2 + enum: [2, 3] cache-sets: enum: [1024, 2048] @@ -115,6 +117,22 @@ allOf: cache-sets: const: 1024 + - if: + properties: + compatible: + contains: + const: sifive,ccache0 + + then: + properties: + cache-level: + enum: [2, 3] + + else: + properties: + cache-level: + const: 2 + additionalProperties: false required: -- GitLab From ca120a79cf5a3323172c82e77efd70ae10d120ef Mon Sep 17 00:00:00 2001 From: Greentime Hu <greentime.hu@sifive.com> Date: Tue, 13 Sep 2022 06:18:12 +0000 Subject: [PATCH 1930/2223] soc: sifive: ccache: Rename SiFive L2 cache to Composable cache. Since composable cache may be L3 cache if there is a L2 cache, we should use its original name composable cache to prevent confusion. There are some new lines were generated due to adding the compatible "sifive,ccache0" into ID table and indent requirement. The sifive L2 has been renamed to sifive CCACHE, EDAC driver needs to apply the change as well. Signed-off-by: Greentime Hu <greentime.hu@sifive.com> Signed-off-by: Zong Li <zong.li@sifive.com> Co-developed-by: Zong Li <zong.li@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220913061817.22564-3-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- drivers/edac/Kconfig | 2 +- drivers/edac/sifive_edac.c | 12 +- drivers/soc/sifive/Kconfig | 6 +- drivers/soc/sifive/Makefile | 2 +- drivers/soc/sifive/sifive_ccache.c | 245 +++++++++++++++++++++++++++ drivers/soc/sifive/sifive_l2_cache.c | 237 -------------------------- include/soc/sifive/sifive_ccache.h | 16 ++ include/soc/sifive/sifive_l2_cache.h | 16 -- 8 files changed, 272 insertions(+), 264 deletions(-) create mode 100644 drivers/soc/sifive/sifive_ccache.c delete mode 100644 drivers/soc/sifive/sifive_l2_cache.c create mode 100644 include/soc/sifive/sifive_ccache.h delete mode 100644 include/soc/sifive/sifive_l2_cache.h diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 17562cf1fe973..456602d373b7b 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -473,7 +473,7 @@ config EDAC_ALTERA_SDMMC config EDAC_SIFIVE bool "Sifive platform EDAC driver" - depends on EDAC=y && SIFIVE_L2 + depends on EDAC=y && SIFIVE_CCACHE help Support for error detection and correction on the SiFive SoCs. diff --git a/drivers/edac/sifive_edac.c b/drivers/edac/sifive_edac.c index ee800aec7d479..b844e2626fd50 100644 --- a/drivers/edac/sifive_edac.c +++ b/drivers/edac/sifive_edac.c @@ -2,7 +2,7 @@ /* * SiFive Platform EDAC Driver * - * Copyright (C) 2018-2019 SiFive, Inc. + * Copyright (C) 2018-2022 SiFive, Inc. * * This driver is partially based on octeon_edac-pc.c * @@ -10,7 +10,7 @@ #include <linux/edac.h> #include <linux/platform_device.h> #include "edac_module.h" -#include <soc/sifive/sifive_l2_cache.h> +#include <soc/sifive/sifive_ccache.h> #define DRVNAME "sifive_edac" @@ -32,9 +32,9 @@ int ecc_err_event(struct notifier_block *this, unsigned long event, void *ptr) p = container_of(this, struct sifive_edac_priv, notifier); - if (event == SIFIVE_L2_ERR_TYPE_UE) + if (event == SIFIVE_CCACHE_ERR_TYPE_UE) edac_device_handle_ue(p->dci, 0, 0, msg); - else if (event == SIFIVE_L2_ERR_TYPE_CE) + else if (event == SIFIVE_CCACHE_ERR_TYPE_CE) edac_device_handle_ce(p->dci, 0, 0, msg); return NOTIFY_OK; @@ -67,7 +67,7 @@ static int ecc_register(struct platform_device *pdev) goto err; } - register_sifive_l2_error_notifier(&p->notifier); + register_sifive_ccache_error_notifier(&p->notifier); return 0; @@ -81,7 +81,7 @@ static int ecc_unregister(struct platform_device *pdev) { struct sifive_edac_priv *p = platform_get_drvdata(pdev); - unregister_sifive_l2_error_notifier(&p->notifier); + unregister_sifive_ccache_error_notifier(&p->notifier); edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(p->dci); diff --git a/drivers/soc/sifive/Kconfig b/drivers/soc/sifive/Kconfig index 58cf8c40d08d5..ed4c571f8771b 100644 --- a/drivers/soc/sifive/Kconfig +++ b/drivers/soc/sifive/Kconfig @@ -2,9 +2,9 @@ if SOC_SIFIVE -config SIFIVE_L2 - bool "Sifive L2 Cache controller" +config SIFIVE_CCACHE + bool "Sifive Composable Cache controller" help - Support for the L2 cache controller on SiFive platforms. + Support for the composable cache controller on SiFive platforms. endif diff --git a/drivers/soc/sifive/Makefile b/drivers/soc/sifive/Makefile index b5caff77938f6..1f5dc339bf827 100644 --- a/drivers/soc/sifive/Makefile +++ b/drivers/soc/sifive/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_SIFIVE_L2) += sifive_l2_cache.o +obj-$(CONFIG_SIFIVE_CCACHE) += sifive_ccache.o diff --git a/drivers/soc/sifive/sifive_ccache.c b/drivers/soc/sifive/sifive_ccache.c new file mode 100644 index 0000000000000..949b824e89adf --- /dev/null +++ b/drivers/soc/sifive/sifive_ccache.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SiFive composable cache controller Driver + * + * Copyright (C) 2018-2022 SiFive, Inc. + * + */ +#include <linux/debugfs.h> +#include <linux/interrupt.h> +#include <linux/of_irq.h> +#include <linux/of_address.h> +#include <linux/device.h> +#include <asm/cacheinfo.h> +#include <soc/sifive/sifive_ccache.h> + +#define SIFIVE_CCACHE_DIRECCFIX_LOW 0x100 +#define SIFIVE_CCACHE_DIRECCFIX_HIGH 0x104 +#define SIFIVE_CCACHE_DIRECCFIX_COUNT 0x108 + +#define SIFIVE_CCACHE_DIRECCFAIL_LOW 0x120 +#define SIFIVE_CCACHE_DIRECCFAIL_HIGH 0x124 +#define SIFIVE_CCACHE_DIRECCFAIL_COUNT 0x128 + +#define SIFIVE_CCACHE_DATECCFIX_LOW 0x140 +#define SIFIVE_CCACHE_DATECCFIX_HIGH 0x144 +#define SIFIVE_CCACHE_DATECCFIX_COUNT 0x148 + +#define SIFIVE_CCACHE_DATECCFAIL_LOW 0x160 +#define SIFIVE_CCACHE_DATECCFAIL_HIGH 0x164 +#define SIFIVE_CCACHE_DATECCFAIL_COUNT 0x168 + +#define SIFIVE_CCACHE_CONFIG 0x00 +#define SIFIVE_CCACHE_WAYENABLE 0x08 +#define SIFIVE_CCACHE_ECCINJECTERR 0x40 + +#define SIFIVE_CCACHE_MAX_ECCINTR 4 + +static void __iomem *ccache_base; +static int g_irq[SIFIVE_CCACHE_MAX_ECCINTR]; +static struct riscv_cacheinfo_ops ccache_cache_ops; + +enum { + DIR_CORR = 0, + DATA_CORR, + DATA_UNCORR, + DIR_UNCORR, +}; + +#ifdef CONFIG_DEBUG_FS +static struct dentry *sifive_test; + +static ssize_t ccache_write(struct file *file, const char __user *data, + size_t count, loff_t *ppos) +{ + unsigned int val; + + if (kstrtouint_from_user(data, count, 0, &val)) + return -EINVAL; + if ((val < 0xFF) || (val >= 0x10000 && val < 0x100FF)) + writel(val, ccache_base + SIFIVE_CCACHE_ECCINJECTERR); + else + return -EINVAL; + return count; +} + +static const struct file_operations ccache_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = ccache_write +}; + +static void setup_sifive_debug(void) +{ + sifive_test = debugfs_create_dir("sifive_ccache_cache", NULL); + + debugfs_create_file("sifive_debug_inject_error", 0200, + sifive_test, NULL, &ccache_fops); +} +#endif + +static void ccache_config_read(void) +{ + u32 regval, val; + + regval = readl(ccache_base + SIFIVE_CCACHE_CONFIG); + val = regval & 0xFF; + pr_info("CCACHE: No. of Banks in the cache: %d\n", val); + val = (regval & 0xFF00) >> 8; + pr_info("CCACHE: No. of ways per bank: %d\n", val); + val = (regval & 0xFF0000) >> 16; + pr_info("CCACHE: Sets per bank: %llu\n", (uint64_t)1 << val); + val = (regval & 0xFF000000) >> 24; + pr_info("CCACHE: Bytes per cache block: %llu\n", (uint64_t)1 << val); + + regval = readl(ccache_base + SIFIVE_CCACHE_WAYENABLE); + pr_info("CCACHE: Index of the largest way enabled: %d\n", regval); +} + +static const struct of_device_id sifive_ccache_ids[] = { + { .compatible = "sifive,fu540-c000-ccache" }, + { .compatible = "sifive,fu740-c000-ccache" }, + { .compatible = "sifive,ccache0" }, + { /* end of table */ } +}; + +static ATOMIC_NOTIFIER_HEAD(ccache_err_chain); + +int register_sifive_ccache_error_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&ccache_err_chain, nb); +} +EXPORT_SYMBOL_GPL(register_sifive_ccache_error_notifier); + +int unregister_sifive_ccache_error_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&ccache_err_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_sifive_ccache_error_notifier); + +static int ccache_largest_wayenabled(void) +{ + return readl(ccache_base + SIFIVE_CCACHE_WAYENABLE) & 0xFF; +} + +static ssize_t number_of_ways_enabled_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ccache_largest_wayenabled()); +} + +static DEVICE_ATTR_RO(number_of_ways_enabled); + +static struct attribute *priv_attrs[] = { + &dev_attr_number_of_ways_enabled.attr, + NULL, +}; + +static const struct attribute_group priv_attr_group = { + .attrs = priv_attrs, +}; + +static const struct attribute_group *ccache_get_priv_group(struct cacheinfo + *this_leaf) +{ + /* We want to use private group for composable cache only */ + if (this_leaf->level == 2) + return &priv_attr_group; + else + return NULL; +} + +static irqreturn_t ccache_int_handler(int irq, void *device) +{ + unsigned int add_h, add_l; + + if (irq == g_irq[DIR_CORR]) { + add_h = readl(ccache_base + SIFIVE_CCACHE_DIRECCFIX_HIGH); + add_l = readl(ccache_base + SIFIVE_CCACHE_DIRECCFIX_LOW); + pr_err("CCACHE: DirError @ 0x%08X.%08X\n", add_h, add_l); + /* Reading this register clears the DirError interrupt sig */ + readl(ccache_base + SIFIVE_CCACHE_DIRECCFIX_COUNT); + atomic_notifier_call_chain(&ccache_err_chain, + SIFIVE_CCACHE_ERR_TYPE_CE, + "DirECCFix"); + } + if (irq == g_irq[DIR_UNCORR]) { + add_h = readl(ccache_base + SIFIVE_CCACHE_DIRECCFAIL_HIGH); + add_l = readl(ccache_base + SIFIVE_CCACHE_DIRECCFAIL_LOW); + /* Reading this register clears the DirFail interrupt sig */ + readl(ccache_base + SIFIVE_CCACHE_DIRECCFAIL_COUNT); + atomic_notifier_call_chain(&ccache_err_chain, + SIFIVE_CCACHE_ERR_TYPE_UE, + "DirECCFail"); + panic("CCACHE: DirFail @ 0x%08X.%08X\n", add_h, add_l); + } + if (irq == g_irq[DATA_CORR]) { + add_h = readl(ccache_base + SIFIVE_CCACHE_DATECCFIX_HIGH); + add_l = readl(ccache_base + SIFIVE_CCACHE_DATECCFIX_LOW); + pr_err("CCACHE: DataError @ 0x%08X.%08X\n", add_h, add_l); + /* Reading this register clears the DataError interrupt sig */ + readl(ccache_base + SIFIVE_CCACHE_DATECCFIX_COUNT); + atomic_notifier_call_chain(&ccache_err_chain, + SIFIVE_CCACHE_ERR_TYPE_CE, + "DatECCFix"); + } + if (irq == g_irq[DATA_UNCORR]) { + add_h = readl(ccache_base + SIFIVE_CCACHE_DATECCFAIL_HIGH); + add_l = readl(ccache_base + SIFIVE_CCACHE_DATECCFAIL_LOW); + pr_err("CCACHE: DataFail @ 0x%08X.%08X\n", add_h, add_l); + /* Reading this register clears the DataFail interrupt sig */ + readl(ccache_base + SIFIVE_CCACHE_DATECCFAIL_COUNT); + atomic_notifier_call_chain(&ccache_err_chain, + SIFIVE_CCACHE_ERR_TYPE_UE, + "DatECCFail"); + } + + return IRQ_HANDLED; +} + +static int __init sifive_ccache_init(void) +{ + struct device_node *np; + struct resource res; + int i, rc, intr_num; + + np = of_find_matching_node(NULL, sifive_ccache_ids); + if (!np) + return -ENODEV; + + if (of_address_to_resource(np, 0, &res)) + return -ENODEV; + + ccache_base = ioremap(res.start, resource_size(&res)); + if (!ccache_base) + return -ENOMEM; + + intr_num = of_property_count_u32_elems(np, "interrupts"); + if (!intr_num) { + pr_err("CCACHE: no interrupts property\n"); + return -ENODEV; + } + + for (i = 0; i < intr_num; i++) { + g_irq[i] = irq_of_parse_and_map(np, i); + rc = request_irq(g_irq[i], ccache_int_handler, 0, "ccache_ecc", + NULL); + if (rc) { + pr_err("CCACHE: Could not request IRQ %d\n", g_irq[i]); + return rc; + } + } + + ccache_config_read(); + + ccache_cache_ops.get_priv_group = ccache_get_priv_group; + riscv_set_cacheinfo_ops(&ccache_cache_ops); + +#ifdef CONFIG_DEBUG_FS + setup_sifive_debug(); +#endif + return 0; +} + +device_initcall(sifive_ccache_init); diff --git a/drivers/soc/sifive/sifive_l2_cache.c b/drivers/soc/sifive/sifive_l2_cache.c deleted file mode 100644 index 59640a1d0b28a..0000000000000 --- a/drivers/soc/sifive/sifive_l2_cache.c +++ /dev/null @@ -1,237 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * SiFive L2 cache controller Driver - * - * Copyright (C) 2018-2019 SiFive, Inc. - * - */ -#include <linux/debugfs.h> -#include <linux/interrupt.h> -#include <linux/of_irq.h> -#include <linux/of_address.h> -#include <linux/device.h> -#include <asm/cacheinfo.h> -#include <soc/sifive/sifive_l2_cache.h> - -#define SIFIVE_L2_DIRECCFIX_LOW 0x100 -#define SIFIVE_L2_DIRECCFIX_HIGH 0x104 -#define SIFIVE_L2_DIRECCFIX_COUNT 0x108 - -#define SIFIVE_L2_DIRECCFAIL_LOW 0x120 -#define SIFIVE_L2_DIRECCFAIL_HIGH 0x124 -#define SIFIVE_L2_DIRECCFAIL_COUNT 0x128 - -#define SIFIVE_L2_DATECCFIX_LOW 0x140 -#define SIFIVE_L2_DATECCFIX_HIGH 0x144 -#define SIFIVE_L2_DATECCFIX_COUNT 0x148 - -#define SIFIVE_L2_DATECCFAIL_LOW 0x160 -#define SIFIVE_L2_DATECCFAIL_HIGH 0x164 -#define SIFIVE_L2_DATECCFAIL_COUNT 0x168 - -#define SIFIVE_L2_CONFIG 0x00 -#define SIFIVE_L2_WAYENABLE 0x08 -#define SIFIVE_L2_ECCINJECTERR 0x40 - -#define SIFIVE_L2_MAX_ECCINTR 4 - -static void __iomem *l2_base; -static int g_irq[SIFIVE_L2_MAX_ECCINTR]; -static struct riscv_cacheinfo_ops l2_cache_ops; - -enum { - DIR_CORR = 0, - DATA_CORR, - DATA_UNCORR, - DIR_UNCORR, -}; - -#ifdef CONFIG_DEBUG_FS -static struct dentry *sifive_test; - -static ssize_t l2_write(struct file *file, const char __user *data, - size_t count, loff_t *ppos) -{ - unsigned int val; - - if (kstrtouint_from_user(data, count, 0, &val)) - return -EINVAL; - if ((val < 0xFF) || (val >= 0x10000 && val < 0x100FF)) - writel(val, l2_base + SIFIVE_L2_ECCINJECTERR); - else - return -EINVAL; - return count; -} - -static const struct file_operations l2_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .write = l2_write -}; - -static void setup_sifive_debug(void) -{ - sifive_test = debugfs_create_dir("sifive_l2_cache", NULL); - - debugfs_create_file("sifive_debug_inject_error", 0200, - sifive_test, NULL, &l2_fops); -} -#endif - -static void l2_config_read(void) -{ - u32 regval, val; - - regval = readl(l2_base + SIFIVE_L2_CONFIG); - val = regval & 0xFF; - pr_info("L2CACHE: No. of Banks in the cache: %d\n", val); - val = (regval & 0xFF00) >> 8; - pr_info("L2CACHE: No. of ways per bank: %d\n", val); - val = (regval & 0xFF0000) >> 16; - pr_info("L2CACHE: Sets per bank: %llu\n", (uint64_t)1 << val); - val = (regval & 0xFF000000) >> 24; - pr_info("L2CACHE: Bytes per cache block: %llu\n", (uint64_t)1 << val); - - regval = readl(l2_base + SIFIVE_L2_WAYENABLE); - pr_info("L2CACHE: Index of the largest way enabled: %d\n", regval); -} - -static const struct of_device_id sifive_l2_ids[] = { - { .compatible = "sifive,fu540-c000-ccache" }, - { .compatible = "sifive,fu740-c000-ccache" }, - { /* end of table */ }, -}; - -static ATOMIC_NOTIFIER_HEAD(l2_err_chain); - -int register_sifive_l2_error_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&l2_err_chain, nb); -} -EXPORT_SYMBOL_GPL(register_sifive_l2_error_notifier); - -int unregister_sifive_l2_error_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&l2_err_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_sifive_l2_error_notifier); - -static int l2_largest_wayenabled(void) -{ - return readl(l2_base + SIFIVE_L2_WAYENABLE) & 0xFF; -} - -static ssize_t number_of_ways_enabled_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", l2_largest_wayenabled()); -} - -static DEVICE_ATTR_RO(number_of_ways_enabled); - -static struct attribute *priv_attrs[] = { - &dev_attr_number_of_ways_enabled.attr, - NULL, -}; - -static const struct attribute_group priv_attr_group = { - .attrs = priv_attrs, -}; - -static const struct attribute_group *l2_get_priv_group(struct cacheinfo *this_leaf) -{ - /* We want to use private group for L2 cache only */ - if (this_leaf->level == 2) - return &priv_attr_group; - else - return NULL; -} - -static irqreturn_t l2_int_handler(int irq, void *device) -{ - unsigned int add_h, add_l; - - if (irq == g_irq[DIR_CORR]) { - add_h = readl(l2_base + SIFIVE_L2_DIRECCFIX_HIGH); - add_l = readl(l2_base + SIFIVE_L2_DIRECCFIX_LOW); - pr_err("L2CACHE: DirError @ 0x%08X.%08X\n", add_h, add_l); - /* Reading this register clears the DirError interrupt sig */ - readl(l2_base + SIFIVE_L2_DIRECCFIX_COUNT); - atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE, - "DirECCFix"); - } - if (irq == g_irq[DIR_UNCORR]) { - add_h = readl(l2_base + SIFIVE_L2_DIRECCFAIL_HIGH); - add_l = readl(l2_base + SIFIVE_L2_DIRECCFAIL_LOW); - /* Reading this register clears the DirFail interrupt sig */ - readl(l2_base + SIFIVE_L2_DIRECCFAIL_COUNT); - atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_UE, - "DirECCFail"); - panic("L2CACHE: DirFail @ 0x%08X.%08X\n", add_h, add_l); - } - if (irq == g_irq[DATA_CORR]) { - add_h = readl(l2_base + SIFIVE_L2_DATECCFIX_HIGH); - add_l = readl(l2_base + SIFIVE_L2_DATECCFIX_LOW); - pr_err("L2CACHE: DataError @ 0x%08X.%08X\n", add_h, add_l); - /* Reading this register clears the DataError interrupt sig */ - readl(l2_base + SIFIVE_L2_DATECCFIX_COUNT); - atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE, - "DatECCFix"); - } - if (irq == g_irq[DATA_UNCORR]) { - add_h = readl(l2_base + SIFIVE_L2_DATECCFAIL_HIGH); - add_l = readl(l2_base + SIFIVE_L2_DATECCFAIL_LOW); - pr_err("L2CACHE: DataFail @ 0x%08X.%08X\n", add_h, add_l); - /* Reading this register clears the DataFail interrupt sig */ - readl(l2_base + SIFIVE_L2_DATECCFAIL_COUNT); - atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_UE, - "DatECCFail"); - } - - return IRQ_HANDLED; -} - -static int __init sifive_l2_init(void) -{ - struct device_node *np; - struct resource res; - int i, rc, intr_num; - - np = of_find_matching_node(NULL, sifive_l2_ids); - if (!np) - return -ENODEV; - - if (of_address_to_resource(np, 0, &res)) - return -ENODEV; - - l2_base = ioremap(res.start, resource_size(&res)); - if (!l2_base) - return -ENOMEM; - - intr_num = of_property_count_u32_elems(np, "interrupts"); - if (!intr_num) { - pr_err("L2CACHE: no interrupts property\n"); - return -ENODEV; - } - - for (i = 0; i < intr_num; i++) { - g_irq[i] = irq_of_parse_and_map(np, i); - rc = request_irq(g_irq[i], l2_int_handler, 0, "l2_ecc", NULL); - if (rc) { - pr_err("L2CACHE: Could not request IRQ %d\n", g_irq[i]); - return rc; - } - } - - l2_config_read(); - - l2_cache_ops.get_priv_group = l2_get_priv_group; - riscv_set_cacheinfo_ops(&l2_cache_ops); - -#ifdef CONFIG_DEBUG_FS - setup_sifive_debug(); -#endif - return 0; -} -device_initcall(sifive_l2_init); diff --git a/include/soc/sifive/sifive_ccache.h b/include/soc/sifive/sifive_ccache.h new file mode 100644 index 0000000000000..4d4ed49388a0a --- /dev/null +++ b/include/soc/sifive/sifive_ccache.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * SiFive Composable Cache Controller header file + * + */ + +#ifndef __SOC_SIFIVE_CCACHE_H +#define __SOC_SIFIVE_CCACHE_H + +extern int register_sifive_ccache_error_notifier(struct notifier_block *nb); +extern int unregister_sifive_ccache_error_notifier(struct notifier_block *nb); + +#define SIFIVE_CCACHE_ERR_TYPE_CE 0 +#define SIFIVE_CCACHE_ERR_TYPE_UE 1 + +#endif /* __SOC_SIFIVE_CCACHE_H */ diff --git a/include/soc/sifive/sifive_l2_cache.h b/include/soc/sifive/sifive_l2_cache.h deleted file mode 100644 index 92ade10ed67e9..0000000000000 --- a/include/soc/sifive/sifive_l2_cache.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * SiFive L2 Cache Controller header file - * - */ - -#ifndef __SOC_SIFIVE_L2_CACHE_H -#define __SOC_SIFIVE_L2_CACHE_H - -extern int register_sifive_l2_error_notifier(struct notifier_block *nb); -extern int unregister_sifive_l2_error_notifier(struct notifier_block *nb); - -#define SIFIVE_L2_ERR_TYPE_CE 0 -#define SIFIVE_L2_ERR_TYPE_UE 1 - -#endif /* __SOC_SIFIVE_L2_CACHE_H */ -- GitLab From 95f196f3212bbc258611c22865aef12b98304e1d Mon Sep 17 00:00:00 2001 From: Zong Li <zong.li@sifive.com> Date: Tue, 13 Sep 2022 06:18:13 +0000 Subject: [PATCH 1931/2223] soc: sifive: ccache: determine the cache level from dts Composable cache could be L2 or L3 cache, use 'cache-level' property of device node to determine the level. Signed-off-by: Zong Li <zong.li@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220913061817.22564-4-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- drivers/soc/sifive/sifive_ccache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/soc/sifive/sifive_ccache.c b/drivers/soc/sifive/sifive_ccache.c index 949b824e89adf..b361b661ea09a 100644 --- a/drivers/soc/sifive/sifive_ccache.c +++ b/drivers/soc/sifive/sifive_ccache.c @@ -38,6 +38,7 @@ static void __iomem *ccache_base; static int g_irq[SIFIVE_CCACHE_MAX_ECCINTR]; static struct riscv_cacheinfo_ops ccache_cache_ops; +static int level; enum { DIR_CORR = 0, @@ -144,7 +145,7 @@ static const struct attribute_group *ccache_get_priv_group(struct cacheinfo *this_leaf) { /* We want to use private group for composable cache only */ - if (this_leaf->level == 2) + if (this_leaf->level == level) return &priv_attr_group; else return NULL; @@ -215,6 +216,9 @@ static int __init sifive_ccache_init(void) if (!ccache_base) return -ENOMEM; + if (of_property_read_u32(np, "cache-level", &level)) + return -ENOENT; + intr_num = of_property_count_u32_elems(np, "interrupts"); if (!intr_num) { pr_err("CCACHE: no interrupts property\n"); -- GitLab From 3fb787e5bad50687a65ded7f3bb805cab70dff59 Mon Sep 17 00:00:00 2001 From: Ben Dooks <ben.dooks@sifive.com> Date: Tue, 13 Sep 2022 06:18:14 +0000 Subject: [PATCH 1932/2223] soc: sifive: ccache: reduce printing on init The driver prints out 6 lines on startup, which can easily be redcued to two lines without losing any information. Note, to make the types work better, uint64_t has been replaced with ULL to make the unsigned long long match the format in the print statement. Signed-off-by: Ben Dooks <ben.dooks@sifive.com> Signed-off-by: Zong Li <zong.li@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220913061817.22564-5-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- drivers/soc/sifive/sifive_ccache.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/soc/sifive/sifive_ccache.c b/drivers/soc/sifive/sifive_ccache.c index b361b661ea09a..17080af7dfa00 100644 --- a/drivers/soc/sifive/sifive_ccache.c +++ b/drivers/soc/sifive/sifive_ccache.c @@ -81,20 +81,17 @@ static void setup_sifive_debug(void) static void ccache_config_read(void) { - u32 regval, val; - - regval = readl(ccache_base + SIFIVE_CCACHE_CONFIG); - val = regval & 0xFF; - pr_info("CCACHE: No. of Banks in the cache: %d\n", val); - val = (regval & 0xFF00) >> 8; - pr_info("CCACHE: No. of ways per bank: %d\n", val); - val = (regval & 0xFF0000) >> 16; - pr_info("CCACHE: Sets per bank: %llu\n", (uint64_t)1 << val); - val = (regval & 0xFF000000) >> 24; - pr_info("CCACHE: Bytes per cache block: %llu\n", (uint64_t)1 << val); - - regval = readl(ccache_base + SIFIVE_CCACHE_WAYENABLE); - pr_info("CCACHE: Index of the largest way enabled: %d\n", regval); + u32 cfg; + + cfg = readl(ccache_base + SIFIVE_CCACHE_CONFIG); + + pr_info("CCACHE: %u banks, %u ways, sets/bank=%llu, bytes/block=%llu\n", + (cfg & 0xff), (cfg >> 8) & 0xff, + BIT_ULL((cfg >> 16) & 0xff), + BIT_ULL((cfg >> 24) & 0xff)); + + cfg = readl(ccache_base + SIFIVE_CCACHE_WAYENABLE); + pr_info("CCACHE: Index of the largest way enabled: %u\n", cfg); } static const struct of_device_id sifive_ccache_ids[] = { -- GitLab From 696ab9bda22a770d079dc3a23bac9aaa553d98f4 Mon Sep 17 00:00:00 2001 From: Ben Dooks <ben.dooks@sifive.com> Date: Tue, 13 Sep 2022 06:18:15 +0000 Subject: [PATCH 1933/2223] soc: sifive: ccache: use pr_fmt() to remove CCACHE: prefixes Use the pr_fmt() macro to prefix all the output with "CCACHE:" to avoid having to write it out each time, or make a large diff when the next change comes along. Signed-off-by: Ben Dooks <ben.dooks@sifive.com> Signed-off-by: Zong Li <zong.li@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220913061817.22564-6-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- drivers/soc/sifive/sifive_ccache.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/soc/sifive/sifive_ccache.c b/drivers/soc/sifive/sifive_ccache.c index 17080af7dfa00..91f0c2b32ea2b 100644 --- a/drivers/soc/sifive/sifive_ccache.c +++ b/drivers/soc/sifive/sifive_ccache.c @@ -5,6 +5,9 @@ * Copyright (C) 2018-2022 SiFive, Inc. * */ + +#define pr_fmt(fmt) "CCACHE: " fmt + #include <linux/debugfs.h> #include <linux/interrupt.h> #include <linux/of_irq.h> @@ -85,13 +88,13 @@ static void ccache_config_read(void) cfg = readl(ccache_base + SIFIVE_CCACHE_CONFIG); - pr_info("CCACHE: %u banks, %u ways, sets/bank=%llu, bytes/block=%llu\n", + pr_info("%u banks, %u ways, sets/bank=%llu, bytes/block=%llu\n", (cfg & 0xff), (cfg >> 8) & 0xff, BIT_ULL((cfg >> 16) & 0xff), BIT_ULL((cfg >> 24) & 0xff)); cfg = readl(ccache_base + SIFIVE_CCACHE_WAYENABLE); - pr_info("CCACHE: Index of the largest way enabled: %u\n", cfg); + pr_info("Index of the largest way enabled: %u\n", cfg); } static const struct of_device_id sifive_ccache_ids[] = { @@ -155,7 +158,7 @@ static irqreturn_t ccache_int_handler(int irq, void *device) if (irq == g_irq[DIR_CORR]) { add_h = readl(ccache_base + SIFIVE_CCACHE_DIRECCFIX_HIGH); add_l = readl(ccache_base + SIFIVE_CCACHE_DIRECCFIX_LOW); - pr_err("CCACHE: DirError @ 0x%08X.%08X\n", add_h, add_l); + pr_err("DirError @ 0x%08X.%08X\n", add_h, add_l); /* Reading this register clears the DirError interrupt sig */ readl(ccache_base + SIFIVE_CCACHE_DIRECCFIX_COUNT); atomic_notifier_call_chain(&ccache_err_chain, @@ -175,7 +178,7 @@ static irqreturn_t ccache_int_handler(int irq, void *device) if (irq == g_irq[DATA_CORR]) { add_h = readl(ccache_base + SIFIVE_CCACHE_DATECCFIX_HIGH); add_l = readl(ccache_base + SIFIVE_CCACHE_DATECCFIX_LOW); - pr_err("CCACHE: DataError @ 0x%08X.%08X\n", add_h, add_l); + pr_err("DataError @ 0x%08X.%08X\n", add_h, add_l); /* Reading this register clears the DataError interrupt sig */ readl(ccache_base + SIFIVE_CCACHE_DATECCFIX_COUNT); atomic_notifier_call_chain(&ccache_err_chain, @@ -185,7 +188,7 @@ static irqreturn_t ccache_int_handler(int irq, void *device) if (irq == g_irq[DATA_UNCORR]) { add_h = readl(ccache_base + SIFIVE_CCACHE_DATECCFAIL_HIGH); add_l = readl(ccache_base + SIFIVE_CCACHE_DATECCFAIL_LOW); - pr_err("CCACHE: DataFail @ 0x%08X.%08X\n", add_h, add_l); + pr_err("DataFail @ 0x%08X.%08X\n", add_h, add_l); /* Reading this register clears the DataFail interrupt sig */ readl(ccache_base + SIFIVE_CCACHE_DATECCFAIL_COUNT); atomic_notifier_call_chain(&ccache_err_chain, @@ -218,7 +221,7 @@ static int __init sifive_ccache_init(void) intr_num = of_property_count_u32_elems(np, "interrupts"); if (!intr_num) { - pr_err("CCACHE: no interrupts property\n"); + pr_err("No interrupts property\n"); return -ENODEV; } @@ -227,7 +230,7 @@ static int __init sifive_ccache_init(void) rc = request_irq(g_irq[i], ccache_int_handler, 0, "ccache_ecc", NULL); if (rc) { - pr_err("CCACHE: Could not request IRQ %d\n", g_irq[i]); + pr_err("Could not request IRQ %d\n", g_irq[i]); return rc; } } -- GitLab From afc7a5834f0de13aee46df62f09e479c1bbf7b9d Mon Sep 17 00:00:00 2001 From: Zong Li <zong.li@sifive.com> Date: Tue, 13 Sep 2022 06:18:16 +0000 Subject: [PATCH 1934/2223] soc: sifive: ccache: define the macro for the register shifts Define the macro for the register shifts, it could make the code be more readable Signed-off-by: Zong Li <zong.li@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220913061817.22564-7-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- drivers/soc/sifive/sifive_ccache.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/soc/sifive/sifive_ccache.c b/drivers/soc/sifive/sifive_ccache.c index 91f0c2b32ea2b..1c171150e878d 100644 --- a/drivers/soc/sifive/sifive_ccache.c +++ b/drivers/soc/sifive/sifive_ccache.c @@ -13,6 +13,7 @@ #include <linux/of_irq.h> #include <linux/of_address.h> #include <linux/device.h> +#include <linux/bitfield.h> #include <asm/cacheinfo.h> #include <soc/sifive/sifive_ccache.h> @@ -33,6 +34,11 @@ #define SIFIVE_CCACHE_DATECCFAIL_COUNT 0x168 #define SIFIVE_CCACHE_CONFIG 0x00 +#define SIFIVE_CCACHE_CONFIG_BANK_MASK GENMASK_ULL(7, 0) +#define SIFIVE_CCACHE_CONFIG_WAYS_MASK GENMASK_ULL(15, 8) +#define SIFIVE_CCACHE_CONFIG_SETS_MASK GENMASK_ULL(23, 16) +#define SIFIVE_CCACHE_CONFIG_BLKS_MASK GENMASK_ULL(31, 24) + #define SIFIVE_CCACHE_WAYENABLE 0x08 #define SIFIVE_CCACHE_ECCINJECTERR 0x40 @@ -87,11 +93,11 @@ static void ccache_config_read(void) u32 cfg; cfg = readl(ccache_base + SIFIVE_CCACHE_CONFIG); - - pr_info("%u banks, %u ways, sets/bank=%llu, bytes/block=%llu\n", - (cfg & 0xff), (cfg >> 8) & 0xff, - BIT_ULL((cfg >> 16) & 0xff), - BIT_ULL((cfg >> 24) & 0xff)); + pr_info("%llu banks, %llu ways, sets/bank=%llu, bytes/block=%llu\n", + FIELD_GET(SIFIVE_CCACHE_CONFIG_BANK_MASK, cfg), + FIELD_GET(SIFIVE_CCACHE_CONFIG_WAYS_MASK, cfg), + BIT_ULL(FIELD_GET(SIFIVE_CCACHE_CONFIG_SETS_MASK, cfg)), + BIT_ULL(FIELD_GET(SIFIVE_CCACHE_CONFIG_BLKS_MASK, cfg))); cfg = readl(ccache_base + SIFIVE_CCACHE_WAYENABLE); pr_info("Index of the largest way enabled: %u\n", cfg); -- GitLab From da29dbcda49d60f34055df19bd4783b889fc7dfc Mon Sep 17 00:00:00 2001 From: Greentime Hu <greentime.hu@sifive.com> Date: Tue, 13 Sep 2022 06:18:17 +0000 Subject: [PATCH 1935/2223] riscv: Add cache information in AUX vector There are no standard CSR registers to provide cache information, the way for RISC-V is to get this information from DT. sysconf syscall could use them to get information of cache through AUX vector. The result of 'getconf -a|grep -i cache' as follows: LEVEL1_ICACHE_SIZE 32768 LEVEL1_ICACHE_ASSOC 2 LEVEL1_ICACHE_LINESIZE 64 LEVEL1_DCACHE_SIZE 32768 LEVEL1_DCACHE_ASSOC 4 LEVEL1_DCACHE_LINESIZE 64 LEVEL2_CACHE_SIZE 524288 LEVEL2_CACHE_ASSOC 8 LEVEL2_CACHE_LINESIZE 64 LEVEL3_CACHE_SIZE 4194304 LEVEL3_CACHE_ASSOC 16 LEVEL3_CACHE_LINESIZE 64 LEVEL4_CACHE_SIZE 0 LEVEL4_CACHE_ASSOC 0 LEVEL4_CACHE_LINESIZE 0 Signed-off-by: Greentime Hu <greentime.hu@sifive.com> Signed-off-by: Zong Li <zong.li@sifive.com> Suggested-by: Zong Li <zong.li@sifive.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220913061817.22564-8-zong.li@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/include/asm/elf.h | 4 ++++ arch/riscv/include/uapi/asm/auxvec.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 14fc7342490bf..e7acffdf21d26 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -99,6 +99,10 @@ do { \ get_cache_size(2, CACHE_TYPE_UNIFIED)); \ NEW_AUX_ENT(AT_L2_CACHEGEOMETRY, \ get_cache_geometry(2, CACHE_TYPE_UNIFIED)); \ + NEW_AUX_ENT(AT_L3_CACHESIZE, \ + get_cache_size(3, CACHE_TYPE_UNIFIED)); \ + NEW_AUX_ENT(AT_L3_CACHEGEOMETRY, \ + get_cache_geometry(3, CACHE_TYPE_UNIFIED)); \ } while (0) #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index 32c73ba1d5313..fb187a33ce589 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -30,8 +30,10 @@ #define AT_L1D_CACHEGEOMETRY 43 #define AT_L2_CACHESIZE 44 #define AT_L2_CACHEGEOMETRY 45 +#define AT_L3_CACHESIZE 46 +#define AT_L3_CACHEGEOMETRY 47 /* entries in ARCH_DLINFO */ -#define AT_VECTOR_SIZE_ARCH 7 +#define AT_VECTOR_SIZE_ARCH 9 #endif /* _UAPI_ASM_RISCV_AUXVEC_H */ -- GitLab From a8616d2dc193b6becc36b5f3cfeaa9ac7a5762f9 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang <jszhang@kernel.org> Date: Sat, 24 Sep 2022 15:07:37 +0800 Subject: [PATCH 1936/2223] riscv: vdso: fix NULL deference in vdso_join_timens() when vfork Testing tools/testing/selftests/timens/vfork_exec.c got below kernel log: [ 6.838454] Unable to handle kernel access to user memory without uaccess routines at virtual address 0000000000000020 [ 6.842255] Oops [#1] [ 6.842871] Modules linked in: [ 6.844249] CPU: 1 PID: 64 Comm: vfork_exec Not tainted 6.0.0-rc3-rt15+ #8 [ 6.845861] Hardware name: riscv-virtio,qemu (DT) [ 6.848009] epc : vdso_join_timens+0xd2/0x110 [ 6.850097] ra : vdso_join_timens+0xd2/0x110 [ 6.851164] epc : ffffffff8000635c ra : ffffffff8000635c sp : ff6000000181fbf0 [ 6.852562] gp : ffffffff80cff648 tp : ff60000000fdb700 t0 : 3030303030303030 [ 6.853852] t1 : 0000000000000030 t2 : 3030303030303030 s0 : ff6000000181fc40 [ 6.854984] s1 : ff60000001e6c000 a0 : 0000000000000010 a1 : ffffffff8005654c [ 6.856221] a2 : 00000000ffffefff a3 : 0000000000000000 a4 : 0000000000000000 [ 6.858114] a5 : 0000000000000000 a6 : 0000000000000008 a7 : 0000000000000038 [ 6.859484] s2 : ff60000001e6c068 s3 : ff6000000108abb0 s4 : 0000000000000000 [ 6.860751] s5 : 0000000000001000 s6 : ffffffff8089dc40 s7 : ffffffff8089dc38 [ 6.862029] s8 : ffffffff8089dc30 s9 : ff60000000fdbe38 s10: 000000000000005e [ 6.863304] s11: ffffffff80cc3510 t3 : ffffffff80d1112f t4 : ffffffff80d1112f [ 6.864565] t5 : ffffffff80d11130 t6 : ff6000000181fa00 [ 6.865561] status: 0000000000000120 badaddr: 0000000000000020 cause: 000000000000000d [ 6.868046] [<ffffffff8008dc94>] timens_commit+0x38/0x11a [ 6.869089] [<ffffffff8008dde8>] timens_on_fork+0x72/0xb4 [ 6.870055] [<ffffffff80190096>] begin_new_exec+0x3c6/0x9f0 [ 6.871231] [<ffffffff801d826c>] load_elf_binary+0x628/0x1214 [ 6.872304] [<ffffffff8018ee7a>] bprm_execve+0x1f2/0x4e4 [ 6.873243] [<ffffffff8018f90c>] do_execveat_common+0x16e/0x1ee [ 6.874258] [<ffffffff8018f9c8>] sys_execve+0x3c/0x48 [ 6.875162] [<ffffffff80003556>] ret_from_syscall+0x0/0x2 [ 6.877484] ---[ end trace 0000000000000000 ]--- This is because the mm->context.vdso_info is NULL in vfork case. From another side, mm->context.vdso_info either points to vdso info for RV64 or vdso info for compat, there's no need to bloat riscv's mm_context_t, we can handle the difference when setup the additional page for vdso. Signed-off-by: Jisheng Zhang <jszhang@kernel.org> Suggested-by: Palmer Dabbelt <palmer@rivosinc.com> Fixes: 3092eb456375 ("riscv: compat: vdso: Add setup additional pages implementation") Link: https://lore.kernel.org/r/20220924070737.3048-1-jszhang@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/include/asm/mmu.h | 1 - arch/riscv/kernel/vdso.c | 13 ++++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index cedcf8ea3c766..0099dc1161683 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -16,7 +16,6 @@ typedef struct { atomic_long_t id; #endif void *vdso; - void *vdso_info; #ifdef CONFIG_SMP /* A local icache flush is needed before user execution can resume. */ cpumask_t icache_stale_mask; diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 69b05b6c181b6..4abc9aebdfae2 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -60,6 +60,11 @@ struct __vdso_info { struct vm_special_mapping *cm; }; +static struct __vdso_info vdso_info; +#ifdef CONFIG_COMPAT +static struct __vdso_info compat_vdso_info; +#endif + static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -114,15 +119,18 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; - struct __vdso_info *vdso_info = mm->context.vdso_info; mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, vdso_info->dm)) + if (vma_is_special_mapping(vma, vdso_info.dm)) zap_page_range(vma, vma->vm_start, size); +#ifdef CONFIG_COMPAT + if (vma_is_special_mapping(vma, compat_vdso_info.dm)) + zap_page_range(vma, vma->vm_start, size); +#endif } mmap_read_unlock(mm); @@ -264,7 +272,6 @@ static int __setup_additional_pages(struct mm_struct *mm, vdso_base += VVAR_SIZE; mm->context.vdso = (void *)vdso_base; - mm->context.vdso_info = (void *)vdso_info; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, -- GitLab From 5a5294fbe0200d1327f0e089135dad77b45aa2ee Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt <palmer@rivosinc.com> Date: Wed, 28 Sep 2022 06:18:07 -0700 Subject: [PATCH 1937/2223] RISC-V: Re-enable counter access from userspace These counters were part of the ISA when we froze the uABI, removing them breaks userspace. Link: https://lore.kernel.org/all/YxEhC%2FmDW1lFt36J@aurel32.net/ Fixes: e9991434596f ("RISC-V: Add perf platform driver based on SBI PMU extension") Tested-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20220928131807.30386-1-palmer@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- drivers/perf/riscv_pmu_sbi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 15e5a47be7d59..3852c18362f53 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -652,8 +652,11 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node); struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); - /* Enable the access for TIME csr only from the user mode now */ - csr_write(CSR_SCOUNTEREN, 0x2); + /* + * Enable the access for CYCLE, TIME, and INSTRET CSRs from userspace, + * as is necessary to maintain uABI compatibility. + */ + csr_write(CSR_SCOUNTEREN, 0x7); /* Stop all the counters so that they can be enabled from perf */ pmu_sbi_stop_all(pmu); -- GitLab From c45fc916c2b2cc2a0587659c18d6ceef9b7299be Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Fri, 29 Jul 2022 12:11:17 +0100 Subject: [PATCH 1938/2223] riscv: enable software resend of irqs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PLIC specification does not describe the interrupt pendings bits as read-write, only that they "can be read". To allow for retriggering of interrupts (and the use of the irq debugfs interface) enable HARDIRQS_SW_RESEND for RISC-V. Link: https://github.com/riscv/riscv-plic-spec/blob/master/riscv-plic.adoc#interrupt-pending-bits Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Acked-by: Marc Zyngier <maz@kernel.org> Acked-by: Palmer Dabbelt <palmer@rivosinc.com> Tested-by: Palmer Dabbelt <palmer@rivosinc.com> # on QEMU Reviewed-by: Björn Töpel <bjorn@kernel.org> Link: https://lore.kernel.org/r/20220729111116.259146-1-conor.dooley@microchip.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e84f2742b6bba..c56bc70158aca 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -70,6 +70,7 @@ config RISCV select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if MMU && 64BIT select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO + select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL -- GitLab From 83439a0f1ce6a592f95e41338320b5f01b98a356 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya <bagasdotme@gmail.com> Date: Tue, 11 Oct 2022 19:26:01 +0700 Subject: [PATCH 1939/2223] Documentation: ACPI: Prune DSDT override documentation from index Commit d206cef03c4827 ("ACPI: docs: Drop useless DSDT override documentation") removes useless DSDT override documentation. However, the commit forgets to prune the documentation entry from table of contents of ACPI admin guide documentation, hence triggers Sphinx warning: Documentation/admin-guide/acpi/index.rst:8: WARNING: toctree contains reference to nonexisting document 'admin-guide/acpi/dsdt-override' Prune the entry to fix the warning. Fixes: d206cef03c4827 ("ACPI: docs: Drop useless DSDT override documentation") Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- Documentation/admin-guide/acpi/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/admin-guide/acpi/index.rst b/Documentation/admin-guide/acpi/index.rst index 71277689ad97f..b078fdb8f4c93 100644 --- a/Documentation/admin-guide/acpi/index.rst +++ b/Documentation/admin-guide/acpi/index.rst @@ -9,7 +9,6 @@ the Linux ACPI support. :maxdepth: 1 initrd_table_override - dsdt-override ssdt-overlays cppc_sysfs fan_performance_states -- GitLab From 43d2748394c3feb86c0c771466f5847e274fc043 Mon Sep 17 00:00:00 2001 From: Ashish Kalra <ashish.kalra@amd.com> Date: Wed, 5 Oct 2022 16:32:53 +0000 Subject: [PATCH 1940/2223] ACPI: APEI: Fix integer overflow in ghes_estatus_pool_init() Change num_ghes from int to unsigned int, preventing an overflow and causing subsequent vmalloc() to fail. The overflow happens in ghes_estatus_pool_init() when calculating len during execution of the statement below as both multiplication operands here are signed int: len += (num_ghes * GHES_ESOURCE_PREALLOC_MAX_SIZE); The following call trace is observed because of this bug: [ 9.317108] swapper/0: vmalloc error: size 18446744071562596352, exceeds total pages, mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0-1 [ 9.317131] Call Trace: [ 9.317134] <TASK> [ 9.317137] dump_stack_lvl+0x49/0x5f [ 9.317145] dump_stack+0x10/0x12 [ 9.317146] warn_alloc.cold+0x7b/0xdf [ 9.317150] ? __device_attach+0x16a/0x1b0 [ 9.317155] __vmalloc_node_range+0x702/0x740 [ 9.317160] ? device_add+0x17f/0x920 [ 9.317164] ? dev_set_name+0x53/0x70 [ 9.317166] ? platform_device_add+0xf9/0x240 [ 9.317168] __vmalloc_node+0x49/0x50 [ 9.317170] ? ghes_estatus_pool_init+0x43/0xa0 [ 9.317176] vmalloc+0x21/0x30 [ 9.317177] ghes_estatus_pool_init+0x43/0xa0 [ 9.317179] acpi_hest_init+0x129/0x19c [ 9.317185] acpi_init+0x434/0x4a4 [ 9.317188] ? acpi_sleep_proc_init+0x2a/0x2a [ 9.317190] do_one_initcall+0x48/0x200 [ 9.317195] kernel_init_freeable+0x221/0x284 [ 9.317200] ? rest_init+0xe0/0xe0 [ 9.317204] kernel_init+0x1a/0x130 [ 9.317205] ret_from_fork+0x22/0x30 [ 9.317208] </TASK> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- drivers/acpi/apei/ghes.c | 2 +- include/acpi/ghes.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 80ad530583c9c..9952f3a792bad 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -163,7 +163,7 @@ static void ghes_unmap(void __iomem *vaddr, enum fixed_addresses fixmap_idx) clear_fixmap(fixmap_idx); } -int ghes_estatus_pool_init(int num_ghes) +int ghes_estatus_pool_init(unsigned int num_ghes) { unsigned long addr, len; int rc; diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 34fb3431a8f36..292a5c40bd0c6 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -71,7 +71,7 @@ int ghes_register_vendor_record_notifier(struct notifier_block *nb); void ghes_unregister_vendor_record_notifier(struct notifier_block *nb); #endif -int ghes_estatus_pool_init(int num_ghes); +int ghes_estatus_pool_init(unsigned int num_ghes); /* From drivers/edac/ghes_edac.c */ -- GitLab From f6ec01da40e4139b41179f046044ee7c4f6370dc Mon Sep 17 00:00:00 2001 From: Tony Luck <tony.luck@intel.com> Date: Mon, 10 Oct 2022 13:34:23 -0700 Subject: [PATCH 1941/2223] ACPI: extlog: Handle multiple records If there is no user space consumer of extlog_mem trace records, then Linux properly handles multiple error records in an ELOG block extlog_print() print_extlog_rcd() __print_extlog_rcd() cper_estatus_print() apei_estatus_for_each_section() But the other code path hard codes looking for a single record to output a trace record. Fix by using the same apei_estatus_for_each_section() iterator to step over all records. Fixes: 2dfb7d51a61d ("trace, RAS: Add eMCA trace event interface") Signed-off-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- drivers/acpi/acpi_extlog.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 72f1fb77abcd0..e648158368a7d 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -12,6 +12,7 @@ #include <linux/ratelimit.h> #include <linux/edac.h> #include <linux/ras.h> +#include <acpi/ghes.h> #include <asm/cpu.h> #include <asm/mce.h> @@ -138,8 +139,8 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, int cpu = mce->extcpu; struct acpi_hest_generic_status *estatus, *tmp; struct acpi_hest_generic_data *gdata; - const guid_t *fru_id = &guid_null; - char *fru_text = ""; + const guid_t *fru_id; + char *fru_text; guid_t *sec_type; static u32 err_seq; @@ -160,17 +161,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, /* log event via trace */ err_seq++; - gdata = (struct acpi_hest_generic_data *)(tmp + 1); - if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) - fru_id = (guid_t *)gdata->fru_id; - if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) - fru_text = gdata->fru_text; - sec_type = (guid_t *)gdata->section_type; - if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { - struct cper_sec_mem_err *mem = (void *)(gdata + 1); - if (gdata->error_data_length >= sizeof(*mem)) - trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, - (u8)gdata->error_severity); + apei_estatus_for_each_section(tmp, gdata) { + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) + fru_id = (guid_t *)gdata->fru_id; + else + fru_id = &guid_null; + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) + fru_text = gdata->fru_text; + else + fru_text = ""; + sec_type = (guid_t *)gdata->section_type; + if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { + struct cper_sec_mem_err *mem = (void *)(gdata + 1); + + if (gdata->error_data_length >= sizeof(*mem)) + trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, + (u8)gdata->error_severity); + } } out: -- GitLab From bfcdf58380b1d9be564a78a9370da722ed1a9965 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org> Date: Tue, 4 Oct 2022 12:33:40 +0200 Subject: [PATCH 1942/2223] ACPI: resource: do IRQ override on LENOVO IdeaPad LENOVO IdeaPad Flex 5 is ryzen-5 based and the commit below removed IRQ overriding for those. This broke touchscreen and trackpad: i2c_designware AMDI0010:00: controller timed out i2c_designware AMDI0010:03: controller timed out i2c_hid_acpi i2c-MSFT0001:00: failed to reset device: -61 i2c_designware AMDI0010:03: controller timed out ... i2c_hid_acpi i2c-MSFT0001:00: can't add hid device: -61 i2c_hid_acpi: probe of i2c-MSFT0001:00 failed with error -61 White-list this specific model in the override_table. For this to work, the ZEN test needs to be put below the table walk. Fixes: 37c81d9f1d1b (ACPI: resource: skip IRQ override on AMD Zen platforms) Link: https://bugzilla.suse.com/show_bug.cgi?id=1203794 Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- drivers/acpi/resource.c | 42 +++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 6f9489edfb4ee..efa92bc8c3036 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -428,17 +428,31 @@ static const struct dmi_system_id asus_laptop[] = { { } }; +static const struct dmi_system_id lenovo_82ra[] = { + { + .ident = "LENOVO IdeaPad Flex 5 16ALC7", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "82RA"), + }, + }, + { } +}; + struct irq_override_cmp { const struct dmi_system_id *system; unsigned char irq; unsigned char triggering; unsigned char polarity; unsigned char shareable; + bool override; }; -static const struct irq_override_cmp skip_override_table[] = { - { medion_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0 }, - { asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0 }, +static const struct irq_override_cmp override_table[] = { + { medion_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false }, + { asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false }, + { lenovo_82ra, 6, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, + { lenovo_82ra, 10, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, }; static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity, @@ -446,6 +460,17 @@ static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity, { int i; + for (i = 0; i < ARRAY_SIZE(override_table); i++) { + const struct irq_override_cmp *entry = &override_table[i]; + + if (dmi_check_system(entry->system) && + entry->irq == gsi && + entry->triggering == triggering && + entry->polarity == polarity && + entry->shareable == shareable) + return entry->override; + } + #ifdef CONFIG_X86 /* * IRQ override isn't needed on modern AMD Zen systems and @@ -456,17 +481,6 @@ static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity, return false; #endif - for (i = 0; i < ARRAY_SIZE(skip_override_table); i++) { - const struct irq_override_cmp *entry = &skip_override_table[i]; - - if (dmi_check_system(entry->system) && - entry->irq == gsi && - entry->triggering == triggering && - entry->polarity == polarity && - entry->shareable == shareable) - return false; - } - return true; } -- GitLab From 4ef96d4dc8e1c418260abf817a90a3adb2d386ac Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org> Date: Tue, 4 Oct 2022 12:33:41 +0200 Subject: [PATCH 1943/2223] ACPI: resource: note more about IRQ override Use an exclamation mark to note which of the properties was overridden. Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- drivers/acpi/resource.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index efa92bc8c3036..78c2804164c6f 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -512,8 +512,11 @@ static void acpi_dev_get_irqresource(struct resource *res, u32 gsi, u8 pol = p ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; if (triggering != trig || polarity != pol) { - pr_warn("ACPI: IRQ %d override to %s, %s\n", gsi, - t ? "level" : "edge", p ? "low" : "high"); + pr_warn("ACPI: IRQ %d override to %s%s, %s%s\n", gsi, + t ? "level" : "edge", + trig == triggering ? "" : "(!)", + p ? "low" : "high", + pol == polarity ? "" : "(!)"); triggering = trig; polarity = pol; } -- GitLab From 9cc205e3c17d5716da7ebb7fa0c985555e95d009 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@orcam.me.uk> Date: Thu, 22 Sep 2022 22:56:06 +0100 Subject: [PATCH 1944/2223] RISC-V: Make port I/O string accessors actually work Fix port I/O string accessors such as `insb', `outsb', etc. which use the physical PCI port I/O address rather than the corresponding memory mapping to get at the requested location, which in turn breaks at least accesses made by our parport driver to a PCIe parallel port such as: PCI parallel port detected: 1415:c118, I/O at 0x1000(0x1008), IRQ 20 parport0: PC-style at 0x1000 (0x1008), irq 20, using FIFO [PCSPP,TRISTATE,COMPAT,EPP,ECP] causing a memory access fault: Unable to handle kernel access to user memory without uaccess routines at virtual address 0000000000001008 Oops [#1] Modules linked in: CPU: 1 PID: 350 Comm: cat Not tainted 6.0.0-rc2-00283-g10d4879f9ef0-dirty #23 Hardware name: SiFive HiFive Unmatched A00 (DT) epc : parport_pc_fifo_write_block_pio+0x266/0x416 ra : parport_pc_fifo_write_block_pio+0xb4/0x416 epc : ffffffff80542c3e ra : ffffffff80542a8c sp : ffffffd88899fc60 gp : ffffffff80fa2700 tp : ffffffd882b1e900 t0 : ffffffd883d0b000 t1 : ffffffffff000002 t2 : 4646393043330a38 s0 : ffffffd88899fcf0 s1 : 0000000000001000 a0 : 0000000000000010 a1 : 0000000000000000 a2 : ffffffd883d0a010 a3 : 0000000000000023 a4 : 00000000ffff8fbb a5 : ffffffd883d0a001 a6 : 0000000100000000 a7 : ffffffc800000000 s2 : ffffffffff000002 s3 : ffffffff80d28880 s4 : ffffffff80fa1f50 s5 : 0000000000001008 s6 : 0000000000000008 s7 : ffffffd883d0a000 s8 : 0004000000000000 s9 : ffffffff80dc1d80 s10: ffffffd8807e4000 s11: 0000000000000000 t3 : 00000000000000ff t4 : 393044410a303930 t5 : 0000000000001000 t6 : 0000000000040000 status: 0000000200000120 badaddr: 0000000000001008 cause: 000000000000000f [<ffffffff80543212>] parport_pc_compat_write_block_pio+0xfe/0x200 [<ffffffff8053bbc0>] parport_write+0x46/0xf8 [<ffffffff8050530e>] lp_write+0x158/0x2d2 [<ffffffff80185716>] vfs_write+0x8e/0x2c2 [<ffffffff80185a74>] ksys_write+0x52/0xc2 [<ffffffff80185af2>] sys_write+0xe/0x16 [<ffffffff80003770>] ret_from_syscall+0x0/0x2 ---[ end trace 0000000000000000 ]--- For simplicity address the problem by adding PCI_IOBASE to the physical address requested in the respective wrapper macros only, observing that the raw accessors such as `__insb', `__outsb', etc. are not supposed to be used other than by said macros. Remove the cast to `long' that is no longer needed on `addr' now that it is used as an offset from PCI_IOBASE and add parentheses around `addr' needed for predictable evaluation in macro expansion. No need to make said adjustments in separate changes given that current code is gravely broken and does not ever work. Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk> Fixes: fab957c11efe2 ("RISC-V: Atomic and Locking Code") Cc: stable@vger.kernel.org # v4.15+ Reviewed-by: Arnd Bergmann <arnd@arndb.de> Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209220223080.29493@angie.orcam.me.uk Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- arch/riscv/include/asm/io.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 69605a4742706..92080a2279372 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -101,9 +101,9 @@ __io_reads_ins(reads, u32, l, __io_br(), __io_ar(addr)) __io_reads_ins(ins, u8, b, __io_pbr(), __io_par(addr)) __io_reads_ins(ins, u16, w, __io_pbr(), __io_par(addr)) __io_reads_ins(ins, u32, l, __io_pbr(), __io_par(addr)) -#define insb(addr, buffer, count) __insb((void __iomem *)(long)addr, buffer, count) -#define insw(addr, buffer, count) __insw((void __iomem *)(long)addr, buffer, count) -#define insl(addr, buffer, count) __insl((void __iomem *)(long)addr, buffer, count) +#define insb(addr, buffer, count) __insb(PCI_IOBASE + (addr), buffer, count) +#define insw(addr, buffer, count) __insw(PCI_IOBASE + (addr), buffer, count) +#define insl(addr, buffer, count) __insl(PCI_IOBASE + (addr), buffer, count) __io_writes_outs(writes, u8, b, __io_bw(), __io_aw()) __io_writes_outs(writes, u16, w, __io_bw(), __io_aw()) @@ -115,22 +115,22 @@ __io_writes_outs(writes, u32, l, __io_bw(), __io_aw()) __io_writes_outs(outs, u8, b, __io_pbw(), __io_paw()) __io_writes_outs(outs, u16, w, __io_pbw(), __io_paw()) __io_writes_outs(outs, u32, l, __io_pbw(), __io_paw()) -#define outsb(addr, buffer, count) __outsb((void __iomem *)(long)addr, buffer, count) -#define outsw(addr, buffer, count) __outsw((void __iomem *)(long)addr, buffer, count) -#define outsl(addr, buffer, count) __outsl((void __iomem *)(long)addr, buffer, count) +#define outsb(addr, buffer, count) __outsb(PCI_IOBASE + (addr), buffer, count) +#define outsw(addr, buffer, count) __outsw(PCI_IOBASE + (addr), buffer, count) +#define outsl(addr, buffer, count) __outsl(PCI_IOBASE + (addr), buffer, count) #ifdef CONFIG_64BIT __io_reads_ins(reads, u64, q, __io_br(), __io_ar(addr)) #define readsq(addr, buffer, count) __readsq(addr, buffer, count) __io_reads_ins(ins, u64, q, __io_pbr(), __io_par(addr)) -#define insq(addr, buffer, count) __insq((void __iomem *)addr, buffer, count) +#define insq(addr, buffer, count) __insq(PCI_IOBASE + (addr), buffer, count) __io_writes_outs(writes, u64, q, __io_bw(), __io_aw()) #define writesq(addr, buffer, count) __writesq(addr, buffer, count) __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) -#define outsq(addr, buffer, count) __outsq((void __iomem *)addr, buffer, count) +#define outsq(addr, buffer, count) __outsq(PCI_IOBASE + (addr), buffer, count) #endif #include <asm-generic/io.h> -- GitLab From 4919d3eb2ec0ee364f7e3cf2d99646c1b224fae8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com> Date: Wed, 12 Oct 2022 20:07:01 +0200 Subject: [PATCH 1945/2223] rtc: cmos: Fix event handler registration ordering issue Because acpi_install_fixed_event_handler() enables the event automatically on success, it is incorrect to call it before the handler routine passed to it is ready to handle events. Unfortunately, the rtc-cmos driver does exactly the incorrect thing by calling cmos_wake_setup(), which passes rtc_handler() to acpi_install_fixed_event_handler(), before cmos_do_probe(), because rtc_handler() uses dev_get_drvdata() to get to the cmos object pointer and the driver data pointer is only populated in cmos_do_probe(). This leads to a NULL pointer dereference in rtc_handler() on boot if the RTC fixed event happens to be active at the init time. To address this issue, change the initialization ordering of the driver so that cmos_wake_setup() is always called after a successful cmos_do_probe() call. While at it, change cmos_pnp_probe() to call cmos_do_probe() after the initial if () statement used for computing the IRQ argument to be passed to cmos_do_probe() which is cleaner than calling it in each branch of that if () (local variable "irq" can be of type int, because it is passed to that function as an argument of type int). Note that commit 6492fed7d8c9 ("rtc: rtc-cmos: Do not check ACPI_FADT_LOW_POWER_S0") caused this issue to affect a larger number of systems, because previously it only affected systems with ACPI_FADT_LOW_POWER_S0 set, but it is present regardless of that commit. Fixes: 6492fed7d8c9 ("rtc: rtc-cmos: Do not check ACPI_FADT_LOW_POWER_S0") Fixes: a474aaedac99 ("rtc-cmos: move wake setup from ACPI glue into RTC driver") Link: https://lore.kernel.org/linux-acpi/20221010141630.zfzi7mk7zvnmclzy@techsingularity.net/ Reported-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Reviewed-by: Bjorn Helgaas <bhelgaas@google.com> Tested-by: Mel Gorman <mgorman@techsingularity.net> Link: https://lore.kernel.org/r/5629262.DvuYhMxLoT@kreacher Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-cmos.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index bdb1df843c78d..610413b4e9ca7 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -1352,10 +1352,10 @@ static void cmos_check_acpi_rtc_status(struct device *dev, static int cmos_pnp_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) { - cmos_wake_setup(&pnp->dev); + int irq, ret; if (pnp_port_start(pnp, 0) == 0x70 && !pnp_irq_valid(pnp, 0)) { - unsigned int irq = 0; + irq = 0; #ifdef CONFIG_X86 /* Some machines contain a PNP entry for the RTC, but * don't define the IRQ. It should always be safe to @@ -1364,13 +1364,17 @@ static int cmos_pnp_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) if (nr_legacy_irqs()) irq = RTC_IRQ; #endif - return cmos_do_probe(&pnp->dev, - pnp_get_resource(pnp, IORESOURCE_IO, 0), irq); } else { - return cmos_do_probe(&pnp->dev, - pnp_get_resource(pnp, IORESOURCE_IO, 0), - pnp_irq(pnp, 0)); + irq = pnp_irq(pnp, 0); } + + ret = cmos_do_probe(&pnp->dev, pnp_get_resource(pnp, IORESOURCE_IO, 0), irq); + if (ret) + return ret; + + cmos_wake_setup(&pnp->dev); + + return 0; } static void cmos_pnp_remove(struct pnp_dev *pnp) @@ -1454,10 +1458,9 @@ static inline void cmos_of_init(struct platform_device *pdev) {} static int __init cmos_platform_probe(struct platform_device *pdev) { struct resource *resource; - int irq; + int irq, ret; cmos_of_init(pdev); - cmos_wake_setup(&pdev->dev); if (RTC_IOMAPPED) resource = platform_get_resource(pdev, IORESOURCE_IO, 0); @@ -1467,7 +1470,13 @@ static int __init cmos_platform_probe(struct platform_device *pdev) if (irq < 0) irq = -1; - return cmos_do_probe(&pdev->dev, resource, irq); + ret = cmos_do_probe(&pdev->dev, resource, irq); + if (ret) + return ret; + + cmos_wake_setup(&pdev->dev); + + return 0; } static int cmos_platform_remove(struct platform_device *pdev) -- GitLab From e5f12a398371280649ccc9d6eb0b97fd42a5df98 Mon Sep 17 00:00:00 2001 From: Ke Sun <sunke@kylinos.cn> Date: Sat, 8 Oct 2022 15:13:21 +0800 Subject: [PATCH 1946/2223] rtc: rv3028: Fix codestyle errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiler warnings: drivers/rtc/rtc-rv3028.c: In function 'rv3028_param_set': drivers/rtc/rtc-rv3028.c:559:20: warning: statement will never be executed [-Wswitch-unreachable] 559 | u8 mode; | ^~~~ drivers/rtc/rtc-rv3028.c: In function 'rv3028_param_get': drivers/rtc/rtc-rv3028.c:526:21: warning: statement will never be executed [-Wswitch-unreachable] 526 | u32 value; | ^~~~~ Fix it by moving the variable declaration to the beginning of the function. Cc: Alessandro Zummo <a.zummo@towertech.it> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: linux-rtc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reported-by: k2ci <kernel-bot@kylinos.cn> Signed-off-by: Ke Sun <sunke@kylinos.cn> Link: https://lore.kernel.org/r/20221008071321.1799971-1-sunke@kylinos.cn Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> --- drivers/rtc/rtc-rv3028.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c index cdc623b3e365b..dd170e3efd83e 100644 --- a/drivers/rtc/rtc-rv3028.c +++ b/drivers/rtc/rtc-rv3028.c @@ -521,10 +521,9 @@ static int rv3028_param_get(struct device *dev, struct rtc_param *param) { struct rv3028_data *rv3028 = dev_get_drvdata(dev); int ret; + u32 value; switch(param->param) { - u32 value; - case RTC_PARAM_BACKUP_SWITCH_MODE: ret = regmap_read(rv3028->regmap, RV3028_BACKUP, &value); if (ret < 0) @@ -554,9 +553,9 @@ static int rv3028_param_get(struct device *dev, struct rtc_param *param) static int rv3028_param_set(struct device *dev, struct rtc_param *param) { struct rv3028_data *rv3028 = dev_get_drvdata(dev); + u8 mode; switch(param->param) { - u8 mode; case RTC_PARAM_BACKUP_SWITCH_MODE: switch (param->uvalue) { case RTC_BSM_DISABLED: -- GitLab From ab0c23b535f3f9d8345d8ad4c18c0a8594459d55 Mon Sep 17 00:00:00 2001 From: Conor Dooley <conor.dooley@microchip.com> Date: Tue, 11 Oct 2022 17:07:45 +0100 Subject: [PATCH 1947/2223] MAINTAINERS: add RISC-V's patchwork The RISC-V patchwork instance on kernel.org has had some necromancy performed on it & will be used going forward. The statuses that are intended to be used are: - New: No action has been taken yet - Under Review: The maintainer is waiting for review comments from others - Changes Requested: Either the maintainer or a reviewer requested changes in the patch. The patch author is expected to submit a new version - Superseded: There's a new version of the patch available - Not Applicable: The patch is not intended for the RISC-V tree - Accepted: The patch has been applied - Rejected: The patch has been rejected, with reasons stated in an email Signed-off-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20221011160744.2167025-1-conor@kernel.org/ Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b3415857a812c..557da4a327179 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17634,6 +17634,7 @@ M: Palmer Dabbelt <palmer@dabbelt.com> M: Albert Ou <aou@eecs.berkeley.edu> L: linux-riscv@lists.infradead.org S: Supported +Q: https://patchwork.kernel.org/project/linux-riscv/list/ P: Documentation/riscv/patch-acceptance.rst T: git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git F: arch/riscv/ -- GitLab From 28be7ca4fcfd69a2d52aaa331adbf9dbe91f9e6e Mon Sep 17 00:00:00 2001 From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz> Date: Mon, 10 Oct 2022 15:46:13 +1300 Subject: [PATCH 1948/2223] tipc: Fix recognition of trial period The trial period exists until jiffies is after addr_trial_end. But as jiffies will eventually overflow, just using time_after will eventually give incorrect results. As the node address is set once the trial period ends, this can be used to know that we are not in the trial period. Fixes: e415577f57f4 ("tipc: correct discovery message handling during address trial period") Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/discover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index da69e1abf68ff..e8630707901e3 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -148,8 +148,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); - bool trial = time_before(jiffies, tn->addr_trial_end); u32 self = tipc_own_addr(net); + bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) -- GitLab From 777ecaabd614d47c482a5c9031579e66da13989a Mon Sep 17 00:00:00 2001 From: Alexander Potapenko <glider@google.com> Date: Wed, 12 Oct 2022 17:25:14 +0200 Subject: [PATCH 1949/2223] tipc: fix an information leak in tipc_topsrv_kern_subscr Use a 8-byte write to initialize sub.usr_handle in tipc_topsrv_kern_subscr(), otherwise four bytes remain uninitialized when issuing setsockopt(..., SOL_TIPC, ...). This resulted in an infoleak reported by KMSAN when the packet was received: ===================================================== BUG: KMSAN: kernel-infoleak in copyout+0xbc/0x100 lib/iov_iter.c:169 instrument_copy_to_user ./include/linux/instrumented.h:121 copyout+0xbc/0x100 lib/iov_iter.c:169 _copy_to_iter+0x5c0/0x20a0 lib/iov_iter.c:527 copy_to_iter ./include/linux/uio.h:176 simple_copy_to_iter+0x64/0xa0 net/core/datagram.c:513 __skb_datagram_iter+0x123/0xdc0 net/core/datagram.c:419 skb_copy_datagram_iter+0x58/0x200 net/core/datagram.c:527 skb_copy_datagram_msg ./include/linux/skbuff.h:3903 packet_recvmsg+0x521/0x1e70 net/packet/af_packet.c:3469 ____sys_recvmsg+0x2c4/0x810 net/socket.c:? ___sys_recvmsg+0x217/0x840 net/socket.c:2743 __sys_recvmsg net/socket.c:2773 __do_sys_recvmsg net/socket.c:2783 __se_sys_recvmsg net/socket.c:2780 __x64_sys_recvmsg+0x364/0x540 net/socket.c:2780 do_syscall_x64 arch/x86/entry/common.c:50 do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd arch/x86/entry/entry_64.S:120 ... Uninit was stored to memory at: tipc_sub_subscribe+0x42d/0xb50 net/tipc/subscr.c:156 tipc_conn_rcv_sub+0x246/0x620 net/tipc/topsrv.c:375 tipc_topsrv_kern_subscr+0x2e8/0x400 net/tipc/topsrv.c:579 tipc_group_create+0x4e7/0x7d0 net/tipc/group.c:190 tipc_sk_join+0x2a8/0x770 net/tipc/socket.c:3084 tipc_setsockopt+0xae5/0xe40 net/tipc/socket.c:3201 __sys_setsockopt+0x87f/0xdc0 net/socket.c:2252 __do_sys_setsockopt net/socket.c:2263 __se_sys_setsockopt net/socket.c:2260 __x64_sys_setsockopt+0xe0/0x160 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:50 do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd arch/x86/entry/entry_64.S:120 Local variable sub created at: tipc_topsrv_kern_subscr+0x57/0x400 net/tipc/topsrv.c:562 tipc_group_create+0x4e7/0x7d0 net/tipc/group.c:190 Bytes 84-87 of 88 are uninitialized Memory access of size 88 starts at ffff88801ed57cd0 Data copied to user address 0000000020000400 ... ===================================================== Signed-off-by: Alexander Potapenko <glider@google.com> Fixes: 026321c6d056a5 ("tipc: rename tipc_server to tipc_topsrv") Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/topsrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 5522865deae95..14fd05fd6107d 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -568,7 +568,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, sub.seq.upper = upper; sub.timeout = TIPC_WAIT_FOREVER; sub.filter = filter; - *(u32 *)&sub.usr_handle = port; + *(u64 *)&sub.usr_handle = (u64)port; con = tipc_conn_alloc(tipc_topsrv(net)); if (IS_ERR(con)) -- GitLab From a2550d3ce53c68f54042bc5e468c4d07491ffe0e Mon Sep 17 00:00:00 2001 From: Christian Marangi <ansuelsmth@gmail.com> Date: Wed, 12 Oct 2022 19:18:36 +0200 Subject: [PATCH 1950/2223] net: dsa: qca8k: fix inband mgmt for big-endian systems The header and the data of the skb for the inband mgmt requires to be in little-endian. This is problematic for big-endian system as the mgmt header is written in the cpu byte order. Fix this by converting each value for the mgmt header and data to little-endian, and convert to cpu byte order the mgmt header and data sent by the switch. Fixes: 5950c7c0a68c ("net: dsa: qca8k: add support for mgmt read/write in Ethernet packet") Tested-by: Pawel Dembicki <paweldembicki@gmail.com> Tested-by: Lech Perczak <lech.perczak@gmail.com> Signed-off-by: Christian Marangi <ansuelsmth@gmail.com> Reviewed-by: Lech Perczak <lech.perczak@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/qca/qca8k-8xxx.c | 63 ++++++++++++++++++++++++-------- include/linux/dsa/tag_qca.h | 6 +-- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 5669c92c93f7a..644338ca0510a 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -137,27 +137,42 @@ static void qca8k_rw_reg_ack_handler(struct dsa_switch *ds, struct sk_buff *skb) struct qca8k_mgmt_eth_data *mgmt_eth_data; struct qca8k_priv *priv = ds->priv; struct qca_mgmt_ethhdr *mgmt_ethhdr; + u32 command; u8 len, cmd; + int i; mgmt_ethhdr = (struct qca_mgmt_ethhdr *)skb_mac_header(skb); mgmt_eth_data = &priv->mgmt_eth_data; - cmd = FIELD_GET(QCA_HDR_MGMT_CMD, mgmt_ethhdr->command); - len = FIELD_GET(QCA_HDR_MGMT_LENGTH, mgmt_ethhdr->command); + command = get_unaligned_le32(&mgmt_ethhdr->command); + cmd = FIELD_GET(QCA_HDR_MGMT_CMD, command); + len = FIELD_GET(QCA_HDR_MGMT_LENGTH, command); /* Make sure the seq match the requested packet */ - if (mgmt_ethhdr->seq == mgmt_eth_data->seq) + if (get_unaligned_le32(&mgmt_ethhdr->seq) == mgmt_eth_data->seq) mgmt_eth_data->ack = true; if (cmd == MDIO_READ) { - mgmt_eth_data->data[0] = mgmt_ethhdr->mdio_data; + u32 *val = mgmt_eth_data->data; + + *val = get_unaligned_le32(&mgmt_ethhdr->mdio_data); /* Get the rest of the 12 byte of data. * The read/write function will extract the requested data. */ - if (len > QCA_HDR_MGMT_DATA1_LEN) - memcpy(mgmt_eth_data->data + 1, skb->data, - QCA_HDR_MGMT_DATA2_LEN); + if (len > QCA_HDR_MGMT_DATA1_LEN) { + __le32 *data2 = (__le32 *)skb->data; + int data_len = min_t(int, QCA_HDR_MGMT_DATA2_LEN, + len - QCA_HDR_MGMT_DATA1_LEN); + + val++; + + for (i = sizeof(u32); i <= data_len; i += sizeof(u32)) { + *val = get_unaligned_le32(data2); + val++; + data2++; + } + } } complete(&mgmt_eth_data->rw_done); @@ -169,8 +184,10 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * struct qca_mgmt_ethhdr *mgmt_ethhdr; unsigned int real_len; struct sk_buff *skb; - u32 *data2; + __le32 *data2; + u32 command; u16 hdr; + int i; skb = dev_alloc_skb(QCA_HDR_MGMT_PKT_LEN); if (!skb) @@ -199,20 +216,32 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(0)); hdr |= FIELD_PREP(QCA_HDR_XMIT_CONTROL, QCA_HDR_XMIT_TYPE_RW_REG); - mgmt_ethhdr->command = FIELD_PREP(QCA_HDR_MGMT_ADDR, reg); - mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_LENGTH, real_len); - mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_CMD, cmd); - mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_CHECK_CODE, + command = FIELD_PREP(QCA_HDR_MGMT_ADDR, reg); + command |= FIELD_PREP(QCA_HDR_MGMT_LENGTH, real_len); + command |= FIELD_PREP(QCA_HDR_MGMT_CMD, cmd); + command |= FIELD_PREP(QCA_HDR_MGMT_CHECK_CODE, QCA_HDR_MGMT_CHECK_CODE_VAL); + put_unaligned_le32(command, &mgmt_ethhdr->command); + if (cmd == MDIO_WRITE) - mgmt_ethhdr->mdio_data = *val; + put_unaligned_le32(*val, &mgmt_ethhdr->mdio_data); mgmt_ethhdr->hdr = htons(hdr); data2 = skb_put_zero(skb, QCA_HDR_MGMT_DATA2_LEN + QCA_HDR_MGMT_PADDING_LEN); - if (cmd == MDIO_WRITE && len > QCA_HDR_MGMT_DATA1_LEN) - memcpy(data2, val + 1, len - QCA_HDR_MGMT_DATA1_LEN); + if (cmd == MDIO_WRITE && len > QCA_HDR_MGMT_DATA1_LEN) { + int data_len = min_t(int, QCA_HDR_MGMT_DATA2_LEN, + len - QCA_HDR_MGMT_DATA1_LEN); + + val++; + + for (i = sizeof(u32); i <= data_len; i += sizeof(u32)) { + put_unaligned_le32(*val, data2); + data2++; + val++; + } + } return skb; } @@ -220,9 +249,11 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * static void qca8k_mdio_header_fill_seq_num(struct sk_buff *skb, u32 seq_num) { struct qca_mgmt_ethhdr *mgmt_ethhdr; + u32 seq; + seq = FIELD_PREP(QCA_HDR_MGMT_SEQ_NUM, seq_num); mgmt_ethhdr = (struct qca_mgmt_ethhdr *)skb->data; - mgmt_ethhdr->seq = FIELD_PREP(QCA_HDR_MGMT_SEQ_NUM, seq_num); + put_unaligned_le32(seq, &mgmt_ethhdr->seq); } static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len) diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index 50be7cbd93a5b..0e176da1e43f4 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -61,9 +61,9 @@ struct sk_buff; /* Special struct emulating a Ethernet header */ struct qca_mgmt_ethhdr { - u32 command; /* command bit 31:0 */ - u32 seq; /* seq 63:32 */ - u32 mdio_data; /* first 4byte mdio */ + __le32 command; /* command bit 31:0 */ + __le32 seq; /* seq 63:32 */ + __le32 mdio_data; /* first 4byte mdio */ __be16 hdr; /* qca hdr */ } __packed; -- GitLab From 0d4636f7d72df3179b20a2d32b647881917a5e2a Mon Sep 17 00:00:00 2001 From: Christian Marangi <ansuelsmth@gmail.com> Date: Wed, 12 Oct 2022 19:18:37 +0200 Subject: [PATCH 1951/2223] net: dsa: qca8k: fix ethtool autocast mib for big-endian systems The switch sends autocast mib in little-endian. This is problematic for big-endian system as the values needs to be converted. Fix this by converting each mib value to cpu byte order. Fixes: 5c957c7ca78c ("net: dsa: qca8k: add support for mib autocast in Ethernet packet") Tested-by: Pawel Dembicki <paweldembicki@gmail.com> Tested-by: Lech Perczak <lech.perczak@gmail.com> Signed-off-by: Christian Marangi <ansuelsmth@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/qca/qca8k-8xxx.c | 20 ++++++++------------ include/linux/dsa/tag_qca.h | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 644338ca0510a..c5c3b4e92f28b 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -1518,9 +1518,9 @@ static void qca8k_mib_autocast_handler(struct dsa_switch *ds, struct sk_buff *sk struct qca8k_priv *priv = ds->priv; const struct qca8k_mib_desc *mib; struct mib_ethhdr *mib_ethhdr; - int i, mib_len, offset = 0; - u64 *data; + __le32 *data2; u8 port; + int i; mib_ethhdr = (struct mib_ethhdr *)skb_mac_header(skb); mib_eth_data = &priv->mib_eth_data; @@ -1532,28 +1532,24 @@ static void qca8k_mib_autocast_handler(struct dsa_switch *ds, struct sk_buff *sk if (port != mib_eth_data->req_port) goto exit; - data = mib_eth_data->data; + data2 = (__le32 *)skb->data; for (i = 0; i < priv->info->mib_count; i++) { mib = &ar8327_mib[i]; /* First 3 mib are present in the skb head */ if (i < 3) { - data[i] = mib_ethhdr->data[i]; + mib_eth_data->data[i] = get_unaligned_le32(mib_ethhdr->data + i); continue; } - mib_len = sizeof(uint32_t); - /* Some mib are 64 bit wide */ if (mib->size == 2) - mib_len = sizeof(uint64_t); - - /* Copy the mib value from packet to the */ - memcpy(data + i, skb->data + offset, mib_len); + mib_eth_data->data[i] = get_unaligned_le64((__le64 *)data2); + else + mib_eth_data->data[i] = get_unaligned_le32(data2); - /* Set the offset for the next mib */ - offset += mib_len; + data2 += mib->size; } exit: diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index 0e176da1e43f4..b1b5720d89a59 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -73,7 +73,7 @@ enum mdio_cmd { }; struct mib_ethhdr { - u32 data[3]; /* first 3 mib counter */ + __le32 data[3]; /* first 3 mib counter */ __be16 hdr; /* qca hdr */ } __packed; -- GitLab From aae425efdfd1b1d8452260a3cb49344ebf20b1f5 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski <jan.sokolowski@intel.com> Date: Wed, 12 Oct 2022 13:54:40 -0700 Subject: [PATCH 1952/2223] i40e: Fix DMA mappings leak During reallocation of RX buffers, new DMA mappings are created for those buffers. steps for reproduction: while : do for ((i=0; i<=8160; i=i+32)) do ethtool -G enp130s0f0 rx $i tx $i sleep 0.5 ethtool -g enp130s0f0 done done This resulted in crash: i40e 0000:01:00.1: Unable to allocate memory for the Rx descriptor ring, size=65536 Driver BUG WARNING: CPU: 0 PID: 4300 at net/core/xdp.c:141 xdp_rxq_info_unreg+0x43/0x50 Call Trace: i40e_free_rx_resources+0x70/0x80 [i40e] i40e_set_ringparam+0x27c/0x800 [i40e] ethnl_set_rings+0x1b2/0x290 genl_family_rcv_msg_doit.isra.15+0x10f/0x150 genl_family_rcv_msg+0xb3/0x160 ? rings_fill_reply+0x1a0/0x1a0 genl_rcv_msg+0x47/0x90 ? genl_family_rcv_msg+0x160/0x160 netlink_rcv_skb+0x4c/0x120 genl_rcv+0x24/0x40 netlink_unicast+0x196/0x230 netlink_sendmsg+0x204/0x3d0 sock_sendmsg+0x4c/0x50 __sys_sendto+0xee/0x160 ? handle_mm_fault+0xbe/0x1e0 ? syscall_trace_enter+0x1d3/0x2c0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca RIP: 0033:0x7f5eac8b035b Missing register, driver bug WARNING: CPU: 0 PID: 4300 at net/core/xdp.c:119 xdp_rxq_info_unreg_mem_model+0x69/0x140 Call Trace: xdp_rxq_info_unreg+0x1e/0x50 i40e_free_rx_resources+0x70/0x80 [i40e] i40e_set_ringparam+0x27c/0x800 [i40e] ethnl_set_rings+0x1b2/0x290 genl_family_rcv_msg_doit.isra.15+0x10f/0x150 genl_family_rcv_msg+0xb3/0x160 ? rings_fill_reply+0x1a0/0x1a0 genl_rcv_msg+0x47/0x90 ? genl_family_rcv_msg+0x160/0x160 netlink_rcv_skb+0x4c/0x120 genl_rcv+0x24/0x40 netlink_unicast+0x196/0x230 netlink_sendmsg+0x204/0x3d0 sock_sendmsg+0x4c/0x50 __sys_sendto+0xee/0x160 ? handle_mm_fault+0xbe/0x1e0 ? syscall_trace_enter+0x1d3/0x2c0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca RIP: 0033:0x7f5eac8b035b This was caused because of new buffers with different RX ring count should substitute older ones, but those buffers were freed in i40e_configure_rx_ring and reallocated again with i40e_alloc_rx_bi, thus kfree on rx_bi caused leak of already mapped DMA. Fix this by reallocating ZC with rx_bi_zc struct when BPF program loads. Additionally reallocate back to rx_bi when BPF program unloads. If BPF program is loaded/unloaded and XSK pools are created, reallocate RX queues accordingly in XSP_SETUP_XSK_POOL handler. Fixes: be1222b585fd ("i40e: Separate kernel allocated rx_bi rings from AF_XDP rings") Signed-off-by: Jan Sokolowski <jan.sokolowski@intel.com> Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Chandan <chandanx.rout@intel.com> (A Contingent Worker at Intel) Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel) Signed-off-by: David S. Miller <davem@davemloft.net> --- .../net/ethernet/intel/i40e/i40e_ethtool.c | 3 - drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++-- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 13 ++-- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 - drivers/net/ethernet/intel/i40e/i40e_xsk.c | 67 ++++++++++++++++--- drivers/net/ethernet/intel/i40e/i40e_xsk.h | 2 +- 6 files changed, 74 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 7e75706f76db2..87f36d1ce8008 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -2181,9 +2181,6 @@ static int i40e_set_ringparam(struct net_device *netdev, */ rx_rings[i].tail = hw->hw_addr + I40E_PRTGEN_STATUS; err = i40e_setup_rx_descriptors(&rx_rings[i]); - if (err) - goto rx_unwind; - err = i40e_alloc_rx_bi(&rx_rings[i]); if (err) goto rx_unwind; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2c07fa8ecfc80..b5dcd15ced364 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3566,12 +3566,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) if (ring->vsi->type == I40E_VSI_MAIN) xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); - kfree(ring->rx_bi); ring->xsk_pool = i40e_xsk_pool(ring); if (ring->xsk_pool) { - ret = i40e_alloc_rx_bi_zc(ring); - if (ret) - return ret; ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); /* For AF_XDP ZC, we disallow packets to span on @@ -3589,9 +3585,6 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->queue_index); } else { - ret = i40e_alloc_rx_bi(ring); - if (ret) - return ret; ring->rx_buf_len = vsi->rx_buf_len; if (ring->vsi->type == I40E_VSI_MAIN) { ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -13296,6 +13289,14 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, i40e_reset_and_rebuild(pf, true, true); } + if (!i40e_enabled_xdp_vsi(vsi) && prog) { + if (i40e_realloc_rx_bi_zc(vsi, true)) + return -ENOMEM; + } else if (i40e_enabled_xdp_vsi(vsi) && !prog) { + if (i40e_realloc_rx_bi_zc(vsi, false)) + return -ENOMEM; + } + for (i = 0; i < vsi->num_queue_pairs; i++) WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog); @@ -13528,6 +13529,7 @@ int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair) i40e_queue_pair_disable_irq(vsi, queue_pair); err = i40e_queue_pair_toggle_rings(vsi, queue_pair, false /* off */); + i40e_clean_rx_ring(vsi->rx_rings[queue_pair]); i40e_queue_pair_toggle_napi(vsi, queue_pair, false /* off */); i40e_queue_pair_clean_rings(vsi, queue_pair); i40e_queue_pair_reset_stats(vsi, queue_pair); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 69e67eb6aea72..b97c95f89fa02 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1457,14 +1457,6 @@ err: return -ENOMEM; } -int i40e_alloc_rx_bi(struct i40e_ring *rx_ring) -{ - unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count; - - rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL); - return rx_ring->rx_bi ? 0 : -ENOMEM; -} - static void i40e_clear_rx_bi(struct i40e_ring *rx_ring) { memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count); @@ -1593,6 +1585,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; + rx_ring->rx_bi = + kcalloc(rx_ring->count, sizeof(*rx_ring->rx_bi), GFP_KERNEL); + if (!rx_ring->rx_bi) + return -ENOMEM; + return 0; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 41f86e9535a00..768290dc6f48b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -469,7 +469,6 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); -int i40e_alloc_rx_bi(struct i40e_ring *rx_ring); /** * i40e_get_head - Retrieve head from head writeback diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 6d4009e0cbd62..cd7b52fb6b46c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -10,14 +10,6 @@ #include "i40e_txrx_common.h" #include "i40e_xsk.h" -int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring) -{ - unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count; - - rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL); - return rx_ring->rx_bi_zc ? 0 : -ENOMEM; -} - void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring) { memset(rx_ring->rx_bi_zc, 0, @@ -29,6 +21,58 @@ static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx) return &rx_ring->rx_bi_zc[idx]; } +/** + * i40e_realloc_rx_xdp_bi - reallocate SW ring for either XSK or normal buffer + * @rx_ring: Current rx ring + * @pool_present: is pool for XSK present + * + * Try allocating memory and return ENOMEM, if failed to allocate. + * If allocation was successful, substitute buffer with allocated one. + * Returns 0 on success, negative on failure + */ +static int i40e_realloc_rx_xdp_bi(struct i40e_ring *rx_ring, bool pool_present) +{ + size_t elem_size = pool_present ? sizeof(*rx_ring->rx_bi_zc) : + sizeof(*rx_ring->rx_bi); + void *sw_ring = kcalloc(rx_ring->count, elem_size, GFP_KERNEL); + + if (!sw_ring) + return -ENOMEM; + + if (pool_present) { + kfree(rx_ring->rx_bi); + rx_ring->rx_bi = NULL; + rx_ring->rx_bi_zc = sw_ring; + } else { + kfree(rx_ring->rx_bi_zc); + rx_ring->rx_bi_zc = NULL; + rx_ring->rx_bi = sw_ring; + } + return 0; +} + +/** + * i40e_realloc_rx_bi_zc - reallocate rx SW rings + * @vsi: Current VSI + * @zc: is zero copy set + * + * Reallocate buffer for rx_rings that might be used by XSK. + * XDP requires more memory, than rx_buf provides. + * Returns 0 on success, negative on failure + */ +int i40e_realloc_rx_bi_zc(struct i40e_vsi *vsi, bool zc) +{ + struct i40e_ring *rx_ring; + unsigned long q; + + for_each_set_bit(q, vsi->af_xdp_zc_qps, vsi->alloc_queue_pairs) { + rx_ring = vsi->rx_rings[q]; + if (i40e_realloc_rx_xdp_bi(rx_ring, zc)) + return -ENOMEM; + } + return 0; +} + /** * i40e_xsk_pool_enable - Enable/associate an AF_XDP buffer pool to a * certain ring/qid @@ -69,6 +113,10 @@ static int i40e_xsk_pool_enable(struct i40e_vsi *vsi, if (err) return err; + err = i40e_realloc_rx_xdp_bi(vsi->rx_rings[qid], true); + if (err) + return err; + err = i40e_queue_pair_enable(vsi, qid); if (err) return err; @@ -113,6 +161,9 @@ static int i40e_xsk_pool_disable(struct i40e_vsi *vsi, u16 qid) xsk_pool_dma_unmap(pool, I40E_RX_DMA_ATTR); if (if_running) { + err = i40e_realloc_rx_xdp_bi(vsi->rx_rings[qid], false); + if (err) + return err; err = i40e_queue_pair_enable(vsi, qid); if (err) return err; diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index bb962987f300a..821df248f8bee 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -32,7 +32,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget); bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi, struct i40e_ring *tx_ring); int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags); -int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring); +int i40e_realloc_rx_bi_zc(struct i40e_vsi *vsi, bool zc); void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring); #endif /* _I40E_XSK_H_ */ -- GitLab From 0d87bbd39d7fd1135ab9eca672d760470f6508e8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski <kuba@kernel.org> Date: Wed, 12 Oct 2022 15:55:20 -0700 Subject: [PATCH 1953/2223] tls: strp: make sure the TCP skbs do not have overlapping data TLS tries to get away with using the TCP input queue directly. This does not work if there is duplicated data (multiple skbs holding bytes for the same seq number range due to retransmits). Check for this condition and fall back to copy mode, it should be rare. Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tls/tls_strp.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 9b79e334dbd9e..955ac3e0bf4d3 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -273,7 +273,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp) return desc.error; } -static int tls_strp_read_short(struct tls_strparser *strp) +static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) { struct skb_shared_info *shinfo; struct page *page; @@ -283,7 +283,7 @@ static int tls_strp_read_short(struct tls_strparser *strp) * to read the data out. Otherwise the connection will stall. * Without pressure threshold of INT_MAX will never be ready. */ - if (likely(!tcp_epollin_ready(strp->sk, INT_MAX))) + if (likely(qshort && !tcp_epollin_ready(strp->sk, INT_MAX))) return 0; shinfo = skb_shinfo(strp->anchor); @@ -315,6 +315,27 @@ static int tls_strp_read_short(struct tls_strparser *strp) return 0; } +static bool tls_strp_check_no_dup(struct tls_strparser *strp) +{ + unsigned int len = strp->stm.offset + strp->stm.full_len; + struct sk_buff *skb; + u32 seq; + + skb = skb_shinfo(strp->anchor)->frag_list; + seq = TCP_SKB_CB(skb)->seq; + + while (skb->len < len) { + seq += skb->len; + len -= skb->len; + skb = skb->next; + + if (TCP_SKB_CB(skb)->seq != seq) + return false; + } + + return true; +} + static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) { struct tcp_sock *tp = tcp_sk(strp->sk); @@ -373,7 +394,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp) return tls_strp_read_copyin(strp); if (inq < strp->stm.full_len) - return tls_strp_read_short(strp); + return tls_strp_read_copy(strp, true); if (!strp->stm.full_len) { tls_strp_load_anchor_with_queue(strp, inq); @@ -387,9 +408,12 @@ static int tls_strp_read_sock(struct tls_strparser *strp) strp->stm.full_len = sz; if (!strp->stm.full_len || inq < strp->stm.full_len) - return tls_strp_read_short(strp); + return tls_strp_read_copy(strp, true); } + if (!tls_strp_check_no_dup(strp)) + return tls_strp_read_copy(strp, false); + strp->msg_ready = 1; tls_rx_msg_ready(strp); -- GitLab From 3d6642eac74d9442fde232181aa52d26d47991df Mon Sep 17 00:00:00 2001 From: zhangxiangqian <zhangxiangqian@kylinos.cn> Date: Thu, 13 Oct 2022 15:41:12 +0800 Subject: [PATCH 1954/2223] net: macvlan: change schedule system_wq to system_unbound_wq For FT2000+/64 devices, when four virtual machines share the same physical network interface, DROP will occur due to the single core CPU performance problem. ip_check_defrag and macvlan_process_broadcast is on the same CPU. When the MACVLAN PORT increases, the CPU usage reaches more than 90%. bc_queue > bc_queue_len_used (default 1000), causing DROP. Signed-off-by: zhangxiangqian <zhangxiangqian@kylinos.cn> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 8f8f73099de8d..c5cfe85551992 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -361,7 +361,7 @@ static void macvlan_broadcast_enqueue(struct macvlan_port *port, } spin_unlock(&port->bc_queue.lock); - schedule_work(&port->bc_work); + queue_work(system_unbound_wq, &port->bc_work); if (err) goto free_nskb; -- GitLab From 9a9a5d80ec9887814042c69768c2fee7961db7f4 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt <palmer@rivosinc.com> Date: Thu, 13 Oct 2022 14:46:36 -0700 Subject: [PATCH 1955/2223] MAINTAINERS: git://github -> https://github.com for petkan Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Reported-by: Conor Dooley <conor.dooley@microchip.com> Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a96c60c787af8..fb514facd1696 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21300,7 +21300,7 @@ L: linux-usb@vger.kernel.org L: netdev@vger.kernel.org S: Maintained W: https://github.com/petkan/pegasus -T: git git://github.com/petkan/pegasus.git +T: git https://github.com/petkan/pegasus.git F: drivers/net/usb/pegasus.* USB PHY LAYER @@ -21337,7 +21337,7 @@ L: linux-usb@vger.kernel.org L: netdev@vger.kernel.org S: Maintained W: https://github.com/petkan/rtl8150 -T: git git://github.com/petkan/rtl8150.git +T: git https://github.com/petkan/rtl8150.git F: drivers/net/usb/rtl8150.c USB SERIAL SUBSYSTEM -- GitLab From 0c93411795513a0e8dfcb5bcc7bab756b98bfc73 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Date: Thu, 13 Oct 2022 19:42:05 -0400 Subject: [PATCH 1956/2223] MAINTAINERS: nfc: s3fwrn5: Drop Krzysztof Opasiak Emails to Krzysztof Opasiak bounce ("Recipient address rejected: User unknown") so drop his email from maintainers of s3fwrn5 NFC bindings and driver. Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml | 1 - MAINTAINERS | 1 - 2 files changed, 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml b/Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml index 64995cbb0f978..41c9760227cd6 100644 --- a/Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml +++ b/Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml @@ -8,7 +8,6 @@ title: Samsung S3FWRN5 NCI NFC Controller maintainers: - Krzysztof Kozlowski <krzk@kernel.org> - - Krzysztof Opasiak <k.opasiak@samsung.com> properties: compatible: diff --git a/MAINTAINERS b/MAINTAINERS index fb514facd1696..abbe88e1c50b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18217,7 +18217,6 @@ F: include/media/drv-intf/s3c_camif.h SAMSUNG S3FWRN5 NFC DRIVER M: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> -M: Krzysztof Opasiak <k.opasiak@samsung.com> L: linux-nfc@lists.01.org (subscribers-only) S: Maintained F: Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml -- GitLab From aca7c13d3bee81a968337a5515411409ae9d095d Mon Sep 17 00:00:00 2001 From: Helge Deller <deller@gmx.de> Date: Fri, 14 Oct 2022 10:13:55 +0200 Subject: [PATCH 1957/2223] parisc: fbdev/stifb: Align graphics memory size to 4MB Independend of the current graphics resolution, adjust the reported graphics card memory size to the next 4MB boundary. This fixes the fbtest program which expects a naturally aligned size. Signed-off-by: Helge Deller <deller@gmx.de> Cc: <stable@vger.kernel.org> --- drivers/video/fbdev/stifb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c index 38a861e22c339..7753e586e65a0 100644 --- a/drivers/video/fbdev/stifb.c +++ b/drivers/video/fbdev/stifb.c @@ -1298,7 +1298,7 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) /* limit fbsize to max visible screen size */ if (fix->smem_len > yres*fix->line_length) - fix->smem_len = yres*fix->line_length; + fix->smem_len = ALIGN(yres*fix->line_length, 4*1024*1024); fix->accel = FB_ACCEL_NONE; -- GitLab From 70be49f2f6223ddd2fcddb0089a40864c37e1494 Mon Sep 17 00:00:00 2001 From: Helge Deller <deller@gmx.de> Date: Fri, 14 Oct 2022 10:18:53 +0200 Subject: [PATCH 1958/2223] parisc: Fix userspace graphics card breakage due to pgtable special bit Commit df24e1783e6e ("parisc: Add vDSO support") introduced the vDSO support, for which a _PAGE_SPECIAL page table flag was needed. Since we wanted to keep every page table entry in 32-bits, this patch re-used the existing - but yet unused - _PAGE_DMB flag (which triggers a hardware break if a page is accessed) to store the special bit. But when graphics card memory is mmapped into userspace, the kernel uses vm_iomap_memory() which sets the the special flag. So, with the DMB bit set, every access to the graphics memory now triggered a hardware exception and segfaulted the userspace program. Fix this breakage by dropping the DMB bit when writing the page protection bits to the CPU TLB. In addition this patch adds a small optimization: if huge pages aren't configured (which is at least the case for 32-bit kernels), then the special bit is stored in the hpage (HUGE PAGE) bit instead. That way we can skip to reset the DMB bit. Fixes: df24e1783e6e ("parisc: Add vDSO support") Cc: <stable@vger.kernel.org> # 5.18+ Signed-off-by: Helge Deller <deller@gmx.de> --- arch/parisc/include/asm/pgtable.h | 7 ++++++- arch/parisc/kernel/entry.S | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index df7b931865d22..ecd0288544698 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -192,6 +192,11 @@ extern void __update_cache(pte_t pte); #define _PAGE_PRESENT_BIT 22 /* (0x200) Software: translation valid */ #define _PAGE_HPAGE_BIT 21 /* (0x400) Software: Huge Page */ #define _PAGE_USER_BIT 20 /* (0x800) Software: User accessible page */ +#ifdef CONFIG_HUGETLB_PAGE +#define _PAGE_SPECIAL_BIT _PAGE_DMB_BIT /* DMB feature is currently unused */ +#else +#define _PAGE_SPECIAL_BIT _PAGE_HPAGE_BIT /* use unused HUGE PAGE bit */ +#endif /* N.B. The bits are defined in terms of a 32 bit word above, so the */ /* following macro is ok for both 32 and 64 bit. */ @@ -219,7 +224,7 @@ extern void __update_cache(pte_t pte); #define _PAGE_PRESENT (1 << xlate_pabit(_PAGE_PRESENT_BIT)) #define _PAGE_HUGE (1 << xlate_pabit(_PAGE_HPAGE_BIT)) #define _PAGE_USER (1 << xlate_pabit(_PAGE_USER_BIT)) -#define _PAGE_SPECIAL (_PAGE_DMB) +#define _PAGE_SPECIAL (1 << xlate_pabit(_PAGE_SPECIAL_BIT)) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL) diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index df8102fb435fc..0e5ebfe8d9d29 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -499,6 +499,10 @@ * Finally, _PAGE_READ goes in the top bit of PL1 (so we * trigger an access rights trap in user space if the user * tries to read an unreadable page */ +#if _PAGE_SPECIAL_BIT == _PAGE_DMB_BIT + /* need to drop DMB bit, as it's used as SPECIAL flag */ + depi 0,_PAGE_SPECIAL_BIT,1,\pte +#endif depd \pte,8,7,\prot /* PAGE_USER indicates the page can be read with user privileges, @@ -529,6 +533,10 @@ * makes the tlb entry for the differently formatted pa11 * insertion instructions */ .macro make_insert_tlb_11 spc,pte,prot +#if _PAGE_SPECIAL_BIT == _PAGE_DMB_BIT + /* need to drop DMB bit, as it's used as SPECIAL flag */ + depi 0,_PAGE_SPECIAL_BIT,1,\pte +#endif zdep \spc,30,15,\prot dep \pte,8,7,\prot extru,= \pte,_PAGE_NO_CACHE_BIT,1,%r0 -- GitLab From bb5f0c855dcfc893ae5ed90e4c646bde9e4498bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= <jose.exposito89@gmail.com> Date: Sun, 9 Oct 2022 20:27:47 +0200 Subject: [PATCH 1959/2223] HID: magicmouse: Do not set BTN_MOUSE on double report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under certain conditions the Magic Trackpad can group 2 reports in a single packet. The packet is split and the raw event function is invoked recursively for each part. However, after processing each part, the BTN_MOUSE status is updated, sending multiple click events. [1] Return after processing double reports to avoid this issue. Link: https://gitlab.freedesktop.org/libinput/libinput/-/issues/811 # [1] Fixes: a462230e16ac ("HID: magicmouse: enable Magic Trackpad support") Reported-by: Nulo <git@nulo.in> Signed-off-by: José Expósito <jose.exposito89@gmail.com> Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> Link: https://lore.kernel.org/r/20221009182747.90730-1-jose.exposito89@gmail.com --- drivers/hid/hid-magicmouse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c index 664a624a363d0..c9c968d4b36a3 100644 --- a/drivers/hid/hid-magicmouse.c +++ b/drivers/hid/hid-magicmouse.c @@ -480,7 +480,7 @@ static int magicmouse_raw_event(struct hid_device *hdev, magicmouse_raw_event(hdev, report, data + 2, data[1]); magicmouse_raw_event(hdev, report, data + 2 + data[1], size - 2 - data[1]); - break; + return 0; default: return 0; } -- GitLab From 182934a1e93b17f4edf71f4fcc8d19b19a6fe67a Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander <roderick@gaikai.com> Date: Mon, 10 Oct 2022 14:23:11 -0700 Subject: [PATCH 1960/2223] HID: playstation: stop DualSense output work on remove. Ensure we don't schedule any new output work on removal and wait for any existing work to complete. If we don't do this e.g. rumble work can get queued during deletion and we trigger a kernel crash. Signed-off-by: Roderick Colenbrander <roderick.colenbrander@sony.com> CC: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> Link: https://lore.kernel.org/r/20221010212313.78275-2-roderick.colenbrander@sony.com --- drivers/hid/hid-playstation.c | 41 ++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index 40050eb85c0a5..d727cd2bf44e2 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -46,6 +46,7 @@ struct ps_device { uint32_t fw_version; int (*parse_report)(struct ps_device *dev, struct hid_report *report, u8 *data, int size); + void (*remove)(struct ps_device *dev); }; /* Calibration data for playstation motion sensors. */ @@ -174,6 +175,7 @@ struct dualsense { struct led_classdev player_leds[5]; struct work_struct output_worker; + bool output_worker_initialized; void *output_report_dmabuf; uint8_t output_seq; /* Sequence number for output report. */ }; @@ -299,6 +301,7 @@ static const struct {int x; int y; } ps_gamepad_hat_mapping[] = { {0, 0}, }; +static inline void dualsense_schedule_work(struct dualsense *ds); static void dualsense_set_lightbar(struct dualsense *ds, uint8_t red, uint8_t green, uint8_t blue); /* @@ -789,6 +792,7 @@ err_free: return ret; } + static int dualsense_get_firmware_info(struct dualsense *ds) { uint8_t *buf; @@ -878,7 +882,7 @@ static int dualsense_player_led_set_brightness(struct led_classdev *led, enum le ds->update_player_leds = true; spin_unlock_irqrestore(&ds->base.lock, flags); - schedule_work(&ds->output_worker); + dualsense_schedule_work(ds); return 0; } @@ -922,6 +926,16 @@ static void dualsense_init_output_report(struct dualsense *ds, struct dualsense_ } } +static inline void dualsense_schedule_work(struct dualsense *ds) +{ + unsigned long flags; + + spin_lock_irqsave(&ds->base.lock, flags); + if (ds->output_worker_initialized) + schedule_work(&ds->output_worker); + spin_unlock_irqrestore(&ds->base.lock, flags); +} + /* * Helper function to send DualSense output reports. Applies a CRC at the end of a report * for Bluetooth reports. @@ -1082,7 +1096,7 @@ static int dualsense_parse_report(struct ps_device *ps_dev, struct hid_report *r spin_unlock_irqrestore(&ps_dev->lock, flags); /* Schedule updating of microphone state at hardware level. */ - schedule_work(&ds->output_worker); + dualsense_schedule_work(ds); } ds->last_btn_mic_state = btn_mic_state; @@ -1197,10 +1211,22 @@ static int dualsense_play_effect(struct input_dev *dev, void *data, struct ff_ef ds->motor_right = effect->u.rumble.weak_magnitude / 256; spin_unlock_irqrestore(&ds->base.lock, flags); - schedule_work(&ds->output_worker); + dualsense_schedule_work(ds); return 0; } +static void dualsense_remove(struct ps_device *ps_dev) +{ + struct dualsense *ds = container_of(ps_dev, struct dualsense, base); + unsigned long flags; + + spin_lock_irqsave(&ds->base.lock, flags); + ds->output_worker_initialized = false; + spin_unlock_irqrestore(&ds->base.lock, flags); + + cancel_work_sync(&ds->output_worker); +} + static int dualsense_reset_leds(struct dualsense *ds) { struct dualsense_output_report report; @@ -1237,7 +1263,7 @@ static void dualsense_set_lightbar(struct dualsense *ds, uint8_t red, uint8_t gr ds->lightbar_blue = blue; spin_unlock_irqrestore(&ds->base.lock, flags); - schedule_work(&ds->output_worker); + dualsense_schedule_work(ds); } static void dualsense_set_player_leds(struct dualsense *ds) @@ -1260,7 +1286,7 @@ static void dualsense_set_player_leds(struct dualsense *ds) ds->update_player_leds = true; ds->player_leds_state = player_ids[player_id]; - schedule_work(&ds->output_worker); + dualsense_schedule_work(ds); } static struct ps_device *dualsense_create(struct hid_device *hdev) @@ -1299,7 +1325,9 @@ static struct ps_device *dualsense_create(struct hid_device *hdev) ps_dev->battery_capacity = 100; /* initial value until parse_report. */ ps_dev->battery_status = POWER_SUPPLY_STATUS_UNKNOWN; ps_dev->parse_report = dualsense_parse_report; + ps_dev->remove = dualsense_remove; INIT_WORK(&ds->output_worker, dualsense_output_worker); + ds->output_worker_initialized = true; hid_set_drvdata(hdev, ds); max_output_report_size = sizeof(struct dualsense_output_report_bt); @@ -1461,6 +1489,9 @@ static void ps_remove(struct hid_device *hdev) ps_devices_list_remove(dev); ps_device_release_player_id(dev); + if (dev->remove) + dev->remove(dev); + hid_hw_close(hdev); hid_hw_stop(hdev); } -- GitLab From b8a968efab301743fd659b5649c5d7d3e30e63a6 Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander <roderick@gaikai.com> Date: Mon, 10 Oct 2022 14:23:12 -0700 Subject: [PATCH 1961/2223] HID: playstation: add initial DualSense Edge controller support Provide initial support for the DualSense Edge controller. The brings support up to the level of the original DualSense, but won't yet provide support for new features (e.g. reprogrammable buttons). Signed-off-by: Roderick Colenbrander <roderick.colenbrander@sony.com> CC: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> Link: https://lore.kernel.org/r/20221010212313.78275-3-roderick.colenbrander@sony.com --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-playstation.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index da86565f04d4e..7cc23be4975c7 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1142,6 +1142,7 @@ #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_2 0x09cc #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE 0x0ba0 #define USB_DEVICE_ID_SONY_PS5_CONTROLLER 0x0ce6 +#define USB_DEVICE_ID_SONY_PS5_CONTROLLER_2 0x0df2 #define USB_DEVICE_ID_SONY_MOTION_CONTROLLER 0x03d5 #define USB_DEVICE_ID_SONY_NAVIGATION_CONTROLLER 0x042f #define USB_DEVICE_ID_SONY_BUZZ_CONTROLLER 0x0002 diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index d727cd2bf44e2..396356b6760a7 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -1464,7 +1464,8 @@ static int ps_probe(struct hid_device *hdev, const struct hid_device_id *id) goto err_stop; } - if (hdev->product == USB_DEVICE_ID_SONY_PS5_CONTROLLER) { + if (hdev->product == USB_DEVICE_ID_SONY_PS5_CONTROLLER || + hdev->product == USB_DEVICE_ID_SONY_PS5_CONTROLLER_2) { dev = dualsense_create(hdev); if (IS_ERR(dev)) { hid_err(hdev, "Failed to create dualsense.\n"); @@ -1499,6 +1500,8 @@ static void ps_remove(struct hid_device *hdev) static const struct hid_device_id ps_devices[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS5_CONTROLLER) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS5_CONTROLLER) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS5_CONTROLLER_2) }, + { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS5_CONTROLLER_2) }, { } }; MODULE_DEVICE_TABLE(hid, ps_devices); -- GitLab From 9fecab247ed15e6145c126fc56ee1e89860741a7 Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander <roderick@gaikai.com> Date: Mon, 10 Oct 2022 14:23:13 -0700 Subject: [PATCH 1962/2223] HID: playstation: support updated DualSense rumble mode. Newer DualSense firmware supports a revised classic rumble mode, which feels more similar to rumble as supported on previous PlayStation controllers. It has been made the default on PlayStation and non-PlayStation devices now (e.g. iOS and Windows). Default to this new mode when supported. Signed-off-by: Roderick Colenbrander <roderick.colenbrander@sony.com> Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> Link: https://lore.kernel.org/r/20221010212313.78275-4-roderick.colenbrander@sony.com --- drivers/hid/hid-playstation.c | 37 ++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index 396356b6760a7..0b58763bfd301 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -108,6 +108,9 @@ struct ps_led_info { #define DS_STATUS_CHARGING GENMASK(7, 4) #define DS_STATUS_CHARGING_SHIFT 4 +/* Feature version from DualSense Firmware Info report. */ +#define DS_FEATURE_VERSION(major, minor) ((major & 0xff) << 8 | (minor & 0xff)) + /* * Status of a DualSense touch point contact. * Contact IDs, with highest bit set are 'inactive' @@ -126,6 +129,7 @@ struct ps_led_info { #define DS_OUTPUT_VALID_FLAG1_RELEASE_LEDS BIT(3) #define DS_OUTPUT_VALID_FLAG1_PLAYER_INDICATOR_CONTROL_ENABLE BIT(4) #define DS_OUTPUT_VALID_FLAG2_LIGHTBAR_SETUP_CONTROL_ENABLE BIT(1) +#define DS_OUTPUT_VALID_FLAG2_COMPATIBLE_VIBRATION2 BIT(2) #define DS_OUTPUT_POWER_SAVE_CONTROL_MIC_MUTE BIT(4) #define DS_OUTPUT_LIGHTBAR_SETUP_LIGHT_OUT BIT(1) @@ -143,6 +147,9 @@ struct dualsense { struct input_dev *sensors; struct input_dev *touchpad; + /* Update version is used as a feature/capability version. */ + uint16_t update_version; + /* Calibration data for accelerometer and gyroscope. */ struct ps_calibration_data accel_calib_data[3]; struct ps_calibration_data gyro_calib_data[3]; @@ -153,6 +160,7 @@ struct dualsense { uint32_t sensor_timestamp_us; /* Compatible rumble state */ + bool use_vibration_v2; bool update_rumble; uint8_t motor_left; uint8_t motor_right; @@ -812,6 +820,15 @@ static int dualsense_get_firmware_info(struct dualsense *ds) ds->base.hw_version = get_unaligned_le32(&buf[24]); ds->base.fw_version = get_unaligned_le32(&buf[28]); + /* Update version is some kind of feature version. It is distinct from + * the firmware version as there can be many different variations of a + * controller over time with the same physical shell, but with different + * PCBs and other internal changes. The update version (internal name) is + * used as a means to detect what features are available and change behavior. + * Note: the version is different between DualSense and DualSense Edge. + */ + ds->update_version = get_unaligned_le16(&buf[44]); + err_free: kfree(buf); return ret; @@ -974,7 +991,10 @@ static void dualsense_output_worker(struct work_struct *work) if (ds->update_rumble) { /* Select classic rumble style haptics and enable it. */ common->valid_flag0 |= DS_OUTPUT_VALID_FLAG0_HAPTICS_SELECT; - common->valid_flag0 |= DS_OUTPUT_VALID_FLAG0_COMPATIBLE_VIBRATION; + if (ds->use_vibration_v2) + common->valid_flag2 |= DS_OUTPUT_VALID_FLAG2_COMPATIBLE_VIBRATION2; + else + common->valid_flag0 |= DS_OUTPUT_VALID_FLAG0_COMPATIBLE_VIBRATION; common->motor_left = ds->motor_left; common->motor_right = ds->motor_right; ds->update_rumble = false; @@ -1348,6 +1368,21 @@ static struct ps_device *dualsense_create(struct hid_device *hdev) return ERR_PTR(ret); } + /* Original DualSense firmware simulated classic controller rumble through + * its new haptics hardware. It felt different from classic rumble users + * were used to. Since then new firmwares were introduced to change behavior + * and make this new 'v2' behavior default on PlayStation and other platforms. + * The original DualSense requires a new enough firmware as bundled with PS5 + * software released in 2021. DualSense edge supports it out of the box. + * Both devices also support the old mode, but it is not really used. + */ + if (hdev->product == USB_DEVICE_ID_SONY_PS5_CONTROLLER) { + /* Feature version 2.21 introduced new vibration method. */ + ds->use_vibration_v2 = ds->update_version >= DS_FEATURE_VERSION(2, 21); + } else if (hdev->product == USB_DEVICE_ID_SONY_PS5_CONTROLLER_2) { + ds->use_vibration_v2 = true; + } + ret = ps_devices_list_add(ps_dev); if (ret) return ERR_PTR(ret); -- GitLab From 96cb9d0554457086664d3bd10630b11193d863f1 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Mon, 10 Oct 2022 09:06:07 -0600 Subject: [PATCH 1963/2223] hwrng: bcm2835 - use hwrng_msleep() instead of cpu_relax() Rather than busy looping, yield back to the scheduler and sleep for a bit in the event that there's no data. This should hopefully prevent the stalls that Mark reported: <6>[ 3.362859] Freeing initrd memory: 16196K <3>[ 23.160131] rcu: INFO: rcu_sched self-detected stall on CPU <3>[ 23.166057] rcu: 0-....: (2099 ticks this GP) idle=03b4/1/0x40000002 softirq=28/28 fqs=1050 <4>[ 23.174895] (t=2101 jiffies g=-1147 q=2353 ncpus=4) <4>[ 23.180203] CPU: 0 PID: 49 Comm: hwrng Not tainted 6.0.0 #1 <4>[ 23.186125] Hardware name: BCM2835 <4>[ 23.189837] PC is at bcm2835_rng_read+0x30/0x6c <4>[ 23.194709] LR is at hwrng_fillfn+0x71/0xf4 <4>[ 23.199218] pc : [<c07ccdc8>] lr : [<c07cb841>] psr: 40000033 <4>[ 23.205840] sp : f093df70 ip : 00000000 fp : 00000000 <4>[ 23.211404] r10: c3c7e800 r9 : 00000000 r8 : c17e6b20 <4>[ 23.216968] r7 : c17e6b64 r6 : c18b0a74 r5 : c07ccd99 r4 : c3f171c0 <4>[ 23.223855] r3 : 000fffff r2 : 00000040 r1 : c3c7e800 r0 : c3f171c0 <4>[ 23.230743] Flags: nZcv IRQs on FIQs on Mode SVC_32 ISA Thumb Segment none <4>[ 23.238426] Control: 50c5387d Table: 0020406a DAC: 00000051 <4>[ 23.244519] CPU: 0 PID: 49 Comm: hwrng Not tainted 6.0.0 #1 Link: https://lore.kernel.org/all/Y0QJLauamRnCDUef@sirena.org.uk/ Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Acked-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/char/hw_random/bcm2835-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c index e7dd457e9b22b..e98fcac578d66 100644 --- a/drivers/char/hw_random/bcm2835-rng.c +++ b/drivers/char/hw_random/bcm2835-rng.c @@ -71,7 +71,7 @@ static int bcm2835_rng_read(struct hwrng *rng, void *buf, size_t max, while ((rng_readl(priv, RNG_STATUS) >> 24) == 0) { if (!wait) return 0; - cpu_relax(); + hwrng_msleep(rng, 1000); } num_words = rng_readl(priv, RNG_STATUS) >> 24; -- GitLab From 4efb365a3f04d0bee7833f168b0b00a15edefeac Mon Sep 17 00:00:00 2001 From: David Sterba <dsterba@suse.com> Date: Tue, 11 Oct 2022 15:08:32 +0200 Subject: [PATCH 1964/2223] MAINTAINERS: update btrfs website links and files We have the new documentation hosted on Read The Docs and content is migrated there from the wiki. Also update http to https and add the tracepoint definition header. Signed-off-by: David Sterba <dsterba@suse.com> --- MAINTAINERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f5ca4aefd184c..5ec615e817be9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4428,13 +4428,15 @@ M: Josef Bacik <josef@toxicpanda.com> M: David Sterba <dsterba@suse.com> L: linux-btrfs@vger.kernel.org S: Maintained -W: http://btrfs.wiki.kernel.org/ -Q: http://patchwork.kernel.org/project/linux-btrfs/list/ +W: https://btrfs.readthedocs.io +W: https://btrfs.wiki.kernel.org/ +Q: https://patchwork.kernel.org/project/linux-btrfs/list/ C: irc://irc.libera.chat/btrfs T: git git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git F: Documentation/filesystems/btrfs.rst F: fs/btrfs/ F: include/linux/btrfs* +F: include/trace/events/btrfs.h F: include/uapi/linux/btrfs* BTTV VIDEO4LINUX DRIVER -- GitLab From 875553e317b28e66824fec73ad4459372576ec68 Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Date: Sat, 8 Oct 2022 18:10:12 +0300 Subject: [PATCH 1965/2223] xen/virtio: Handle cases when page offset > PAGE_SIZE properly Passed to xen_grant_dma_map_page() offset in the page can be > PAGE_SIZE even if the guest uses the same page granularity as Xen (4KB). Before current patch, if such case happened we ended up providing grants for the whole region in xen_grant_dma_map_page() which was really unnecessary. The more, we ended up not releasing all grants which represented that region in xen_grant_dma_unmap_page(). Current patch updates the code to be able to deal with such cases. Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Xenia Ragiadakou <burzalodowa@gmail.com> Link: https://lore.kernel.org/r/20221008151013.2537826-2-olekstysh@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/grant-dma-ops.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 860f37c93af41..3089940436be5 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -168,7 +168,9 @@ static dma_addr_t xen_grant_dma_map_page(struct device *dev, struct page *page, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned int i, n_pages = PFN_UP(offset + size); + unsigned long dma_offset = offset_in_page(offset), + pfn_offset = PFN_DOWN(offset); + unsigned int i, n_pages = PFN_UP(dma_offset + size); grant_ref_t grant; dma_addr_t dma_handle; @@ -187,10 +189,11 @@ static dma_addr_t xen_grant_dma_map_page(struct device *dev, struct page *page, for (i = 0; i < n_pages; i++) { gnttab_grant_foreign_access_ref(grant + i, data->backend_domid, - xen_page_to_gfn(page) + i, dir == DMA_TO_DEVICE); + pfn_to_gfn(page_to_xen_pfn(page) + i + pfn_offset), + dir == DMA_TO_DEVICE); } - dma_handle = grant_to_dma(grant) + offset; + dma_handle = grant_to_dma(grant) + dma_offset; return dma_handle; } -- GitLab From a383dcb1cca8305497877119fba0a320f41fe853 Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Date: Sat, 8 Oct 2022 18:10:13 +0300 Subject: [PATCH 1966/2223] xen/virtio: Convert PAGE_SIZE/PAGE_SHIFT/PFN_UP to Xen counterparts Currently, a grant ref is always based on the Xen page granularity (4KB), and guest commonly uses the same page granularity. But the guest may use a different page granularity (i.e 64KB). So adopt the code to be able to deal with it. Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Acked-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Xenia Ragiadakou <burzalodowa@gmail.com> Link: https://lore.kernel.org/r/20221008151013.2537826-3-olekstysh@gmail.com Signed-off-by: Juergen Gross <jgross@suse.com> --- drivers/xen/grant-dma-ops.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 3089940436be5..daa525df7bdc5 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -31,12 +31,12 @@ static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ); static inline dma_addr_t grant_to_dma(grant_ref_t grant) { - return XEN_GRANT_DMA_ADDR_OFF | ((dma_addr_t)grant << PAGE_SHIFT); + return XEN_GRANT_DMA_ADDR_OFF | ((dma_addr_t)grant << XEN_PAGE_SHIFT); } static inline grant_ref_t dma_to_grant(dma_addr_t dma) { - return (grant_ref_t)((dma & ~XEN_GRANT_DMA_ADDR_OFF) >> PAGE_SHIFT); + return (grant_ref_t)((dma & ~XEN_GRANT_DMA_ADDR_OFF) >> XEN_PAGE_SHIFT); } static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev) @@ -79,7 +79,7 @@ static void *xen_grant_dma_alloc(struct device *dev, size_t size, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned int i, n_pages = PFN_UP(size); + unsigned int i, n_pages = XEN_PFN_UP(size); unsigned long pfn; grant_ref_t grant; void *ret; @@ -91,14 +91,14 @@ static void *xen_grant_dma_alloc(struct device *dev, size_t size, if (unlikely(data->broken)) return NULL; - ret = alloc_pages_exact(n_pages * PAGE_SIZE, gfp); + ret = alloc_pages_exact(n_pages * XEN_PAGE_SIZE, gfp); if (!ret) return NULL; pfn = virt_to_pfn(ret); if (gnttab_alloc_grant_reference_seq(n_pages, &grant)) { - free_pages_exact(ret, n_pages * PAGE_SIZE); + free_pages_exact(ret, n_pages * XEN_PAGE_SIZE); return NULL; } @@ -116,7 +116,7 @@ static void xen_grant_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned int i, n_pages = PFN_UP(size); + unsigned int i, n_pages = XEN_PFN_UP(size); grant_ref_t grant; data = find_xen_grant_dma_data(dev); @@ -138,7 +138,7 @@ static void xen_grant_dma_free(struct device *dev, size_t size, void *vaddr, gnttab_free_grant_reference_seq(grant, n_pages); - free_pages_exact(vaddr, n_pages * PAGE_SIZE); + free_pages_exact(vaddr, n_pages * XEN_PAGE_SIZE); } static struct page *xen_grant_dma_alloc_pages(struct device *dev, size_t size, @@ -168,9 +168,9 @@ static dma_addr_t xen_grant_dma_map_page(struct device *dev, struct page *page, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned long dma_offset = offset_in_page(offset), - pfn_offset = PFN_DOWN(offset); - unsigned int i, n_pages = PFN_UP(dma_offset + size); + unsigned long dma_offset = xen_offset_in_page(offset), + pfn_offset = XEN_PFN_DOWN(offset); + unsigned int i, n_pages = XEN_PFN_UP(dma_offset + size); grant_ref_t grant; dma_addr_t dma_handle; @@ -203,8 +203,8 @@ static void xen_grant_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, unsigned long attrs) { struct xen_grant_dma_data *data; - unsigned long offset = dma_handle & (PAGE_SIZE - 1); - unsigned int i, n_pages = PFN_UP(offset + size); + unsigned long dma_offset = xen_offset_in_page(dma_handle); + unsigned int i, n_pages = XEN_PFN_UP(dma_offset + size); grant_ref_t grant; if (WARN_ON(dir == DMA_NONE)) -- GitLab From f21cb52036373a108acde5853931facfea727a7b Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Mon, 10 Oct 2022 22:28:08 -0700 Subject: [PATCH 1967/2223] perf stat: Support old kernels for bperf cgroup counting The recent change in the cgroup will break the backward compatiblity in the BPF program. It should support both old and new kernels using BPF CO-RE technique. Like the task_struct->__state handling in the offcpu analysis, we can check the field name in the cgroup struct. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Song Liu <songliubraving@fb.com> Cc: Tejun Heo <tj@kernel.org> Cc: bpf@vger.kernel.org Cc: cgroups@vger.kernel.org Cc: zefan li <lizefan.x@bytedance.com> Link: http://lore.kernel.org/lkml/20221011052808.282394-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c index 435a875566881..6a438e0102c5a 100644 --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -43,6 +43,18 @@ struct { __uint(value_size, sizeof(struct bpf_perf_event_value)); } cgrp_readings SEC(".maps"); +/* new kernel cgroup definition */ +struct cgroup___new { + int level; + struct cgroup *ancestors[]; +} __attribute__((preserve_access_index)); + +/* old kernel cgroup definition */ +struct cgroup___old { + int level; + u64 ancestor_ids[]; +} __attribute__((preserve_access_index)); + const volatile __u32 num_events = 1; const volatile __u32 num_cpus = 1; @@ -50,6 +62,21 @@ int enabled = 0; int use_cgroup_v2 = 0; int perf_subsys_id = -1; +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) +{ + /* recast pointer to capture new type for compiler */ + struct cgroup___new *cgrp_new = (void *)cgrp; + + if (bpf_core_field_exists(cgrp_new->ancestors)) { + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); + } else { + /* recast pointer to capture old type for compiler */ + struct cgroup___old *cgrp_old = (void *)cgrp; + + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); + } +} + static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) { struct task_struct *p = (void *)bpf_get_current_task(); @@ -77,7 +104,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) break; // convert cgroup-id to a map index - cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id); + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); if (!elem) continue; -- GitLab From fe180a52014fd4a768345fc7ff11a7ced45765e6 Mon Sep 17 00:00:00 2001 From: James Clark <james.clark@arm.com> Date: Wed, 5 Oct 2022 15:05:08 +0100 Subject: [PATCH 1968/2223] perf test: Fix test_arm_coresight.sh failures on Juno This test commonly fails on Arm Juno because the instruction interval is large enough to miss generating any samples for Perf in system-wide mode. Fix this by lowering the interval until a comfortable number of Perf instructions are generated. The test is still quick to run because only a small amount of trace is gathered. Before: sudo ./perf test coresight -vvv ... Recording trace with system wide mode Looking at perf.data file for dumping branch samples: Looking at perf.data file for reporting branch samples: Looking at perf.data file for instruction samples: CoreSight system wide testing: FAIL ... After: sudo ./perf test coresight -vvv ... Recording trace with system wide mode Looking at perf.data file for dumping branch samples: Looking at perf.data file for reporting branch samples: Looking at perf.data file for instruction samples: CoreSight system wide testing: PASS ... Reviewed-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: James Clark <james.clark@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20221005140508.1537277-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_arm_coresight.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh index e4cb4f1806ffa..daad786cf48d9 100755 --- a/tools/perf/tests/shell/test_arm_coresight.sh +++ b/tools/perf/tests/shell/test_arm_coresight.sh @@ -70,7 +70,7 @@ perf_report_instruction_samples() { # 68.12% touch libc-2.27.so [.] _dl_addr # 5.80% touch libc-2.27.so [.] getenv # 4.35% touch ld-2.27.so [.] _dl_fixup - perf report --itrace=i1000i --stdio -i ${perfdata} 2>&1 | \ + perf report --itrace=i20i --stdio -i ${perfdata} 2>&1 | \ egrep " +[0-9]+\.[0-9]+% +$1" > /dev/null 2>&1 } -- GitLab From 11df33c36c4b7a04d2674531f2c6178ad8d61572 Mon Sep 17 00:00:00 2001 From: Richard Acayan <mailingradian@gmail.com> Date: Mon, 10 Oct 2022 21:38:28 -0400 Subject: [PATCH 1969/2223] modpost: put modpost options before argument The musl implementation of getopt stops looking for options after the first non-option argument. Put the options before the non-option argument so environments using musl can still build the kernel and modules. Fixes: f73edc8951b2 ("kbuild: unify two modpost invocations") Link: https://git.musl-libc.org/cgit/musl/tree/src/misc/getopt.c?h=dc9285ad1dc19349c407072cc48ba70dab86de45#n44 Signed-off-by: Richard Acayan <mailingradian@gmail.com> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> --- scripts/Makefile.modpost | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 7740ce3b29e80..8489a3402eb8c 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -119,7 +119,7 @@ quiet_cmd_modpost = MODPOST $@ echo >&2 "WARNING: $(missing-input) is missing."; \ echo >&2 " Modules may not have dependencies or modversions."; \ echo >&2 " You may get many unresolved symbol warnings.";) \ - sed 's/ko$$/o/' $(or $(modorder-if-needed), /dev/null) | $(MODPOST) $(modpost-args) $(vmlinux.o-if-present) -T - + sed 's/ko$$/o/' $(or $(modorder-if-needed), /dev/null) | $(MODPOST) $(modpost-args) -T - $(vmlinux.o-if-present) targets += $(output-symdump) $(output-symdump): $(modorder-if-needed) $(vmlinux.o-if-present) $(moudle.symvers-if-present) $(MODPOST) FORCE -- GitLab From 04518e4c2edc78bc90b4651d50c4aad48d09ac23 Mon Sep 17 00:00:00 2001 From: Guru Das Srinagesh <quic_gurus@quicinc.com> Date: Tue, 11 Oct 2022 12:06:00 -0700 Subject: [PATCH 1970/2223] scripts/clang-tools: Convert clang-tidy args to list Convert list of clang-tidy arguments to a list for ease of adding to them and extending them as required. Signed-off-by: Guru Das Srinagesh <quic_gurus@quicinc.com> Suggested-by: Nick Desaulniers <ndesaulniers@google.com> Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> --- scripts/clang-tools/run-clang-tools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/clang-tools/run-clang-tools.py b/scripts/clang-tools/run-clang-tools.py index bb78c9bde55c9..56f2ec8f0f40a 100755 --- a/scripts/clang-tools/run-clang-tools.py +++ b/scripts/clang-tools/run-clang-tools.py @@ -45,13 +45,14 @@ def init(l, a): def run_analysis(entry): # Disable all checks, then re-enable the ones we want - checks = "-checks=-*," + checks = [] + checks.append("-checks=-*") if args.type == "clang-tidy": - checks += "linuxkernel-*" + checks.append("linuxkernel-*") else: - checks += "clang-analyzer-*" - checks += ",-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling" - p = subprocess.run(["clang-tidy", "-p", args.path, checks, entry["file"]], + checks.append("clang-analyzer-*") + checks.append("-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling") + p = subprocess.run(["clang-tidy", "-p", args.path, ",".join(checks), entry["file"]], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=entry["directory"]) -- GitLab From d5e57375a562f021b455e3f958cc28d54d0ff54b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo <acme@redhat.com> Date: Fri, 14 Oct 2022 10:39:21 -0300 Subject: [PATCH 1971/2223] libperf: Do not include non-UAPI linux/compiler.h header Its just for that __packed define, so use it expanded as __attribute__((packed)), like the other files in /usr/include do. This was problem was preventing building the libperf examples on ALT Linux and Fedora 35, fix it. Reported-by: Vitaly Chikunov <vt@altlinux.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Dmitry Levin <ldv@altlinux.org Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lore.kernel.org/lkml/Y0lnpl2Ix7VljVDc@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/lib/perf/include/perf/event.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index e282faf8fd75b..ad47d7b31046c 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -6,7 +6,6 @@ #include <linux/types.h> #include <linux/limits.h> #include <linux/bpf.h> -#include <linux/compiler.h> #include <sys/types.h> /* pid_t */ #define event_contains(obj, mem) ((obj).header.size > offsetof(typeof(obj), mem)) @@ -207,7 +206,7 @@ struct perf_record_range_cpu_map { __u16 end_cpu; }; -struct __packed perf_record_cpu_map_data { +struct perf_record_cpu_map_data { __u16 type; union { /* Used when type == PERF_CPU_MAP__CPUS. */ @@ -219,7 +218,7 @@ struct __packed perf_record_cpu_map_data { /* Used when type == PERF_CPU_MAP__RANGE_CPUS. */ struct perf_record_range_cpu_map range_cpu_data; }; -}; +} __attribute__((packed)); #pragma GCC diagnostic pop -- GitLab From 531778b129937ea3a6923bc67a45d024712a12f7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Thu, 6 Oct 2022 15:22:32 -0700 Subject: [PATCH 1972/2223] perf annotate: Add missing condition flags for arm64 According to the document [1], it can also have 'hs', 'lo', 'vc', 'vs' as a condition code. Let's add them too. [1] https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/condition-codes-1-condition-flags-and-codes Reported-by: Kevin Nomura <nomurak@google.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20221006222232.266416-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/arch/arm64/annotate/instructions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 037e292ecd8eb..4af0c3a0f86ee 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -102,7 +102,7 @@ static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) if (err) goto out_free_arm; /* b, b.cond, br, cbz/cbnz, tbz/tbnz */ - err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl)?n?z?$", + err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$", REG_EXTENDED); if (err) goto out_free_call; -- GitLab From 7d60fa2cde0d3d80c55492f86a2a944da7510a67 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung@kernel.org> Date: Tue, 4 Oct 2022 13:02:11 -0700 Subject: [PATCH 1973/2223] perf mem: Fix -C option behavior for perf mem record The -C/--cpu option was maily for report but it also affected record as it ate the option. So users needed to use "--" after perf mem record to pass the info to the perf record properly. Check if this option is set for record, and pass it to the actual perf record. Before) $ sudo perf --debug perf-event-open mem record -C 0 2>&1 | grep -a sys_perf_event_open ... sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 4 sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 = 5 sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 = 6 sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 = 7 sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 8 sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 = 9 sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 = 10 sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 = 11 ... After) $ sudo perf --debug perf-event-open mem record -C 0 2>&1 | grep -a sys_perf_event_open ... sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 4 sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 5 sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 6 sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 7 Reported-by: Ravi Bangoria <ravi.bangoria@amd.com> Reviewed-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Leo Yan <leo.yan@linaro.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221004200211.1444521-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-mem.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index f7dd8216de72e..923fb8316fdae 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -97,6 +97,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) else rec_argc = argc + 9 * perf_pmu__hybrid_pmu_num(); + if (mem->cpu_list) + rec_argc += 2; + rec_argv = calloc(rec_argc + 1, sizeof(char *)); if (!rec_argv) return -1; @@ -159,6 +162,11 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) if (all_kernel) rec_argv[i++] = "--all-kernel"; + if (mem->cpu_list) { + rec_argv[i++] = "-C"; + rec_argv[i++] = mem->cpu_list; + } + for (j = 0; j < argc; j++, i++) rec_argv[i] = argv[j]; -- GitLab From 0cef141e8630c0b08bd1c4309be2ba74480c69a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@linux.intel.com> Date: Tue, 4 Oct 2022 12:26:34 -0700 Subject: [PATCH 1974/2223] perf list: Fix metricgroups title message $ perf list metricgroups gives List of pre-defined events (to be used in -e): Metric Groups: Backend Bad BadSpec But that's incorrect of course because metric groups or metrics can only be specified with -M. So fix the message to say -e or -M Signed-off-by: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20221004192634.998984-1-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/builtin-list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 744dd35205847..58e1ec1654ef4 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -60,7 +60,7 @@ int cmd_list(int argc, const char **argv) setup_pager(); if (!raw_dump && pager_in_use()) - printf("\nList of pre-defined events (to be used in -e):\n\n"); + printf("\nList of pre-defined events (to be used in -e or -M):\n\n"); if (hybrid_type) { pmu_name = perf_pmu__hybrid_type_to_pmu(hybrid_type); -- GitLab From e552b7be12ed62357df84392efa525ecb01910fb Mon Sep 17 00:00:00 2001 From: Rob Herring <robh@kernel.org> Date: Tue, 4 Oct 2022 14:12:35 -0500 Subject: [PATCH 1975/2223] perf: Skip and warn on unknown format 'configN' attrs If the kernel exposes a new perf_event_attr field in a format attr, perf will return an error stating the specified PMU can't be found. For example, a format attr with 'config3:0-63' causes an error as config3 is unknown to perf. This causes a compatibility issue between a newer kernel with older perf tool. Before this change with a kernel adding 'config3' I get: $ perf record -e arm_spe// -- true event syntax error: 'arm_spe//' \___ Cannot find PMU `arm_spe'. Missing kernel support? Run 'perf list' for a list of valid events Usage: perf record [<options>] [<command>] or: perf record [<options>] -- <command> [<options>] -e, --event <event> event selector. use 'perf list' to list available events After this change, I get: $ perf record -e arm_spe// -- true WARNING: 'arm_spe_0' format 'inv_event_filter' requires 'perf_event_attr::config3' which is not supported by this version of perf! [ perf record: Woken up 2 times to write data ] [ perf record: Captured and wrote 0.091 MB perf.data ] To support unknown configN formats, rework the YACC implementation to pass any config[0-9]+ format to perf_pmu__new_format() to handle with a warning. Reviewed-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Rob Herring <robh@kernel.org> Tested-by: Leo Yan <leo.yan@linaro.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220914-arm-perf-tool-spe1-2-v2-v4-1-83c098e6212e@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/parse-events.c | 3 +++ tools/perf/util/pmu.c | 17 +++++++++++++++++ tools/perf/util/pmu.h | 2 ++ tools/perf/util/pmu.l | 2 -- tools/perf/util/pmu.y | 15 ++++----------- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 437389dacf483..5973f46c23755 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -246,6 +246,9 @@ __add_event(struct list_head *list, int *idx, struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) : cpu_list ? perf_cpu_map__new(cpu_list) : NULL; + if (pmu) + perf_pmu__warn_invalid_formats(pmu); + if (pmu && attr->type == PERF_TYPE_RAW) perf_pmu__warn_invalid_config(pmu, attr->config, name); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 74a2cafb4e8de..03284059175f7 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1005,6 +1005,23 @@ err: return NULL; } +void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) +{ + struct perf_pmu_format *format; + + /* fake pmu doesn't have format list */ + if (pmu == &perf_pmu__fake) + return; + + list_for_each_entry(format, &pmu->format, list) + if (format->value >= PERF_PMU_FORMAT_VALUE_CONFIG_END) { + pr_warning("WARNING: '%s' format '%s' requires 'perf_event_attr::config%d'" + "which is not supported by this version of perf!\n", + pmu->name, format->name, format->value); + return; + } +} + static struct perf_pmu *pmu_find(const char *name) { struct perf_pmu *pmu; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index a7b0f9507510b..68e15c38ae710 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -17,6 +17,7 @@ enum { PERF_PMU_FORMAT_VALUE_CONFIG, PERF_PMU_FORMAT_VALUE_CONFIG1, PERF_PMU_FORMAT_VALUE_CONFIG2, + PERF_PMU_FORMAT_VALUE_CONFIG_END, }; #define PERF_PMU_FORMAT_BITS 64 @@ -139,6 +140,7 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu); void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, const char *name); +void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu); bool perf_pmu__has_hybrid(void); int perf_pmu__match(char *pattern, char *name, char *tok); diff --git a/tools/perf/util/pmu.l b/tools/perf/util/pmu.l index a15d9fbd7c0ed..58b4926cfaca9 100644 --- a/tools/perf/util/pmu.l +++ b/tools/perf/util/pmu.l @@ -27,8 +27,6 @@ num_dec [0-9]+ {num_dec} { return value(10); } config { return PP_CONFIG; } -config1 { return PP_CONFIG1; } -config2 { return PP_CONFIG2; } - { return '-'; } : { return ':'; } , { return ','; } diff --git a/tools/perf/util/pmu.y b/tools/perf/util/pmu.y index 0dab0ec2eff7c..e675d79a0274f 100644 --- a/tools/perf/util/pmu.y +++ b/tools/perf/util/pmu.y @@ -18,7 +18,7 @@ do { \ %} -%token PP_CONFIG PP_CONFIG1 PP_CONFIG2 +%token PP_CONFIG %token PP_VALUE PP_ERROR %type <num> PP_VALUE %type <bits> bit_term @@ -45,18 +45,11 @@ PP_CONFIG ':' bits $3)); } | -PP_CONFIG1 ':' bits +PP_CONFIG PP_VALUE ':' bits { ABORT_ON(perf_pmu__new_format(format, name, - PERF_PMU_FORMAT_VALUE_CONFIG1, - $3)); -} -| -PP_CONFIG2 ':' bits -{ - ABORT_ON(perf_pmu__new_format(format, name, - PERF_PMU_FORMAT_VALUE_CONFIG2, - $3)); + $2, + $4)); } bits: -- GitLab From a9e17d3d74d14e5fd10d54f0a07e0fce4e5f80dd Mon Sep 17 00:00:00 2001 From: Paulo Alcantara <pc@cjr.nz> Date: Fri, 14 Oct 2022 13:40:42 -0300 Subject: [PATCH 1976/2223] cifs: fix static checker warning Remove unnecessary NULL check of oparam->cifs_sb when parsing symlink error response as it's already set by all smb2_open_file() callers and deferenced earlier. This fixes below report: fs/cifs/smb2file.c:126 smb2_open_file() warn: variable dereferenced before check 'oparms->cifs_sb' (see line 112) Link: https://lore.kernel.org/r/Y0kt42j2tdpYakRu@kili Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 4992b43616a7a..ffbd9a99fc128 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -123,7 +123,7 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER)) rc = -ENOMEM; - else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK && oparms->cifs_sb) { + else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) { rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov, &data->symlink_target); if (!rc) { -- GitLab From 34314cd615af5036e582fad14f2bb13e4383bfe1 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Thu, 13 Oct 2022 23:19:15 +0100 Subject: [PATCH 1977/2223] parisc: Fix spelling mistake "mis-match" -> "mismatch" in eisa driver There are several spelling mistakes in kernel error messages. Fix them. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Helge Deller <deller@gmx.de> --- drivers/parisc/eisa_enumerator.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c index f54a6f4503915..f0cb31198a8f0 100644 --- a/drivers/parisc/eisa_enumerator.c +++ b/drivers/parisc/eisa_enumerator.c @@ -393,7 +393,7 @@ static int parse_slot_config(int slot, } if (p0 + function_len < pos) { - printk(KERN_ERR "eisa_enumerator: function %d length mis-match " + printk(KERN_ERR "eisa_enumerator: function %d length mismatch " "got %d, expected %d\n", num_func, pos-p0, function_len); res=-1; @@ -407,13 +407,13 @@ static int parse_slot_config(int slot, } if (pos != es->config_data_length) { - printk(KERN_ERR "eisa_enumerator: config data length mis-match got %d, expected %d\n", + printk(KERN_ERR "eisa_enumerator: config data length mismatch got %d, expected %d\n", pos, es->config_data_length); res=-1; } if (num_func != es->num_functions) { - printk(KERN_ERR "eisa_enumerator: number of functions mis-match got %d, expected %d\n", + printk(KERN_ERR "eisa_enumerator: number of functions mismatch got %d, expected %d\n", num_func, es->num_functions); res=-2; } @@ -451,7 +451,7 @@ static int init_slot(int slot, struct eeprom_eisa_slot_info *es) } if (es->eisa_slot_id != id) { print_eisa_id(id_string, id); - printk(KERN_ERR "EISA slot %d id mis-match: got %s", + printk(KERN_ERR "EISA slot %d id mismatch: got %s", slot, id_string); print_eisa_id(id_string, es->eisa_slot_id); -- GitLab From 2130b87b2273389cafe6765bf09ef564cda01407 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor <nathan@kernel.org> Date: Fri, 14 Oct 2022 08:21:03 -0700 Subject: [PATCH 1978/2223] drm/amd/display: Fix build breakage with CONFIG_DEBUG_FS=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 8799c0be89eb ("drm/amd/display: Fix vblank refcount in vrr transition"), a build with CONFIG_DEBUG_FS=n is broken due to a misplaced brace, along the lines of: In file included from drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm_trace.h:39, from drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:41: drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c: At top level: ./include/drm/drm_atomic.h:864:9: error: expected identifier or ‘(’ before ‘for’ 864 | for ((__i) = 0; \ | ^~~ drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:8317:9: note: in expansion of macro ‘for_each_new_crtc_in_state’ 8317 | for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Move the brace within the #ifdef so that the file can be built with or without CONFIG_DEBUG_FS. Fixes: 8799c0be89eb ("drm/amd/display: Fix vblank refcount in vrr transition") Signed-off-by: Nathan Chancellor <nathan@kernel.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index f6a9e8fdd87d6..c053cb79cd063 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8310,8 +8310,8 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) crtc, dm_new_crtc_state, cur_crc_src)) DRM_DEBUG_DRIVER("Failed to configure crc source"); } -#endif } +#endif } for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) -- GitLab From 5632e2beaf9d5dda694c0572684dea783d8a9492 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas <bhelgaas@google.com> Date: Fri, 14 Oct 2022 13:45:45 -0500 Subject: [PATCH 1979/2223] Revert "PCI: Distribute available resources for root buses, too" This reverts commit e96e27fc6f7971380283768e9a734af16b1716ee. Jonathan reported that this commit broke this topology, where all the space available on bus 02 was assigned to the 02:00.0 bridge window, leaving none for the e1000 device at 02:00.1: pci 0000:00:04.0: bridge window [mem 0x10200000-0x103fffff] to [bus 02-04] pci 0000:02:00.0: bridge window [mem 0x10200000-0x103fffff] to [bus 03-04] pci 0000:02:00.1: BAR 0: failed to assign [mem size 0x00020000] e1000 0000:02:00.1: can't ioremap BAR 0: [??? 0x00000000 flags 0x0] Link: https://lore.kernel.org/r/20221014124553.0000696f@huawei.com Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- drivers/pci/setup-bus.c | 62 +---------------------------------------- 1 file changed, 1 insertion(+), 61 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index dc6a30ee6edfb..b4096598dbcbb 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1768,10 +1768,7 @@ static void adjust_bridge_window(struct pci_dev *bridge, struct resource *res, } res->end = res->start + new_size - 1; - - /* If the resource is part of the add_list remove it now */ - if (add_list) - remove_from_list(add_list, res); + remove_from_list(add_list, res); } static void pci_bus_distribute_available_resources(struct pci_bus *bus, @@ -1926,8 +1923,6 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, if (!bridge->is_hotplug_bridge) return; - pci_dbg(bridge, "distributing available resources\n"); - /* Take the initial extra resources from the hotplug port */ available_io = bridge->resource[PCI_BRIDGE_IO_WINDOW]; available_mmio = bridge->resource[PCI_BRIDGE_MEM_WINDOW]; @@ -1939,59 +1934,6 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, available_mmio_pref); } -static bool pci_bridge_resources_not_assigned(struct pci_dev *dev) -{ - const struct resource *r; - - /* - * Check the child device's resources and if they are not yet - * assigned it means we are configuring them (not the boot - * firmware) so we should be able to extend the upstream - * bridge's (that's the hotplug downstream PCIe port) resources - * in the same way we do with the normal hotplug case. - */ - r = &dev->resource[PCI_BRIDGE_IO_WINDOW]; - if (!r->flags || !(r->flags & IORESOURCE_STARTALIGN)) - return false; - r = &dev->resource[PCI_BRIDGE_MEM_WINDOW]; - if (!r->flags || !(r->flags & IORESOURCE_STARTALIGN)) - return false; - r = &dev->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; - if (!r->flags || !(r->flags & IORESOURCE_STARTALIGN)) - return false; - - return true; -} - -static void pci_root_bus_distribute_available_resources(struct pci_bus *bus, - struct list_head *add_list) -{ - struct pci_dev *dev, *bridge = bus->self; - - for_each_pci_bridge(dev, bus) { - struct pci_bus *b; - - b = dev->subordinate; - if (!b) - continue; - - /* - * Need to check "bridge" here too because it is NULL - * in case of root bus. - */ - if (bridge && pci_bridge_resources_not_assigned(dev)) { - pci_bridge_distribute_available_resources(bridge, add_list); - /* - * There is only PCIe upstream port on the bus - * so we don't need to go futher. - */ - return; - } - - pci_root_bus_distribute_available_resources(b, add_list); - } -} - /* * First try will not touch PCI bridge res. * Second and later try will clear small leaf bridge res. @@ -2031,8 +1973,6 @@ again: */ __pci_bus_size_bridges(bus, add_list); - pci_root_bus_distribute_available_resources(bus, add_list); - /* Depth last, allocate resources and update the hardware. */ __pci_bus_assign_resources(bus, add_list, &fail_head); if (add_list) -- GitLab From c67a85bee78db74c6889a5ca645c3763ad23d863 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers <ndesaulniers@google.com> Date: Fri, 14 Oct 2022 09:53:02 -0700 Subject: [PATCH 1980/2223] kbuild: add -fno-discard-value-names to cmd_cc_ll_c When debugging LLVM IR, it can be handy for clang to not discard value names used for local variables and parameters. Compare the generated IR. -fdiscard-value-names: define i32 @core_sys_select(i32 %0, ptr %1, ptr %2, ptr %3, ptr %4) { %6 = alloca i64 %7 = alloca %struct.poll_wqueues %8 = alloca [64 x i32] -fno-discard-value-names: define i32 @core_sys_select(i32 %n, ptr %inp, ptr %outp, ptr %exp, ptr %end_time) { %expire.i = alloca i64 %table.i = alloca %struct.poll_wqueues %stack_fds = alloca [64 x i32] The rule for generating human readable LLVM IR (.ll) is only useful as a debugging feature: $ make LLVM=1 fs/select.ll As Fangrui notes: A LLVM_ENABLE_ASSERTIONS=off build of Clang defaults to -fdiscard-value-names. A LLVM_ENABLE_ASSERTIONS=on build of Clang defaults to -fno-discard-value-names. Explicitly enable -fno-discard-value-names so that the IR always contains value names regardless of whether assertions were enabled or not. Assertions generally are not enabled in releases of clang packaged by distributions. Link: https://github.com/ClangBuiltLinux/linux/issues/1467 Reviewed-by: Nathan Chancellor <nathan@kernel.org> Reviewed-by: Fangrui Song <maskray@google.com> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> --- scripts/Makefile.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 22adbf89cb310..41f3602fc8de7 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -140,7 +140,7 @@ $(obj)/%.symtypes : $(src)/%.c FORCE # LLVM assembly # Generate .ll files from .c quiet_cmd_cc_ll_c = CC $(quiet_modtag) $@ - cmd_cc_ll_c = $(CC) $(c_flags) -emit-llvm -S -o $@ $< + cmd_cc_ll_c = $(CC) $(c_flags) -emit-llvm -S -fno-discard-value-names -o $@ $< $(obj)/%.ll: $(src)/%.c FORCE $(call if_changed_dep,cc_ll_c) -- GitLab From b05ea3314390e9cb3c27cf2928d48e38fef97050 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com> Date: Tue, 11 Oct 2022 15:55:48 +0200 Subject: [PATCH 1981/2223] clk: mediatek: clk-mux: Add .determine_rate() callback Since commit 262ca38f4b6e ("clk: Stop forwarding clk_rate_requests to the parent"), the clk_rate_request is .. as the title says, not forwarded anymore to the parent: this produces an issue with the MediaTek clock MUX driver during GPU DVFS on MT8195, but not on MT8192 or others. This is because, differently from others, like MT8192 where all of the clocks in the MFG parents tree are of mtk_mux type, but in the parent tree of MT8195's MFG clock, we have one mtk_mux clock and one (clk framework generic) mux clock, like so: names: mfg_bg3d -> mfg_ck_fast_ref -> top_mfg_core_tmp (or) mfgpll types: mtk_gate -> mux -> mtk_mux (or) mtk_pll To solve this issue and also keep the GPU DVFS clocks code working as expected, wire up a .determine_rate() callback for the mtk_mux ops; for that, the standard clk_mux_determine_rate_flags() was used as it was possible to. This commit was successfully tested on MT6795 Xperia M5, MT8173 Elm, MT8192 Spherion and MT8195 Tomato; no regressions were seen. For the sake of some more documentation about this issue here's the trace of it: [ 12.211587] ------------[ cut here ]------------ [ 12.211589] WARNING: CPU: 6 PID: 78 at drivers/clk/clk.c:1462 clk_core_init_rate_req+0x84/0x90 [ 12.211593] Modules linked in: stp crct10dif_ce mtk_adsp_common llc rfkill snd_sof_xtensa_dsp panfrost(+) sbs_battery cros_ec_lid_angle cros_ec_sensors snd_sof_of cros_ec_sensors_core hid_multitouch cros_usbpd_logger snd_sof gpu_sched snd_sof_utils fuse ipv6 [ 12.211614] CPU: 6 PID: 78 Comm: kworker/u16:2 Tainted: G W 6.0.0-next-20221011+ #58 [ 12.211616] Hardware name: Acer Tomato (rev2) board (DT) [ 12.211617] Workqueue: devfreq_wq devfreq_monitor [ 12.211620] pstate: 40400009 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 12.211622] pc : clk_core_init_rate_req+0x84/0x90 [ 12.211625] lr : clk_core_forward_rate_req+0xa4/0xe4 [ 12.211627] sp : ffff80000893b8e0 [ 12.211628] x29: ffff80000893b8e0 x28: ffffdddf92f9b000 x27: ffff46a2c0e8bc05 [ 12.211632] x26: ffff46a2c1041200 x25: 0000000000000000 x24: 00000000173eed80 [ 12.211636] x23: ffff80000893b9c0 x22: ffff80000893b940 x21: 0000000000000000 [ 12.211641] x20: ffff46a2c1039f00 x19: ffff46a2c1039f00 x18: 0000000000000000 [ 12.211645] x17: 0000000000000038 x16: 000000000000d904 x15: 0000000000000003 [ 12.211649] x14: ffffdddf9357ce48 x13: ffffdddf935e71c8 x12: 000000000004803c [ 12.211653] x11: 00000000a867d7ad x10: 00000000a867d7ad x9 : ffffdddf90c28df4 [ 12.211657] x8 : ffffdddf9357a980 x7 : 0000000000000000 x6 : 0000000000000004 [ 12.211661] x5 : ffffffffffffffc8 x4 : 00000000173eed80 x3 : ffff80000893b940 [ 12.211665] x2 : 00000000173eed80 x1 : ffff80000893b940 x0 : 0000000000000000 [ 12.211669] Call trace: [ 12.211670] clk_core_init_rate_req+0x84/0x90 [ 12.211673] clk_core_round_rate_nolock+0xe8/0x10c [ 12.211675] clk_mux_determine_rate_flags+0x174/0x1f0 [ 12.211677] clk_mux_determine_rate+0x1c/0x30 [ 12.211680] clk_core_determine_round_nolock+0x74/0x130 [ 12.211682] clk_core_round_rate_nolock+0x58/0x10c [ 12.211684] clk_core_round_rate_nolock+0xf4/0x10c [ 12.211686] clk_core_set_rate_nolock+0x194/0x2ac [ 12.211688] clk_set_rate+0x40/0x94 [ 12.211691] _opp_config_clk_single+0x38/0xa0 [ 12.211693] _set_opp+0x1b0/0x500 [ 12.211695] dev_pm_opp_set_rate+0x120/0x290 [ 12.211697] panfrost_devfreq_target+0x3c/0x50 [panfrost] [ 12.211705] devfreq_set_target+0x8c/0x2d0 [ 12.211707] devfreq_update_target+0xcc/0xf4 [ 12.211708] devfreq_monitor+0x40/0x1d0 [ 12.211710] process_one_work+0x294/0x664 [ 12.211712] worker_thread+0x7c/0x45c [ 12.211713] kthread+0x104/0x110 [ 12.211716] ret_from_fork+0x10/0x20 [ 12.211718] irq event stamp: 7102 [ 12.211719] hardirqs last enabled at (7101): [<ffffdddf904ea5a0>] finish_task_switch.isra.0+0xec/0x2f0 [ 12.211723] hardirqs last disabled at (7102): [<ffffdddf91794b74>] el1_dbg+0x24/0x90 [ 12.211726] softirqs last enabled at (6716): [<ffffdddf90410be4>] __do_softirq+0x414/0x588 [ 12.211728] softirqs last disabled at (6507): [<ffffdddf904171d8>] ____do_softirq+0x18/0x24 [ 12.211730] ---[ end trace 0000000000000000 ]--- Fixes: 262ca38f4b6e ("clk: Stop forwarding clk_rate_requests to the parent") Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com> Link: https://lore.kernel.org/r/20221011135548.318323-1-angelogioacchino.delregno@collabora.com Signed-off-by: Stephen Boyd <sboyd@kernel.org> --- drivers/clk/mediatek/clk-mux.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/clk/mediatek/clk-mux.c b/drivers/clk/mediatek/clk-mux.c index cd5f9fd8cb98e..5d217f7377ee1 100644 --- a/drivers/clk/mediatek/clk-mux.c +++ b/drivers/clk/mediatek/clk-mux.c @@ -128,9 +128,18 @@ static int mtk_clk_mux_set_parent_setclr_lock(struct clk_hw *hw, u8 index) return 0; } +static int mtk_clk_mux_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct mtk_clk_mux *mux = to_mtk_clk_mux(hw); + + return clk_mux_determine_rate_flags(hw, req, mux->data->flags); +} + const struct clk_ops mtk_mux_clr_set_upd_ops = { .get_parent = mtk_clk_mux_get_parent, .set_parent = mtk_clk_mux_set_parent_setclr_lock, + .determine_rate = mtk_clk_mux_determine_rate, }; EXPORT_SYMBOL_GPL(mtk_mux_clr_set_upd_ops); @@ -140,6 +149,7 @@ const struct clk_ops mtk_mux_gate_clr_set_upd_ops = { .is_enabled = mtk_clk_mux_is_enabled, .get_parent = mtk_clk_mux_get_parent, .set_parent = mtk_clk_mux_set_parent_setclr_lock, + .determine_rate = mtk_clk_mux_determine_rate, }; EXPORT_SYMBOL_GPL(mtk_mux_gate_clr_set_upd_ops); -- GitLab From 8c7bc6ca3740959edc6abe5d8214e5c84aa8a853 Mon Sep 17 00:00:00 2001 From: Linus Walleij <linus.walleij@linaro.org> Date: Thu, 13 Oct 2022 16:07:45 +0200 Subject: [PATCH 1982/2223] clk: qcom: gcc-msm8660: Drop hardcoded fixed board clocks These two clocks are now registered in the device tree as fixed clocks, causing a regression in the driver as the clock already exists with e.g. the name "pxo_board" as the MSM8660 GCC driver probes. Fix this by just not hard-coding this anymore and everything works like a charm. Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> Fixes: baecbda52933 ("ARM: dts: qcom: msm8660: fix node names for fixed clocks") Signed-off-by: Linus Walleij <linus.walleij@linaro.org> Link: https://lore.kernel.org/r/20221013140745.7801-1-linus.walleij@linaro.org Signed-off-by: Stephen Boyd <sboyd@kernel.org> --- drivers/clk/qcom/gcc-msm8660.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/clk/qcom/gcc-msm8660.c b/drivers/clk/qcom/gcc-msm8660.c index 657e1154bb9b7..a9eb6a9ac4454 100644 --- a/drivers/clk/qcom/gcc-msm8660.c +++ b/drivers/clk/qcom/gcc-msm8660.c @@ -2767,17 +2767,6 @@ MODULE_DEVICE_TABLE(of, gcc_msm8660_match_table); static int gcc_msm8660_probe(struct platform_device *pdev) { - int ret; - struct device *dev = &pdev->dev; - - ret = qcom_cc_register_board_clk(dev, "cxo_board", "cxo", 19200000); - if (ret) - return ret; - - ret = qcom_cc_register_board_clk(dev, "pxo_board", "pxo", 27000000); - if (ret) - return ret; - return qcom_cc_probe(pdev, &gcc_msm8660_desc); } -- GitLab From 57d849636a04a12713dd3a10a97cb9658ec7edf6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang <wangkefeng.wang@huawei.com> Date: Wed, 12 Oct 2022 11:06:35 +0800 Subject: [PATCH 1983/2223] clk: at91: fix the build with binutils 2.27 There is an issue when build with older versions of binutils 2.27.0, arch/arm/mach-at91/pm_suspend.S: Assembler messages: arch/arm/mach-at91/pm_suspend.S:1086: Error: garbage following instruction -- `ldr tmp1,=0x00020010UL' Use UL() macro to fix the issue in assembly file. Fixes: 4fd36e458392 ("ARM: at91: pm: add plla disable/enable support for sam9x60") Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> Link: https://lore.kernel.org/r/20221012030635.13140-1-wangkefeng.wang@huawei.com Signed-off-by: Stephen Boyd <sboyd@kernel.org> --- include/linux/clk/at91_pmc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h index 3484309b59bf1..7af499bdbecb9 100644 --- a/include/linux/clk/at91_pmc.h +++ b/include/linux/clk/at91_pmc.h @@ -12,6 +12,8 @@ #ifndef AT91_PMC_H #define AT91_PMC_H +#include <linux/bits.h> + #define AT91_PMC_V1 (1) /* PMC version 1 */ #define AT91_PMC_V2 (2) /* PMC version 2 [SAM9X60] */ @@ -45,8 +47,8 @@ #define AT91_PMC_PCSR 0x18 /* Peripheral Clock Status Register */ #define AT91_PMC_PLL_ACR 0x18 /* PLL Analog Control Register [for SAM9X60] */ -#define AT91_PMC_PLL_ACR_DEFAULT_UPLL 0x12020010UL /* Default PLL ACR value for UPLL */ -#define AT91_PMC_PLL_ACR_DEFAULT_PLLA 0x00020010UL /* Default PLL ACR value for PLLA */ +#define AT91_PMC_PLL_ACR_DEFAULT_UPLL UL(0x12020010) /* Default PLL ACR value for UPLL */ +#define AT91_PMC_PLL_ACR_DEFAULT_PLLA UL(0x00020010) /* Default PLL ACR value for PLLA */ #define AT91_PMC_PLL_ACR_UTMIVR (1 << 12) /* UPLL Voltage regulator Control */ #define AT91_PMC_PLL_ACR_UTMIBG (1 << 13) /* UPLL Bandgap Control */ -- GitLab From c461c677a8cb19026fd06741a23ff32d0759342b Mon Sep 17 00:00:00 2001 From: Jon Hunter <jonathanh@nvidia.com> Date: Mon, 10 Oct 2022 11:00:46 +0100 Subject: [PATCH 1984/2223] clk: tegra: Fix Tegra PWM parent clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8c193f4714df ("pwm: tegra: Optimize period calculation") updated the period calculation in the Tegra PWM driver and now returns an error if the period requested is less than minimum period supported. This is breaking PWM support on various Tegra platforms. For example, on the Tegra210 Jetson Nano platform this is breaking the PWM fan support and probing the PWM fan driver now fails ... pwm-fan pwm-fan: Failed to configure PWM: -22 pwm-fan: probe of pwm-fan failed with error -22 The problem is that the default parent clock for the PWM on Tegra210 is a 32kHz clock and is unable to support the requested PWM period. Fix PWM support on Tegra20, Tegra30, Tegra114, Tegra124 and Tegra210 by updating the parent clock for the PWM to be the PLL_P. Fixes: 8c193f4714df ("pwm: tegra: Optimize period calculation") Signed-off-by: Jon Hunter <jonathanh@nvidia.com> Tested-by: Robert Eckelmann <longnoserob@gmail.com> # TF101 T20 Tested-by: Antoni Aloy Torrens <aaloytorrens@gmail.com> # TF101 T20 Tested-by: Svyatoslav Ryhel <clamor95@gmail.com> # TF201 T30 Tested-by: Andreas Westman Dorcsak <hedmoo@yahoo.com> # TF700T T3 Link: https://lore.kernel.org/r/20221010100046.6477-1-jonathanh@nvidia.com Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Stephen Boyd <sboyd@kernel.org> --- drivers/clk/tegra/clk-tegra114.c | 1 + drivers/clk/tegra/clk-tegra124.c | 1 + drivers/clk/tegra/clk-tegra20.c | 1 + drivers/clk/tegra/clk-tegra210.c | 1 + drivers/clk/tegra/clk-tegra30.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/clk/tegra/clk-tegra114.c b/drivers/clk/tegra/clk-tegra114.c index f7405a58877e2..73303458e8866 100644 --- a/drivers/clk/tegra/clk-tegra114.c +++ b/drivers/clk/tegra/clk-tegra114.c @@ -1166,6 +1166,7 @@ static struct tegra_clk_init_table init_table[] __initdata = { { TEGRA114_CLK_I2S3_SYNC, TEGRA114_CLK_CLK_MAX, 24000000, 0 }, { TEGRA114_CLK_I2S4_SYNC, TEGRA114_CLK_CLK_MAX, 24000000, 0 }, { TEGRA114_CLK_VIMCLK_SYNC, TEGRA114_CLK_CLK_MAX, 24000000, 0 }, + { TEGRA114_CLK_PWM, TEGRA114_CLK_PLL_P, 408000000, 0 }, /* must be the last entry */ { TEGRA114_CLK_CLK_MAX, TEGRA114_CLK_CLK_MAX, 0, 0 }, }; diff --git a/drivers/clk/tegra/clk-tegra124.c b/drivers/clk/tegra/clk-tegra124.c index a9d4efcef2d4d..6c46592d794ec 100644 --- a/drivers/clk/tegra/clk-tegra124.c +++ b/drivers/clk/tegra/clk-tegra124.c @@ -1330,6 +1330,7 @@ static struct tegra_clk_init_table common_init_table[] __initdata = { { TEGRA124_CLK_I2S3_SYNC, TEGRA124_CLK_CLK_MAX, 24576000, 0 }, { TEGRA124_CLK_I2S4_SYNC, TEGRA124_CLK_CLK_MAX, 24576000, 0 }, { TEGRA124_CLK_VIMCLK_SYNC, TEGRA124_CLK_CLK_MAX, 24576000, 0 }, + { TEGRA124_CLK_PWM, TEGRA124_CLK_PLL_P, 408000000, 0 }, /* must be the last entry */ { TEGRA124_CLK_CLK_MAX, TEGRA124_CLK_CLK_MAX, 0, 0 }, }; diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c index 8a4514f6d5033..422d782475532 100644 --- a/drivers/clk/tegra/clk-tegra20.c +++ b/drivers/clk/tegra/clk-tegra20.c @@ -1044,6 +1044,7 @@ static struct tegra_clk_init_table init_table[] = { { TEGRA20_CLK_GR2D, TEGRA20_CLK_PLL_C, 300000000, 0 }, { TEGRA20_CLK_GR3D, TEGRA20_CLK_PLL_C, 300000000, 0 }, { TEGRA20_CLK_VDE, TEGRA20_CLK_PLL_C, 300000000, 0 }, + { TEGRA20_CLK_PWM, TEGRA20_CLK_PLL_P, 48000000, 0 }, /* must be the last entry */ { TEGRA20_CLK_CLK_MAX, TEGRA20_CLK_CLK_MAX, 0, 0 }, }; diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c index 499f999e91e13..a3488aaac3f78 100644 --- a/drivers/clk/tegra/clk-tegra210.c +++ b/drivers/clk/tegra/clk-tegra210.c @@ -3597,6 +3597,7 @@ static struct tegra_clk_init_table init_table[] __initdata = { { TEGRA210_CLK_VIMCLK_SYNC, TEGRA210_CLK_CLK_MAX, 24576000, 0 }, { TEGRA210_CLK_HDA, TEGRA210_CLK_PLL_P, 51000000, 0 }, { TEGRA210_CLK_HDA2CODEC_2X, TEGRA210_CLK_PLL_P, 48000000, 0 }, + { TEGRA210_CLK_PWM, TEGRA210_CLK_PLL_P, 48000000, 0 }, /* This MUST be the last entry. */ { TEGRA210_CLK_CLK_MAX, TEGRA210_CLK_CLK_MAX, 0, 0 }, }; diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c index 168c07d5a5f24..60f1534711f1c 100644 --- a/drivers/clk/tegra/clk-tegra30.c +++ b/drivers/clk/tegra/clk-tegra30.c @@ -1237,6 +1237,7 @@ static struct tegra_clk_init_table init_table[] = { { TEGRA30_CLK_VIMCLK_SYNC, TEGRA30_CLK_CLK_MAX, 24000000, 0 }, { TEGRA30_CLK_HDA, TEGRA30_CLK_PLL_P, 102000000, 0 }, { TEGRA30_CLK_HDA2CODEC_2X, TEGRA30_CLK_PLL_P, 48000000, 0 }, + { TEGRA30_CLK_PWM, TEGRA30_CLK_PLL_P, 48000000, 0 }, /* must be the last entry */ { TEGRA30_CLK_CLK_MAX, TEGRA30_CLK_CLK_MAX, 0, 0 }, }; -- GitLab From a8aed7b35becfd21f22a77c7014029ea837b018f Mon Sep 17 00:00:00 2001 From: Jonathan Cooper <jonathan.s.cooper@amd.com> Date: Thu, 13 Oct 2022 10:55:53 +0100 Subject: [PATCH 1985/2223] sfc: Change VF mac via PF as first preference if available. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changing a VF's mac address through the VF (rather than via the PF) fails with EPERM because the latter part of efx_ef10_set_mac_address attempts to change the vport mac address list as the VF. Even with this fixed it still fails with EBUSY because the vadaptor is still assigned on the VF - the vadaptor reassignment must be within a section where the VF has torn down its state. A major reason this has broken is because we have two functions that ostensibly do the same thing - have a PF and VF cooperate to change a VF mac address. Rather than do this, if we are changing the mac of a VF that has a link to the PF in the same VM then simply call sriov_set_vf_mac instead, which is a proven working function that does that. If there is no PF available, or that fails non-fatally, then attempt to change the VF's mac address as we would a PF, without updating the PF's data. Test case: Create a VF: echo 1 > /sys/class/net/<if>/device/sriov_numvfs Set the mac address of the VF directly: ip link set <vf> addr 00:11:22:33:44:55 Set the MAC address of the VF via the PF: ip link set <pf> vf 0 mac 00:11:22:33:44:66 Without this patch the last command will fail with ENOENT. Signed-off-by: Jonathan Cooper <jonathan.s.cooper@amd.com> Reported-by: Íñigo Huguet <ihuguet@redhat.com> Fixes: 910c8789a777 ("set the MAC address using MC_CMD_VADAPTOR_SET_MAC") Acked-by: Edward Cree <ecree.xilinx@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/sfc/ef10.c | 58 ++++++++++++++------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index d1e1aa19a68ed..7022fb2005a2f 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -3277,6 +3277,30 @@ static int efx_ef10_set_mac_address(struct efx_nic *efx) bool was_enabled = efx->port_enabled; int rc; +#ifdef CONFIG_SFC_SRIOV + /* If this function is a VF and we have access to the parent PF, + * then use the PF control path to attempt to change the VF MAC address. + */ + if (efx->pci_dev->is_virtfn && efx->pci_dev->physfn) { + struct efx_nic *efx_pf = pci_get_drvdata(efx->pci_dev->physfn); + struct efx_ef10_nic_data *nic_data = efx->nic_data; + u8 mac[ETH_ALEN]; + + /* net_dev->dev_addr can be zeroed by efx_net_stop in + * efx_ef10_sriov_set_vf_mac, so pass in a copy. + */ + ether_addr_copy(mac, efx->net_dev->dev_addr); + + rc = efx_ef10_sriov_set_vf_mac(efx_pf, nic_data->vf_index, mac); + if (!rc) + return 0; + + netif_dbg(efx, drv, efx->net_dev, + "Updating VF mac via PF failed (%d), setting directly\n", + rc); + } +#endif + efx_device_detach_sync(efx); efx_net_stop(efx->net_dev); @@ -3297,40 +3321,6 @@ static int efx_ef10_set_mac_address(struct efx_nic *efx) efx_net_open(efx->net_dev); efx_device_attach_if_not_resetting(efx); -#ifdef CONFIG_SFC_SRIOV - if (efx->pci_dev->is_virtfn && efx->pci_dev->physfn) { - struct efx_ef10_nic_data *nic_data = efx->nic_data; - struct pci_dev *pci_dev_pf = efx->pci_dev->physfn; - - if (rc == -EPERM) { - struct efx_nic *efx_pf; - - /* Switch to PF and change MAC address on vport */ - efx_pf = pci_get_drvdata(pci_dev_pf); - - rc = efx_ef10_sriov_set_vf_mac(efx_pf, - nic_data->vf_index, - efx->net_dev->dev_addr); - } else if (!rc) { - struct efx_nic *efx_pf = pci_get_drvdata(pci_dev_pf); - struct efx_ef10_nic_data *nic_data = efx_pf->nic_data; - unsigned int i; - - /* MAC address successfully changed by VF (with MAC - * spoofing) so update the parent PF if possible. - */ - for (i = 0; i < efx_pf->vf_count; ++i) { - struct ef10_vf *vf = nic_data->vf + i; - - if (vf->efx == efx) { - ether_addr_copy(vf->mac, - efx->net_dev->dev_addr); - return 0; - } - } - } - } else -#endif if (rc == -EPERM) { netif_err(efx, drv, efx->net_dev, "Cannot change MAC address; use sfboot to enable" -- GitLab From d8bde3bf7f82dac5fc68a62c2816793a12cafa2a Mon Sep 17 00:00:00 2001 From: Xiaobo Liu <cppcoffee@gmail.com> Date: Fri, 14 Oct 2022 10:05:40 +0800 Subject: [PATCH 1986/2223] net/atm: fix proc_mpc_write incorrect return value Then the input contains '\0' or '\n', proc_mpc_write has read them, so the return value needs +1. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xiaobo Liu <cppcoffee@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/atm/mpoa_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 829db9eba0cb9..aaf64b9539150 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -219,11 +219,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff, if (!page) return -ENOMEM; - for (p = page, len = 0; len < nbytes; p++, len++) { + for (p = page, len = 0; len < nbytes; p++) { if (get_user(*p, buff++)) { free_page((unsigned long)page); return -EFAULT; } + len += 1; if (*p == '\0' || *p == '\n') break; } -- GitLab From 017e42540639a46fdf7c7f5ee647e0b7806c9013 Mon Sep 17 00:00:00 2001 From: Cezar Bulinaru <cbulinaru@gmail.com> Date: Thu, 13 Oct 2022 22:45:03 -0400 Subject: [PATCH 1987/2223] net: hv_netvsc: Fix a warning triggered by memcpy in rndis_filter memcpy: detected field-spanning write (size 168) of single field "(void *)&request->response_msg + (sizeof(struct rndis_message) - sizeof(union rndis_message_container)) + sizeof(*req_id)" at drivers/net/hyperv/rndis_filter.c:338 (size 40) RSP: 0018:ffffc90000144de0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8881766b4000 RCX: 0000000000000000 RDX: 0000000000000102 RSI: 0000000000009ffb RDI: 00000000ffffffff RBP: ffffc90000144e38 R08: 0000000000000000 R09: 00000000ffffdfff R10: ffffc90000144c48 R11: ffffffff82f56ac8 R12: ffff8881766b403c R13: 00000000000000a8 R14: ffff888100b75000 R15: ffff888179301d00 FS: 0000000000000000(0000) GS:ffff8884d6280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f8b024c418 CR3: 0000000176548001 CR4: 00000000003706e0 Call Trace: <IRQ> ? _raw_spin_unlock_irqrestore+0x27/0x50 netvsc_poll+0x556/0x940 [hv_netvsc] __napi_poll+0x2e/0x170 net_rx_action+0x299/0x2f0 __do_softirq+0xed/0x2ef __irq_exit_rcu+0x9f/0x110 irq_exit_rcu+0xe/0x20 sysvec_hyperv_callback+0xb0/0xd0 </IRQ> <TASK> asm_sysvec_hyperv_callback+0x1b/0x20 RIP: 0010:native_safe_halt+0xb/0x10 Fixes: A warning triggered when the response message len exceeds the size of rndis_message. Inside the rndis_request structure these fields are however followed by a RNDIS_EXT_LEN padding so it is safe to use unsafe_memcpy. Reviewed-by: Michael Kelley <mikelley@microsoft.com> Signed-off-by: Cezar Bulinaru <cbulinaru@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/hyperv/rndis_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 11f767a204443..eea777ec2541b 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -20,6 +20,7 @@ #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/ucs2_string.h> +#include <linux/string.h> #include "hyperv_net.h" #include "netvsc_trace.h" @@ -335,9 +336,10 @@ static void rndis_filter_receive_response(struct net_device *ndev, if (resp->msg_len <= sizeof(struct rndis_message) + RNDIS_EXT_LEN) { memcpy(&request->response_msg, resp, RNDIS_HEADER_SIZE + sizeof(*req_id)); - memcpy((void *)&request->response_msg + RNDIS_HEADER_SIZE + sizeof(*req_id), + unsafe_memcpy((void *)&request->response_msg + RNDIS_HEADER_SIZE + sizeof(*req_id), data + RNDIS_HEADER_SIZE + sizeof(*req_id), - resp->msg_len - RNDIS_HEADER_SIZE - sizeof(*req_id)); + resp->msg_len - RNDIS_HEADER_SIZE - sizeof(*req_id), + "request->response_msg is followed by a padding of RNDIS_EXT_LEN inside rndis_request"); if (request->request_msg.ndis_msg_type == RNDIS_MSG_QUERY && request->request_msg.msg. query_req.oid == RNDIS_OID_GEN_MEDIA_CONNECT_STATUS) -- GitLab From 0c9efbd5c50c64ead434960a404c9c9a097b0403 Mon Sep 17 00:00:00 2001 From: Harini Katakam <harini.katakam@amd.com> Date: Fri, 14 Oct 2022 12:17:35 +0530 Subject: [PATCH 1988/2223] net: phy: dp83867: Extend RX strap quirk for SGMII mode When RX strap in HW is not set to MODE 3 or 4, bit 7 and 8 in CF4 register should be set. The former is already handled in dp83867_config_init; add the latter in SGMII specific initialization. Fixes: 2a10154abcb7 ("net: phy: dp83867: Add TI dp83867 phy") Signed-off-by: Harini Katakam <harini.katakam@amd.com> Reviewed-by: Andrew Lunn <andrew@lunn.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/phy/dp83867.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 6939563d3b7c5..417527f8bbf55 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -853,6 +853,14 @@ static int dp83867_config_init(struct phy_device *phydev) else val &= ~DP83867_SGMII_TYPE; phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL, val); + + /* This is a SW workaround for link instability if RX_CTRL is + * not strapped to mode 3 or 4 in HW. This is required for SGMII + * in addition to clearing bit 7, handled above. + */ + if (dp83867->rxctrl_strap_quirk) + phy_set_bits_mmd(phydev, DP83867_DEVADDR, DP83867_CFG4, + BIT(8)); } val = phy_read(phydev, DP83867_CFG3); -- GitLab From bdee15e8c58b450ad736a2b62ef8c7a12548b704 Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Fri, 14 Oct 2022 12:34:36 +0300 Subject: [PATCH 1989/2223] net/smc: Fix an error code in smc_lgr_create() If smc_wr_alloc_lgr_mem() fails then return an error code. Don't return success. Fixes: 8799e310fb3f ("net/smc: add v2 support to the work request layer") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e6ee797640b45..c305d8dd23f80 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -896,7 +896,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); - if (smc_wr_alloc_lgr_mem(lgr)) + rc = smc_wr_alloc_lgr_mem(lgr); + if (rc) goto free_wq; smc_llc_lgr_init(lgr, smc); -- GitLab From 9408f3d321ed2286b9722bceff08ca28b741c026 Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Fri, 14 Oct 2022 17:33:02 +0300 Subject: [PATCH 1990/2223] sunhme: Uninitialized variable in happy_meal_init() The "burst" string is only initialized for CONFIG_SPARC. It should be set to "64" because that's what is used by PCI. Fixes: 24cddbc3ef11 ("sunhme: Combine continued messages") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Sean Anderson <seanga2@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/sun/sunhme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index 91f10f746dffd..1c16548415cdd 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -1328,7 +1328,7 @@ static int happy_meal_init(struct happy_meal *hp) void __iomem *erxregs = hp->erxregs; void __iomem *bregs = hp->bigmacregs; void __iomem *tregs = hp->tcvregs; - const char *bursts; + const char *bursts = "64"; u32 regtmp, rxcfg; /* If auto-negotiation timer is running, kill it. */ -- GitLab From 0a6d58a70a39d9a74882af6d00ec6df0737503ff Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Fri, 14 Oct 2022 18:08:39 +0300 Subject: [PATCH 1991/2223] net: dsa: uninitialized variable in dsa_slave_netdevice_event() Return zero if both dsa_slave_dev_check() and netdev_uses_dsa() are false. Fixes: acc43b7bf52a ("net: dsa: allow masters to join a LAG") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1a59918d3b305..a9fde48cffd43 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -3145,7 +3145,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; - int err; + int err = 0; if (dsa_slave_dev_check(dev)) { dp = dsa_slave_to_port(dev); -- GitLab From fc8695eb11f07d936a4a9dbd15d7797986bc8b89 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski <kuba@kernel.org> Date: Fri, 14 Oct 2022 09:07:46 -0700 Subject: [PATCH 1992/2223] Revert "net: fix cpu_max_bits_warn() usage in netif_attrmask_next{,_and}" This reverts commit 854701ba4c39afae2362ba19a580c461cb183e4f. We have more violations around, which leads to: WARNING: CPU: 2 PID: 1 at include/linux/cpumask.h:110 __netif_set_xps_queue+0x14e/0x770 Let's back this out and retry with a larger clean up in -next. Fixes: 854701ba4c39 ("net: fix cpu_max_bits_warn() usage in netif_attrmask_next{,_and}") Link: https://lore.kernel.org/all/20221014030459.3272206-2-guoren@kernel.org/ Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/netdevice.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a36edb0ec1993..eddf8ee270e74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3663,8 +3663,9 @@ static inline bool netif_attr_test_online(unsigned long j, static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) { - /* n is a prior cpu */ - cpu_max_bits_warn(n + 1, nr_bits); + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); if (srcp) return find_next_bit(srcp, nr_bits, n + 1); @@ -3685,8 +3686,9 @@ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits) { - /* n is a prior cpu */ - cpu_max_bits_warn(n + 1, nr_bits); + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); if (src1p && src2p) return find_next_and_bit(src1p, src2p, nr_bits, n + 1); -- GitLab From 96de900ae78e7dbedc937fd91bafe2934579c65a Mon Sep 17 00:00:00 2001 From: Shenwei Wang <shenwei.wang@nxp.com> Date: Fri, 14 Oct 2022 09:47:28 -0500 Subject: [PATCH 1993/2223] net: phylink: add mac_managed_pm in phylink_config structure The recent commit 'commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state")' requires the MAC driver explicitly tell the phy driver who is managing the PM, otherwise you will see warning during resume stage. Add a boolean property in the phylink_config structure so that the MAC driver can use it to tell the PHY driver if it wants to manage the PM. Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com> Acked-by: Florian Fainelli <f.fainelli@gmail.com> Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/phy/phylink.c | 3 +++ include/linux/phylink.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 75464df191ef7..6547b6cc6cbe7 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1661,6 +1661,9 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, if (phy_interrupt_is_valid(phy)) phy_request_interrupt(phy); + if (pl->config->mac_managed_pm) + phy->mac_managed_pm = true; + return 0; } diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 664dd409feb93..3f01ac8017e06 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -122,6 +122,7 @@ enum phylink_op_type { * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. + * @mac_managed_pm: if true, indicate the MAC driver is responsible for PHY PM. * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. @@ -134,6 +135,7 @@ struct phylink_config { enum phylink_op_type type; bool legacy_pre_march2020; bool poll_fixed_state; + bool mac_managed_pm; bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); -- GitLab From f151c147b3afcf92dedff53f5f0e965414e4fd2c Mon Sep 17 00:00:00 2001 From: Shenwei Wang <shenwei.wang@nxp.com> Date: Fri, 14 Oct 2022 09:47:29 -0500 Subject: [PATCH 1994/2223] net: stmmac: Enable mac_managed_pm phylink config Enable the mac_managed_pm configuration in the phylink_config structure to avoid the kernel warning during system resume. Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com> Acked-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 65c96773c6d2b..8273e6a175c84 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1214,6 +1214,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) if (priv->plat->tx_queues_to_use > 1) priv->phylink_config.mac_capabilities &= ~(MAC_10HD | MAC_100HD | MAC_1000HD); + priv->phylink_config.mac_managed_pm = true; phylink = phylink_create(&priv->phylink_config, fwnode, mode, &stmmac_phylink_mac_ops); -- GitLab From bde971a83bbff78561458ded236605a365411b87 Mon Sep 17 00:00:00 2001 From: Denis Nikitin <denik@chromium.org> Date: Fri, 14 Oct 2022 11:45:32 -0700 Subject: [PATCH 1995/2223] KVM: arm64: nvhe: Fix build with profile optimization Kernel build with clang and KCFLAGS=-fprofile-sample-use=<profile> fails with: error: arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: Unexpected SHT_REL section ".rel.llvm.call-graph-profile" Starting from 13.0.0 llvm can generate SHT_REL section, see https://reviews.llvm.org/rGca3bdb57fa1ac98b711a735de048c12b5fdd8086. gen-hyprel does not support SHT_REL relocation section. Filter out profile use flags to fix the build with profile optimization. Signed-off-by: Denis Nikitin <denik@chromium.org> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221014184532.3153551-1-denik@chromium.org --- arch/arm64/kvm/hyp/nvhe/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 48f6ae7cc6e64..be0a2bc3e20d0 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -92,6 +92,10 @@ quiet_cmd_hypcopy = HYPCOPY $@ # Remove ftrace, Shadow Call Stack, and CFI CFLAGS. # This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) +# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' +# when profile optimization is applied. gen-hyprel does not support SHT_REL and +# causes a build failure. Remove profile optimization flags. +KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) # KVM nVHE code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may -- GitLab From c000a2607145d28b06c697f968491372ea56c23a Mon Sep 17 00:00:00 2001 From: Eric Ren <renzhengeek@gmail.com> Date: Sat, 15 Oct 2022 11:19:28 +0800 Subject: [PATCH 1996/2223] KVM: arm64: vgic: Fix exit condition in scan_its_table() With some PCIe topologies, restoring a guest fails while parsing the ITS device tables. Reproducer hints: 1. Create ARM virt VM with pxb-pcie bus which adds extra host bridges, with qemu command like: ``` -device pxb-pcie,bus_nr=8,id=pci.x,numa_node=0,bus=pcie.0 \ -device pcie-root-port,..,bus=pci.x \ ... -device pxb-pcie,bus_nr=37,id=pci.y,numa_node=1,bus=pcie.0 \ -device pcie-root-port,..,bus=pci.y \ ... ``` 2. Ensure the guest uses 2-level device table 3. Perform VM migration which calls save/restore device tables In that setup, we get a big "offset" between 2 device_ids, which makes unsigned "len" round up a big positive number, causing the scan loop to continue with a bad GPA. For example: 1. L1 table has 2 entries; 2. and we are now scanning at L2 table entry index 2075 (pointed to by L1 first entry) 3. if next device id is 9472, we will get a big offset: 7397; 4. with unsigned 'len', 'len -= offset * esz', len will underflow to a positive number, mistakenly into next iteration with a bad GPA; (It should break out of the current L2 table scanning, and jump into the next L1 table entry) 5. that bad GPA fails the guest read. Fix it by stopping the L2 table scan when the next device id is outside of the current table, allowing the scan to continue from the next L1 table entry. Thanks to Eric Auger for the fix suggestion. Fixes: 920a7a8fa92a ("KVM: arm64: vgic-its: Add infrastructure for tableookup") Suggested-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Eric Ren <renzhengeek@gmail.com> [maz: commit message tidy-up] Signed-off-by: Marc Zyngier <maz@kernel.org> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/d9c3a564af9e2c5bf63f48a7dcbf08cd593c5c0b.1665802985.git.renzhengeek@gmail.com --- arch/arm64/kvm/vgic/vgic-its.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 24d7778d1ce63..733b53055f976 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2149,7 +2149,7 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, memset(entry, 0, esz); - while (len > 0) { + while (true) { int next_offset; size_t byte_offset; @@ -2162,6 +2162,9 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, return next_offset; byte_offset = next_offset * esz; + if (byte_offset >= len) + break; + id += next_offset; gpa += byte_offset; len -= byte_offset; -- GitLab From e4080492877d3125ffd0c6dd3e3c997fbe0ebe6d Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Fri, 14 Oct 2022 20:08:59 +0300 Subject: [PATCH 1997/2223] perf test: test_intel_pt.sh: Fix return checking again count_result() does not always reset ret=0 which means the value can spill into the next test result. Fix by explicitly setting it to zero between tests. Committer testing: # perf test "Miscellaneous Intel PT testing" 110: Miscellaneous Intel PT testing : Ok # Tested as well with: # perf test -v "Miscellaneous Intel PT testing" Fixes: fd9b45e39cfaf885 ("perf test: test_intel_pt.sh: Fix return checking") Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index efaad9566c347..4609a24c93400 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -265,13 +265,12 @@ count_result() return fi err_cnt=$((err_cnt + 1)) - ret=0 } ret=0 -test_system_wide_side_band || ret=$? ; count_result $ret -test_per_thread "" "" || ret=$? ; count_result $ret -test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret +test_system_wide_side_band || ret=$? ; count_result $ret ; ret=0 +test_per_thread "" "" || ret=$? ; count_result $ret ; ret=0 +test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret ; ret=0 cleanup -- GitLab From 5021d82bca4f5335b29de71f0533b93c6c15007e Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Fri, 14 Oct 2022 20:09:00 +0300 Subject: [PATCH 1998/2223] perf test: test_intel_pt.sh: Tidy some perf record options When not decoding, the options "-B -N --no-bpf-event" speed up perf record. Make a common function for them. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 4609a24c93400..334836f92bdc6 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -42,6 +42,14 @@ trap_cleanup() trap trap_cleanup EXIT TERM INT +# perf record for testing without decoding +perf_record_no_decode() +{ + # Options to speed up recording: no post-processing, no build-id cache update, + # and no BPF events. + perf record -B -N --no-bpf-event "$@" +} + have_workload=false cat << _end_of_file_ | /usr/bin/cc -o "${workload}" -xc - -pthread && have_workload=true #include <time.h> @@ -76,7 +84,7 @@ _end_of_file_ can_cpu_wide() { echo "Checking for CPU-wide recording on CPU $1" - if ! perf record -o "${tmpfile}" -B -N --no-bpf-event -e dummy:u -C "$1" true >/dev/null 2>&1 ; then + if ! perf_record_no_decode -o "${tmpfile}" -e dummy:u -C "$1" true >/dev/null 2>&1 ; then echo "No so skipping" return 2 fi @@ -93,7 +101,7 @@ test_system_wide_side_band() can_cpu_wide 1 || return $? # Record on CPU 0 a task running on CPU 1 - perf record -B -N --no-bpf-event -o "${perfdatafile}" -e intel_pt//u -C 0 -- taskset --cpu-list 1 uname + perf_record_no_decode -o "${perfdatafile}" -e intel_pt//u -C 0 -- taskset --cpu-list 1 uname # Should get MMAP events from CPU 1 because they can be needed to decode mmap_cnt=$(perf script -i "${perfdatafile}" --no-itrace --show-mmap-events -C 1 2>/dev/null | grep -c MMAP) @@ -109,7 +117,7 @@ test_system_wide_side_band() can_kernel() { - perf record -o "${tmpfile}" -B -N --no-bpf-event -e dummy:k true >/dev/null 2>&1 || return 2 + perf_record_no_decode -o "${tmpfile}" -e dummy:k true >/dev/null 2>&1 || return 2 return 0 } @@ -235,7 +243,7 @@ test_per_thread() wait_for_threads ${w1} 2 wait_for_threads ${w2} 2 - perf record -B -N --no-bpf-event -o "${perfdatafile}" -e intel_pt//u"${k}" -vvv --per-thread -p "${w1},${w2}" 2>"${errfile}" >"${outfile}" & + perf_record_no_decode -o "${perfdatafile}" -e intel_pt//u"${k}" -vvv --per-thread -p "${w1},${w2}" 2>"${errfile}" >"${outfile}" & ppid=$! echo "perf PID is $ppid" wait_for_perf_to_start ${ppid} "${errfile}" || return 1 -- GitLab From 9637bf8ff0f050cfb9fe84f5734af633e7902796 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Fri, 14 Oct 2022 20:09:01 +0300 Subject: [PATCH 1999/2223] perf test: test_intel_pt.sh: Print a message when skipping kernel tracing Messages display with the perf test -v option. Add a message to show when skipping a test because the user cannot do kernel tracing. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 334836f92bdc6..9c746ff1c4d24 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -117,7 +117,14 @@ test_system_wide_side_band() can_kernel() { - perf_record_no_decode -o "${tmpfile}" -e dummy:k true >/dev/null 2>&1 || return 2 + if [ -z "${can_kernel_trace}" ] ; then + can_kernel_trace=0 + perf_record_no_decode -o "${tmpfile}" -e dummy:k true >/dev/null 2>&1 && can_kernel_trace=1 + fi + if [ ${can_kernel_trace} -eq 0 ] ; then + echo "SKIP: no kernel tracing" + return 2 + fi return 0 } -- GitLab From 40053a4b7ebd227e923eb996f5e3e328a647db93 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Fri, 14 Oct 2022 20:09:02 +0300 Subject: [PATCH 2000/2223] perf test: test_intel_pt.sh: Tidy some alignment Tidy alignment of test function lines to make them more readable. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 9c746ff1c4d24..79dde57b561dd 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -283,9 +283,9 @@ count_result() } ret=0 -test_system_wide_side_band || ret=$? ; count_result $ret ; ret=0 -test_per_thread "" "" || ret=$? ; count_result $ret ; ret=0 -test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret ; ret=0 +test_system_wide_side_band || ret=$? ; count_result $ret ; ret=0 +test_per_thread "" "" || ret=$? ; count_result $ret ; ret=0 +test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret ; ret=0 cleanup -- GitLab From 973db24079fc6b292e896b3b9c057a0a6c0d8e93 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Fri, 14 Oct 2022 20:09:03 +0300 Subject: [PATCH 2001/2223] perf test: test_intel_pt.sh: Add jitdump test Add a test for decoding self-modifying code using a jitdump file. The test creates a workload that uses self-modifying code and generates its own jitdump file. The result is processed with perf inject --jit and checked for decoding errors. Note the test will fail without patch "perf inject: Fix GEN_ELF_TEXT_OFFSET for jit" applied. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 162 ++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index 79dde57b561dd..e0bf75981b9ca 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -22,6 +22,7 @@ outfile="${temp_dir}/test-out.txt" errfile="${temp_dir}/test-err.txt" workload="${temp_dir}/workload" awkscript="${temp_dir}/awkscript" +jitdump_workload="${temp_dir}/jitdump_workload" cleanup() { @@ -50,6 +51,13 @@ perf_record_no_decode() perf record -B -N --no-bpf-event "$@" } +# perf record for testing should not need BPF events +perf_record_no_bpf() +{ + # Options for no BPF events + perf record --no-bpf-event "$@" +} + have_workload=false cat << _end_of_file_ | /usr/bin/cc -o "${workload}" -xc - -pthread && have_workload=true #include <time.h> @@ -269,6 +277,159 @@ test_per_thread() return 0 } +test_jitdump() +{ + echo "--- Test tracing self-modifying code that uses jitdump ---" + + script_path=$(realpath "$0") + script_dir=$(dirname "$script_path") + jitdump_incl_dir="${script_dir}/../../util" + jitdump_h="${jitdump_incl_dir}/jitdump.h" + + if [ ! -e "${jitdump_h}" ] ; then + echo "SKIP: Include file jitdump.h not found" + return 2 + fi + + if [ -z "${have_jitdump_workload}" ] ; then + have_jitdump_workload=false + # Create a workload that uses self-modifying code and generates its own jitdump file + cat <<- "_end_of_file_" | /usr/bin/cc -o "${jitdump_workload}" -I "${jitdump_incl_dir}" -xc - -pthread && have_jitdump_workload=true + #define _GNU_SOURCE + #include <sys/mman.h> + #include <sys/types.h> + #include <stddef.h> + #include <stdio.h> + #include <stdint.h> + #include <unistd.h> + #include <string.h> + + #include "jitdump.h" + + #define CHK_BYTE 0x5a + + static inline uint64_t rdtsc(void) + { + unsigned int low, high; + + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + + return low | ((uint64_t)high) << 32; + } + + static FILE *open_jitdump(void) + { + struct jitheader header = { + .magic = JITHEADER_MAGIC, + .version = JITHEADER_VERSION, + .total_size = sizeof(header), + .pid = getpid(), + .timestamp = rdtsc(), + .flags = JITDUMP_FLAGS_ARCH_TIMESTAMP, + }; + char filename[256]; + FILE *f; + void *m; + + snprintf(filename, sizeof(filename), "jit-%d.dump", getpid()); + f = fopen(filename, "w+"); + if (!f) + goto err; + /* Create an MMAP event for the jitdump file. That is how perf tool finds it. */ + m = mmap(0, 4096, PROT_READ | PROT_EXEC, MAP_PRIVATE, fileno(f), 0); + if (m == MAP_FAILED) + goto err_close; + munmap(m, 4096); + if (fwrite(&header,sizeof(header),1,f) != 1) + goto err_close; + return f; + + err_close: + fclose(f); + err: + return NULL; + } + + static int write_jitdump(FILE *f, void *addr, const uint8_t *dat, size_t sz, uint64_t *idx) + { + struct jr_code_load rec = { + .p.id = JIT_CODE_LOAD, + .p.total_size = sizeof(rec) + sz, + .p.timestamp = rdtsc(), + .pid = getpid(), + .tid = gettid(), + .vma = (unsigned long)addr, + .code_addr = (unsigned long)addr, + .code_size = sz, + .code_index = ++*idx, + }; + + if (fwrite(&rec,sizeof(rec),1,f) != 1 || + fwrite(dat, sz, 1, f) != 1) + return -1; + return 0; + } + + static void close_jitdump(FILE *f) + { + fclose(f); + } + + int main() + { + /* Get a memory page to store executable code */ + void *addr = mmap(0, 4096, PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + /* Code to execute: mov CHK_BYTE, %eax ; ret */ + uint8_t dat[] = {0xb8, CHK_BYTE, 0x00, 0x00, 0x00, 0xc3}; + FILE *f = open_jitdump(); + uint64_t idx = 0; + int ret = 1; + + if (!f) + return 1; + /* Copy executable code to executable memory page */ + memcpy(addr, dat, sizeof(dat)); + /* Record it in the jitdump file */ + if (write_jitdump(f, addr, dat, sizeof(dat), &idx)) + goto out_close; + /* Call it */ + ret = ((int (*)(void))addr)() - CHK_BYTE; + out_close: + close_jitdump(f); + return ret; + } + _end_of_file_ + fi + + if ! $have_jitdump_workload ; then + echo "SKIP: No jitdump workload" + return 2 + fi + + # Change to temp_dir so jitdump collateral files go there + cd "${temp_dir}" + perf_record_no_bpf -o "${tmpfile}" -e intel_pt//u "${jitdump_workload}" + perf inject -i "${tmpfile}" -o "${perfdatafile}" --jit + decode_br_cnt=$(perf script -i "${perfdatafile}" --itrace=b | wc -l) + # Note that overflow and lost errors are suppressed for the error count + decode_err_cnt=$(perf script -i "${perfdatafile}" --itrace=e-o-l | grep -ci error) + cd - + # Should be thousands of branches + if [ "${decode_br_cnt}" -lt 1000 ] ; then + echo "Decode failed, only ${decode_br_cnt} branches" + return 1 + fi + # Should be no errors + if [ "${decode_err_cnt}" -ne 0 ] ; then + echo "Decode failed, ${decode_err_cnt} errors" + perf script -i "${perfdatafile}" --itrace=e-o-l + return 1 + fi + + echo OK + return 0 +} + count_result() { if [ "$1" -eq 2 ] ; then @@ -286,6 +447,7 @@ ret=0 test_system_wide_side_band || ret=$? ; count_result $ret ; ret=0 test_per_thread "" "" || ret=$? ; count_result $ret ; ret=0 test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret ; ret=0 +test_jitdump || ret=$? ; count_result $ret ; ret=0 cleanup -- GitLab From 89b15d00527b7825ff19130ed83478e80e3fae99 Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Fri, 14 Oct 2022 20:09:04 +0300 Subject: [PATCH 2002/2223] perf inject: Fix GEN_ELF_TEXT_OFFSET for jit When a program header was added, it moved the text section but GEN_ELF_TEXT_OFFSET was not updated. Fix by adding the program header size and aligning. Fixes: babd04386b1df8c3 ("perf jit: Include program header in ELF files") Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Lieven Hey <lieven.hey@kdab.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/genelf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index b5c909546e3f2..6af062d1c4522 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -2,6 +2,8 @@ #ifndef __GENELF_H__ #define __GENELF_H__ +#include <linux/math.h> + /* genelf.c */ int jit_write_elf(int fd, uint64_t code_addr, const char *sym, const void *code, int csize, void *debug, int nr_debug_entries, @@ -76,6 +78,6 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #endif /* The .text section is directly after the ELF header */ -#define GEN_ELF_TEXT_OFFSET sizeof(Elf_Ehdr) +#define GEN_ELF_TEXT_OFFSET round_up(sizeof(Elf_Ehdr) + sizeof(Elf_Phdr), 16) #endif -- GitLab From f77811a0f62577d2d51e57c5740a4fbd53dd3331 Mon Sep 17 00:00:00 2001 From: Ammy Yi <ammy.yi@intel.com> Date: Fri, 14 Oct 2022 20:09:05 +0300 Subject: [PATCH 2003/2223] perf test: test_intel_pt.sh: Add 9 tests Add tests: Test with MTC and TSC disabled Test with branches disabled Test with/without CYC Test recording with sample mode Test with kernel trace Test virtual LBR Test power events Test with TNT packets disabled Test with event_trace These tests mostly check that perf record works with the corresponding Intel PT config terms, sometimes also checking that certain packets do or do not appear in the resulting trace as appropriate. The "Test virtual LBR" is slightly trickier, using a Python script to check that branch stacks are actually synthesized. Signed-off-by: Ammy Yi <ammy.yi@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20221014170905.64069-8-adrian.hunter@intel.com Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/test_intel_pt.sh | 195 +++++++++++++++++++++++- 1 file changed, 194 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh index e0bf75981b9ca..4c0aabbe33bdf 100755 --- a/tools/perf/tests/shell/test_intel_pt.sh +++ b/tools/perf/tests/shell/test_intel_pt.sh @@ -23,6 +23,7 @@ errfile="${temp_dir}/test-err.txt" workload="${temp_dir}/workload" awkscript="${temp_dir}/awkscript" jitdump_workload="${temp_dir}/jitdump_workload" +maxbrstack="${temp_dir}/maxbrstack.py" cleanup() { @@ -422,7 +423,7 @@ test_jitdump() # Should be no errors if [ "${decode_err_cnt}" -ne 0 ] ; then echo "Decode failed, ${decode_err_cnt} errors" - perf script -i "${perfdatafile}" --itrace=e-o-l + perf script -i "${perfdatafile}" --itrace=e-o-l --show-mmap-events | cat return 1 fi @@ -430,6 +431,189 @@ test_jitdump() return 0 } +test_packet_filter() +{ + echo "--- Test with MTC and TSC disabled ---" + # Disable MTC and TSC + perf_record_no_decode -o "${perfdatafile}" -e intel_pt/mtc=0,tsc=0/u uname + # Should not get MTC packet + mtc_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "MTC 0x") + if [ "${mtc_cnt}" -ne 0 ] ; then + echo "Failed to filter with mtc=0" + return 1 + fi + # Should not get TSC package + tsc_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "TSC 0x") + if [ "${tsc_cnt}" -ne 0 ] ; then + echo "Failed to filter with tsc=0" + return 1 + fi + echo OK + return 0 +} + +test_disable_branch() +{ + echo "--- Test with branches disabled ---" + # Disable branch + perf_record_no_decode -o "${perfdatafile}" -e intel_pt/branch=0/u uname + # Should not get branch related packets + tnt_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "TNT 0x") + tip_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "TIP 0x") + fup_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "FUP 0x") + if [ "${tnt_cnt}" -ne 0 ] || [ "${tip_cnt}" -ne 0 ] || [ "${fup_cnt}" -ne 0 ] ; then + echo "Failed to disable branches" + return 1 + fi + echo OK + return 0 +} + +test_time_cyc() +{ + echo "--- Test with/without CYC ---" + # Check if CYC is supported + cyc=$(cat /sys/bus/event_source/devices/intel_pt/caps/psb_cyc) + if [ "${cyc}" != "1" ] ; then + echo "SKIP: CYC is not supported" + return 2 + fi + # Enable CYC + perf_record_no_decode -o "${perfdatafile}" -e intel_pt/cyc/u uname + # should get CYC packets + cyc_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "CYC 0x") + if [ "${cyc_cnt}" = "0" ] ; then + echo "Failed to get CYC packet" + return 1 + fi + # Without CYC + perf_record_no_decode -o "${perfdatafile}" -e intel_pt//u uname + # Should not get CYC packets + cyc_cnt=$(perf script -i "${perfdatafile}" -D 2>/dev/null | grep -c "CYC 0x") + if [ "${cyc_cnt}" -gt 0 ] ; then + echo "Still get CYC packet without cyc" + return 1 + fi + echo OK + return 0 +} + +test_sample() +{ + echo "--- Test recording with sample mode ---" + # Check if recording with sample mode is working + if ! perf_record_no_decode -o "${perfdatafile}" --aux-sample=8192 -e '{intel_pt//u,branch-misses:u}' uname ; then + echo "perf record failed with --aux-sample" + return 1 + fi + echo OK + return 0 +} + +test_kernel_trace() +{ + echo "--- Test with kernel trace ---" + # Check if recording with kernel trace is working + can_kernel || return 2 + if ! perf_record_no_decode -o "${perfdatafile}" -e intel_pt//k -m1,128 uname ; then + echo "perf record failed with intel_pt//k" + return 1 + fi + echo OK + return 0 +} + +test_virtual_lbr() +{ + echo "--- Test virtual LBR ---" + + # Python script to determine the maximum size of branch stacks + cat << "_end_of_file_" > "${maxbrstack}" +from __future__ import print_function + +bmax = 0 + +def process_event(param_dict): + if "brstack" in param_dict: + brstack = param_dict["brstack"] + n = len(brstack) + global bmax + if n > bmax: + bmax = n + +def trace_end(): + print("max brstack", bmax) +_end_of_file_ + + # Check if virtual lbr is working + perf_record_no_bpf -o "${perfdatafile}" --aux-sample -e '{intel_pt//,cycles}:u' uname + times_val=$(perf script -i "${perfdatafile}" --itrace=L -s "${maxbrstack}" 2>/dev/null | grep "max brstack " | cut -d " " -f 3) + case "${times_val}" in + [0-9]*) ;; + *) times_val=0;; + esac + if [ "${times_val}" -lt 2 ] ; then + echo "Failed with virtual lbr" + return 1 + fi + echo OK + return 0 +} + +test_power_event() +{ + echo "--- Test power events ---" + # Check if power events are supported + power_event=$(cat /sys/bus/event_source/devices/intel_pt/caps/power_event_trace) + if [ "${power_event}" != "1" ] ; then + echo "SKIP: power_event_trace is not supported" + return 2 + fi + if ! perf_record_no_decode -o "${perfdatafile}" -a -e intel_pt/pwr_evt/u uname ; then + echo "perf record failed with pwr_evt" + return 1 + fi + echo OK + return 0 +} + +test_no_tnt() +{ + echo "--- Test with TNT packets disabled ---" + # Check if TNT disable is supported + notnt=$(cat /sys/bus/event_source/devices/intel_pt/caps/tnt_disable) + if [ "${notnt}" != "1" ] ; then + echo "SKIP: tnt_disable is not supported" + return 2 + fi + perf_record_no_decode -o "${perfdatafile}" -e intel_pt/notnt/u uname + # Should be no TNT packets + tnt_cnt=$(perf script -i "${perfdatafile}" -D | grep -c TNT) + if [ "${tnt_cnt}" -ne 0 ] ; then + echo "TNT packets still there after notnt" + return 1 + fi + echo OK + return 0 +} + +test_event_trace() +{ + echo "--- Test with event_trace ---" + # Check if event_trace is supported + event_trace=$(cat /sys/bus/event_source/devices/intel_pt/caps/event_trace) + if [ "${event_trace}" != 1 ] ; then + echo "SKIP: event_trace is not supported" + return 2 + fi + if ! perf_record_no_decode -o "${perfdatafile}" -e intel_pt/event/u uname ; then + echo "perf record failed with event trace" + return 1 + fi + echo OK + return 0 +} + count_result() { if [ "$1" -eq 2 ] ; then @@ -448,6 +632,15 @@ test_system_wide_side_band || ret=$? ; count_result $ret ; ret=0 test_per_thread "" "" || ret=$? ; count_result $ret ; ret=0 test_per_thread "k" "(incl. kernel) " || ret=$? ; count_result $ret ; ret=0 test_jitdump || ret=$? ; count_result $ret ; ret=0 +test_packet_filter || ret=$? ; count_result $ret ; ret=0 +test_disable_branch || ret=$? ; count_result $ret ; ret=0 +test_time_cyc || ret=$? ; count_result $ret ; ret=0 +test_sample || ret=$? ; count_result $ret ; ret=0 +test_kernel_trace || ret=$? ; count_result $ret ; ret=0 +test_virtual_lbr || ret=$? ; count_result $ret ; ret=0 +test_power_event || ret=$? ; count_result $ret ; ret=0 +test_no_tnt || ret=$? ; count_result $ret ; ret=0 +test_event_trace || ret=$? ; count_result $ret ; ret=0 cleanup -- GitLab From e28039667cea2cbea72aeb19665a1c57c6756253 Mon Sep 17 00:00:00 2001 From: James Clark <james.clark@arm.com> Date: Wed, 12 Oct 2022 10:46:32 +0100 Subject: [PATCH 2004/2223] perf test: Fix attr tests for PERF_FORMAT_LOST Since PERF_FORMAT_LOST was added, the default read format has that bit set, so add it to the tests. Keep the old value as well so that the test still passes on older kernels. This fixes the following failure: expected read_format=0|4, got 20 FAILED './tests/attr/test-record-C0' - match failure Fixes: 85b425f31c8866e0 ("perf record: Set PERF_FORMAT_LOST by default") Signed-off-by: James Clark <james.clark@arm.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20221012094633.21669-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/attr/base-record | 2 +- tools/perf/tests/attr/system-wide-dummy | 2 +- tools/perf/tests/attr/test-record-group | 4 ++-- tools/perf/tests/attr/test-record-group-sampling | 6 +++--- tools/perf/tests/attr/test-record-group1 | 4 ++-- tools/perf/tests/attr/test-record-group2 | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index 8c10955eff939..3ef07a12aa142 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -9,7 +9,7 @@ size=128 config=0 sample_period=* sample_type=263 -read_format=0|4 +read_format=0|4|20 disabled=1 inherit=1 pinned=0 diff --git a/tools/perf/tests/attr/system-wide-dummy b/tools/perf/tests/attr/system-wide-dummy index 86a15dd359d93..8fec06eda5f90 100644 --- a/tools/perf/tests/attr/system-wide-dummy +++ b/tools/perf/tests/attr/system-wide-dummy @@ -11,7 +11,7 @@ size=128 config=9 sample_period=4000 sample_type=455 -read_format=4 +read_format=4|20 # Event will be enabled right away. disabled=0 inherit=1 diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group index 14ee60fd3f410..6c1cff8aae8b8 100644 --- a/tools/perf/tests/attr/test-record-group +++ b/tools/perf/tests/attr/test-record-group @@ -7,14 +7,14 @@ ret = 1 fd=1 group_fd=-1 sample_type=327 -read_format=4 +read_format=4|20 [event-2:base-record] fd=2 group_fd=1 config=1 sample_type=327 -read_format=4 +read_format=4|20 mmap=0 comm=0 task=0 diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/tests/attr/test-record-group-sampling index 300b9f7e6d693..97e7e64a38f07 100644 --- a/tools/perf/tests/attr/test-record-group-sampling +++ b/tools/perf/tests/attr/test-record-group-sampling @@ -7,7 +7,7 @@ ret = 1 fd=1 group_fd=-1 sample_type=343 -read_format=12 +read_format=12|28 inherit=0 [event-2:base-record] @@ -21,8 +21,8 @@ config=3 # default | PERF_SAMPLE_READ sample_type=343 -# PERF_FORMAT_ID | PERF_FORMAT_GROUP -read_format=12 +# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST +read_format=12|28 task=0 mmap=0 comm=0 diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1 index 3ffe246e02283..eeb1db392bc9c 100644 --- a/tools/perf/tests/attr/test-record-group1 +++ b/tools/perf/tests/attr/test-record-group1 @@ -7,7 +7,7 @@ ret = 1 fd=1 group_fd=-1 sample_type=327 -read_format=4 +read_format=4|20 [event-2:base-record] fd=2 @@ -15,7 +15,7 @@ group_fd=1 type=0 config=1 sample_type=327 -read_format=4 +read_format=4|20 mmap=0 comm=0 task=0 diff --git a/tools/perf/tests/attr/test-record-group2 b/tools/perf/tests/attr/test-record-group2 index 6b9f8d182ce10..cebdaa8e64e47 100644 --- a/tools/perf/tests/attr/test-record-group2 +++ b/tools/perf/tests/attr/test-record-group2 @@ -9,7 +9,7 @@ group_fd=-1 config=0|1 sample_period=1234000 sample_type=87 -read_format=12 +read_format=12|28 inherit=0 freq=0 @@ -19,7 +19,7 @@ group_fd=1 config=0|1 sample_period=6789000 sample_type=87 -read_format=12 +read_format=12|28 disabled=0 inherit=0 mmap=0 -- GitLab From 5a3d47071f0ced0431ef82a5fb6bd077ed9493db Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 12 Oct 2022 11:22:58 +0300 Subject: [PATCH 2005/2223] perf intel-pt: Fix segfault in intel_pt_print_info() with uClibc uClibc segfaulted because NULL was passed as the format to fprintf(). That happened because one of the format strings was missing and intel_pt_print_info() didn't check that before calling fprintf(). Add the missing format string, and check format is not NULL before calling fprintf(). Fixes: 11fa7cb86b56d361 ("perf tools: Pass Intel PT information for decoding MTC and CYC") Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221012082259.22394-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/intel-pt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index b34cb3dec1aac..e3548ddef2545 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -4046,6 +4046,7 @@ static const char * const intel_pt_info_fmts[] = { [INTEL_PT_SNAPSHOT_MODE] = " Snapshot mode %"PRId64"\n", [INTEL_PT_PER_CPU_MMAPS] = " Per-cpu maps %"PRId64"\n", [INTEL_PT_MTC_BIT] = " MTC bit %#"PRIx64"\n", + [INTEL_PT_MTC_FREQ_BITS] = " MTC freq bits %#"PRIx64"\n", [INTEL_PT_TSC_CTC_N] = " TSC:CTC numerator %"PRIu64"\n", [INTEL_PT_TSC_CTC_D] = " TSC:CTC denominator %"PRIu64"\n", [INTEL_PT_CYC_BIT] = " CYC bit %#"PRIx64"\n", @@ -4060,8 +4061,12 @@ static void intel_pt_print_info(__u64 *arr, int start, int finish) if (!dump_trace) return; - for (i = start; i <= finish; i++) - fprintf(stdout, intel_pt_info_fmts[i], arr[i]); + for (i = start; i <= finish; i++) { + const char *fmt = intel_pt_info_fmts[i]; + + if (fmt) + fprintf(stdout, fmt, arr[i]); + } } static void intel_pt_print_info_str(const char *name, const char *str) -- GitLab From 6cef7dab3e2e5cb23a13569c3880c0532326748c Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@intel.com> Date: Wed, 12 Oct 2022 11:22:59 +0300 Subject: [PATCH 2006/2223] perf intel-pt: Fix system_wide dummy event for hybrid User space tasks can migrate between CPUs, so when tracing selected CPUs, system-wide sideband is still needed, however evlist->core.has_user_cpus is not set in the hybrid case, so check the target cpu_list instead. Fixes: 7d189cadbeebc778 ("perf intel-pt: Track sideband system-wide when needed") Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221012082259.22394-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/arch/x86/util/intel-pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 793b35f2221aa..af102f471e9f4 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -866,7 +866,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * User space tasks can migrate between CPUs, so when tracing * selected CPUs, sideband for all CPUs is still needed. */ - need_system_wide_tracking = evlist->core.has_user_cpus && + need_system_wide_tracking = opts->target.cpu_list && !intel_pt_evsel->core.attr.exclude_user; tracking_evsel = evlist__add_aux_dummy(evlist, need_system_wide_tracking); -- GitLab From cd400f6f18421b75e64e4aa7bc359d2606033412 Mon Sep 17 00:00:00 2001 From: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Date: Thu, 6 Oct 2022 21:21:48 +0530 Subject: [PATCH 2007/2223] perf tests stat+csv_output: Include sanity check for topology Testcase stat+csv_output.sh fails in powerpc: 84: perf stat CSV output linter: FAILED! The testcase "stat+csv_output.sh" verifies perf stat CSV output. The test covers aggregation modes like per-socket, per-core, per-die, -A (no_aggr mode) along with few other tests. It counts expected fields for various commands. For example say -A (i.e, AGGR_NONE mode), expects 7 fields in the output having "CPU" as first field. Same way, for per-socket, it expects the first field in result to point to socket id. The testcases compares the result with expected count. The values for socket, die, core and cpu are fetched from topology directory: /sys/devices/system/cpu/cpu*/topology. For example, socket value is fetched from "physical_package_id" file of topology directory. (cpu__get_topology_int() in util/cpumap.c) If a platform fails to fetch the topology information, values will be set to -1. For example, incase of pSeries platform of powerpc, value for "physical_package_id" is restricted and not exposed. So, -1 will be assigned. Perf code has a checks for valid cpu id in "aggr_printout" (stat-display.c), which displays the fields. So, in cases where topology values not exposed, first field of the output displaying will be empty. This cause the testcase to fail, as it counts number of fields in the output. Incase of -A (AGGR_NONE mode,), testcase expects 7 fields in the output, becos of -1 value obtained from topology files for some, only 6 fields are printed. Hence a testcase failure reported due to mismatch in number of fields in the output. Patch here adds a sanity check in the testcase for topology. Check will help to skip the test if -1 value found. Fixes: 7473ee56dbc91c98 ("perf test: Add checking for perf stat CSV output.") Reported-by: Disha Goel <disgoel@linux.vnet.ibm.com> Suggested-by: Ian Rogers <irogers@google.com> Suggested-by: James Clark <james.clark@arm.com> Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Claire Jensen <cjense@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: linuxppc-dev@lists.ozlabs.org Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nageswara R Sastry <rnsastry@linux.ibm.com> Link: https://lore.kernel.org/r/20221006155149.67205-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/stat+csv_output.sh | 43 ++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh index eb5196f58190e..b7f050aa6210c 100755 --- a/tools/perf/tests/shell/stat+csv_output.sh +++ b/tools/perf/tests/shell/stat+csv_output.sh @@ -6,6 +6,8 @@ set -e +skip_test=0 + function commachecker() { local -i cnt=0 @@ -156,14 +158,47 @@ check_per_socket() echo "[Success]" } +# The perf stat options for per-socket, per-core, per-die +# and -A ( no_aggr mode ) uses the info fetched from this +# directory: "/sys/devices/system/cpu/cpu*/topology". For +# example, socket value is fetched from "physical_package_id" +# file in topology directory. +# Reference: cpu__get_topology_int in util/cpumap.c +# If the platform doesn't expose topology information, values +# will be set to -1. For example, incase of pSeries platform +# of powerpc, value for "physical_package_id" is restricted +# and set to -1. Check here validates the socket-id read from +# topology file before proceeding further + +FILE_LOC="/sys/devices/system/cpu/cpu*/topology/" +FILE_NAME="physical_package_id" + +check_for_topology() +{ + if ! ParanoidAndNotRoot 0 + then + socket_file=`ls $FILE_LOC/$FILE_NAME | head -n 1` + [ -z $socket_file ] && return 0 + socket_id=`cat $socket_file` + [ $socket_id == -1 ] && skip_test=1 + return 0 + fi +} + +check_for_topology check_no_args check_system_wide -check_system_wide_no_aggr check_interval check_event -check_per_core check_per_thread -check_per_die check_per_node -check_per_socket +if [ $skip_test -ne 1 ] +then + check_system_wide_no_aggr + check_per_core + check_per_die + check_per_socket +else + echo "[Skip] Skipping tests for system_wide_no_aggr, per_core, per_die and per_socket since socket id exposed via topology is invalid" +fi exit 0 -- GitLab From 58d4802a5eaab55e174f4d31262daada6665aa22 Mon Sep 17 00:00:00 2001 From: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Date: Thu, 6 Oct 2022 21:21:49 +0530 Subject: [PATCH 2008/2223] perf tests stat+json_output: Include sanity check for topology Testcase stat+json_output.sh fails in powerpc: 86: perf stat JSON output linter : FAILED! The testcase "stat+json_output.sh" verifies perf stat JSON output. The test covers aggregation modes like per-socket, per-core, per-die, -A (no_aggr mode) along with few other tests. It counts expected fields for various commands. For example say -A (i.e, AGGR_NONE mode), expects 7 fields in the output having "CPU" as first field. Same way, for per-socket, it expects the first field in result to point to socket id. The testcases compares the result with expected count. The values for socket, die, core and cpu are fetched from topology directory: /sys/devices/system/cpu/cpu*/topology. For example, socket value is fetched from "physical_package_id" file of topology directory. (cpu__get_topology_int() in util/cpumap.c) If a platform fails to fetch the topology information, values will be set to -1. For example, incase of pSeries platform of powerpc, value for "physical_package_id" is restricted and not exposed. So, -1 will be assigned. Perf code has a checks for valid cpu id in "aggr_printout" (stat-display.c), which displays the fields. So, in cases where topology values not exposed, first field of the output displaying will be empty. This cause the testcase to fail, as it counts number of fields in the output. Incase of -A (AGGR_NONE mode,), testcase expects 7 fields in the output, becos of -1 value obtained from topology files for some, only 6 fields are printed. Hence a testcase failure reported due to mismatch in number of fields in the output. Patch here adds a sanity check in the testcase for topology. Check will help to skip the test if -1 value found. Fixes: 0c343af2a2f82844 ("perf test: JSON format checking") Reported-by: Disha Goel <disgoel@linux.vnet.ibm.com> Suggested-by: Ian Rogers <irogers@google.com> Suggested-by: James Clark <james.clark@arm.com> Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com> Cc: Claire Jensen <cjense@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kajol Jain <kjain@linux.ibm.com> Cc: linuxppc-dev@lists.ozlabs.org Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nageswara R Sastry <rnsastry@linux.ibm.com> Link: https://lore.kernel.org/r/20221006155149.67205-2-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/tests/shell/stat+json_output.sh | 43 ++++++++++++++++++++-- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh index ea8714a360512..2c4212c641ede 100755 --- a/tools/perf/tests/shell/stat+json_output.sh +++ b/tools/perf/tests/shell/stat+json_output.sh @@ -6,6 +6,8 @@ set -e +skip_test=0 + pythonchecker=$(dirname $0)/lib/perf_json_output_lint.py if [ "x$PYTHON" == "x" ] then @@ -134,14 +136,47 @@ check_per_socket() echo "[Success]" } +# The perf stat options for per-socket, per-core, per-die +# and -A ( no_aggr mode ) uses the info fetched from this +# directory: "/sys/devices/system/cpu/cpu*/topology". For +# example, socket value is fetched from "physical_package_id" +# file in topology directory. +# Reference: cpu__get_topology_int in util/cpumap.c +# If the platform doesn't expose topology information, values +# will be set to -1. For example, incase of pSeries platform +# of powerpc, value for "physical_package_id" is restricted +# and set to -1. Check here validates the socket-id read from +# topology file before proceeding further + +FILE_LOC="/sys/devices/system/cpu/cpu*/topology/" +FILE_NAME="physical_package_id" + +check_for_topology() +{ + if ! ParanoidAndNotRoot 0 + then + socket_file=`ls $FILE_LOC/$FILE_NAME | head -n 1` + [ -z $socket_file ] && return 0 + socket_id=`cat $socket_file` + [ $socket_id == -1 ] && skip_test=1 + return 0 + fi +} + +check_for_topology check_no_args check_system_wide -check_system_wide_no_aggr check_interval check_event -check_per_core check_per_thread -check_per_die check_per_node -check_per_socket +if [ $skip_test -ne 1 ] +then + check_system_wide_no_aggr + check_per_core + check_per_die + check_per_socket +else + echo "[Skip] Skipping tests for system_wide_no_aggr, per_core, per_die and per_socket since socket id exposed via topology is invalid" +fi exit 0 -- GitLab From 45a3975f8e4c56829ada20f7a6a29095ca05e375 Mon Sep 17 00:00:00 2001 From: Qi Liu <liuqi115@huawei.com> Date: Tue, 27 Sep 2022 16:13:58 +0800 Subject: [PATCH 2009/2223] perf auxtrace arm: Refactor event list iteration in auxtrace_record__init() Add find_pmu_for_event() and use to simplify logic in auxtrace_record_init(). find_pmu_for_event() will be reused in subsequent patches. Reviewed-by: John Garry <john.garry@huawei.com> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Reviewed-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Qi Liu <liuqi115@huawei.com> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Bjorn Helgaas <helgaas@kernel.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Liu <liuqi6124@gmail.com> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> Cc: Shaokun Zhang <zhangshaokun@hisilicon.com> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Zeng Prime <prime.zeng@huawei.com> Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20220927081400.14364-2-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/arch/arm/util/auxtrace.c | 53 ++++++++++++++++++----------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index 5fc6a2a3dbc5f..384c7cfda0fde 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -50,16 +50,32 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) return arm_spe_pmus; } +static struct perf_pmu *find_pmu_for_event(struct perf_pmu **pmus, + int pmu_nr, struct evsel *evsel) +{ + int i; + + if (!pmus) + return NULL; + + for (i = 0; i < pmu_nr; i++) { + if (evsel->core.attr.type == pmus[i]->type) + return pmus[i]; + } + + return NULL; +} + struct auxtrace_record *auxtrace_record__init(struct evlist *evlist, int *err) { - struct perf_pmu *cs_etm_pmu; + struct perf_pmu *cs_etm_pmu = NULL; + struct perf_pmu **arm_spe_pmus = NULL; struct evsel *evsel; - bool found_etm = false; + struct perf_pmu *found_etm = NULL; struct perf_pmu *found_spe = NULL; - struct perf_pmu **arm_spe_pmus = NULL; + int auxtrace_event_cnt = 0; int nr_spes = 0; - int i = 0; if (!evlist) return NULL; @@ -68,24 +84,23 @@ struct auxtrace_record arm_spe_pmus = find_all_arm_spe_pmus(&nr_spes, err); evlist__for_each_entry(evlist, evsel) { - if (cs_etm_pmu && - evsel->core.attr.type == cs_etm_pmu->type) - found_etm = true; - - if (!nr_spes || found_spe) - continue; - - for (i = 0; i < nr_spes; i++) { - if (evsel->core.attr.type == arm_spe_pmus[i]->type) { - found_spe = arm_spe_pmus[i]; - break; - } - } + if (cs_etm_pmu && !found_etm) + found_etm = find_pmu_for_event(&cs_etm_pmu, 1, evsel); + + if (arm_spe_pmus && !found_spe) + found_spe = find_pmu_for_event(arm_spe_pmus, nr_spes, evsel); } + free(arm_spe_pmus); - if (found_etm && found_spe) { - pr_err("Concurrent ARM Coresight ETM and SPE operation not currently supported\n"); + if (found_etm) + auxtrace_event_cnt++; + + if (found_spe) + auxtrace_event_cnt++; + + if (auxtrace_event_cnt > 1) { + pr_err("Concurrent AUX trace operation not currently supported\n"); *err = -EOPNOTSUPP; return NULL; } -- GitLab From 057381a7ece1b2726509ce47cdb9c1a111acfce9 Mon Sep 17 00:00:00 2001 From: Qi Liu <liuqi115@huawei.com> Date: Tue, 27 Sep 2022 16:13:59 +0800 Subject: [PATCH 2010/2223] perf auxtrace arm64: Add support for HiSilicon PCIe Tune and Trace device driver HiSilicon PCIe tune and trace device (PTT) could dynamically tune the PCIe link's events, and trace the TLP headers). This patch add support for PTT device in perf tool, so users could use 'perf record' to get TLP headers trace data. Reviewed-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Qi Liu <liuqi115@huawei.com> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Acked-by: John Garry <john.garry@huawei.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Bjorn Helgaas <helgaas@kernel.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: Jonathan Cameron <jonathan.cameron@huawei.com> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Liu <liuqi6124@gmail.com> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> Cc: Shaokun Zhang <zhangshaokun@hisilicon.com> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Zeng Prime <prime.zeng@huawei.com> Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20220927081400.14364-3-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/arch/arm/util/auxtrace.c | 63 +++++++++ tools/perf/arch/arm/util/pmu.c | 3 + tools/perf/arch/arm64/util/Build | 2 +- tools/perf/arch/arm64/util/hisi-ptt.c | 188 ++++++++++++++++++++++++++ tools/perf/util/auxtrace.c | 1 + tools/perf/util/auxtrace.h | 1 + tools/perf/util/hisi-ptt.h | 16 +++ 7 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 tools/perf/arch/arm64/util/hisi-ptt.c create mode 100644 tools/perf/util/hisi-ptt.h diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index 384c7cfda0fde..deeb163999ceb 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -4,9 +4,11 @@ * Author: Mathieu Poirier <mathieu.poirier@linaro.org> */ +#include <dirent.h> #include <stdbool.h> #include <linux/coresight-pmu.h> #include <linux/zalloc.h> +#include <api/fs/fs.h> #include "../../../util/auxtrace.h" #include "../../../util/debug.h" @@ -14,6 +16,7 @@ #include "../../../util/pmu.h" #include "cs-etm.h" #include "arm-spe.h" +#include "hisi-ptt.h" static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) { @@ -50,6 +53,52 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) return arm_spe_pmus; } +static struct perf_pmu **find_all_hisi_ptt_pmus(int *nr_ptts, int *err) +{ + const char *sysfs = sysfs__mountpoint(); + struct perf_pmu **hisi_ptt_pmus = NULL; + struct dirent *dent; + char path[PATH_MAX]; + DIR *dir = NULL; + int idx = 0; + + snprintf(path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH, sysfs); + dir = opendir(path); + if (!dir) { + pr_err("can't read directory '%s'\n", EVENT_SOURCE_DEVICE_PATH); + *err = -EINVAL; + return NULL; + } + + while ((dent = readdir(dir))) { + if (strstr(dent->d_name, HISI_PTT_PMU_NAME)) + (*nr_ptts)++; + } + + if (!(*nr_ptts)) + goto out; + + hisi_ptt_pmus = zalloc(sizeof(struct perf_pmu *) * (*nr_ptts)); + if (!hisi_ptt_pmus) { + pr_err("hisi_ptt alloc failed\n"); + *err = -ENOMEM; + goto out; + } + + rewinddir(dir); + while ((dent = readdir(dir))) { + if (strstr(dent->d_name, HISI_PTT_PMU_NAME) && idx < *nr_ptts) { + hisi_ptt_pmus[idx] = perf_pmu__find(dent->d_name); + if (hisi_ptt_pmus[idx]) + idx++; + } + } + +out: + closedir(dir); + return hisi_ptt_pmus; +} + static struct perf_pmu *find_pmu_for_event(struct perf_pmu **pmus, int pmu_nr, struct evsel *evsel) { @@ -71,17 +120,21 @@ struct auxtrace_record { struct perf_pmu *cs_etm_pmu = NULL; struct perf_pmu **arm_spe_pmus = NULL; + struct perf_pmu **hisi_ptt_pmus = NULL; struct evsel *evsel; struct perf_pmu *found_etm = NULL; struct perf_pmu *found_spe = NULL; + struct perf_pmu *found_ptt = NULL; int auxtrace_event_cnt = 0; int nr_spes = 0; + int nr_ptts = 0; if (!evlist) return NULL; cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME); arm_spe_pmus = find_all_arm_spe_pmus(&nr_spes, err); + hisi_ptt_pmus = find_all_hisi_ptt_pmus(&nr_ptts, err); evlist__for_each_entry(evlist, evsel) { if (cs_etm_pmu && !found_etm) @@ -89,9 +142,13 @@ struct auxtrace_record if (arm_spe_pmus && !found_spe) found_spe = find_pmu_for_event(arm_spe_pmus, nr_spes, evsel); + + if (hisi_ptt_pmus && !found_ptt) + found_ptt = find_pmu_for_event(hisi_ptt_pmus, nr_ptts, evsel); } free(arm_spe_pmus); + free(hisi_ptt_pmus); if (found_etm) auxtrace_event_cnt++; @@ -99,6 +156,9 @@ struct auxtrace_record if (found_spe) auxtrace_event_cnt++; + if (found_ptt) + auxtrace_event_cnt++; + if (auxtrace_event_cnt > 1) { pr_err("Concurrent AUX trace operation not currently supported\n"); *err = -EOPNOTSUPP; @@ -111,6 +171,9 @@ struct auxtrace_record #if defined(__aarch64__) if (found_spe) return arm_spe_recording_init(err, found_spe); + + if (found_ptt) + return hisi_ptt_recording_init(err, found_ptt); #endif /* diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index b8b23b9dc5987..887c8addc4916 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -10,6 +10,7 @@ #include <linux/string.h> #include "arm-spe.h" +#include "hisi-ptt.h" #include "../../../util/pmu.h" struct perf_event_attr @@ -22,6 +23,8 @@ struct perf_event_attr #if defined(__aarch64__) } else if (strstarts(pmu->name, ARM_SPE_PMU_NAME)) { return arm_spe_pmu_default_config(pmu); + } else if (strstarts(pmu->name, HISI_PTT_PMU_NAME)) { + pmu->selectable = true; #endif } diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index 9fcb4e68add93..337aa9bdf905d 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -11,4 +11,4 @@ perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-$(CONFIG_AUXTRACE) += ../../arm/util/pmu.o \ ../../arm/util/auxtrace.o \ ../../arm/util/cs-etm.o \ - arm-spe.o mem-events.o + arm-spe.o mem-events.o hisi-ptt.o diff --git a/tools/perf/arch/arm64/util/hisi-ptt.c b/tools/perf/arch/arm64/util/hisi-ptt.c new file mode 100644 index 0000000000000..ba97c8a562a02 --- /dev/null +++ b/tools/perf/arch/arm64/util/hisi-ptt.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/log2.h> +#include <linux/zalloc.h> +#include <time.h> + +#include <internal/lib.h> // page_size +#include "../../../util/auxtrace.h" +#include "../../../util/cpumap.h" +#include "../../../util/debug.h" +#include "../../../util/event.h" +#include "../../../util/evlist.h" +#include "../../../util/evsel.h" +#include "../../../util/hisi-ptt.h" +#include "../../../util/pmu.h" +#include "../../../util/record.h" +#include "../../../util/session.h" +#include "../../../util/tsc.h" + +#define KiB(x) ((x) * 1024) +#define MiB(x) ((x) * 1024 * 1024) + +struct hisi_ptt_recording { + struct auxtrace_record itr; + struct perf_pmu *hisi_ptt_pmu; + struct evlist *evlist; +}; + +static size_t +hisi_ptt_info_priv_size(struct auxtrace_record *itr __maybe_unused, + struct evlist *evlist __maybe_unused) +{ + return HISI_PTT_AUXTRACE_PRIV_SIZE; +} + +static int hisi_ptt_info_fill(struct auxtrace_record *itr, + struct perf_session *session, + struct perf_record_auxtrace_info *auxtrace_info, + size_t priv_size) +{ + struct hisi_ptt_recording *pttr = + container_of(itr, struct hisi_ptt_recording, itr); + struct perf_pmu *hisi_ptt_pmu = pttr->hisi_ptt_pmu; + + if (priv_size != HISI_PTT_AUXTRACE_PRIV_SIZE) + return -EINVAL; + + if (!session->evlist->core.nr_mmaps) + return -EINVAL; + + auxtrace_info->type = PERF_AUXTRACE_HISI_PTT; + auxtrace_info->priv[0] = hisi_ptt_pmu->type; + + return 0; +} + +static int hisi_ptt_set_auxtrace_mmap_page(struct record_opts *opts) +{ + bool privileged = perf_event_paranoid_check(-1); + + if (!opts->full_auxtrace) + return 0; + + if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) { + if (privileged) { + opts->auxtrace_mmap_pages = MiB(16) / page_size; + } else { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + } + + /* Validate auxtrace_mmap_pages */ + if (opts->auxtrace_mmap_pages) { + size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size; + size_t min_sz = KiB(8); + + if (sz < min_sz || !is_power_of_2(sz)) { + pr_err("Invalid mmap size for HISI PTT: must be at least %zuKiB and a power of 2\n", + min_sz / 1024); + return -EINVAL; + } + } + + return 0; +} + +static int hisi_ptt_recording_options(struct auxtrace_record *itr, + struct evlist *evlist, + struct record_opts *opts) +{ + struct hisi_ptt_recording *pttr = + container_of(itr, struct hisi_ptt_recording, itr); + struct perf_pmu *hisi_ptt_pmu = pttr->hisi_ptt_pmu; + struct evsel *evsel, *hisi_ptt_evsel = NULL; + struct evsel *tracking_evsel; + int err; + + pttr->evlist = evlist; + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type == hisi_ptt_pmu->type) { + if (hisi_ptt_evsel) { + pr_err("There may be only one " HISI_PTT_PMU_NAME "x event\n"); + return -EINVAL; + } + evsel->core.attr.freq = 0; + evsel->core.attr.sample_period = 1; + evsel->needs_auxtrace_mmap = true; + hisi_ptt_evsel = evsel; + opts->full_auxtrace = true; + } + } + + err = hisi_ptt_set_auxtrace_mmap_page(opts); + if (err) + return err; + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace event + * must come first. + */ + evlist__to_front(evlist, hisi_ptt_evsel); + evsel__set_sample_bit(hisi_ptt_evsel, TIME); + + /* Add dummy event to keep tracking */ + err = parse_event(evlist, "dummy:u"); + if (err) + return err; + + tracking_evsel = evlist__last(evlist); + evlist__set_tracking_event(evlist, tracking_evsel); + + tracking_evsel->core.attr.freq = 0; + tracking_evsel->core.attr.sample_period = 1; + evsel__set_sample_bit(tracking_evsel, TIME); + + return 0; +} + +static u64 hisi_ptt_reference(struct auxtrace_record *itr __maybe_unused) +{ + return rdtsc(); +} + +static void hisi_ptt_recording_free(struct auxtrace_record *itr) +{ + struct hisi_ptt_recording *pttr = + container_of(itr, struct hisi_ptt_recording, itr); + + free(pttr); +} + +struct auxtrace_record *hisi_ptt_recording_init(int *err, + struct perf_pmu *hisi_ptt_pmu) +{ + struct hisi_ptt_recording *pttr; + + if (!hisi_ptt_pmu) { + *err = -ENODEV; + return NULL; + } + + pttr = zalloc(sizeof(*pttr)); + if (!pttr) { + *err = -ENOMEM; + return NULL; + } + + pttr->hisi_ptt_pmu = hisi_ptt_pmu; + pttr->itr.pmu = hisi_ptt_pmu; + pttr->itr.recording_options = hisi_ptt_recording_options; + pttr->itr.info_priv_size = hisi_ptt_info_priv_size; + pttr->itr.info_fill = hisi_ptt_info_fill; + pttr->itr.free = hisi_ptt_recording_free; + pttr->itr.reference = hisi_ptt_reference; + pttr->itr.read_finish = auxtrace_record__read_finish; + pttr->itr.alignment = 0; + + *err = 0; + return &pttr->itr; +} diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index b59c278fe9ede..0e53b796c5d5d 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1320,6 +1320,7 @@ int perf_event__process_auxtrace_info(struct perf_session *session, case PERF_AUXTRACE_S390_CPUMSF: err = s390_cpumsf_process_auxtrace_info(event, session); break; + case PERF_AUXTRACE_HISI_PTT: case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index cb8e0a01abb6e..6a0f9b98f059b 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -48,6 +48,7 @@ enum auxtrace_type { PERF_AUXTRACE_CS_ETM, PERF_AUXTRACE_ARM_SPE, PERF_AUXTRACE_S390_CPUMSF, + PERF_AUXTRACE_HISI_PTT, }; enum itrace_period_type { diff --git a/tools/perf/util/hisi-ptt.h b/tools/perf/util/hisi-ptt.h new file mode 100644 index 0000000000000..82283c81b4c11 --- /dev/null +++ b/tools/perf/util/hisi-ptt.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#ifndef INCLUDE__PERF_HISI_PTT_H__ +#define INCLUDE__PERF_HISI_PTT_H__ + +#define HISI_PTT_PMU_NAME "hisi_ptt" +#define HISI_PTT_AUXTRACE_PRIV_SIZE sizeof(u64) + +struct auxtrace_record *hisi_ptt_recording_init(int *err, + struct perf_pmu *hisi_ptt_pmu); + +#endif -- GitLab From 5e91e57e68090c0e8ab0acecdbb309af8417d415 Mon Sep 17 00:00:00 2001 From: Qi Liu <liuqi115@huawei.com> Date: Tue, 27 Sep 2022 16:14:00 +0800 Subject: [PATCH 2011/2223] perf auxtrace arm64: Add support for parsing HiSilicon PCIe Trace packet Add support for using 'perf report --dump-raw-trace' to parse PTT packet. Example usage: Output will contain raw PTT data and its textual representation, such as (8DW format): 0 0 0x5810 [0x30]: PERF_RECORD_AUXTRACE size: 0x400000 offset: 0 ref: 0xa5d50c725 idx: 0 tid: -1 cpu: 0 . . ... HISI PTT data: size 4194304 bytes . 00000000: 00 00 00 00 Prefix . 00000004: 08 20 00 60 Header DW0 . 00000008: ff 02 00 01 Header DW1 . 0000000c: 20 08 00 00 Header DW2 . 00000010: 10 e7 44 ab Header DW3 . 00000014: 2a a8 1e 01 Time . 00000020: 00 00 00 00 Prefix . 00000024: 01 00 00 60 Header DW0 . 00000028: 0f 1e 00 01 Header DW1 . 0000002c: 04 00 00 00 Header DW2 . 00000030: 40 00 81 02 Header DW3 . 00000034: ee 02 00 00 Time .... This patch only add basic parsing support according to the definition of the PTT packet described in Documentation/trace/hisi-ptt.rst. And the fields of each packet can be further decoded following the PCIe Spec's definition of TLP packet. Signed-off-by: Qi Liu <liuqi115@huawei.com> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Bjorn Helgaas <helgaas@kernel.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@arm.com> Cc: John Garry <john.garry@huawei.com> Cc: Jonathan Cameron <jonathan.cameron@huawei.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Liu <liuqi6124@gmail.com> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> Cc: Shaokun Zhang <zhangshaokun@hisilicon.com> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Zeng Prime <prime.zeng@huawei.com> Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20220927081400.14364-4-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/perf/util/Build | 2 + tools/perf/util/auxtrace.c | 3 + tools/perf/util/hisi-ptt-decoder/Build | 1 + .../hisi-ptt-decoder/hisi-ptt-pkt-decoder.c | 164 +++++++++++++++ .../hisi-ptt-decoder/hisi-ptt-pkt-decoder.h | 31 +++ tools/perf/util/hisi-ptt.c | 192 ++++++++++++++++++ tools/perf/util/hisi-ptt.h | 3 + 7 files changed, 396 insertions(+) create mode 100644 tools/perf/util/hisi-ptt-decoder/Build create mode 100644 tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c create mode 100644 tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h create mode 100644 tools/perf/util/hisi-ptt.c diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 815d235466d01..e315ecaec3233 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -118,6 +118,8 @@ perf-$(CONFIG_AUXTRACE) += intel-pt.o perf-$(CONFIG_AUXTRACE) += intel-bts.o perf-$(CONFIG_AUXTRACE) += arm-spe.o perf-$(CONFIG_AUXTRACE) += arm-spe-decoder/ +perf-$(CONFIG_AUXTRACE) += hisi-ptt.o +perf-$(CONFIG_AUXTRACE) += hisi-ptt-decoder/ perf-$(CONFIG_AUXTRACE) += s390-cpumsf.o ifdef CONFIG_LIBOPENCSD diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 0e53b796c5d5d..60d8beb662aa3 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -52,6 +52,7 @@ #include "intel-pt.h" #include "intel-bts.h" #include "arm-spe.h" +#include "hisi-ptt.h" #include "s390-cpumsf.h" #include "util/mmap.h" @@ -1321,6 +1322,8 @@ int perf_event__process_auxtrace_info(struct perf_session *session, err = s390_cpumsf_process_auxtrace_info(event, session); break; case PERF_AUXTRACE_HISI_PTT: + err = hisi_ptt_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/hisi-ptt-decoder/Build b/tools/perf/util/hisi-ptt-decoder/Build new file mode 100644 index 0000000000000..db3db8b750332 --- /dev/null +++ b/tools/perf/util/hisi-ptt-decoder/Build @@ -0,0 +1 @@ +perf-$(CONFIG_AUXTRACE) += hisi-ptt-pkt-decoder.o diff --git a/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c new file mode 100644 index 0000000000000..a17c423a526dd --- /dev/null +++ b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <endian.h> +#include <byteswap.h> +#include <linux/bitops.h> +#include <stdarg.h> + +#include "../color.h" +#include "hisi-ptt-pkt-decoder.h" + +/* + * For 8DW format, the bit[31:11] of DW0 is always 0x1fffff, which can be + * used to distinguish the data format. + * 8DW format is like: + * bits [ 31:11 ][ 10:0 ] + * |---------------------------------------|-------------------| + * DW0 [ 0x1fffff ][ Reserved (0x7ff) ] + * DW1 [ Prefix ] + * DW2 [ Header DW0 ] + * DW3 [ Header DW1 ] + * DW4 [ Header DW2 ] + * DW5 [ Header DW3 ] + * DW6 [ Reserved (0x0) ] + * DW7 [ Time ] + * + * 4DW format is like: + * bits [31:30] [ 29:25 ][24][23][22][21][ 20:11 ][ 10:0 ] + * |-----|---------|---|---|---|---|-------------|-------------| + * DW0 [ Fmt ][ Type ][T9][T8][TH][SO][ Length ][ Time ] + * DW1 [ Header DW1 ] + * DW2 [ Header DW2 ] + * DW3 [ Header DW3 ] + */ + +enum hisi_ptt_8dw_pkt_field_type { + HISI_PTT_8DW_CHK_AND_RSV0, + HISI_PTT_8DW_PREFIX, + HISI_PTT_8DW_HEAD0, + HISI_PTT_8DW_HEAD1, + HISI_PTT_8DW_HEAD2, + HISI_PTT_8DW_HEAD3, + HISI_PTT_8DW_RSV1, + HISI_PTT_8DW_TIME, + HISI_PTT_8DW_TYPE_MAX +}; + +enum hisi_ptt_4dw_pkt_field_type { + HISI_PTT_4DW_HEAD1, + HISI_PTT_4DW_HEAD2, + HISI_PTT_4DW_HEAD3, + HISI_PTT_4DW_TYPE_MAX +}; + +static const char * const hisi_ptt_8dw_pkt_field_name[] = { + [HISI_PTT_8DW_PREFIX] = "Prefix", + [HISI_PTT_8DW_HEAD0] = "Header DW0", + [HISI_PTT_8DW_HEAD1] = "Header DW1", + [HISI_PTT_8DW_HEAD2] = "Header DW2", + [HISI_PTT_8DW_HEAD3] = "Header DW3", + [HISI_PTT_8DW_TIME] = "Time" +}; + +static const char * const hisi_ptt_4dw_pkt_field_name[] = { + [HISI_PTT_4DW_HEAD1] = "Header DW1", + [HISI_PTT_4DW_HEAD2] = "Header DW2", + [HISI_PTT_4DW_HEAD3] = "Header DW3", +}; + +union hisi_ptt_4dw { + struct { + uint32_t format : 2; + uint32_t type : 5; + uint32_t t9 : 1; + uint32_t t8 : 1; + uint32_t th : 1; + uint32_t so : 1; + uint32_t len : 10; + uint32_t time : 11; + }; + uint32_t value; +}; + +static void hisi_ptt_print_pkt(const unsigned char *buf, int pos, const char *desc) +{ + const char *color = PERF_COLOR_BLUE; + int i; + + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + for (i = 0; i < HISI_PTT_FIELD_LENTH; i++) + color_fprintf(stdout, color, "%02x ", buf[pos + i]); + for (i = 0; i < HISI_PTT_MAX_SPACE_LEN; i++) + color_fprintf(stdout, color, " "); + color_fprintf(stdout, color, " %s\n", desc); +} + +static int hisi_ptt_8dw_kpt_desc(const unsigned char *buf, int pos) +{ + int i; + + for (i = 0; i < HISI_PTT_8DW_TYPE_MAX; i++) { + /* Do not show 8DW check field and reserved fields */ + if (i == HISI_PTT_8DW_CHK_AND_RSV0 || i == HISI_PTT_8DW_RSV1) { + pos += HISI_PTT_FIELD_LENTH; + continue; + } + + hisi_ptt_print_pkt(buf, pos, hisi_ptt_8dw_pkt_field_name[i]); + pos += HISI_PTT_FIELD_LENTH; + } + + return hisi_ptt_pkt_size[HISI_PTT_8DW_PKT]; +} + +static void hisi_ptt_4dw_print_dw0(const unsigned char *buf, int pos) +{ + const char *color = PERF_COLOR_BLUE; + union hisi_ptt_4dw dw0; + int i; + + dw0.value = *(uint32_t *)(buf + pos); + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + for (i = 0; i < HISI_PTT_FIELD_LENTH; i++) + color_fprintf(stdout, color, "%02x ", buf[pos + i]); + for (i = 0; i < HISI_PTT_MAX_SPACE_LEN; i++) + color_fprintf(stdout, color, " "); + + color_fprintf(stdout, color, + " %s %x %s %x %s %x %s %x %s %x %s %x %s %x %s %x\n", + "Format", dw0.format, "Type", dw0.type, "T9", dw0.t9, + "T8", dw0.t8, "TH", dw0.th, "SO", dw0.so, "Length", + dw0.len, "Time", dw0.time); +} + +static int hisi_ptt_4dw_kpt_desc(const unsigned char *buf, int pos) +{ + int i; + + hisi_ptt_4dw_print_dw0(buf, pos); + pos += HISI_PTT_FIELD_LENTH; + + for (i = 0; i < HISI_PTT_4DW_TYPE_MAX; i++) { + hisi_ptt_print_pkt(buf, pos, hisi_ptt_4dw_pkt_field_name[i]); + pos += HISI_PTT_FIELD_LENTH; + } + + return hisi_ptt_pkt_size[HISI_PTT_4DW_PKT]; +} + +int hisi_ptt_pkt_desc(const unsigned char *buf, int pos, enum hisi_ptt_pkt_type type) +{ + if (type == HISI_PTT_8DW_PKT) + return hisi_ptt_8dw_kpt_desc(buf, pos); + + return hisi_ptt_4dw_kpt_desc(buf, pos); +} diff --git a/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h new file mode 100644 index 0000000000000..e78f1b5bc836e --- /dev/null +++ b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#ifndef INCLUDE__HISI_PTT_PKT_DECODER_H__ +#define INCLUDE__HISI_PTT_PKT_DECODER_H__ + +#include <stddef.h> +#include <stdint.h> + +#define HISI_PTT_8DW_CHECK_MASK GENMASK(31, 11) +#define HISI_PTT_IS_8DW_PKT GENMASK(31, 11) +#define HISI_PTT_MAX_SPACE_LEN 10 +#define HISI_PTT_FIELD_LENTH 4 + +enum hisi_ptt_pkt_type { + HISI_PTT_4DW_PKT, + HISI_PTT_8DW_PKT, + HISI_PTT_PKT_MAX +}; + +static int hisi_ptt_pkt_size[] = { + [HISI_PTT_4DW_PKT] = 16, + [HISI_PTT_8DW_PKT] = 32, +}; + +int hisi_ptt_pkt_desc(const unsigned char *buf, int pos, enum hisi_ptt_pkt_type type); + +#endif diff --git a/tools/perf/util/hisi-ptt.c b/tools/perf/util/hisi-ptt.c new file mode 100644 index 0000000000000..45b614bb73bfa --- /dev/null +++ b/tools/perf/util/hisi-ptt.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#include <byteswap.h> +#include <endian.h> +#include <errno.h> +#include <inttypes.h> +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/types.h> +#include <linux/zalloc.h> +#include <stdlib.h> +#include <unistd.h> + +#include "auxtrace.h" +#include "color.h" +#include "debug.h" +#include "evsel.h" +#include "hisi-ptt.h" +#include "hisi-ptt-decoder/hisi-ptt-pkt-decoder.h" +#include "machine.h" +#include "session.h" +#include "tool.h" +#include <internal/lib.h> + +struct hisi_ptt { + struct auxtrace auxtrace; + u32 auxtrace_type; + struct perf_session *session; + struct machine *machine; + u32 pmu_type; +}; + +struct hisi_ptt_queue { + struct hisi_ptt *ptt; + struct auxtrace_buffer *buffer; +}; + +static enum hisi_ptt_pkt_type hisi_ptt_check_packet_type(unsigned char *buf) +{ + uint32_t head = *(uint32_t *)buf; + + if ((HISI_PTT_8DW_CHECK_MASK & head) == HISI_PTT_IS_8DW_PKT) + return HISI_PTT_8DW_PKT; + + return HISI_PTT_4DW_PKT; +} + +static void hisi_ptt_dump(struct hisi_ptt *ptt __maybe_unused, + unsigned char *buf, size_t len) +{ + const char *color = PERF_COLOR_BLUE; + enum hisi_ptt_pkt_type type; + size_t pos = 0; + int pkt_len; + + type = hisi_ptt_check_packet_type(buf); + len = round_down(len, hisi_ptt_pkt_size[type]); + color_fprintf(stdout, color, ". ... HISI PTT data: size %zu bytes\n", + len); + + while (len > 0) { + pkt_len = hisi_ptt_pkt_desc(buf, pos, type); + if (!pkt_len) + color_fprintf(stdout, color, " Bad packet!\n"); + + pos += pkt_len; + len -= pkt_len; + } +} + +static void hisi_ptt_dump_event(struct hisi_ptt *ptt, unsigned char *buf, + size_t len) +{ + printf(".\n"); + + hisi_ptt_dump(ptt, buf, len); +} + +static int hisi_ptt_process_event(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct perf_tool *tool __maybe_unused) +{ + return 0; +} + +static int hisi_ptt_process_auxtrace_event(struct perf_session *session, + union perf_event *event, + struct perf_tool *tool __maybe_unused) +{ + struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, + auxtrace); + int fd = perf_data__fd(session->data); + int size = event->auxtrace.size; + void *data = malloc(size); + off_t data_offset; + int err; + + if (!data) + return -errno; + + if (perf_data__is_pipe(session->data)) { + data_offset = 0; + } else { + data_offset = lseek(fd, 0, SEEK_CUR); + if (data_offset == -1) + return -errno; + } + + err = readn(fd, data, size); + if (err != (ssize_t)size) { + free(data); + return -errno; + } + + if (dump_trace) + hisi_ptt_dump_event(ptt, data, size); + + return 0; +} + +static int hisi_ptt_flush(struct perf_session *session __maybe_unused, + struct perf_tool *tool __maybe_unused) +{ + return 0; +} + +static void hisi_ptt_free_events(struct perf_session *session __maybe_unused) +{ +} + +static void hisi_ptt_free(struct perf_session *session) +{ + struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, + auxtrace); + + session->auxtrace = NULL; + free(ptt); +} + +static bool hisi_ptt_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, auxtrace); + + return evsel->core.attr.type == ptt->pmu_type; +} + +static void hisi_ptt_print_info(__u64 type) +{ + if (!dump_trace) + return; + + fprintf(stdout, " PMU Type %" PRId64 "\n", (s64) type); +} + +int hisi_ptt_process_auxtrace_info(union perf_event *event, + struct perf_session *session) +{ + struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; + struct hisi_ptt *ptt; + + if (auxtrace_info->header.size < HISI_PTT_AUXTRACE_PRIV_SIZE + + sizeof(struct perf_record_auxtrace_info)) + return -EINVAL; + + ptt = zalloc(sizeof(*ptt)); + if (!ptt) + return -ENOMEM; + + ptt->session = session; + ptt->machine = &session->machines.host; /* No kvm support */ + ptt->auxtrace_type = auxtrace_info->type; + ptt->pmu_type = auxtrace_info->priv[0]; + + ptt->auxtrace.process_event = hisi_ptt_process_event; + ptt->auxtrace.process_auxtrace_event = hisi_ptt_process_auxtrace_event; + ptt->auxtrace.flush_events = hisi_ptt_flush; + ptt->auxtrace.free_events = hisi_ptt_free_events; + ptt->auxtrace.free = hisi_ptt_free; + ptt->auxtrace.evsel_is_auxtrace = hisi_ptt_evsel_is_auxtrace; + session->auxtrace = &ptt->auxtrace; + + hisi_ptt_print_info(auxtrace_info->priv[0]); + + return 0; +} diff --git a/tools/perf/util/hisi-ptt.h b/tools/perf/util/hisi-ptt.h index 82283c81b4c11..2db9b40562148 100644 --- a/tools/perf/util/hisi-ptt.h +++ b/tools/perf/util/hisi-ptt.h @@ -13,4 +13,7 @@ struct auxtrace_record *hisi_ptt_recording_init(int *err, struct perf_pmu *hisi_ptt_pmu); +int hisi_ptt_process_auxtrace_info(union perf_event *event, + struct perf_session *session); + #endif -- GitLab From a3a365655a28f12f07eddf4f3fd596987b175e1d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo <acme@redhat.com> Date: Fri, 7 Aug 2020 08:45:47 -0300 Subject: [PATCH 2012/2223] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes in: b8d1d163604bd1e6 ("x86/apic: Don't disable x2APIC if locked") ca5b7c0d9621702e ("perf/x86/amd/lbr: Add LbrExtV2 branch record support") Addressing these tools/perf build warnings: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' That makes the beautification scripts to pick some new entries: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after --- before 2022-10-14 18:06:34.294561729 -0300 +++ after 2022-10-14 18:06:41.285744044 -0300 @@ -264,6 +264,7 @@ [0xc0000102 - x86_64_specific_MSRs_offset] = "KERNEL_GS_BASE", [0xc0000103 - x86_64_specific_MSRs_offset] = "TSC_AUX", [0xc0000104 - x86_64_specific_MSRs_offset] = "AMD64_TSC_RATIO", + [0xc000010e - x86_64_specific_MSRs_offset] = "AMD64_LBR_SELECT", [0xc000010f - x86_64_specific_MSRs_offset] = "AMD_DBG_EXTN_CFG", [0xc0000300 - x86_64_specific_MSRs_offset] = "AMD64_PERF_CNTR_GLOBAL_STATUS", [0xc0000301 - x86_64_specific_MSRs_offset] = "AMD64_PERF_CNTR_GLOBAL_CTL", $ Now one can trace systemwide asking to see backtraces to where that MSR is being read/written, see this example with a previous update: # perf trace -e msr:*_msr/max-stack=32/ --filter="msr>=IA32_U_CET && msr<=IA32_INT_SSP_TAB" ^C# If we use -v (verbose mode) we can see what it does behind the scenes: # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr>=IA32_U_CET && msr<=IA32_INT_SSP_TAB" Using CPUID AuthenticAMD-25-21-0 0x6a0 0x6a8 New filter for msr:read_msr: (msr>=0x6a0 && msr<=0x6a8) && (common_pid != 597499 && common_pid != 3313) 0x6a0 0x6a8 New filter for msr:write_msr: (msr>=0x6a0 && msr<=0x6a8) && (common_pid != 597499 && common_pid != 3313) mmap size 528384B ^C# Example with a frequent msr: # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr==IA32_SPEC_CTRL" --max-events 2 Using CPUID AuthenticAMD-25-21-0 0x48 New filter for msr:read_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841) 0x48 New filter for msr:write_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841) mmap size 528384B Looking at the vmlinux_path (8 entries long) symsrc__init: build id mismatch for vmlinux. Using /proc/kcore for kernel data Using /proc/kallsyms for symbols 0.000 Timer/2525383 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) __switch_to_xtra ([kernel.kallsyms]) __switch_to ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule ([kernel.kallsyms]) futex_wait_queue_me ([kernel.kallsyms]) futex_wait ([kernel.kallsyms]) do_futex ([kernel.kallsyms]) __x64_sys_futex ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) entry_SYSCALL_64_after_hwframe ([kernel.kallsyms]) __futex_abstimed_wait_common64 (/usr/lib64/libpthread-2.33.so) 0.030 :0/0 msr:write_msr(msr: IA32_SPEC_CTRL, val: 2) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) __switch_to_xtra ([kernel.kallsyms]) __switch_to ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule_idle ([kernel.kallsyms]) do_idle ([kernel.kallsyms]) cpu_startup_entry ([kernel.kallsyms]) secondary_startup_64_no_verify ([kernel.kallsyms]) # Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Daniel Sneddon <daniel.sneddon@linux.intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Link: https://lore.kernel.org/lkml/Y0nQkz2TUJxwfXJd@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- tools/arch/x86/include/asm/msr-index.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 6674bdb096f34..10ac52705892a 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -155,6 +155,11 @@ * Return Stack Buffer Predictions. */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ + #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* * Writeback and invalidate the @@ -585,6 +590,9 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Last Branch Record MSRs */ +#define MSR_AMD64_LBR_SELECT 0xc000010e + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -756,6 +764,8 @@ #define MSR_AMD_DBG_EXTN_CFG 0xc000010f #define MSR_AMD_SAMP_BR_FROM 0xc0010300 +#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 @@ -1054,4 +1064,12 @@ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 +/* x2APIC locked status */ +#define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +#define LEGACY_XAPIC_DISABLED BIT(0) /* + * x2APIC mode is locked and + * disabling x2APIC will cause + * a #GP + */ + #endif /* _ASM_X86_MSR_INDEX_H */ -- GitLab From b854b4ee66437e6e1622fda90529c814978cb4ca Mon Sep 17 00:00:00 2001 From: Paulo Alcantara <pc@cjr.nz> Date: Fri, 14 Oct 2022 17:14:54 -0300 Subject: [PATCH 2013/2223] cifs: fix double-fault crash during ntlmssp The crash occurred because we were calling memzero_explicit() on an already freed sess_data::iov[1] (ntlmsspblob) in sess_free_buffer(). Fix this by not calling memzero_explicit() on sess_data::iov[1] as it's already by handled by callers. Fixes: a4e430c8c8ba ("cifs: replace kfree() with kfree_sensitive() for sensitive data") Reviewed-by: Enzo Matsumiya <ematsumiya@suse.de> Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/sess.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c9edec7081de7..0435d1dfa9e11 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1208,16 +1208,18 @@ out_free_smb_buf: static void sess_free_buffer(struct sess_data *sess_data) { - int i; + struct kvec *iov = sess_data->iov; - /* zero the session data before freeing, as it might contain sensitive info (keys, etc) */ - for (i = 0; i < 3; i++) - if (sess_data->iov[i].iov_base) - memzero_explicit(sess_data->iov[i].iov_base, sess_data->iov[i].iov_len); + /* + * Zero the session data before freeing, as it might contain sensitive info (keys, etc). + * Note that iov[1] is already freed by caller. + */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; - kfree(sess_data->iov[2].iov_base); + kfree_sensitive(iov[2].iov_base); } static int -- GitLab From f09bd695af3b8ab46fc24e5d6954a24104c38387 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Fri, 14 Oct 2022 18:50:20 -0500 Subject: [PATCH 2014/2223] smb3: must initialize two ACL struct fields to zero Coverity spotted that we were not initalizing Stbz1 and Stbz2 to zero in create_sd_buf. Addresses-Coverity: 1513848 ("Uninitialized scalar variable") Cc: <stable@vger.kernel.org> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e1162217ad1a6..f8f89ff96c5d1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2420,7 +2420,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) unsigned int acelen, acl_size, ace_count; unsigned int owner_offset = 0; unsigned int group_offset = 0; - struct smb3_acl acl; + struct smb3_acl acl = {}; *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); @@ -2493,6 +2493,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ acl.AclSize = cpu_to_le16(acl_size); acl.AceCount = cpu_to_le16(ace_count); + /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */ memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); -- GitLab From 625b60d4f9517903ad499633776825e67fdb0c16 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Fri, 14 Oct 2022 19:18:32 -0500 Subject: [PATCH 2015/2223] cifs: lease key is uninitialized in smb1 paths It is cleaner to set lease key to zero in the places where leases are not supported (smb1 can not return lease keys so the field was uninitialized). Addresses-Coverity: 1513994 ("Uninitialized scalar variable") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index cbd46ac59cd2f..a5c73c2af3a26 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -413,7 +413,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; __u32 oplock; struct cifsFileInfo *file_info; -- GitLab From 2bff0659338e58a3a24698a35e7dcb2b62199ba4 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Fri, 14 Oct 2022 20:00:32 -0500 Subject: [PATCH 2016/2223] cifs: lease key is uninitialized in two additional functions when smb1 cifs_open and _cifsFileInfo_put also end up with lease_key uninitialized in smb1 mounts. It is cleaner to set lease key to zero in these places where leases are not supported (smb1 can not return lease keys so the field was uninitialized). Addresses-Coverity: 1514207 ("Uninitialized scalar variable") Addresses-Coverity: 1514331 ("Uninitialized scalar variable") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index dcec1690312be..f6ffee514c345 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -489,7 +489,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, struct cifsInodeInfo *cifsi = CIFS_I(inode); struct super_block *sb = inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; bool oplock_break_cancelled; @@ -571,7 +571,7 @@ int cifs_open(struct inode *inode, struct file *file) void *page; const char *full_path; bool posix_open_ok = false; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; struct cifs_open_info_data data = {}; -- GitLab From e3e9463414f610e91528f2b920b8cb655f4bae33 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Sat, 15 Oct 2022 00:43:22 -0500 Subject: [PATCH 2017/2223] smb3: improve SMB3 change notification support Change notification is a commonly supported feature by most servers, but the current ioctl to request notification when a directory is changed does not return the information about what changed (even though it is returned by the server in the SMB3 change notify response), it simply returns when there is a change. This ioctl improves upon CIFS_IOC_NOTIFY by returning the notify information structure which includes the name of the file(s) that changed and why. See MS-SMB2 2.2.35 for details on the individual filter flags and the file_notify_information structure returned. To use this simply pass in the following (with enough space to fit at least one file_notify_information structure) struct __attribute__((__packed__)) smb3_notify { uint32_t completion_filter; bool watch_tree; uint32_t data_len; uint8_t data[]; } __packed; using CIFS_IOC_NOTIFY_INFO 0xc009cf0b or equivalently _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) The ioctl will block until the server detects a change to that directory or its subdirectories (if watch_tree is set). Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Acked-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifs_ioctl.h | 8 ++++++++ fs/cifs/cifsglob.h | 2 +- fs/cifs/ioctl.c | 25 ++++++++++++++++++++++++- fs/cifs/smb2ops.c | 35 ++++++++++++++++++++++++++++------- fs/cifs/smb2pdu.c | 30 +++++++++++++++++++++++++++--- fs/cifs/smb2proto.h | 3 ++- 6 files changed, 90 insertions(+), 13 deletions(-) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index b87cbbe6d2d4b..d86d78d5bfdc1 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -91,6 +91,13 @@ struct smb3_notify { bool watch_tree; } __packed; +struct smb3_notify_info { + __u32 completion_filter; + bool watch_tree; + __u32 data_len; /* size of notify data below */ + __u8 notify_data[]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) @@ -100,6 +107,7 @@ struct smb3_notify { #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) +#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) #define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9c0253835f1c7..1420acf987f03 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -454,7 +454,7 @@ struct smb_version_operations { int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *src_file, void __user *); int (*notify)(const unsigned int xid, struct file *pfile, - void __user *pbuf); + void __user *pbuf, bool return_changes); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index b6e6e5d6c8dd6..89d5fa8873649 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -484,12 +484,35 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(tlink); if (tcon && tcon->ses->server->ops->notify) { rc = tcon->ses->server->ops->notify(xid, - filep, (void __user *)arg); + filep, (void __user *)arg, + false /* no ret data */); cifs_dbg(FYI, "ioctl notify rc %d\n", rc); } else rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_NOTIFY_INFO: + if (!S_ISDIR(inode->i_mode)) { + /* Notify can only be done on directories */ + rc = -EOPNOTSUPP; + break; + } + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); + if (tcon && tcon->ses->server->ops->notify) { + rc = tcon->ses->server->ops->notify(xid, + filep, (void __user *)arg, + true /* return details */); + cifs_dbg(FYI, "ioctl notify info rc %d\n", rc); + } else + rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); + break; case CIFS_IOC_SHUTDOWN: rc = cifs_shutdown(inode->i_sb, arg); break; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b907d1fab8d98..17b25153cb689 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2018,9 +2018,10 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, static int smb3_notify(const unsigned int xid, struct file *pfile, - void __user *ioc_buf) + void __user *ioc_buf, bool return_changes) { - struct smb3_notify notify; + struct smb3_notify_info notify; + struct smb3_notify_info __user *pnotify_buf; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2028,10 +2029,12 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct cifs_fid fid; struct cifs_tcon *tcon; const unsigned char *path; + char *returned_ioctl_info = NULL; void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; + __u32 ret_len = 0; path = build_path_from_dentry(dentry, page); if (IS_ERR(path)) { @@ -2045,9 +2048,17 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; } - if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { - rc = -EFAULT; - goto notify_exit; + if (return_changes) { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify_info))) { + rc = -EFAULT; + goto notify_exit; + } + } else { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { + rc = -EFAULT; + goto notify_exit; + } + notify.data_len = 0; } tcon = cifs_sb_master_tcon(cifs_sb); @@ -2064,12 +2075,22 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid, - notify.watch_tree, notify.completion_filter); + notify.watch_tree, notify.completion_filter, + notify.data_len, &returned_ioctl_info, &ret_len); SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); - + if (return_changes && (ret_len > 0) && (notify.data_len > 0)) { + if (ret_len > notify.data_len) + ret_len = notify.data_len; + pnotify_buf = (struct smb3_notify_info __user *)ioc_buf; + if (copy_to_user(pnotify_buf->notify_data, returned_ioctl_info, ret_len)) + rc = -EFAULT; + else if (copy_to_user(&pnotify_buf->data_len, &ret_len, sizeof(ret_len))) + rc = -EFAULT; + } + kfree(returned_ioctl_info); notify_exit: free_dentry_path(page); kfree(utf16_path); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f8f89ff96c5d1..a2384509ea84b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3710,11 +3710,13 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter) + u32 completion_filter, u32 max_out_data_len, char **out_data, + u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); struct smb_rqst rqst; + struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; @@ -3730,6 +3732,9 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, memset(&rqst, 0, sizeof(struct smb_rqst)); memset(&iov, 0, sizeof(iov)); + if (plen) + *plen = 0; + rqst.rq_iov = iov; rqst.rq_nvec = 1; @@ -3748,9 +3753,28 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE); trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter, rc); - } else + } else { trace_smb3_notify_done(xid, persistent_fid, tcon->tid, - ses->Suid, (u8)watch_tree, completion_filter); + ses->Suid, (u8)watch_tree, completion_filter); + /* validate that notify information is plausible */ + if ((rsp_iov.iov_base == NULL) || + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + goto cnotify_exit; + + smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; + + smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov, + sizeof(struct file_notify_information)); + + *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto cnotify_exit; + } else + *plen = le32_to_cpu(smb_rsp->OutputBufferLength); + } cnotify_exit: if (rqst.rq_iov) diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 7818d0b835672..be21b5d26f67e 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -144,7 +144,8 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter); + u32 completion_filter, u32 max_out_data_len, + char **out_data, u32 *plen /* returned data len */); extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, -- GitLab From 34a0bac084e49324c29e6d0984d24096e02c6314 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt <palmer@rivosinc.com> Date: Thu, 13 Oct 2022 14:46:37 -0700 Subject: [PATCH 2018/2223] MAINTAINERS: git://github -> https://github.com for openrisc Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Reported-by: Conor Dooley <conor.dooley@microchip.com> Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> Signed-off-by: Stafford Horne <shorne@gmail.com> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f5ca4aefd184c..4c66999091661 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15240,7 +15240,7 @@ M: Stafford Horne <shorne@gmail.com> L: openrisc@lists.librecores.org S: Maintained W: http://openrisc.io -T: git git://github.com/openrisc/linux.git +T: git https://github.com/openrisc/linux.git F: Documentation/devicetree/bindings/openrisc/ F: Documentation/openrisc/ F: arch/openrisc/ -- GitLab From 4bb7f6c2781e46fc5bd00475a66df2ea30ef330d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com> Date: Thu, 13 Oct 2022 14:50:28 +0200 Subject: [PATCH 2019/2223] thermal: intel_powerclamp: Use first online CPU as control_cpu Commit 68b99e94a4a2 ("thermal: intel_powerclamp: Use get_cpu() instead of smp_processor_id() to avoid crash") fixed an issue related to using smp_processor_id() in preemptible context by replacing it with a pair of get_cpu()/put_cpu(), but what is needed there really is any online CPU and not necessarily the one currently running the code. Arguably, getting the one that's running the code in there is confusing. For this reason, simply give the control CPU role to the first online one which automatically will be CPU0 if it is online, so one check can be dropped from the code for an added benefit. Link: https://lore.kernel.org/linux-pm/20221011113646.GA12080@duo.ucw.cz/ Fixes: 68b99e94a4a2 ("thermal: intel_powerclamp: Use get_cpu() instead of smp_processor_id() to avoid crash") Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Reviewed-by: Chen Yu <yu.c.chen@intel.com> --- drivers/thermal/intel/intel_powerclamp.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 2a5570b9799a9..b80e25ec12615 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -516,11 +516,7 @@ static int start_power_clamp(void) cpus_read_lock(); /* prefer BSP */ - control_cpu = 0; - if (!cpu_online(control_cpu)) { - control_cpu = get_cpu(); - put_cpu(); - } + control_cpu = cpumask_first(cpu_online_mask); clamping = true; schedule_delayed_work(&poll_pkg_cstate_work, 0); -- GitLab From e36ce448a08d43de69e7449eb225805a7a8addf8 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Sat, 15 Oct 2022 13:34:29 +0900 Subject: [PATCH 2020/2223] mm/slab: use kmalloc_node() for off slab freelist_idx_t array allocation After commit d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator"), SLAB passes large ( > PAGE_SIZE * 2) requests to buddy like SLUB does. SLAB has been using kmalloc caches to allocate freelist_idx_t array for off slab caches. But after the commit, freelist_size can be bigger than KMALLOC_MAX_CACHE_SIZE. Instead of using pointer to kmalloc cache, use kmalloc_node() and only check if the kmalloc cache is off slab during calculate_slab_order(). If freelist_size > KMALLOC_MAX_CACHE_SIZE, no looping condition happens as it allocates freelist_idx_t array directly from buddy. Link: https://lore.kernel.org/all/20221014205818.GA1428667@roeck-us.net/ Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net> Fixes: d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator") Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka <vbabka@suse.cz> --- include/linux/slab_def.h | 1 - mm/slab.c | 37 +++++++++++++++++++------------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index e24c9aff6fed0..f0ffad6a33653 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -33,7 +33,6 @@ struct kmem_cache { size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ - struct kmem_cache *freelist_cache; unsigned int freelist_size; /* constructor func */ diff --git a/mm/slab.c b/mm/slab.c index a5486ff8362a1..d1f6e2c64c2ec 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1619,7 +1619,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) * although actual page can be freed in rcu context */ if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->freelist_cache, freelist); + kfree(freelist); } /* @@ -1671,21 +1671,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (flags & CFLGS_OFF_SLAB) { struct kmem_cache *freelist_cache; size_t freelist_size; + size_t freelist_cache_size; freelist_size = num * sizeof(freelist_idx_t); - freelist_cache = kmalloc_slab(freelist_size, 0u); - if (!freelist_cache) - continue; - - /* - * Needed to avoid possible looping condition - * in cache_grow_begin() - */ - if (OFF_SLAB(freelist_cache)) - continue; + if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { + freelist_cache_size = PAGE_SIZE << get_order(freelist_size); + } else { + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + freelist_cache_size = freelist_cache->size; + + /* + * Needed to avoid possible looping condition + * in cache_grow_begin() + */ + if (OFF_SLAB(freelist_cache)) + continue; + } /* check if off slab has enough benefit */ - if (freelist_cache->size > cachep->size / 2) + if (freelist_cache_size > cachep->size / 2) continue; } @@ -2061,11 +2067,6 @@ done: cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif - if (OFF_SLAB(cachep)) { - cachep->freelist_cache = - kmalloc_slab(cachep->freelist_size, 0u); - } - err = setup_cpu_cache(cachep, gfp); if (err) { __kmem_cache_release(cachep); @@ -2292,7 +2293,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, freelist = NULL; else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - freelist = kmem_cache_alloc_node(cachep->freelist_cache, + freelist = kmalloc_node(cachep->freelist_size, local_flags, nodeid); } else { /* We will use last bytes at the slab for freelist */ -- GitLab From 3753af778dd9d0d5199d6a7d01b0ead33135d095 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada <masahiroy@kernel.org> Date: Sat, 15 Oct 2022 05:18:11 +0900 Subject: [PATCH 2021/2223] kbuild: fix single directory build Commit f110e5a250e3 ("kbuild: refactor single builds of *.ko") was wrong. KBUILD_MODULES _is_ needed for single builds. Otherwise, "make foo/bar/baz/" does not build module objects at all. Fixes: f110e5a250e3 ("kbuild: refactor single builds of *.ko") Reported-by: David Sterba <dsterba@suse.cz> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> Tested-by: David Sterba <dsterba@suse.com> --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 85a63a1d29b37..48a005fc69ca0 100644 --- a/Makefile +++ b/Makefile @@ -1978,6 +1978,8 @@ endif single-goals := $(addprefix $(build-dir)/, $(single-no-ko)) +KBUILD_MODULES := 1 + endif # Preset locale variables to speed up the build process. Limit locale -- GitLab From 0a6de78cff600cb991f2a1b7ed376935871796a0 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor <nathan@kernel.org> Date: Fri, 14 Oct 2022 13:42:11 -0700 Subject: [PATCH 2022/2223] lib/Kconfig.debug: Add check for non-constant .{s,u}leb128 support to DWARF5 When building with a RISC-V kernel with DWARF5 debug info using clang and the GNU assembler, several instances of the following error appear: /tmp/vgettimeofday-48aa35.s:2963: Error: non-constant .uleb128 is not supported Dumping the .s file reveals these .uleb128 directives come from .debug_loc and .debug_ranges: .Ldebug_loc0: .byte 4 # DW_LLE_offset_pair .uleb128 .Lfunc_begin0-.Lfunc_begin0 # starting offset .uleb128 .Ltmp1-.Lfunc_begin0 # ending offset .byte 1 # Loc expr size .byte 90 # DW_OP_reg10 .byte 0 # DW_LLE_end_of_list .Ldebug_ranges0: .byte 4 # DW_RLE_offset_pair .uleb128 .Ltmp6-.Lfunc_begin0 # starting offset .uleb128 .Ltmp27-.Lfunc_begin0 # ending offset .byte 4 # DW_RLE_offset_pair .uleb128 .Ltmp28-.Lfunc_begin0 # starting offset .uleb128 .Ltmp30-.Lfunc_begin0 # ending offset .byte 0 # DW_RLE_end_of_list There is an outstanding binutils issue to support a non-constant operand to .sleb128 and .uleb128 in GAS for RISC-V but there does not appear to be any movement on it, due to concerns over how it would work with linker relaxation. To avoid these build errors, prevent DWARF5 from being selected when using clang and an assembler that does not have support for these symbol deltas, which can be easily checked in Kconfig with as-instr plus the small test program from the dwz test suite from the binutils issue. Link: https://sourceware.org/bugzilla/show_bug.cgi?id=27215 Link: https://github.com/ClangBuiltLinux/linux/issues/1719 Signed-off-by: Nathan Chancellor <nathan@kernel.org> Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Masahiro Yamada <masahiroy@kernel.org> --- lib/Kconfig.debug | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index db8d9271cabf5..5c1c635758951 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -231,6 +231,11 @@ config DEBUG_INFO in the "Debug information" choice below, indicating that debug information will be generated for build targets. +# Clang is known to generate .{s,u}leb128 with symbol deltas with DWARF5, which +# some targets may not support: https://sourceware.org/bugzilla/show_bug.cgi?id=27215 +config AS_HAS_NON_CONST_LEB128 + def_bool $(as-instr,.uleb128 .Lexpr_end4 - .Lexpr_start3\n.Lexpr_start3:\n.Lexpr_end4:) + choice prompt "Debug information" depends on DEBUG_KERNEL @@ -253,7 +258,7 @@ config DEBUG_INFO_NONE config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT bool "Rely on the toolchain's implicit default DWARF version" select DEBUG_INFO - depends on !CC_IS_CLANG || AS_IS_LLVM || CLANG_VERSION < 140000 || (AS_IS_GNU && AS_VERSION >= 23502) + depends on !CC_IS_CLANG || AS_IS_LLVM || CLANG_VERSION < 140000 || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_LEB128) help The implicit default version of DWARF debug info produced by a toolchain changes over time. @@ -277,7 +282,7 @@ config DEBUG_INFO_DWARF4 config DEBUG_INFO_DWARF5 bool "Generate DWARF Version 5 debuginfo" select DEBUG_INFO - depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502) + depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_LEB128) help Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc 5.0+ accepts the -gdwarf-5 flag but only had partial support for some -- GitLab From 80493877d7d0ae0cbe62921d748682811c58026f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Date: Sun, 16 Oct 2022 00:53:51 +0900 Subject: [PATCH 2023/2223] Revert "cpumask: fix checking valid cpu range". This reverts commit 78e5a3399421 ("cpumask: fix checking valid cpu range"). syzbot is hitting WARN_ON_ONCE(cpu >= nr_cpumask_bits) warning at cpu_max_bits_warn() [1], for commit 78e5a3399421 ("cpumask: fix checking valid cpu range") is broken. Obviously that patch hits WARN_ON_ONCE() when e.g. reading /proc/cpuinfo because passing "cpu + 1" instead of "cpu" will trivially hit cpu == nr_cpumask_bits condition. Although syzbot found this problem in linux-next.git on 2022/09/27 [2], this problem was not fixed immediately. As a result, that patch was sent to linux.git before the patch author recognizes this problem, and syzbot started failing to test changes in linux.git since 2022/10/10 [3]. Andrew Jones proposed a fix for x86 and riscv architectures [4]. But [2] and [5] indicate that affected locations are not limited to arch code. More delay before we find and fix affected locations, less tested kernel (and more difficult to bisect and fix) before release. We should have inspected and fixed basically all cpumask users before applying that patch. We should not crash kernels in order to ask existing cpumask users to update their code, even if limited to CONFIG_DEBUG_PER_CPU_MAPS=y case. Link: https://syzkaller.appspot.com/bug?extid=d0fd2bf0dd6da72496dd [1] Link: https://syzkaller.appspot.com/bug?extid=21da700f3c9f0bc40150 [2] Link: https://syzkaller.appspot.com/bug?extid=51a652e2d24d53e75734 [3] Link: https://lkml.kernel.org/r/20221014155845.1986223-1-ajones@ventanamicro.com [4] Link: https://syzkaller.appspot.com/bug?extid=4d46c43d81c3bd155060 [5] Reported-by: Andrew Jones <ajones@ventanamicro.com> Reported-by: syzbot+d0fd2bf0dd6da72496dd@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Yury Norov <yury.norov@gmail.com> Cc: Borislav Petkov <bp@alien8.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/cpumask.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2f065ad97541f..c2aa0aa26b457 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -174,8 +174,9 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { - /* n is a prior cpu */ - cpumask_check(n + 1); + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); } @@ -188,8 +189,9 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { - /* n is a prior cpu */ - cpumask_check(n + 1); + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } @@ -229,8 +231,9 @@ static inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { - /* n is a prior cpu */ - cpumask_check(n + 1); + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits, n + 1); } @@ -260,8 +263,8 @@ static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); - /* n is a prior cpu */ - cpumask_check(n + 1); + if (n != -1) + cpumask_check(n); /* * Return the first available CPU when wrapping, or when starting before cpu0, -- GitLab From 2d1f274b95c6e4ba6a813b3b8e7a1a38d54a0a08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Sat, 15 Oct 2022 21:24:41 +0000 Subject: [PATCH 2024/2223] skmsg: pass gfp argument to alloc_sk_msg() syzbot found that alloc_sk_msg() could be called from a non sleepable context. sk_psock_verdict_recv() uses rcu_read_lock() protection. We need the callers to pass a gfp_t argument to avoid issues. syzbot report was: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:274 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 3613, name: syz-executor414 preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 INFO: lockdep is turned off. CPU: 0 PID: 3613 Comm: syz-executor414 Not tainted 6.0.0-syzkaller-09589-g55be6084c8e0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e3/0x2cb lib/dump_stack.c:106 __might_resched+0x538/0x6a0 kernel/sched/core.c:9877 might_alloc include/linux/sched/mm.h:274 [inline] slab_pre_alloc_hook mm/slab.h:700 [inline] slab_alloc_node mm/slub.c:3162 [inline] slab_alloc mm/slub.c:3256 [inline] kmem_cache_alloc_trace+0x59/0x310 mm/slub.c:3287 kmalloc include/linux/slab.h:600 [inline] kzalloc include/linux/slab.h:733 [inline] alloc_sk_msg net/core/skmsg.c:507 [inline] sk_psock_skb_ingress_self+0x5c/0x330 net/core/skmsg.c:600 sk_psock_verdict_apply+0x395/0x440 net/core/skmsg.c:1014 sk_psock_verdict_recv+0x34d/0x560 net/core/skmsg.c:1201 tcp_read_skb+0x4a1/0x790 net/ipv4/tcp.c:1770 tcp_rcv_established+0x129d/0x1a10 net/ipv4/tcp_input.c:5971 tcp_v4_do_rcv+0x479/0xac0 net/ipv4/tcp_ipv4.c:1681 sk_backlog_rcv include/net/sock.h:1109 [inline] __release_sock+0x1d8/0x4c0 net/core/sock.c:2906 release_sock+0x5d/0x1c0 net/core/sock.c:3462 tcp_sendmsg+0x36/0x40 net/ipv4/tcp.c:1483 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg net/socket.c:734 [inline] __sys_sendto+0x46d/0x5f0 net/socket.c:2117 __do_sys_sendto net/socket.c:2129 [inline] __se_sys_sendto net/socket.c:2125 [inline] __x64_sys_sendto+0xda/0xf0 net/socket.c:2125 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 43312915b5ba ("skmsg: Get rid of unncessary memset()") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Cong Wang <cong.wang@bytedance.com> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: John Fastabend <john.fastabend@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/skmsg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ca70525621c71..1efdc47a999b4 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -500,11 +500,11 @@ bool sk_msg_is_readable(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_msg_is_readable); -static struct sk_msg *alloc_sk_msg(void) +static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); @@ -520,7 +520,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - return alloc_sk_msg(); + return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, @@ -597,7 +597,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { - struct sk_msg *msg = alloc_sk_msg(); + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; -- GitLab From 9abf2313adc1ca1b6180c508c25f22f9395cc780 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Sun, 16 Oct 2022 15:36:24 -0700 Subject: [PATCH 2025/2223] Linux 6.1-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c690361b393ff..f41ec8c8426ba 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 0 +PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- GitLab From b6291023f659482fdb25f8ee5ab00c74682e658c Mon Sep 17 00:00:00 2001 From: Wilken Gottwalt <wilken.gottwalt@posteo.net> Date: Mon, 3 Oct 2022 09:05:27 +0000 Subject: [PATCH 2026/2223] hwmon: (corsair-psu) fix typo in USB id description Fix spelling mistake (Corsaur -> Corsair). Fixes: 0cf46a653bda ("hwmon: (corsair-psu) add USB id of new revision of the HX1000i psu") Signed-off-by: Wilken Gottwalt <wilken.gottwalt@posteo.net> Link: https://lore.kernel.org/r/Yzql13NOvQLlrye1@monster.localdomain Signed-off-by: Guenter Roeck <linux@roeck-us.net> --- drivers/hwmon/corsair-psu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/corsair-psu.c b/drivers/hwmon/corsair-psu.c index 345d883ab0442..c1c27e475f6d6 100644 --- a/drivers/hwmon/corsair-psu.c +++ b/drivers/hwmon/corsair-psu.c @@ -820,7 +820,7 @@ static const struct hid_device_id corsairpsu_idtable[] = { { HID_USB_DEVICE(0x1b1c, 0x1c0b) }, /* Corsair RM750i */ { HID_USB_DEVICE(0x1b1c, 0x1c0c) }, /* Corsair RM850i */ { HID_USB_DEVICE(0x1b1c, 0x1c0d) }, /* Corsair RM1000i */ - { HID_USB_DEVICE(0x1b1c, 0x1c1e) }, /* Corsaur HX1000i revision 2 */ + { HID_USB_DEVICE(0x1b1c, 0x1c1e) }, /* Corsair HX1000i revision 2 */ { }, }; MODULE_DEVICE_TABLE(hid, corsairpsu_idtable); -- GitLab From 3008d20f5445ee6f214e3b2d42114c8c923d9625 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com> Date: Thu, 13 Oct 2022 15:59:51 +0200 Subject: [PATCH 2027/2223] hwmon: (pwm-fan) Explicitly switch off fan power when setting pwm1_enable to 0 When pwm1_enable is changed from 1 to 0 while pwm1 == 0, the regulator is not switched off as expected. The reason is that when the fan is already off, ctx->enabled is false, so pwm_fan_power_off() will be a no-op. Handle this case explicitly in pwm_fan_update_enable() by calling pwm_fan_switch_power() directly. Fixes: b99152d4f04b ("hwmon: (pwm-fan) Switch regulator dynamically") Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com> Link: https://lore.kernel.org/r/20221013135951.4902-1-matthias.schiffer@ew.tq-group.com Signed-off-by: Guenter Roeck <linux@roeck-us.net> --- drivers/hwmon/pwm-fan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c index dc3d9a22d9176..83a347ca35da5 100644 --- a/drivers/hwmon/pwm-fan.c +++ b/drivers/hwmon/pwm-fan.c @@ -257,7 +257,10 @@ static int pwm_fan_update_enable(struct pwm_fan_ctx *ctx, long val) if (val == 0) { /* Disable pwm-fan unconditionally */ - ret = __set_pwm(ctx, 0); + if (ctx->enabled) + ret = __set_pwm(ctx, 0); + else + ret = pwm_fan_switch_power(ctx, false); if (ret) ctx->enable_mode = old_val; pwm_fan_update_state(ctx, 0); -- GitLab From 664609e49f1c84fc97987b2bf64544e586b8849c Mon Sep 17 00:00:00 2001 From: Yue Hu <huyue2@coolpad.com> Date: Wed, 5 Oct 2022 09:35:28 +0800 Subject: [PATCH 2028/2223] erofs: fix illegal unmapped accesses in z_erofs_fill_inode_lazy() Note that we are still accessing 'h_idata_size' and 'h_fragmentoff' after calling erofs_put_metabuf(), that is not correct. Fix it. Fixes: ab92184ff8f1 ("erofs: add on-disk compressed tail-packing inline support") Fixes: b15b2e307c3a ("erofs: support on-disk compressed fragments data") Signed-off-by: Yue Hu <huyue2@coolpad.com> Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com> Reviewed-by: Chao Yu <chao@kernel.org> Link: https://lore.kernel.org/r/20221005013528.62977-1-zbestahu@163.com Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> --- fs/erofs/zmap.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 44c27ef39c436..0bb66927e3d06 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -57,8 +57,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), - EROFS_KMAP_ATOMIC); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out_unlock; @@ -73,7 +72,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); vi->z_tailextent_headlcn = 0; - goto unmap_done; + goto done; } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; @@ -85,7 +84,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; - goto unmap_done; + goto out_put_metabuf; } vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); @@ -95,7 +94,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ @@ -103,12 +102,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } -unmap_done: - erofs_put_metabuf(&buf); - if (err) - goto out_unlock; if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { struct erofs_map_blocks map = { @@ -127,7 +122,7 @@ unmap_done: err = -EFSCORRUPTED; } if (err < 0) - goto out_unlock; + goto out_put_metabuf; } if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && @@ -141,11 +136,14 @@ unmap_done: EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) - goto out_unlock; + goto out_put_metabuf; } +done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; -- GitLab From 63bbb85658ea43dd35dbfde6d4150b47c407fc87 Mon Sep 17 00:00:00 2001 From: Gao Xiang <hsiangkao@linux.alibaba.com> Date: Wed, 12 Oct 2022 12:50:56 +0800 Subject: [PATCH 2029/2223] erofs: shouldn't churn the mapping page for duplicated copies If other duplicated copies exist in one decompression shot, should leave the old page as is rather than replace it with the new duplicated one. Otherwise, the following cold path to deal with duplicated copies will use the invalid bvec. It impacts compressed data deduplication. Also, shift the onlinepage EIO bit to avoid touching the signed bit. Fixes: 267f2492c8f7 ("erofs: introduce multi-reference pclusters (fully-referenced)") Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> Link: https://lore.kernel.org/r/20221012045056.13421-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 8 +++----- fs/erofs/zdata.h | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 559380a535aff..4553be650968b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -888,15 +888,13 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { unsigned int pgnr; - struct page *oldpage; pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); - oldpage = be->decompressed_pages[pgnr]; - be->decompressed_pages[pgnr] = bvec->page; - - if (!oldpage) + if (!be->decompressed_pages[pgnr]) { + be->decompressed_pages[pgnr] = bvec->page; return; + } } /* (cold path) one pcluster is requested multiple times */ diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e7f04c4fbb81c..d98c952129852 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -126,10 +126,10 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) } /* - * bit 31: I/O error occurred on this page - * bit 0 - 30: remaining parts to complete this page + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page */ -#define Z_EROFS_PAGE_EIO (1 << 31) +#define Z_EROFS_PAGE_EIO (1 << 30) static inline void z_erofs_onlinepage_init(struct page *page) { -- GitLab From e7933278b442f97809b1ea84264586302bd08a03 Mon Sep 17 00:00:00 2001 From: Gao Xiang <hsiangkao@linux.alibaba.com> Date: Fri, 14 Oct 2022 14:49:15 +0800 Subject: [PATCH 2030/2223] erofs: fix up inplace decompression success rate Partial decompression should be checked after updating length. It's a new regression when introducing multi-reference pclusters. Fixes: 2bfab9c0edac ("erofs: record the longest decompressed size in this round") Reviewed-by: Chao Yu <chao@kernel.org> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> Link: https://lore.kernel.org/r/20221014064915.8103-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 4553be650968b..c7f24fc7efd59 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -813,15 +813,14 @@ retry: ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; - - if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && - !(map->m_flags & EROFS_MAP_PARTIAL_REF) && - fe->pcl->length == map->m_llen) - fe->pcl->partial = false; if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; next_part: /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; -- GitLab From 38eddb2c75fb99b9cd78445094ca0e1bda08d102 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Sun, 16 Oct 2022 21:30:48 +0100 Subject: [PATCH 2031/2223] io_uring: remove FFS_SCM THe lifetime of SCM'ed files is bound to ring_sock, which is destroyed strictly after we're done with registered file tables. This means there is no need for the FFS_SCM hack, which was not available on 32-bit builds anyway. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/984226a1045adf42dc35d8bd7fb5a8bbfa472ce1.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/filetable.h | 15 +-------------- io_uring/io_uring.c | 2 -- io_uring/rsrc.c | 7 ++----- io_uring/rsrc.h | 4 ---- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/io_uring/filetable.h b/io_uring/filetable.h index ff3a712e11bf3..19d2aed66c72e 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -5,22 +5,9 @@ #include <linux/file.h> #include <linux/io_uring_types.h> -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index de08d9902b30b..18aa39709faec 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1587,8 +1587,6 @@ unsigned int io_file_get_flags(struct file *file) res |= FFS_ISREG; if (__io_file_supports_nowait(file, mode)) res |= FFS_NOWAIT; - if (io_file_need_scm(file)) - res |= FFS_SCM; return res; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 012fdb04ec238..55d4ab96fb925 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -757,20 +757,17 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { -#if !defined(IO_URING_SCM_ALL) int i; for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); - if (!file) - continue; - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + /* skip scm accounted files, they'll be freed by ->ring_sock */ + if (!file || io_file_need_scm(file)) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } -#endif #if defined(CONFIG_UNIX) if (ctx->ring_sock) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 9bce15665444e..81445a477622b 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -82,11 +82,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); #if defined(CONFIG_UNIX) static inline bool io_file_need_scm(struct file *filp) { -#if defined(IO_URING_SCM_ALL) - return true; -#else return !!unix_get_socket(filp); -#endif } #else static inline bool io_file_need_scm(struct file *filp) -- GitLab From 4d5059512d283dab7372d282c2fbd43c7f5a2456 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Sun, 16 Oct 2022 21:30:49 +0100 Subject: [PATCH 2032/2223] io_uring: kill hot path fixed file bitmap debug checks We test file_table.bitmap in io_file_get_fixed() to check invariants, don't do it, it's expensive and was showing up in profiles. No reports of this triggering has come in. Move the check to the file clear instead, which will still catch any wrong usage. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/cf77f2ded68d2e5b2bc7355784d969837d48e023.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/filetable.h | 1 + io_uring/io_uring.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 19d2aed66c72e..351111ff88827 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -25,6 +25,7 @@ unsigned int io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { + WARN_ON_ONCE(!test_bit(bit, table->bitmap)); __clear_bit(bit, table->bitmap); table->alloc_hint = bit; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 18aa39709faec..6e50f548de1a7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1858,7 +1858,6 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); io_req_set_rsrc_node(req, ctx, 0); - WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); out: io_ring_submit_unlock(ctx, issue_flags); return file; -- GitLab From 34f0bc427e94065e7f828e70690f8fe1e01b3a9d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Sun, 16 Oct 2022 21:30:50 +0100 Subject: [PATCH 2033/2223] io_uring: reuse io_alloc_req() Don't duplicate io_alloc_req() in io_req_caches_free() but reuse the helper. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/6005fc88274864a49fc3096c22d8bdd605cf8576.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6e50f548de1a7..62be51fbf39c4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2560,18 +2560,14 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_req_caches_free(struct io_ring_ctx *ctx) { - struct io_submit_state *state = &ctx->submit_state; int nr = 0; mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, state); + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { - struct io_wq_work_node *node; - struct io_kiocb *req; + struct io_kiocb *req = io_alloc_req(ctx); - node = wq_stack_extract(&state->free_list); - req = container_of(node, struct io_kiocb, comp_list); kmem_cache_free(req_cachep, req); nr++; } -- GitLab From 02bac94bd8efd75f615ac7515dd2def75b43e5b9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Sun, 16 Oct 2022 21:30:51 +0100 Subject: [PATCH 2034/2223] io_uring: don't iopoll from io_ring_ctx_wait_and_kill() We should not be completing requests from a task context that has already undergone io_uring cancellations, i.e. __io_uring_cancel(), as there are some assumptions, e.g. around cached task refs draining. Remove iopolling from io_ring_ctx_wait_and_kill() as it can be called later after PF_EXITING is set with the last task_work run. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/7c03cc91455c4a1af49c6b9cbda4e57ea467aa11.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 62be51fbf39c4..6cc16e39b27f0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2804,15 +2804,12 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* failed during ring init, it couldn't have issued any requests */ - if (ctx->rings) { + /* + * If we failed setting up the ctx, we might not have any rings + * and therefore did not submit any requests + */ + if (ctx->rings) io_kill_timeouts(ctx, NULL, true); - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); - /* drop cached put refs after potentially doing completions */ - if (current->io_uring) - io_uring_drop_tctx_refs(current); - } INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* -- GitLab From 76dd298094f484c6250ebd076fa53287477b2328 Mon Sep 17 00:00:00 2001 From: Yu Kuai <yukuai3@huawei.com> Date: Tue, 11 Oct 2022 22:22:53 +0800 Subject: [PATCH 2035/2223] blk-mq: fix null pointer dereference in blk_mq_clear_rq_mapping() Our syzkaller report a null pointer dereference, root cause is following: __blk_mq_alloc_map_and_rqs set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs blk_mq_alloc_map_and_rqs blk_mq_alloc_rqs // failed due to oom alloc_pages_node // set->tags[hctx_idx] is still NULL blk_mq_free_rqs drv_tags = set->tags[hctx_idx]; // null pointer dereference is triggered blk_mq_clear_rq_mapping(drv_tags, ...) This is because commit 63064be150e4 ("blk-mq: Add blk_mq_alloc_map_and_rqs()") merged the two steps: 1) set->tags[hctx_idx] = blk_mq_alloc_rq_map() 2) blk_mq_alloc_rqs(..., set->tags[hctx_idx]) into one step: set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs() Since tags is not initialized yet in this case, fix the problem by checking if tags is NULL pointer in blk_mq_clear_rq_mapping(). Fixes: 63064be150e4 ("blk-mq: Add blk_mq_alloc_map_and_rqs()") Signed-off-by: Yu Kuai <yukuai3@huawei.com> Reviewed-by: John Garry <john.garry@huawei.com> Link: https://lore.kernel.org/r/20221011142253.4015966-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8070b6c10e8d5..33292c01875d5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3112,8 +3112,11 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct page *page; unsigned long flags; - /* There is no need to clear a driver tags own mapping */ - if (drv_tags == tags) + /* + * There is no need to clear mapping if driver tags is not initialized + * or the mapping belongs to the driver tags. + */ + if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { -- GitLab From 5c61795ea97c170347c5c4af0c159bd877b8af71 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Sun, 16 Oct 2022 17:24:10 -0600 Subject: [PATCH 2036/2223] io_uring/rw: remove leftover debug statement This debug statement was never meant to go into the upstream release, kill it off before it ends up in a release. It was just part of the testing for the initial version of the patch. Fixes: 2ec33a6c3cca ("io_uring/rw: ensure kiocb_end_write() is always called") Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/rw.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 100de2626e478..bb47cc4da713c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -242,8 +242,6 @@ static void io_req_io_end(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - WARN_ON(!in_task()); - if (rw->kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); fsnotify_modify(req->file); -- GitLab From 979556f1521a835a059de3b117b9c6c6642c7d58 Mon Sep 17 00:00:00 2001 From: Alexander Stein <alexander.stein@ew.tq-group.com> Date: Wed, 12 Oct 2022 15:11:05 +0200 Subject: [PATCH 2037/2223] ata: ahci-imx: Fix MODULE_ALIAS 'ahci:' is an invalid prefix, preventing the module from autoloading. Fix this by using the 'platform:' prefix and DRV_NAME. Fixes: 9e54eae23bc9 ("ahci_imx: add ahci sata support on imx platforms") Cc: stable@vger.kernel.org Signed-off-by: Alexander Stein <alexander.stein@ew.tq-group.com> Reviewed-by: Fabio Estevam <festevam@gmail.com> Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> --- drivers/ata/ahci_imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c index b734e069034d2..632caa3014587 100644 --- a/drivers/ata/ahci_imx.c +++ b/drivers/ata/ahci_imx.c @@ -1235,4 +1235,4 @@ module_platform_driver(imx_ahci_driver); MODULE_DESCRIPTION("Freescale i.MX AHCI SATA platform driver"); MODULE_AUTHOR("Richard Zhu <Hong-Xing.Zhu@freescale.com>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("ahci:imx"); +MODULE_ALIAS("platform:" DRV_NAME); -- GitLab From 1e41e693f458eef2d5728207dbd327cd3b16580a Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng <kai.heng.feng@canonical.com> Date: Tue, 11 Oct 2022 10:46:17 +0800 Subject: [PATCH 2038/2223] ata: ahci: Match EM_MAX_SLOTS with SATA_PMP_MAX_PORTS UBSAN complains about array-index-out-of-bounds: [ 1.980703] kernel: UBSAN: array-index-out-of-bounds in /build/linux-9H675w/linux-5.15.0/drivers/ata/libahci.c:968:41 [ 1.980709] kernel: index 15 is out of range for type 'ahci_em_priv [8]' [ 1.980713] kernel: CPU: 0 PID: 209 Comm: scsi_eh_8 Not tainted 5.15.0-25-generic #25-Ubuntu [ 1.980716] kernel: Hardware name: System manufacturer System Product Name/P5Q3, BIOS 1102 06/11/2010 [ 1.980718] kernel: Call Trace: [ 1.980721] kernel: <TASK> [ 1.980723] kernel: show_stack+0x52/0x58 [ 1.980729] kernel: dump_stack_lvl+0x4a/0x5f [ 1.980734] kernel: dump_stack+0x10/0x12 [ 1.980736] kernel: ubsan_epilogue+0x9/0x45 [ 1.980739] kernel: __ubsan_handle_out_of_bounds.cold+0x44/0x49 [ 1.980742] kernel: ahci_qc_issue+0x166/0x170 [libahci] [ 1.980748] kernel: ata_qc_issue+0x135/0x240 [ 1.980752] kernel: ata_exec_internal_sg+0x2c4/0x580 [ 1.980754] kernel: ? vprintk_default+0x1d/0x20 [ 1.980759] kernel: ata_exec_internal+0x67/0xa0 [ 1.980762] kernel: sata_pmp_read+0x8d/0xc0 [ 1.980765] kernel: sata_pmp_read_gscr+0x3c/0x90 [ 1.980768] kernel: sata_pmp_attach+0x8b/0x310 [ 1.980771] kernel: ata_eh_revalidate_and_attach+0x28c/0x4b0 [ 1.980775] kernel: ata_eh_recover+0x6b6/0xb30 [ 1.980778] kernel: ? ahci_do_hardreset+0x180/0x180 [libahci] [ 1.980783] kernel: ? ahci_stop_engine+0xb0/0xb0 [libahci] [ 1.980787] kernel: ? ahci_do_softreset+0x290/0x290 [libahci] [ 1.980792] kernel: ? trace_event_raw_event_ata_eh_link_autopsy_qc+0xe0/0xe0 [ 1.980795] kernel: sata_pmp_eh_recover.isra.0+0x214/0x560 [ 1.980799] kernel: sata_pmp_error_handler+0x23/0x40 [ 1.980802] kernel: ahci_error_handler+0x43/0x80 [libahci] [ 1.980806] kernel: ata_scsi_port_error_handler+0x2b1/0x600 [ 1.980810] kernel: ata_scsi_error+0x9c/0xd0 [ 1.980813] kernel: scsi_error_handler+0xa1/0x180 [ 1.980817] kernel: ? scsi_unjam_host+0x1c0/0x1c0 [ 1.980820] kernel: kthread+0x12a/0x150 [ 1.980823] kernel: ? set_kthread_struct+0x50/0x50 [ 1.980826] kernel: ret_from_fork+0x22/0x30 [ 1.980831] kernel: </TASK> This happens because sata_pmp_init_links() initialize link->pmp up to SATA_PMP_MAX_PORTS while em_priv is declared as 8 elements array. I can't find the maximum Enclosure Management ports specified in AHCI spec v1.3.1, but "12.2.1 LED message type" states that "Port Multiplier Information" can utilize 4 bits, which implies it can support up to 16 ports. Hence, use SATA_PMP_MAX_PORTS as EM_MAX_SLOTS to resolve the issue. BugLink: https://bugs.launchpad.net/bugs/1970074 Cc: stable@vger.kernel.org Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com> Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> --- drivers/ata/ahci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index da7ee8bec165a..7add8e79912b1 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -257,7 +257,7 @@ enum { PCS_7 = 0x94, /* 7+ port PCS (Denverton) */ /* em constants */ - EM_MAX_SLOTS = 8, + EM_MAX_SLOTS = SATA_PMP_MAX_PORTS, EM_MAX_RETRY = 5, /* em_ctl bits */ -- GitLab From ce4b815686573bef82d5ee53bf6f509bf20904dc Mon Sep 17 00:00:00 2001 From: Dawei Li <set_pte_at@outlook.com> Date: Mon, 17 Oct 2022 09:55:53 +0800 Subject: [PATCH 2039/2223] erofs: protect s_inodes with s_inode_list_lock for fscache s_inodes is superblock-specific resource, which should be protected by sb's specific lock s_inode_list_lock. Link: https://lore.kernel.org/r/TYCP286MB23238380DE3B74874E8D78ABCA299@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Fixes: 7d41963759fe ("erofs: Support sharing cookies in the same domain") Reviewed-by: Yue Hu <huyue2@coolpad.com> Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com> Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com> Signed-off-by: Dawei Li <set_pte_at@outlook.com> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> --- fs/erofs/fscache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 998cd26a1b3b1..fe05bc51f9f2f 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -590,14 +590,17 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, struct super_block *psb = erofs_pseudo_mnt->mnt_sb; mutex_lock(&erofs_domain_cookies_lock); + spin_lock(&psb->s_inode_list_lock); list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { ctx = inode->i_private; if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) continue; igrab(inode); + spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } + spin_unlock(&psb->s_inode_list_lock); ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); mutex_unlock(&erofs_domain_cookies_lock); return ctx; -- GitLab From b3d0d98179d62f9d55635a600679c4fa362baf8d Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 17 Oct 2022 11:51:54 +0800 Subject: [PATCH 2040/2223] net: ethernet: mtk_eth_soc: fix possible memory leak in mtk_probe() If mtk_wed_add_hw() has been called, mtk_wed_exit() needs be called in error path or removing module to free the memory allocated in mtk_wed_add_hw(). Fixes: 804775dfc288 ("net: ethernet: mtk_eth_soc: add support for Wireless Ethernet Dispatch (WED)") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 4fba7cb0144ba..7cd381530aa4a 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -4060,19 +4060,23 @@ static int mtk_probe(struct platform_device *pdev) eth->irq[i] = platform_get_irq(pdev, i); if (eth->irq[i] < 0) { dev_err(&pdev->dev, "no IRQ%d resource found\n", i); - return -ENXIO; + err = -ENXIO; + goto err_wed_exit; } } for (i = 0; i < ARRAY_SIZE(eth->clks); i++) { eth->clks[i] = devm_clk_get(eth->dev, mtk_clks_source_name[i]); if (IS_ERR(eth->clks[i])) { - if (PTR_ERR(eth->clks[i]) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(eth->clks[i]) == -EPROBE_DEFER) { + err = -EPROBE_DEFER; + goto err_wed_exit; + } if (eth->soc->required_clks & BIT(i)) { dev_err(&pdev->dev, "clock %s not found\n", mtk_clks_source_name[i]); - return -EINVAL; + err = -EINVAL; + goto err_wed_exit; } eth->clks[i] = NULL; } @@ -4083,7 +4087,7 @@ static int mtk_probe(struct platform_device *pdev) err = mtk_hw_init(eth); if (err) - return err; + goto err_wed_exit; eth->hwlro = MTK_HAS_CAPS(eth->soc->caps, MTK_HWLRO); @@ -4179,6 +4183,8 @@ err_free_dev: mtk_free_dev(eth); err_deinit_hw: mtk_hw_deinit(eth); +err_wed_exit: + mtk_wed_exit(); return err; } @@ -4198,6 +4204,7 @@ static int mtk_remove(struct platform_device *pdev) phylink_disconnect_phy(mac->phylink); } + mtk_wed_exit(); mtk_hw_deinit(eth); netif_napi_del(ð->tx_napi); -- GitLab From 9d4f20a476ca57e4c9246eb1fa2a61bea2354720 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 17 Oct 2022 11:51:55 +0800 Subject: [PATCH 2041/2223] net: ethernet: mtk_eth_wed: add missing put_device() in mtk_wed_add_hw() After calling get_device() in mtk_wed_add_hw(), in error path, put_device() needs be called. Fixes: 804775dfc288 ("net: ethernet: mtk_eth_soc: add support for Wireless Ethernet Dispatch (WED)") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mediatek/mtk_wed.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 099b6e0df619a..09bbd05bd83c1 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -1077,11 +1077,11 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth, get_device(&pdev->dev); irq = platform_get_irq(pdev, 0); if (irq < 0) - return; + goto err_put_device; regs = syscon_regmap_lookup_by_phandle(np, NULL); if (IS_ERR(regs)) - return; + goto err_put_device; rcu_assign_pointer(mtk_soc_wed_ops, &wed_ops); @@ -1124,8 +1124,14 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth, hw_list[index] = hw; + mutex_unlock(&hw_lock); + + return; + unlock: mutex_unlock(&hw_lock); +err_put_device: + put_device(&pdev->dev); } void mtk_wed_exit(void) -- GitLab From e0bb4659e235770e6f53b3692e958591f49448f5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 17 Oct 2022 11:51:56 +0800 Subject: [PATCH 2042/2223] net: ethernet: mtk_eth_wed: add missing of_node_put() The device_node pointer returned by of_parse_phandle() with refcount incremented, when finish using it, the refcount need be decreased. Fixes: 804775dfc288 ("net: ethernet: mtk_eth_soc: add support for Wireless Ethernet Dispatch (WED)") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mediatek/mtk_wed.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 09bbd05bd83c1..65e01bf4b4d22 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -1072,7 +1072,7 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth, pdev = of_find_device_by_node(np); if (!pdev) - return; + goto err_of_node_put; get_device(&pdev->dev); irq = platform_get_irq(pdev, 0); @@ -1132,6 +1132,8 @@ unlock: mutex_unlock(&hw_lock); err_put_device: put_device(&pdev->dev); +err_of_node_put: + of_node_put(np); } void mtk_wed_exit(void) @@ -1152,6 +1154,7 @@ void mtk_wed_exit(void) hw_list[i] = NULL; debugfs_remove(hw->debugfs_dir); put_device(hw->dev); + of_node_put(hw->node); kfree(hw); } } -- GitLab From 402fe7a5728789f3a3998d2823b7a110f4cd924e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Date: Mon, 17 Oct 2022 14:49:20 +0800 Subject: [PATCH 2043/2223] net: ethernet: mediatek: ppe: Remove the unused function mtk_foe_entry_usable() The function mtk_foe_entry_usable() is defined in the mtk_ppe.c file, but not called elsewhere, so delete this unused function. drivers/net/ethernet/mediatek/mtk_ppe.c:400:20: warning: unused function 'mtk_foe_entry_usable'. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2409 Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mediatek/mtk_ppe.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index ae00e572390d7..2d8ca99f2467f 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -397,12 +397,6 @@ int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry, return 0; } -static inline bool mtk_foe_entry_usable(struct mtk_foe_entry *entry) -{ - return !(entry->ib1 & MTK_FOE_IB1_STATIC) && - FIELD_GET(MTK_FOE_IB1_STATE, entry->ib1) != MTK_FOE_STATE_BIND; -} - static bool mtk_flow_entry_match(struct mtk_eth *eth, struct mtk_flow_entry *entry, struct mtk_foe_entry *data) -- GitLab From 17cc1ee6e83b16989118237294327bd0dd12b1a4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@opensource.wdc.com> Date: Thu, 13 Oct 2022 17:16:10 +0900 Subject: [PATCH 2044/2223] ata: ahci_st: Fix compilation warning If CONFIG_OF is disabled and the ahci_st driver is builtin (or CONFIG_MODULES is disabled), then using the macro of_match_ptr() results in the st_ahci_match variable being unused, which generates a compilation warning and a compilation error if CONFIG_WERROR is enabled. Fix this by directly assigning st_ahci_match to .of_match_table in the st_ahci_driver platform driver definition. Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Acked-by: Arnd Bergmann <arnd@arndb.de> --- drivers/ata/ahci_st.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c index 5a2cac60a29ad..8607b68eee532 100644 --- a/drivers/ata/ahci_st.c +++ b/drivers/ata/ahci_st.c @@ -236,7 +236,7 @@ static struct platform_driver st_ahci_driver = { .driver = { .name = DRV_NAME, .pm = &st_ahci_pm_ops, - .of_match_table = of_match_ptr(st_ahci_match), + .of_match_table = st_ahci_match, }, .probe = st_ahci_probe, .remove = ata_platform_remove_one, -- GitLab From c32d7cab57e3a77af8ecc17cde7a5761a26483b8 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" <chang.seok.bae@intel.com> Date: Wed, 24 Aug 2022 12:12:21 -0700 Subject: [PATCH 2045/2223] x86/fpu: Configure init_fpstate attributes orderly The init_fpstate setup code is spread out and out of order. The init image is recorded before its scoped features and the buffer size are determined. Determine the scope of init_fpstate components and its size before recording the init state. Also move the relevant code together. Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: neelnatu@google.com Link: https://lore.kernel.org/r/20220824191223.1248-2-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/init.c | 8 -------- arch/x86/kernel/fpu/xstate.c | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 621f4b6cac4a3..8946f89761cc3 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -210,13 +210,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpstate_reset(¤t->thread.fpu); } -static void __init fpu__init_init_fpstate(void) -{ - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; -} - /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: @@ -236,5 +229,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8340156bfd2a..f0ce10620ab04 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -360,7 +360,7 @@ static void __init setup_init_fpu_buf(void) print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 @@ -875,6 +875,10 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); + /* Bring init_fpstate size and features up to date */ + init_fpstate.size = fpu_kernel_cfg.max_size; + init_fpstate.xfeatures = fpu_kernel_cfg.max_features; + setup_init_fpu_buf(); /* -- GitLab From d3e021adac7c51a26d9ede167c789fcc1b878467 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" <chang.seok.bae@intel.com> Date: Wed, 24 Aug 2022 12:12:22 -0700 Subject: [PATCH 2046/2223] x86/fpu: Fix the init_fpstate size check with the actual size The init_fpstate buffer is statically allocated. Thus, the sanity test was established to check whether the pre-allocated buffer is enough for the calculated size or not. The currently measured size is not strictly relevant. Fix to validate the calculated init_fpstate size with the pre-allocated area. Also, replace the sanity check function with open code for clarity. The abstraction itself and the function naming do not tend to represent simply what it does. Fixes: 2ae996e0c1a3 ("x86/fpu: Calculate the default sizes independently") Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20220824191223.1248-3-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f0ce10620ab04..f5ef78633b4c8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -678,20 +678,6 @@ static unsigned int __init get_xsave_size_user(void) return ebx; } -/* - * Will the runtime-enumerated 'xstate_size' fit in the init - * task's statically-allocated buffer? - */ -static bool __init is_supported_xstate_size(unsigned int test_xstate_size) -{ - if (test_xstate_size <= sizeof(init_fpstate.regs)) - return true; - - pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(init_fpstate.regs), test_xstate_size); - return false; -} - static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ @@ -717,10 +703,6 @@ static int __init init_xstate_size(void) kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all default enabled features. */ - if (!is_supported_xstate_size(kernel_default_size)) - return -EINVAL; - if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; @@ -879,6 +861,12 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) init_fpstate.size = fpu_kernel_cfg.max_size; init_fpstate.xfeatures = fpu_kernel_cfg.max_features; + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } + setup_init_fpu_buf(); /* -- GitLab From a401f45e38754953c9d402f8b3bc965707eecc91 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" <chang.seok.bae@intel.com> Date: Wed, 24 Aug 2022 12:12:23 -0700 Subject: [PATCH 2047/2223] x86/fpu: Exclude dynamic states from init_fpstate == Background == The XSTATE init code initializes all enabled and supported components. Then, the init states are saved in the init_fpstate buffer that is statically allocated in about one page. The AMX TILE_DATA state is large (8KB) but its init state is zero. And the feature comes only with the compacted format with these established dependencies: AMX->XFD->XSAVES. So this state is excludable from init_fpstate. == Problem == But the buffer is formatted to include that large state. Then, this can be the cause of a noisy splat like the below. This came from XRSTORS for the task with init_fpstate in its XSAVE buffer. It is reproducible on AMX systems when the running kernel is built with CONFIG_DEBUG_PAGEALLOC=y and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT=y: Bad FPU state detected at restore_fpregs_from_fpstate+0x57/0xd0, reinitializing FPU registers. ... RIP: 0010:restore_fpregs_from_fpstate+0x57/0xd0 ? restore_fpregs_from_fpstate+0x45/0xd0 switch_fpu_return+0x4e/0xe0 exit_to_user_mode_prepare+0x17b/0x1b0 syscall_exit_to_user_mode+0x29/0x40 do_syscall_64+0x67/0x80 ? do_syscall_64+0x67/0x80 ? exc_page_fault+0x86/0x180 entry_SYSCALL_64_after_hwframe+0x63/0xcd == Solution == Adjust init_fpstate to exclude dynamic states. XRSTORS from init_fpstate still initializes those states when their bits are set in the requested-feature bitmap. Fixes: 2308ee57d93d ("x86/fpu/amx: Enable the AMX feature in 64-bit mode") Reported-by: Lin X Wang <lin.x.wang@intel.com> Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Lin X Wang <lin.x.wang@intel.com> Link: https://lore.kernel.org/r/20220824191223.1248-4-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f5ef78633b4c8..e77cabfa802ff 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -857,9 +857,12 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", -- GitLab From ca6c21327c6af02b7eec31ce4b9a740a18c6c13f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <peterz@infradead.org> Date: Thu, 6 Oct 2022 15:00:39 +0200 Subject: [PATCH 2048/2223] perf: Fix missing SIGTRAPs Marco reported: Due to the implementation of how SIGTRAP are delivered if perf_event_attr::sigtrap is set, we've noticed 3 issues: 1. Missing SIGTRAP due to a race with event_sched_out() (more details below). 2. Hardware PMU events being disabled due to returning 1 from perf_event_overflow(). The only way to re-enable the event is for user space to first "properly" disable the event and then re-enable it. 3. The inability to automatically disable an event after a specified number of overflows via PERF_EVENT_IOC_REFRESH. The worst of the 3 issues is problem (1), which occurs when a pending_disable is "consumed" by a racing event_sched_out(), observed as follows: CPU0 | CPU1 --------------------------------+--------------------------- __perf_event_overflow() | perf_event_disable_inatomic() | pending_disable = CPU0 | ... | _perf_event_enable() | event_function_call() | task_function_call() | /* sends IPI to CPU0 */ <IPI> | ... __perf_event_enable() +--------------------------- ctx_resched() task_ctx_sched_out() ctx_sched_out() group_sched_out() event_sched_out() pending_disable = -1 </IPI> <IRQ-work> perf_pending_event() perf_pending_event_disable() /* Fails to send SIGTRAP because no pending_disable! */ </IRQ-work> In the above case, not only is that particular SIGTRAP missed, but also all future SIGTRAPs because 'event_limit' is not reset back to 1. To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction of a separate 'pending_sigtrap', no longer using 'event_limit' and 'pending_disable' for its delivery. Additionally; and different to Marco's proposed patch: - recognise that pending_disable effectively duplicates oncpu for the case where it is set. As such, change the irq_work handler to use ->oncpu to target the event and use pending_* as boolean toggles. - observe that SIGTRAP targets the ctx->task, so the context switch optimization that carries contexts between tasks is invalid. If the irq_work were delayed enough to hit after a context switch the SIGTRAP would be delivered to the wrong task. - observe that if the event gets scheduled out (rotation/migration/context-switch/...) the irq-work would be insufficient to deliver the SIGTRAP when the event gets scheduled back in (the irq-work might still be pending on the old CPU). Therefore have event_sched_out() convert the pending sigtrap into a task_work which will deliver the signal at return_to_user. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov <dvyukov@google.com> Debugged-by: Dmitry Vyukov <dvyukov@google.com> Reported-by: Marco Elver <elver@google.com> Debugged-by: Marco Elver <elver@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Marco Elver <elver@google.com> Tested-by: Marco Elver <elver@google.com> --- include/linux/perf_event.h | 19 ++++- kernel/events/core.c | 151 +++++++++++++++++++++++++++--------- kernel/events/ring_buffer.c | 2 +- 3 files changed, 129 insertions(+), 43 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 853f64b6c8c2c..0031f7b4d9aba 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -756,11 +756,14 @@ struct perf_event { struct fasync_struct *fasync; /* delayed work for NMIs and such */ - int pending_wakeup; - int pending_kill; - int pending_disable; + unsigned int pending_wakeup; + unsigned int pending_kill; + unsigned int pending_disable; + unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ - struct irq_work pending; + struct irq_work pending_irq; + struct callback_head pending_task; + unsigned int pending_work; atomic_t event_limit; @@ -877,6 +880,14 @@ struct perf_event_context { #endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; + + /* + * Sum (event->pending_sigtrap + event->pending_work) + * + * The SIGTRAP is targeted at ctx->task, as such it won't do changing + * that until the signal is delivered. + */ + local_t nr_pending; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index aefc1e08e015e..01933db7629c1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -54,6 +54,7 @@ #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> +#include <linux/task_work.h> #include "internal.h" @@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (READ_ONCE(event->pending_disable) >= 0) { - WRITE_ONCE(event->pending_disable, -1); + if (event->pending_disable) { + event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } + + if (event->pending_sigtrap) { + bool dec = true; + + event->pending_sigtrap = 0; + if (state != PERF_EVENT_STATE_OFF && + !event->pending_work) { + event->pending_work = 1; + dec = false; + task_work_add(current, &event->pending_task, TWA_RESUME); + } + if (dec) + local_dec(&event->ctx->nr_pending); + } + perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - WRITE_ONCE(event->pending_disable, smp_processor_id()); - /* can fail, see perf_pending_event_disable() */ - irq_work_queue(&event->pending); + event->pending_disable = 1; + irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + perf_pmu_disable(pmu); + + /* PMIs are disabled; ctx->nr_pending is stable. */ + if (local_read(&ctx->nr_pending) || + local_read(&next_ctx->nr_pending)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_pmu_disable(pmu); - if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); @@ -3473,6 +3500,7 @@ unlock: raw_spin_lock(&ctx->lock); perf_pmu_disable(pmu); +inside_switch: if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); @@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event, static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); unaccount_event(event); @@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event) return; /* - * perf_pending_event() can race with the task exiting. + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ if (current->flags & PF_EXITING) return; @@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event) event->attr.type, event->attr.sig_data); } -static void perf_pending_event_disable(struct perf_event *event) +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_irq(struct perf_event *event) { - int cpu = READ_ONCE(event->pending_disable); + int cpu = READ_ONCE(event->oncpu); + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ if (cpu < 0) return; + /* + * Yay, we hit home and are in the context of the event. + */ if (cpu == smp_processor_id()) { - WRITE_ONCE(event->pending_disable, -1); - - if (event->attr.sigtrap) { + if (event->pending_sigtrap) { + event->pending_sigtrap = 0; perf_sigtrap(event); - atomic_set_release(&event->event_limit, 1); /* rearm event */ - return; + local_dec(&event->ctx->nr_pending); + } + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); } - - perf_event_disable_local(event); return; } @@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_event() + * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending, cpu); + irq_work_queue_on(&event->pending_irq, cpu); } -static void perf_pending_event(struct irq_work *entry) +static void perf_pending_irq(struct irq_work *entry) { - struct perf_event *event = container_of(entry, struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); - perf_pending_event_disable(event); - + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } + __perf_pending_irq(event); + if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_pending); + } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); +} + #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; @@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event) */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_addr = data->addr; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * Should not be able to return to user space without processing + * pending_sigtrap (kernel events can overflow multiple times). + */ + WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); + if (!event->pending_sigtrap) { + event->pending_sigtrap = 1; + local_inc(&event->ctx->nr_pending); + } + event->pending_addr = data->addr; + irq_work_queue(&event->pending_irq); + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); - event->pending_disable = -1; - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (parent_event) event->event_caps = parent_event->event_caps; - if (event->attr.sigtrap) - atomic_set(&event->event_limit, 1); - if (task) { event->attach_state = PERF_ATTACH_TASK; /* diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 726132039c388..273a0fe7910a5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); + irq_work_queue(&handle->event->pending_irq); } /* -- GitLab From 23488ec66867f7e673b694623a951fb583e464a7 Mon Sep 17 00:00:00 2001 From: Marco Elver <elver@google.com> Date: Tue, 11 Oct 2022 14:45:35 +0200 Subject: [PATCH 2049/2223] selftests/perf_events: Add a SIGTRAP stress test with disables Add a SIGTRAP stress test that exercises repeatedly enabling/disabling an event while it concurrently keeps firing. Signed-off-by: Marco Elver <elver@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/all/Y0E3uG7jOywn7vy3@elver.google.com/ --- .../selftests/perf_events/sigtrap_threads.c | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index 6d849dc2bee0b..d1d8483ac628d 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -62,6 +62,8 @@ static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr, .remove_on_exec = 1, /* Required by sigtrap. */ .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ .sig_data = TEST_SIG_DATA(addr, id), + .exclude_kernel = 1, /* To allow */ + .exclude_hv = 1, /* running as !root */ }; return attr; } @@ -93,9 +95,13 @@ static void *test_thread(void *arg) __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); iter = ctx.iterate_on; /* read */ - for (i = 0; i < iter - 1; i++) { - __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); - ctx.iterate_on = iter; /* idempotent write */ + if (iter >= 0) { + for (i = 0; i < iter - 1; i++) { + __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); + ctx.iterate_on = iter; /* idempotent write */ + } + } else { + while (ctx.iterate_on); } return NULL; @@ -208,4 +214,27 @@ TEST_F(sigtrap_threads, signal_stress) EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); } +TEST_F(sigtrap_threads, signal_stress_with_disable) +{ + const int target_count = NUM_THREADS * 3000; + int i; + + ctx.iterate_on = -1; + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + pthread_barrier_wait(&self->barrier); + while (__atomic_load_n(&ctx.signal_count, __ATOMIC_RELAXED) < target_count) { + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0); + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + } + ctx.iterate_on = 0; + for (i = 0; i < NUM_THREADS; i++) + ASSERT_EQ(pthread_join(self->threads[i], NULL), 0); + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0); + + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); +} + TEST_HARNESS_MAIN -- GitLab From 21da7472a040420f2dc624ffec70291a72c5d6a6 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar <sumanthk@linux.ibm.com> Date: Fri, 7 Oct 2022 10:13:27 +0200 Subject: [PATCH 2050/2223] bpf: Fix sample_flags for bpf_perf_event_output * Raw data is also filled by bpf_perf_event_output. * Add sample_flags to indicate raw data. * This eliminates the segfaults as shown below: Run ./samples/bpf/trace_output BUG pid 9 cookie 1001000000004 sized 4 BUG pid 9 cookie 1001000000004 sized 4 BUG pid 9 cookie 1001000000004 sized 4 Segmentation fault (core dumped) Fixes: 838d9bb62d13 ("perf: Use sample_flags for raw_data") Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Namhyung Kim <namhyung@kernel.org> Link: https://lkml.kernel.org/r/20221007081327.1047552-1-sumanthk@linux.ibm.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 49fb9ec8366de..1ed08967fb979 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -687,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; err = __bpf_perf_event_output(regs, map, flags, sd); @@ -745,6 +746,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; ret = __bpf_perf_event_output(regs, map, flags, sd); out: -- GitLab From e705968dd687574b6ca3ebe772683d5642759132 Mon Sep 17 00:00:00 2001 From: Lin Shengwang <linshengwang1@huawei.com> Date: Sat, 8 Oct 2022 10:27:09 +0800 Subject: [PATCH 2051/2223] sched/core: Fix comparison in sched_group_cookie_match() In commit 97886d9dcd86 ("sched: Migration changes for core scheduling"), sched_group_cookie_match() was added to help determine if a cookie matches the core state. However, while it iterates the SMT group, it fails to actually use the RQ for each of the CPUs iterated, use cpu_rq(cpu) instead of rq to fix things. Fixes: 97886d9dcd86 ("sched: Migration changes for core scheduling") Signed-off-by: Lin Shengwang <linshengwang1@huawei.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20221008022709.642-1-linshengwang1@huawei.com --- kernel/sched/sched.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1644242ecd11a..0d08511273695 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1182,6 +1182,14 @@ static inline bool is_migration_disabled(struct task_struct *p) #endif } +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(&runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() raw_cpu_ptr(&runqueues) + struct sched_group; #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1269,7 +1277,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, return true; for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { - if (sched_core_cookie_match(rq, p)) + if (sched_core_cookie_match(cpu_rq(cpu), p)) return true; } return false; @@ -1384,14 +1392,6 @@ static inline void update_idle_core(struct rq *rq) static inline void update_idle_core(struct rq *rq) { } #endif -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() this_cpu_ptr(&runqueues) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() raw_cpu_ptr(&runqueues) - #ifdef CONFIG_FAIR_GROUP_SCHED static inline struct task_struct *task_of(struct sched_entity *se) { -- GitLab From 8e5bad7dccec2014f24497b57d8a8ee0b752c290 Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Fri, 7 Oct 2022 17:07:58 -0700 Subject: [PATCH 2052/2223] sched: Introduce struct balance_callback to avoid CFI mismatches Introduce distinct struct balance_callback instead of performing function pointer casting which will trip CFI. Avoids warnings as found by Clang's future -Wcast-function-type-strict option: In file included from kernel/sched/core.c:84: kernel/sched/sched.h:1755:15: warning: cast from 'void (*)(struct rq *)' to 'void (*)(struct callback_head *)' converts to incompatible function type [-Wcast-function-type-strict] head->func = (void (*)(struct callback_head *))func; ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ No binary differences result from this change. This patch is a cleanup based on Brad Spengler/PaX Team's modifications to sched code in their last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Reported-by: Sami Tolvanen <samitolvanen@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Nathan Chancellor <nathan@kernel.org> Link: https://github.com/ClangBuiltLinux/linux/issues/1724 Link: https://lkml.kernel.org/r/20221008000758.2957718-1-keescook@chromium.org --- kernel/sched/core.c | 24 ++++++++++++------------ kernel/sched/deadline.c | 4 ++-- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 14 ++++++++++---- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5800b0623ff30..cb2aa2b54c7a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4823,10 +4823,10 @@ static inline void finish_task(struct task_struct *prev) #ifdef CONFIG_SMP -static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); - struct callback_head *next; + struct balance_callback *next; lockdep_assert_rq_held(rq); @@ -4853,15 +4853,15 @@ static void balance_push(struct rq *rq); * This abuse is tolerated because it places all the unlikely/odd cases behind * a single test, namely: rq->balance_callback == NULL. */ -struct callback_head balance_push_callback = { +struct balance_callback balance_push_callback = { .next = NULL, - .func = (void (*)(struct callback_head *))balance_push, + .func = balance_push, }; -static inline struct callback_head * +static inline struct balance_callback * __splice_balance_callbacks(struct rq *rq, bool split) { - struct callback_head *head = rq->balance_callback; + struct balance_callback *head = rq->balance_callback; if (likely(!head)) return NULL; @@ -4883,7 +4883,7 @@ __splice_balance_callbacks(struct rq *rq, bool split) return head; } -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { return __splice_balance_callbacks(rq, true); } @@ -4893,7 +4893,7 @@ static void __balance_callbacks(struct rq *rq) do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } -static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) { unsigned long flags; @@ -4910,12 +4910,12 @@ static inline void __balance_callbacks(struct rq *rq) { } -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { return NULL; } -static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) { } @@ -6188,7 +6188,7 @@ static void sched_core_balance(struct rq *rq) preempt_enable(); } -static DEFINE_PER_CPU(struct callback_head, core_balance_head); +static DEFINE_PER_CPU(struct balance_callback, core_balance_head); static void queue_core_balance(struct rq *rq) { @@ -7419,7 +7419,7 @@ static int __sched_setscheduler(struct task_struct *p, int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; - struct callback_head *head; + struct balance_callback *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 86dea6a05267d..9ae8f41e3372f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -644,8 +644,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d869bcf898ccb..ed2a47e4ddaec 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -410,8 +410,8 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } -static DEFINE_PER_CPU(struct callback_head, rt_push_head); -static DEFINE_PER_CPU(struct callback_head, rt_pull_head); +static DEFINE_PER_CPU(struct balance_callback, rt_push_head); +static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d08511273695..a4a20046e586e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -938,6 +938,12 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ +struct rq; +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; + /* * This is the main, per-CPU runqueue data structure. * @@ -1036,7 +1042,7 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; - struct callback_head *balance_callback; + struct balance_callback *balance_callback; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1544,7 +1550,7 @@ struct rq_flags { #endif }; -extern struct callback_head balance_push_callback; +extern struct balance_callback balance_push_callback; /* * Lockdep annotation that avoids accidental unlocks; it's like a @@ -1724,7 +1730,7 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static inline void queue_balance_callback(struct rq *rq, - struct callback_head *head, + struct balance_callback *head, void (*func)(struct rq *rq)) { lockdep_assert_rq_held(rq); @@ -1737,7 +1743,7 @@ queue_balance_callback(struct rq *rq, if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; - head->func = (void (*)(struct callback_head *))func; + head->func = func; head->next = rq->balance_callback; rq->balance_callback = head; } -- GitLab From 897a66d281983c4fe2b805f26b315309b35fb028 Mon Sep 17 00:00:00 2001 From: Jon Hunter <jonathanh@nvidia.com> Date: Mon, 17 Oct 2022 09:40:06 +0100 Subject: [PATCH 2053/2223] Revert "PCI: tegra: Use PCI_CONF1_EXT_ADDRESS() macro" This reverts commit 8bb7ff12a91429eb76e093b517ae810b146448fe. Commit 8bb7ff12a914 ("PCI: tegra: Use PCI_CONF1_EXT_ADDRESS() macro") updated the Tegra PCI driver to use the macro PCI_CONF1_EXT_ADDRESS() instead of a local function in the Tegra PCI driver. This broke PCI for some Tegra platforms because, when calculating the offset value, the mask applied to the lower 8-bits changed from 0xff to 0xfc. For now, fix this by reverting this commit. Fixes: 8bb7ff12a914 ("PCI: tegra: Use PCI_CONF1_EXT_ADDRESS() macro") Link: https://lore.kernel.org/r/20221017084006.11770-1-jonathanh@nvidia.com Signed-off-by: Jon Hunter <jonathanh@nvidia.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Thierry Reding <treding@nvidia.com> Acked-by: Lorenzo Pieralisi <lpieralisi@kernel.org> --- drivers/pci/controller/pci-tegra.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 24478ae5a345d..8e323e93be915 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -415,6 +415,13 @@ static inline u32 pads_readl(struct tegra_pcie *pcie, unsigned long offset) * address (access to which generates correct config transaction) falls in * this 4 KiB region. */ +static unsigned int tegra_pcie_conf_offset(u8 bus, unsigned int devfn, + unsigned int where) +{ + return ((where & 0xf00) << 16) | (bus << 16) | (PCI_SLOT(devfn) << 11) | + (PCI_FUNC(devfn) << 8) | (where & 0xff); +} + static void __iomem *tegra_pcie_map_bus(struct pci_bus *bus, unsigned int devfn, int where) @@ -436,9 +443,7 @@ static void __iomem *tegra_pcie_map_bus(struct pci_bus *bus, unsigned int offset; u32 base; - offset = PCI_CONF1_EXT_ADDRESS(bus->number, PCI_SLOT(devfn), - PCI_FUNC(devfn), where) & - ~PCI_CONF1_ENABLE; + offset = tegra_pcie_conf_offset(bus->number, devfn, where); /* move 4 KiB window to offset within the FPCI region */ base = 0xfe100000 + ((offset & ~(SZ_4K - 1)) >> 8); -- GitLab From 33806e7cb8d50379f55c3e8f335e91e1b359dc7b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor <nathan@kernel.org> Date: Thu, 29 Sep 2022 08:20:10 -0700 Subject: [PATCH 2054/2223] x86/Kconfig: Drop check for -mabi=ms for CONFIG_EFI_STUB A recent change in LLVM made CONFIG_EFI_STUB unselectable because it no longer pretends to support -mabi=ms, breaking the dependency in Kconfig. Lack of CONFIG_EFI_STUB can prevent kernels from booting via EFI in certain circumstances. This check was added by 8f24f8c2fc82 ("efi/libstub: Annotate firmware routines as __efiapi") to ensure that __attribute__((ms_abi)) was available, as -mabi=ms is not actually used in any cflags. According to the GCC documentation, this attribute has been supported since GCC 4.4.7. The kernel currently requires GCC 5.1 so this check is not necessary; even when that change landed in 5.6, the kernel required GCC 4.9 so it was unnecessary then as well. Clang supports __attribute__((ms_abi)) for all versions that are supported for building the kernel so no additional check is needed. Remove the 'depends on' line altogether to allow CONFIG_EFI_STUB to be selected when CONFIG_EFI is enabled, regardless of compiler. Fixes: 8f24f8c2fc82 ("efi/libstub: Annotate firmware routines as __efiapi") Signed-off-by: Nathan Chancellor <nathan@kernel.org> Signed-off-by: Borislav Petkov <bp@suse.de> Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Acked-by: Ard Biesheuvel <ardb@kernel.org> Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/commit/d1ad006a8f64bdc17f618deffa9e7c91d82c444d --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d1879ef933a2..67745ceab0dbc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1973,7 +1973,6 @@ config EFI config EFI_STUB bool "EFI stub support" depends on EFI - depends on $(cc-option,-mabi=ms) || X86_32 select RELOCATABLE help This kernel feature allows a bzImage to be loaded directly -- GitLab From 7108b80a542b9d65e44b36d64a700a83658c0b73 Mon Sep 17 00:00:00 2001 From: Zhang Rui <rui.zhang@intel.com> Date: Fri, 14 Oct 2022 17:01:45 +0800 Subject: [PATCH 2055/2223] hwmon/coretemp: Handle large core ID value The coretemp driver supports up to a hard-coded limit of 128 cores. Today, the driver can not support a core with an ID above that limit. Yet, the encoding of core ID's is arbitrary (BIOS APIC-ID) and so they may be sparse and they may be large. Update the driver to map arbitrary core ID numbers into appropriate array indexes so that 128 cores can be supported, no matter the encoding of core ID's. Signed-off-by: Zhang Rui <rui.zhang@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Len Brown <len.brown@intel.com> Acked-by: Guenter Roeck <linux@roeck-us.net> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221014090147.1836-3-rui.zhang@intel.com --- drivers/hwmon/coretemp.c | 56 +++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index ccf0af5b988a7..8bf32c6c85d95 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -46,9 +46,6 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) -#define TO_CORE_ID(cpu) (cpu_data(cpu).cpu_core_id) -#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) - #ifdef CONFIG_SMP #define for_each_sibling(i, cpu) \ for_each_cpu(i, topology_sibling_cpumask(cpu)) @@ -91,6 +88,8 @@ struct temp_data { struct platform_data { struct device *hwmon_dev; u16 pkg_id; + u16 cpu_map[NUM_REAL_CORES]; + struct ida ida; struct cpumask cpumask; struct temp_data *core_data[MAX_CORE_DATA]; struct device_attribute name_attr; @@ -441,7 +440,7 @@ static struct temp_data *init_temp_data(unsigned int cpu, int pkg_flag) MSR_IA32_THERM_STATUS; tdata->is_pkg_data = pkg_flag; tdata->cpu = cpu; - tdata->cpu_core_id = TO_CORE_ID(cpu); + tdata->cpu_core_id = topology_core_id(cpu); tdata->attr_size = MAX_CORE_ATTRS; mutex_init(&tdata->update_lock); return tdata; @@ -454,7 +453,7 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, struct platform_data *pdata = platform_get_drvdata(pdev); struct cpuinfo_x86 *c = &cpu_data(cpu); u32 eax, edx; - int err, attr_no; + int err, index, attr_no; /* * Find attr number for sysfs: @@ -462,14 +461,26 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, * The attr number is always core id + 2 * The Pkgtemp will always show up as temp1_*, if available */ - attr_no = pkg_flag ? PKG_SYSFS_ATTR_NO : TO_ATTR_NO(cpu); + if (pkg_flag) { + attr_no = PKG_SYSFS_ATTR_NO; + } else { + index = ida_alloc(&pdata->ida, GFP_KERNEL); + if (index < 0) + return index; + pdata->cpu_map[index] = topology_core_id(cpu); + attr_no = index + BASE_SYSFS_ATTR_NO; + } - if (attr_no > MAX_CORE_DATA - 1) - return -ERANGE; + if (attr_no > MAX_CORE_DATA - 1) { + err = -ERANGE; + goto ida_free; + } tdata = init_temp_data(cpu, pkg_flag); - if (!tdata) - return -ENOMEM; + if (!tdata) { + err = -ENOMEM; + goto ida_free; + } /* Test if we can access the status register */ err = rdmsr_safe_on_cpu(cpu, tdata->status_reg, &eax, &edx); @@ -505,6 +516,9 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, exit_free: pdata->core_data[attr_no] = NULL; kfree(tdata); +ida_free: + if (!pkg_flag) + ida_free(&pdata->ida, index); return err; } @@ -524,6 +538,9 @@ static void coretemp_remove_core(struct platform_data *pdata, int indx) kfree(pdata->core_data[indx]); pdata->core_data[indx] = NULL; + + if (indx >= BASE_SYSFS_ATTR_NO) + ida_free(&pdata->ida, indx - BASE_SYSFS_ATTR_NO); } static int coretemp_probe(struct platform_device *pdev) @@ -537,6 +554,7 @@ static int coretemp_probe(struct platform_device *pdev) return -ENOMEM; pdata->pkg_id = pdev->id; + ida_init(&pdata->ida); platform_set_drvdata(pdev, pdata); pdata->hwmon_dev = devm_hwmon_device_register_with_groups(dev, DRVNAME, @@ -553,6 +571,7 @@ static int coretemp_remove(struct platform_device *pdev) if (pdata->core_data[i]) coretemp_remove_core(pdata, i); + ida_destroy(&pdata->ida); return 0; } @@ -647,7 +666,7 @@ static int coretemp_cpu_offline(unsigned int cpu) struct platform_device *pdev = coretemp_get_pdev(cpu); struct platform_data *pd; struct temp_data *tdata; - int indx, target; + int i, indx = -1, target; /* * Don't execute this on suspend as the device remove locks @@ -660,12 +679,19 @@ static int coretemp_cpu_offline(unsigned int cpu) if (!pdev) return 0; - /* The core id is too big, just return */ - indx = TO_ATTR_NO(cpu); - if (indx > MAX_CORE_DATA - 1) + pd = platform_get_drvdata(pdev); + + for (i = 0; i < NUM_REAL_CORES; i++) { + if (pd->cpu_map[i] == topology_core_id(cpu)) { + indx = i + BASE_SYSFS_ATTR_NO; + break; + } + } + + /* Too many cores and this core is not populated, just return */ + if (indx < 0) return 0; - pd = platform_get_drvdata(pdev); tdata = pd->core_data[indx]; cpumask_clear_cpu(cpu, &pd->cpumask); -- GitLab From 2b12a7a126d62bdbd81f4923c21bf6e9a7fbd069 Mon Sep 17 00:00:00 2001 From: Zhang Rui <rui.zhang@intel.com> Date: Fri, 14 Oct 2022 17:01:46 +0800 Subject: [PATCH 2056/2223] x86/topology: Fix multiple packages shown on a single-package system CPUID.1F/B does not enumerate Package level explicitly, instead, all the APIC-ID bits above the enumerated levels are assumed to be package ID bits. Current code gets package ID by shifting out all the APIC-ID bits that Linux supports, rather than shifting out all the APIC-ID bits that CPUID.1F enumerates. This introduces problems when CPUID.1F enumerates a level that Linux does not support. For example, on a single package AlderLake-N, there are 2 Ecore Modules with 4 atom cores in each module. Linux does not support the Module level and interprets the Module ID bits as package ID and erroneously reports a multi module system as a multi-package system. Fix this by using APIC-ID bits above all the CPUID.1F enumerated levels as package ID. [ dhansen: spelling fix ] Fixes: 7745f03eb395 ("x86/topology: Add CPUID.1F multi-die/package support") Suggested-by: Len Brown <len.brown@intel.com> Signed-off-by: Zhang Rui <rui.zhang@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Len Brown <len.brown@intel.com> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221014090147.1836-4-rui.zhang@intel.com --- arch/x86/kernel/cpu/topology.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 132a2de44d2fe..f7592814e5d59 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; bool die_level_present = false; int leaf; @@ -111,10 +112,10 @@ int detect_extended_topology(struct cpuinfo_x86 *c) core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; - do { + while (true) { cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* @@ -132,8 +133,13 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + } core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> @@ -148,7 +154,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) } c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - die_plus_mask_width); + pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ -- GitLab From 71eac7063698b7d7b8fafb1683ac24a034541141 Mon Sep 17 00:00:00 2001 From: Zhang Rui <rui.zhang@intel.com> Date: Fri, 14 Oct 2022 17:01:47 +0800 Subject: [PATCH 2057/2223] x86/topology: Fix duplicated core ID within a package Today, core ID is assumed to be unique within each package. But an AlderLake-N platform adds a Module level between core and package, Linux excludes the unknown modules bits from the core ID, resulting in duplicate core ID's. To keep core ID unique within a package, Linux must include all APIC-ID bits for known or unknown levels above the core and below the package in the core ID. It is important to understand that core ID's have always come directly from the APIC-ID encoding, which comes from the BIOS. Thus there is no guarantee that they start at 0, or that they are contiguous. As such, naively using them for array indexes can be problematic. [ dhansen: un-known -> unknown ] Fixes: 7745f03eb395 ("x86/topology: Add CPUID.1F multi-die/package support") Suggested-by: Len Brown <len.brown@intel.com> Signed-off-by: Zhang Rui <rui.zhang@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Len Brown <len.brown@intel.com> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221014090147.1836-5-rui.zhang@intel.com --- arch/x86/kernel/cpu/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index f7592814e5d59..5e868b62a7c4e 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -141,7 +141,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) sub_index++; } - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; -- GitLab From 79a818b5087393d5a4cb356d4545d02f55bf1a2f Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 17 Oct 2022 08:08:05 -1000 Subject: [PATCH 2058/2223] blkcg: Update MAINTAINERS entry Josef wrote iolatency and iocost is missing from the files list. Let's add Josef as a maintainer and add blk-iocost.c to the files list. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Jens Axboe <axboe@kernel.dk> --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0dc4a769216be..4a5ce3863deb7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5256,6 +5256,7 @@ F: tools/testing/selftests/cgroup/ CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO) M: Tejun Heo <tj@kernel.org> +M: Josef Bacik <josef@toxicpanda.com> M: Jens Axboe <axboe@kernel.dk> L: cgroups@vger.kernel.org L: linux-block@vger.kernel.org @@ -5263,6 +5264,7 @@ T: git git://git.kernel.dk/linux-block F: Documentation/admin-guide/cgroup-v1/blkio-controller.rst F: block/bfq-cgroup.c F: block/blk-cgroup.c +F: block/blk-iocost.c F: block/blk-iolatency.c F: block/blk-throttle.c F: include/linux/blk-cgroup.h -- GitLab From 0ffac4727eec1879305c1bda07c0195197937bb2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@opensource.wdc.com> Date: Fri, 14 Oct 2022 10:38:15 +0900 Subject: [PATCH 2059/2223] ata: sata_rcar: Fix compilation warning When compiling with clang and W=1, the following warning is generated: drivers/ata/sata_rcar.c:878:15: error: cast to smaller integer type 'enum sata_rcar_type' from 'const void *' [-Werror,-Wvoid-pointer-to-enum-cast] priv->type = (enum sata_rcar_type)of_device_get_match_data(dev); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by using a cast to unsigned long to match the "void *" type size returned by of_device_get_match_data(). Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be> Acked-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru> --- drivers/ata/sata_rcar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c index 590ebea996017..0195eb29f6c2d 100644 --- a/drivers/ata/sata_rcar.c +++ b/drivers/ata/sata_rcar.c @@ -875,7 +875,7 @@ static int sata_rcar_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->type = (enum sata_rcar_type)of_device_get_match_data(dev); + priv->type = (unsigned long)of_device_get_match_data(dev); pm_runtime_enable(dev); ret = pm_runtime_get_sync(dev); -- GitLab From 7d7b0c85127cbac45e6c4e0ae0647ace17cadfaf Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@opensource.wdc.com> Date: Fri, 14 Oct 2022 10:42:57 +0900 Subject: [PATCH 2060/2223] ata: ahci_brcm: Fix compilation warning When compiling with clang and W=1, the following warning is generated: drivers/ata/ahci_brcm.c:451:18: error: cast to smaller integer type 'enum brcm_ahci_version' from 'const void *' [-Werror,-Wvoid-pointer-to-enum-cast] priv->version = (enum brcm_ahci_version)of_id->data; ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by using a cast to unsigned long to match the "void *" type size of of_id->data. Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: Florian Fainelli <f.fainelli@gmail.com> --- drivers/ata/ahci_brcm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c index f61795c546cf1..6f216eb256100 100644 --- a/drivers/ata/ahci_brcm.c +++ b/drivers/ata/ahci_brcm.c @@ -448,7 +448,7 @@ static int brcm_ahci_probe(struct platform_device *pdev) if (!of_id) return -ENODEV; - priv->version = (enum brcm_ahci_version)of_id->data; + priv->version = (unsigned long)of_id->data; priv->dev = dev; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "top-ctrl"); -- GitLab From e8fbdf1855f7f31a8f37df60d7be44d8aabe6288 Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@opensource.wdc.com> Date: Fri, 14 Oct 2022 10:45:58 +0900 Subject: [PATCH 2061/2223] ata: ahci_xgene: Fix compilation warning When compiling with clang and W=1, the following warning is generated: drivers/ata/ahci_xgene.c:788:14: error: cast to smaller integer type 'enum xgene_ahci_version' from 'const void *' [-Werror,-Wvoid-pointer-to-enum-cast] version = (enum xgene_ahci_version) of_devid->data; ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by using a cast to unsigned long to match the "void *" type size of of_devid->data. Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Acked-by: Arnd Bergmann <arnd@arndb.de> --- drivers/ata/ahci_xgene.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c index 7bb5db17f8642..1e08704d51173 100644 --- a/drivers/ata/ahci_xgene.c +++ b/drivers/ata/ahci_xgene.c @@ -785,7 +785,7 @@ static int xgene_ahci_probe(struct platform_device *pdev) of_devid = of_match_device(xgene_ahci_of_match, dev); if (of_devid) { if (of_devid->data) - version = (enum xgene_ahci_version) of_devid->data; + version = (unsigned long) of_devid->data; } #ifdef CONFIG_ACPI else { -- GitLab From 26d9f48d9981205a7e229e21e183dbf1f13de83e Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@opensource.wdc.com> Date: Fri, 14 Oct 2022 10:48:16 +0900 Subject: [PATCH 2062/2223] ata: ahci_imx: Fix compilation warning When compiling with clang and W=1, the following warning is generated: drivers/ata/ahci_imx.c:1070:18: error: cast to smaller integer type 'enum ahci_imx_type' from 'const void *' [-Werror,-Wvoid-pointer-to-enum-cast] imxpriv->type = (enum ahci_imx_type)of_id->data; ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by using a cast to unsigned long to match the "void *" type size of of_id->data. Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Acked-by: Arnd Bergmann <arnd@arndb.de> --- drivers/ata/ahci_imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c index 632caa3014587..a950767f79483 100644 --- a/drivers/ata/ahci_imx.c +++ b/drivers/ata/ahci_imx.c @@ -1067,7 +1067,7 @@ static int imx_ahci_probe(struct platform_device *pdev) imxpriv->ahci_pdev = pdev; imxpriv->no_device = false; imxpriv->first_time = true; - imxpriv->type = (enum ahci_imx_type)of_id->data; + imxpriv->type = (unsigned long)of_id->data; imxpriv->sata_clk = devm_clk_get(dev, "sata"); if (IS_ERR(imxpriv->sata_clk)) { -- GitLab From 2ce3a0bf2010b16c78b78cc35a97fa913f1be0ca Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@opensource.wdc.com> Date: Fri, 14 Oct 2022 11:03:49 +0900 Subject: [PATCH 2063/2223] ata: ahci_qoriq: Fix compilation warning When compiling with clang and W=1, the following warning is generated: drivers/ata/ahci_qoriq.c:283:22: error: cast to smaller integer type 'enum ahci_qoriq_type' from 'const void *' [-Werror,-Wvoid-pointer-to-enum-cast] qoriq_priv->type = (enum ahci_qoriq_type)of_id->data; ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by using a cast to unsigned long to match the "void *" type size of of_id->data. Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Acked-by: Arnd Bergmann <arnd@arndb.de> --- drivers/ata/ahci_qoriq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c index 6cd61842ad48b..9cf9bf36a8740 100644 --- a/drivers/ata/ahci_qoriq.c +++ b/drivers/ata/ahci_qoriq.c @@ -280,7 +280,7 @@ static int ahci_qoriq_probe(struct platform_device *pdev) return -ENOMEM; if (of_id) - qoriq_priv->type = (enum ahci_qoriq_type)of_id->data; + qoriq_priv->type = (unsigned long)of_id->data; else qoriq_priv->type = (enum ahci_qoriq_type)acpi_id->driver_data; -- GitLab From 2331ce6126be8864b39490e705286b66e2344aac Mon Sep 17 00:00:00 2001 From: Uday Shankar <ushankar@purestorage.com> Date: Fri, 23 Sep 2022 18:02:42 -0600 Subject: [PATCH 2064/2223] scsi: core: Restrict legal sdev_state transitions via sysfs Userspace can currently write to sysfs to transition sdev_state to RUNNING or OFFLINE from any source state. This causes issues because proper transitioning out of some states involves steps besides just changing sdev_state, so allowing userspace to change sdev_state regardless of the source state can result in inconsistencies; e.g. with ISCSI we can end up with sdev_state == SDEV_RUNNING while the device queue is quiesced. Any task attempting I/O on the device will then hang, and in more recent kernels, iscsid will hang as well. More detail about this bug is provided in my first attempt: https://groups.google.com/g/open-iscsi/c/PNKca4HgPDs/m/CXaDkntOAQAJ Link: https://lore.kernel.org/r/20220924000241.2967323-1-ushankar@purestorage.com Signed-off-by: Uday Shankar <ushankar@purestorage.com> Suggested-by: Mike Christie <michael.christie@oracle.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> --- drivers/scsi/scsi_sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index c95177ca6ed26..cac7c902cf70a 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -828,6 +828,14 @@ store_state_field(struct device *dev, struct device_attribute *attr, } mutex_lock(&sdev->state_mutex); + switch (sdev->sdev_state) { + case SDEV_RUNNING: + case SDEV_OFFLINE: + break; + default: + mutex_unlock(&sdev->state_mutex); + return -EINVAL; + } if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { ret = 0; } else { -- GitLab From dc8e483f684a24cc06e1d5fa958b54db58855093 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca <rafaelmendsr@gmail.com> Date: Fri, 16 Sep 2022 00:59:07 -0300 Subject: [PATCH 2065/2223] scsi: lpfc: Fix memory leak in lpfc_create_port() Commit 5e633302ace1 ("scsi: lpfc: vmid: Add support for VMID in mailbox command") introduced allocations for the VMID resources in lpfc_create_port() after the call to scsi_host_alloc(). Upon failure on the VMID allocations, the new code would branch to the 'out' label, which returns NULL without unwinding anything, thus skipping the call to scsi_host_put(). Fix the problem by creating a separate label 'out_free_vmid' to unwind the VMID resources and make the 'out_put_shost' label call only scsi_host_put(), as was done before the introduction of allocations for VMID. Fixes: 5e633302ace1 ("scsi: lpfc: vmid: Add support for VMID in mailbox command") Signed-off-by: Rafael Mendonca <rafaelmendsr@gmail.com> Link: https://lore.kernel.org/r/20220916035908.712799-1-rafaelmendsr@gmail.com Reviewed-by: James Smart <jsmart2021@gmail.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> --- drivers/scsi/lpfc/lpfc_init.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index b49c39569386a..b535f1fd30100 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4812,7 +4812,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) rc = lpfc_vmid_res_alloc(phba, vport); if (rc) - goto out; + goto out_put_shost; /* Initialize all internally managed lists. */ INIT_LIST_HEAD(&vport->fc_nodes); @@ -4830,16 +4830,17 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev); if (error) - goto out_put_shost; + goto out_free_vmid; spin_lock_irq(&phba->port_list_lock); list_add_tail(&vport->listentry, &phba->port_list); spin_unlock_irq(&phba->port_list_lock); return vport; -out_put_shost: +out_free_vmid: kfree(vport->vmid); bitmap_free(vport->vmid_priority_range); +out_put_shost: scsi_host_put(shost); out: return NULL; -- GitLab From 69421bf98482d089e50799f45e48b25ce4a8d154 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima <kuniyu@amazon.com> Date: Fri, 14 Oct 2022 11:26:25 -0700 Subject: [PATCH 2066/2223] udp: Update reuse->has_conns under reuseport_lock. When we call connect() for a UDP socket in a reuseport group, we have to update sk->sk_reuseport_cb->has_conns to 1. Otherwise, the kernel could select a unconnected socket wrongly for packets sent to the connected socket. However, the current way to set has_conns is illegal and possible to trigger that problem. reuseport_has_conns() changes has_conns under rcu_read_lock(), which upgrades the RCU reader to the updater. Then, it must do the update under the updater's lock, reuseport_lock, but it doesn't for now. For this reason, there is a race below where we fail to set has_conns resulting in the wrong socket selection. To avoid the race, let's split the reader and updater with proper locking. cpu1 cpu2 +----+ +----+ __ip[46]_datagram_connect() reuseport_grow() . . |- reuseport_has_conns(sk, true) |- more_reuse = __reuseport_alloc(more_socks_size) | . | | |- rcu_read_lock() | |- reuse = rcu_dereference(sk->sk_reuseport_cb) | | | | | /* reuse->has_conns == 0 here */ | | |- more_reuse->has_conns = reuse->has_conns | |- reuse->has_conns = 1 | /* more_reuse->has_conns SHOULD BE 1 HERE */ | | | | | |- rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, | | | more_reuse) | `- rcu_read_unlock() `- kfree_rcu(reuse, rcu) | |- sk->sk_state = TCP_ESTABLISHED Note the likely(reuse) in reuseport_has_conns_set() is always true, but we put the test there for ease of review. [0] For the record, usually, sk_reuseport_cb is changed under lock_sock(). The only exception is reuseport_grow() & TCP reqsk migration case. 1) shutdown() TCP listener, which is moved into the latter part of reuse->socks[] to migrate reqsk. 2) New listen() overflows reuse->socks[] and call reuseport_grow(). 3) reuse->max_socks overflows u16 with the new listener. 4) reuseport_grow() pops the old shutdown()ed listener from the array and update its sk->sk_reuseport_cb as NULL without lock_sock(). shutdown()ed TCP sk->sk_reuseport_cb can be changed without lock_sock(), but, reuseport_has_conns_set() is called only for UDP under lock_sock(), so likely(reuse) never be false in reuseport_has_conns_set(). [0]: https://lore.kernel.org/netdev/CANn89iLja=eQHbsM_Ta2sQF0tOGU8vAGrh_izRuuHjuO1ouUag@mail.gmail.com/ Fixes: acdcecc61285 ("udp: correct reuseport selection with connected sockets") Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://lore.kernel.org/r/20221014182625.89913-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- include/net/sock_reuseport.h | 11 +++++------ net/core/sock_reuseport.c | 16 ++++++++++++++++ net/ipv4/datagram.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/udp.c | 2 +- 6 files changed, 25 insertions(+), 10 deletions(-) diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 473b0b0fa4abc..efc9085c68927 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -43,21 +43,20 @@ struct sock *reuseport_migrate_sock(struct sock *sk, extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); -static inline bool reuseport_has_conns(struct sock *sk, bool set) +static inline bool reuseport_has_conns(struct sock *sk) { struct sock_reuseport *reuse; bool ret = false; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); - if (reuse) { - if (set) - reuse->has_conns = 1; - ret = reuse->has_conns; - } + if (reuse && reuse->has_conns) + ret = true; rcu_read_unlock(); return ret; } +void reuseport_has_conns_set(struct sock *sk); + #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5daa1fa542490..fb90e1e00773b 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -21,6 +21,22 @@ static DEFINE_IDA(reuseport_ida); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany); +void reuseport_has_conns_set(struct sock *sk) +{ + struct sock_reuseport *reuse; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (likely(reuse)) + reuse->has_conns = 1; + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_has_conns_set); + static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 405a8c2aea641..5e66add7befac 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -70,7 +70,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; - reuseport_has_conns(sk, true); + reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); inet->inet_id = prandom_u32(); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8126f67d18b34..752b72892a443 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -448,7 +448,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk, false)) + if (result && !reuseport_has_conns(sk)) return result; result = result ? : sk; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index df665d4e8f0f1..5ecb56522f9d6 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -256,7 +256,7 @@ ipv4_connected: goto out; } - reuseport_has_conns(sk, true); + reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8d09f0ea5b8c7..129ec5a9b0eb7 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -195,7 +195,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk, false)) + if (result && !reuseport_has_conns(sk)) return result; result = result ? : sk; -- GitLab From e7ad18d1169c62e6c78c01ff693fd362d9d65278 Mon Sep 17 00:00:00 2001 From: Borislav Petkov <bp@suse.de> Date: Wed, 5 Oct 2022 12:00:08 +0200 Subject: [PATCH 2067/2223] x86/microcode/AMD: Apply the patch early on every logical thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the patch application logic checks whether the revision needs to be applied on each logical CPU (SMT thread). Therefore, on SMT designs where the microcode engine is shared between the two threads, the application happens only on one of them as that is enough to update the shared microcode engine. However, there are microcode patches which do per-thread modification, see Link tag below. Therefore, drop the revision check and try applying on each thread. This is what the BIOS does too so this method is very much tested. Btw, change only the early paths. On the late loading paths, there's no point in doing per-thread modification because if is it some case like in the bugzilla below - removing a CPUID flag - the kernel cannot go and un-use features it has detected are there early. For that, one should use early loading anyway. [ bp: Fixes does not contain the oldest commit which did check for equality but that is good enough. ] Fixes: 8801b3fcb574 ("x86/microcode/AMD: Rework container parsing") Reported-by: Ștefan Talpalaru <stefantalpalaru@yahoo.com> Signed-off-by: Borislav Petkov <bp@suse.de> Tested-by: Ștefan Talpalaru <stefantalpalaru@yahoo.com> Cc: <stable@vger.kernel.org> Link: https://bugzilla.kernel.org/show_bug.cgi?id=216211 --- arch/x86/kernel/cpu/microcode/amd.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index e7410e98fc1f9..3a35dec3ec550 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -440,7 +440,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -528,8 +534,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; -- GitLab From 1ca695207ed2271ecbf8ee6c641970f621c157cc Mon Sep 17 00:00:00 2001 From: Zhengchao Shao <shaozhengchao@huawei.com> Date: Mon, 17 Oct 2022 16:03:31 +0800 Subject: [PATCH 2068/2223] ip6mr: fix UAF issue in ip6mr_sk_done() when addrconf_init_net() failed If the initialization fails in calling addrconf_init_net(), devconf_all is the pointer that has been released. Then ip6mr_sk_done() is called to release the net, accessing devconf->mc_forwarding directly causes invalid pointer access. The process is as follows: setup_net() ops_init() addrconf_init_net() all = kmemdup(...) ---> alloc "all" ... net->ipv6.devconf_all = all; __addrconf_sysctl_register() ---> failed ... kfree(all); ---> ipv6.devconf_all invalid ... ops_exit_list() ... ip6mr_sk_done() devconf = net->ipv6.devconf_all; //devconf is invalid pointer if (!devconf || !atomic_read(&devconf->mc_forwarding)) The following is the Call Trace information: BUG: KASAN: use-after-free in ip6mr_sk_done+0x112/0x3a0 Read of size 4 at addr ffff888075508e88 by task ip/14554 Call Trace: <TASK> dump_stack_lvl+0x8e/0xd1 print_report+0x155/0x454 kasan_report+0xba/0x1f0 kasan_check_range+0x35/0x1b0 ip6mr_sk_done+0x112/0x3a0 rawv6_close+0x48/0x70 inet_release+0x109/0x230 inet6_release+0x4c/0x70 sock_release+0x87/0x1b0 igmp6_net_exit+0x6b/0x170 ops_exit_list+0xb0/0x170 setup_net+0x7ac/0xbd0 copy_net_ns+0x2e6/0x6b0 create_new_namespaces+0x382/0xa50 unshare_nsproxy_namespaces+0xa6/0x1c0 ksys_unshare+0x3a4/0x7e0 __x64_sys_unshare+0x2d/0x40 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f7963322547 </TASK> Allocated by task 14554: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0xa1/0xb0 __kmalloc_node_track_caller+0x4a/0xb0 kmemdup+0x28/0x60 addrconf_init_net+0x1be/0x840 ops_init+0xa5/0x410 setup_net+0x5aa/0xbd0 copy_net_ns+0x2e6/0x6b0 create_new_namespaces+0x382/0xa50 unshare_nsproxy_namespaces+0xa6/0x1c0 ksys_unshare+0x3a4/0x7e0 __x64_sys_unshare+0x2d/0x40 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Freed by task 14554: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 ____kasan_slab_free+0x155/0x1b0 slab_free_freelist_hook+0x11b/0x220 __kmem_cache_free+0xa4/0x360 addrconf_init_net+0x623/0x840 ops_init+0xa5/0x410 setup_net+0x5aa/0xbd0 copy_net_ns+0x2e6/0x6b0 create_new_namespaces+0x382/0xa50 unshare_nsproxy_namespaces+0xa6/0x1c0 ksys_unshare+0x3a4/0x7e0 __x64_sys_unshare+0x2d/0x40 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: 7d9b1b578d67 ("ip6mr: fix use-after-free in ip6mr_sk_done()") Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20221017080331.16878-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 10ce86bf228e1..d5967cba5b568 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7214,9 +7214,11 @@ err_reg_dflt: __addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(dflt); + net->ipv6.devconf_dflt = NULL; #endif err_alloc_dflt: kfree(all); + net->ipv6.devconf_all = NULL; err_alloc_all: kfree(net->ipv6.inet6_addr_lst); err_alloc_addr: -- GitLab From 1dcaf30725c32b26daa70d22083999972ab99c29 Mon Sep 17 00:00:00 2001 From: Jon Hunter <jonathanh@nvidia.com> Date: Tue, 11 Oct 2022 16:32:43 +0100 Subject: [PATCH 2069/2223] cpufreq: tegra194: Fix module loading When the Tegra194 CPUFREQ driver is built as a module it is not automatically loaded as expected on Tegra194 devices. Populate the MODULE_DEVICE_TABLE to fix this. Cc: v5.9+ <stable@vger.kernel.org> # v5.9+ Fixes: df320f89359c ("cpufreq: Add Tegra194 cpufreq driver") Signed-off-by: Jon Hunter <jonathanh@nvidia.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/tegra194-cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index c2004cae3f021..4596c3e323aa4 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -589,6 +589,7 @@ static const struct of_device_id tegra194_cpufreq_of_match[] = { { .compatible = "nvidia,tegra239-ccplex-cluster", .data = &tegra239_cpufreq_soc }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, tegra194_cpufreq_of_match); static struct platform_driver tegra194_ccplex_driver = { .driver = { -- GitLab From 9f42cf54403a42cb092636804d2628d8ecf71e75 Mon Sep 17 00:00:00 2001 From: Fabien Parent <fabien.parent@linaro.org> Date: Sat, 15 Oct 2022 15:04:22 +0200 Subject: [PATCH 2070/2223] cpufreq: qcom: fix memory leak in error path If for some reason the speedbin length is incorrect, then there is a memory leak in the error path because we never free the speedbin buffer. This commit fixes the error path to always free the speedbin buffer. Cc: v5.7+ <stable@vger.kernel.org> # v5.7+ Fixes: a8811ec764f9 ("cpufreq: qcom: Add support for krait based socs") Signed-off-by: Fabien Parent <fabien.parent@linaro.org> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/qcom-cpufreq-nvmem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index 863548f59c3e5..3bd38acde4b95 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -213,6 +213,7 @@ static int qcom_cpufreq_krait_name_version(struct device *cpu_dev, int speed = 0, pvs = 0, pvs_ver = 0; u8 *speedbin; size_t len; + int ret = 0; speedbin = nvmem_cell_read(speedbin_nvmem, &len); @@ -230,7 +231,8 @@ static int qcom_cpufreq_krait_name_version(struct device *cpu_dev, break; default: dev_err(cpu_dev, "Unable to read nvmem data. Defaulting to 0!\n"); - return -ENODEV; + ret = -ENODEV; + goto len_error; } snprintf(*pvs_name, sizeof("speedXX-pvsXX-vXX"), "speed%d-pvs%d-v%d", @@ -238,8 +240,9 @@ static int qcom_cpufreq_krait_name_version(struct device *cpu_dev, drv->versions = (1 << speed); +len_error: kfree(speedbin); - return 0; + return ret; } static const struct qcom_cpufreq_match_data match_data_kryo = { -- GitLab From 01039fb8e90c9cb684430414bff70cea9eb168c5 Mon Sep 17 00:00:00 2001 From: Fabien Parent <fabien.parent@linaro.org> Date: Sat, 15 Oct 2022 15:04:23 +0200 Subject: [PATCH 2071/2223] cpufreq: qcom: fix writes in read-only memory region This commit fixes a kernel oops because of a write in some read-only memory: [ 9.068287] Unable to handle kernel write to read-only memory at virtual address ffff800009240ad8 ..snip.. [ 9.138790] Internal error: Oops: 9600004f [#1] PREEMPT SMP ..snip.. [ 9.269161] Call trace: [ 9.276271] __memcpy+0x5c/0x230 [ 9.278531] snprintf+0x58/0x80 [ 9.282002] qcom_cpufreq_msm8939_name_version+0xb4/0x190 [ 9.284869] qcom_cpufreq_probe+0xc8/0x39c ..snip.. The following line defines a pointer that point to a char buffer stored in read-only memory: char *pvs_name = "speedXX-pvsXX-vXX"; This pointer is meant to hold a template "speedXX-pvsXX-vXX" where the XX values get overridden by the qcom_cpufreq_krait_name_version function. Since the template is actually stored in read-only memory, when the function executes the following call we get an oops: snprintf(*pvs_name, sizeof("speedXX-pvsXX-vXX"), "speed%d-pvs%d-v%d", speed, pvs, pvs_ver); To fix this issue, we instead store the template name onto the stack by using the following syntax: char pvs_name_buffer[] = "speedXX-pvsXX-vXX"; Because the `pvs_name` needs to be able to be assigned to NULL, the template buffer is stored in the pvs_name_buffer and not under the pvs_name variable. Cc: v5.7+ <stable@vger.kernel.org> # v5.7+ Fixes: a8811ec764f9 ("cpufreq: qcom: Add support for krait based socs") Signed-off-by: Fabien Parent <fabien.parent@linaro.org> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/qcom-cpufreq-nvmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index 3bd38acde4b95..82e0339d7722b 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -265,7 +265,8 @@ static int qcom_cpufreq_probe(struct platform_device *pdev) struct nvmem_cell *speedbin_nvmem; struct device_node *np; struct device *cpu_dev; - char *pvs_name = "speedXX-pvsXX-vXX"; + char pvs_name_buffer[] = "speedXX-pvsXX-vXX"; + char *pvs_name = pvs_name_buffer; unsigned cpu; const struct of_device_id *match; int ret; -- GitLab From a05887f005d374ff10aeaffe9f203e49fde22d17 Mon Sep 17 00:00:00 2001 From: Fabien Parent <fabien.parent@linaro.org> Date: Sat, 15 Oct 2022 15:04:24 +0200 Subject: [PATCH 2072/2223] cpufreq: qcom: remove unused parameter in function definition The speedbin_nvmem parameter is not used for get_krait_bin_format_{a,b}. Let's remove the parameter to make the code cleaner. Signed-off-by: Fabien Parent <fabien.parent@linaro.org> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/qcom-cpufreq-nvmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index 82e0339d7722b..a154f03666fdb 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -64,7 +64,7 @@ static struct platform_device *cpufreq_dt_pdev, *cpufreq_pdev; static void get_krait_bin_format_a(struct device *cpu_dev, int *speed, int *pvs, int *pvs_ver, - struct nvmem_cell *pvs_nvmem, u8 *buf) + u8 *buf) { u32 pte_efuse; @@ -95,7 +95,7 @@ static void get_krait_bin_format_a(struct device *cpu_dev, static void get_krait_bin_format_b(struct device *cpu_dev, int *speed, int *pvs, int *pvs_ver, - struct nvmem_cell *pvs_nvmem, u8 *buf) + u8 *buf) { u32 pte_efuse, redundant_sel; @@ -223,11 +223,11 @@ static int qcom_cpufreq_krait_name_version(struct device *cpu_dev, switch (len) { case 4: get_krait_bin_format_a(cpu_dev, &speed, &pvs, &pvs_ver, - speedbin_nvmem, speedbin); + speedbin); break; case 8: get_krait_bin_format_b(cpu_dev, &speed, &pvs, &pvs_ver, - speedbin_nvmem, speedbin); + speedbin); break; default: dev_err(cpu_dev, "Unable to read nvmem data. Defaulting to 0!\n"); -- GitLab From 2a808b9f701ba935a67be58a3afa2e3b230cee85 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 27 Sep 2022 23:40:18 +0800 Subject: [PATCH 2073/2223] cpufreq: dt: Switch to use dev_err_probe() helper In the probe path, dev_err() can be replaced with dev_err_probe() which will check if error code is -EPROBE_DEFER and prints the error name. It also sets the defer probe reason which can be checked later through debugfs. It's more simple in error path. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/cpufreq-dt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index d69d13a264146..4aec4b2a52259 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -222,10 +222,8 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu) if (reg_name[0]) { priv->opp_token = dev_pm_opp_set_regulators(cpu_dev, reg_name); if (priv->opp_token < 0) { - ret = priv->opp_token; - if (ret != -EPROBE_DEFER) - dev_err(cpu_dev, "failed to set regulators: %d\n", - ret); + ret = dev_err_probe(cpu_dev, priv->opp_token, + "failed to set regulators\n"); goto free_cpumask; } } -- GitLab From ab4fdc735daf483c70fc7e4b6c49fa8c1999f741 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 27 Sep 2022 23:40:19 +0800 Subject: [PATCH 2074/2223] cpufreq: imx6q: Switch to use dev_err_probe() helper In the probe path, dev_err() can be replaced with dev_err_probe() which will check if error code is -EPROBE_DEFER and prints the error name. It also sets the defer probe reason which can be checked later through debugfs. It's more simple in error path. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/imx6q-cpufreq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index 90beb26ed34e9..ad4ce84931446 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -396,9 +396,7 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) ret = imx6q_opp_check_speed_grading(cpu_dev); } if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(cpu_dev, "failed to read ocotp: %d\n", - ret); + dev_err_probe(cpu_dev, ret, "failed to read ocotp\n"); goto out_free_opp; } -- GitLab From d78be404f97fadacd6a0d0928e6933e89e1869f6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 27 Sep 2022 23:40:20 +0800 Subject: [PATCH 2075/2223] cpufreq: qcom-nvmem: Switch to use dev_err_probe() helper In the probe path, dev_err() can be replaced with dev_err_probe() which will check if error code is -EPROBE_DEFER and prints the error name. It also sets the defer probe reason which can be checked later through debugfs. It's more simple in error path. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/qcom-cpufreq-nvmem.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index a154f03666fdb..a577586b23be2 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -299,11 +299,8 @@ static int qcom_cpufreq_probe(struct platform_device *pdev) if (drv->data->get_version) { speedbin_nvmem = of_nvmem_cell_get(np, NULL); if (IS_ERR(speedbin_nvmem)) { - if (PTR_ERR(speedbin_nvmem) != -EPROBE_DEFER) - dev_err(cpu_dev, - "Could not get nvmem cell: %ld\n", - PTR_ERR(speedbin_nvmem)); - ret = PTR_ERR(speedbin_nvmem); + ret = dev_err_probe(cpu_dev, PTR_ERR(speedbin_nvmem), + "Could not get nvmem cell\n"); goto free_drv; } -- GitLab From 889a50aedcd216cc5f2b98bb2412f0498d417721 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 27 Sep 2022 23:40:21 +0800 Subject: [PATCH 2076/2223] cpufreq: sun50i: Switch to use dev_err_probe() helper In the probe path, convert pr_err() to dev_err_probe() which will check if error code is -EPROBE_DEFER and prints the error name. It also sets the defer probe reason which can be checked later through debugfs. It's more simple in error path. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index a4922580ce065..1583a370da396 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -56,12 +56,9 @@ static int sun50i_cpufreq_get_efuse(u32 *versions) speedbin_nvmem = of_nvmem_cell_get(np, NULL); of_node_put(np); - if (IS_ERR(speedbin_nvmem)) { - if (PTR_ERR(speedbin_nvmem) != -EPROBE_DEFER) - pr_err("Could not get nvmem cell: %ld\n", - PTR_ERR(speedbin_nvmem)); - return PTR_ERR(speedbin_nvmem); - } + if (IS_ERR(speedbin_nvmem)) + return dev_err_probe(cpu_dev, PTR_ERR(speedbin_nvmem), + "Could not get nvmem cell\n"); speedbin = nvmem_cell_read(speedbin_nvmem, &len); nvmem_cell_put(speedbin_nvmem); -- GitLab From e0539ae012ba5d618eb19665ff990b87b960c643 Mon Sep 17 00:00:00 2001 From: ZiyangZhang <ZiyangZhang@linux.alibaba.com> Date: Tue, 18 Oct 2022 12:53:46 +0800 Subject: [PATCH 2077/2223] Documentation: document ublk user recovery feature Add documentation for user recovery feature of ublk subsystem. Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20221018045346.99706-2-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/block/ublk.rst | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 2122d1a4a5419..ba45c46cc0dac 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -144,6 +144,42 @@ managing and controlling ublk devices with help of several control commands: For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's responsibility to save IO target specific info in userspace. +- ``UBLK_CMD_START_USER_RECOVERY`` + + This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This + command is accepted after the old process has exited, ublk device is quiesced + and ``/dev/ublkc*`` is released. User should send this command before he starts + a new process which re-opens ``/dev/ublkc*``. When this command returns, the + ublk device is ready for the new process. + +- ``UBLK_CMD_END_USER_RECOVERY`` + + This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This + command is accepted after ublk device is quiesced and a new process has + opened ``/dev/ublkc*`` and get all ublk queues be ready. When this command + returns, ublk device is unquiesced and new I/O requests are passed to the + new process. + +- user recovery feature description + + Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and + ``UBLK_F_USER_RECOVERY_REISSUE``. + + With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io + handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole + recovery stage and ublk device ID is kept. It is ublk server's + responsibility to recover the device context by its own knowledge. + Requests which have not been issued to userspace are requeued. Requests + which have been issued to userspace are aborted. + + With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk + server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``, + requests which have been issued to userspace are requeued and will be + re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``. + ``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate + double-write since the driver may issue the same I/O request twice. It + might be useful to a read-only FS or a VM backend. + Data plane ---------- -- GitLab From 79425b297f56bd481c6e97700a9a4e44c7bcfa35 Mon Sep 17 00:00:00 2001 From: Samuel Bailey <samuel.bailey1@gmail.com> Date: Wed, 5 Oct 2022 19:51:23 +0100 Subject: [PATCH 2078/2223] HID: saitek: add madcatz variant of MMO7 mouse device ID The MadCatz variant of the MMO7 mouse has the ID 0738:1713 and the same quirks as the Saitek variant. Signed-off-by: Samuel Bailey <samuel.bailey1@gmail.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-quirks.c | 1 + drivers/hid/hid-saitek.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 7cc23be4975c7..dad953f66996a 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -867,6 +867,7 @@ #define USB_DEVICE_ID_MADCATZ_BEATPAD 0x4540 #define USB_DEVICE_ID_MADCATZ_RAT5 0x1705 #define USB_DEVICE_ID_MADCATZ_RAT9 0x1709 +#define USB_DEVICE_ID_MADCATZ_MMO7 0x1713 #define USB_VENDOR_ID_MCC 0x09db #define USB_DEVICE_ID_MCC_PMD1024LS 0x0076 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 70f602c64fd13..50e1c717fc0a3 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -620,6 +620,7 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SAITEK, USB_DEVICE_ID_SAITEK_MMO7) }, { HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_RAT5) }, { HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_RAT9) }, + { HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_MMO7) }, #endif #if IS_ENABLED(CONFIG_HID_SAMSUNG) { HID_USB_DEVICE(USB_VENDOR_ID_SAMSUNG, USB_DEVICE_ID_SAMSUNG_IR_REMOTE) }, diff --git a/drivers/hid/hid-saitek.c b/drivers/hid/hid-saitek.c index c7bf14c019605..b84e975977c42 100644 --- a/drivers/hid/hid-saitek.c +++ b/drivers/hid/hid-saitek.c @@ -187,6 +187,8 @@ static const struct hid_device_id saitek_devices[] = { .driver_data = SAITEK_RELEASE_MODE_RAT7 }, { HID_USB_DEVICE(USB_VENDOR_ID_SAITEK, USB_DEVICE_ID_SAITEK_MMO7), .driver_data = SAITEK_RELEASE_MODE_MMO7 }, + { HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_MMO7), + .driver_data = SAITEK_RELEASE_MODE_MMO7 }, { } }; -- GitLab From e66928af3667a9d844a674976ba7765757ab68e2 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Wed, 5 Oct 2022 16:42:16 +0100 Subject: [PATCH 2079/2223] HID: lenovo: Make array tp10ubkbd_led static const Don't populate the read-only array tp10ubkbd_led on the stack but instead make it static const. Also makes the object code a little smaller. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- drivers/hid/hid-lenovo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 9dabd63232343..44763c0da4441 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -985,7 +985,7 @@ static int lenovo_led_brightness_set(struct led_classdev *led_cdev, struct device *dev = led_cdev->dev->parent; struct hid_device *hdev = to_hid_device(dev); struct lenovo_drvdata *data_pointer = hid_get_drvdata(hdev); - u8 tp10ubkbd_led[] = { TP10UBKBD_MUTE_LED, TP10UBKBD_MICMUTE_LED }; + static const u8 tp10ubkbd_led[] = { TP10UBKBD_MUTE_LED, TP10UBKBD_MICMUTE_LED }; int led_nr = 0; int ret = 0; -- GitLab From fee0fb1f15054bb6a0ede452acb42da5bef4d587 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Mon, 17 Oct 2022 22:45:21 +0800 Subject: [PATCH 2080/2223] cifs: Fix xid leak in cifs_create() If the cifs already shutdown, we should free the xid before return, otherwise, the xid will be leaked. Fixes: 087f757b0129 ("cifs: add shutdown support") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index a5c73c2af3a26..8b1c371585564 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -543,8 +543,10 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); - if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) - return -EIO; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) { + rc = -EIO; + goto out_free_xid; + } tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); -- GitLab From 9a97df404a402fe1174d2d1119f87ff2a0ca2fe9 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Mon, 17 Oct 2022 22:45:22 +0800 Subject: [PATCH 2081/2223] cifs: Fix xid leak in cifs_copy_file_range() If the file is used by swap, before return -EOPNOTSUPP, should free the xid, otherwise, the xid will be leaked. Fixes: 4e8aea30f775 ("smb3: enable swap on SMB3 mounts") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c6ac19223ddc0..d0b9fec111aac 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1302,8 +1302,11 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, ssize_t rc; struct cifsFileInfo *cfile = dst_file->private_data; - if (cfile->swapfile) - return -EOPNOTSUPP; + if (cfile->swapfile) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff, len, flags); -- GitLab From 575e079c782b9862ec2626403922d041a42e6ed6 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Mon, 17 Oct 2022 22:45:23 +0800 Subject: [PATCH 2082/2223] cifs: Fix xid leak in cifs_flock() If not flock, before return -ENOLCK, should free the xid, otherwise, the xid will be leaked. Fixes: d0677992d2af ("cifs: add support for flock") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index f6ffee514c345..5b3b308e115c8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1885,11 +1885,13 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) struct cifsFileInfo *cfile; __u32 type; - rc = -EACCES; xid = get_xid(); - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; + if (!(fl->fl_flags & FL_FLOCK)) { + rc = -ENOLCK; + free_xid(xid); + return rc; + } cfile = (struct cifsFileInfo *)file->private_data; tcon = tlink_tcon(cfile->tlink); @@ -1908,8 +1910,9 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) * if no lock or unlock then nothing to do since we do not * know what it is */ + rc = -EOPNOTSUPP; free_xid(xid); - return -EOPNOTSUPP; + return rc; } rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock, -- GitLab From e909d054bdea75ef1ec48c18c5936affdaecbb2c Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Mon, 17 Oct 2022 22:45:24 +0800 Subject: [PATCH 2083/2223] cifs: Fix xid leak in cifs_ses_add_channel() Before return, should free the xid, otherwise, the xid will be leaked. Fixes: d70e9fa55884 ("cifs: try opening channels after mounting") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/sess.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 0435d1dfa9e11..92e4278ec35d5 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -496,6 +496,7 @@ out: cifs_put_tcp_session(chan->server, 0); } + free_xid(xid); return rc; } -- GitLab From 10269f13257d4eb6061d09ccce61666316df9838 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Mon, 17 Oct 2022 22:45:25 +0800 Subject: [PATCH 2084/2223] cifs: Fix xid leak in cifs_get_file_info_unix() If stardup the symlink target failed, should free the xid, otherwise the xid will be leaked. Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7cf96e581d243..9bde08d44617f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -368,8 +368,10 @@ cifs_get_file_info_unix(struct file *filp) if (cfile->symlink_target) { fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); - if (!fattr.cf_symlink_target) - return -ENOMEM; + if (!fattr.cf_symlink_target) { + rc = -ENOMEM; + goto cifs_gfiunix_out; + } } rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); -- GitLab From d32f211adb6aa179c00ee1c251315d1ef1433a38 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 17 Oct 2022 16:55:08 +0800 Subject: [PATCH 2085/2223] cifs: use LIST_HEAD() and list_move() to simplify code list_head can be initialized automatically with LIST_HEAD() instead of calling INIT_LIST_HEAD(). Using list_move() instead of list_del() and list_add(). Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index fe88b67c863fe..8cad528a87228 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -378,13 +378,11 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { struct cached_fids *cfids = tcon->cfids; struct cached_fid *cfid, *q; - struct list_head entry; + LIST_HEAD(entry); - INIT_LIST_HEAD(&entry); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - list_del(&cfid->entry); - list_add(&cfid->entry, &entry); + list_move(&cfid->entry, &entry); cfids->num_entries--; cfid->is_open = false; /* To prevent race with smb2_cached_lease_break() */ @@ -518,15 +516,13 @@ struct cached_fids *init_cached_dirs(void) void free_cached_dirs(struct cached_fids *cfids) { struct cached_fid *cfid, *q; - struct list_head entry; + LIST_HEAD(entry); - INIT_LIST_HEAD(&entry); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { cfid->on_list = false; cfid->is_open = false; - list_del(&cfid->entry); - list_add(&cfid->entry, &entry); + list_move(&cfid->entry, &entry); } spin_unlock(&cfids->cfid_list_lock); -- GitLab From 053569ccde2a41abcc592781451cd16eaa6e8bab Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Mon, 17 Oct 2022 18:48:26 -0500 Subject: [PATCH 2086/2223] cifs: set rc to -ENOENT if we can not get a dentry for the cached dir We already set rc to this return code further down in the function but we can set it earlier in order to suppress a smash warning. Also fix a false positive for Coverity. The reason this is a false positive is that this happens during umount after all files and directories have been closed but mosetting on ->on_list to suppress the warning. Reported-by: Dan carpenter <dan.carpenter@oracle.com> Reported-by: coverity-bot <keescook+coverity-bot@chromium.org> Addresses-Coverity-ID: 1525256 ("Concurrent data access violations") Fixes: a350d6e73f5e ("cifs: enable caching of directories for which a lease is held") Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 8cad528a87228..20efc9e22761d 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -253,8 +253,10 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, dentry = dget(cifs_sb->root); else { dentry = path_to_dentry(cifs_sb, path); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + rc = -ENOENT; goto oshr_free; + } } cfid->dentry = dentry; cfid->tcon = tcon; @@ -385,13 +387,13 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) list_move(&cfid->entry, &entry); cfids->num_entries--; cfid->is_open = false; + cfid->on_list = false; /* To prevent race with smb2_cached_lease_break() */ kref_get(&cfid->refcount); } spin_unlock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &entry, entry) { - cfid->on_list = false; list_del(&cfid->entry); cancel_work_sync(&cfid->lease_break); if (cfid->has_lease) { -- GitLab From 30b2d7f8f13664655480d6af45f60270b3eb6736 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Date: Tue, 18 Oct 2022 11:49:16 +0800 Subject: [PATCH 2087/2223] cifs: Fix memory leak when build ntlmssp negotiate blob failed There is a memory leak when mount cifs: unreferenced object 0xffff888166059600 (size 448): comm "mount.cifs", pid 51391, jiffies 4295596373 (age 330.596s) hex dump (first 32 bytes): fe 53 4d 42 40 00 00 00 00 00 00 00 01 00 82 00 .SMB@........... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000060609a61>] mempool_alloc+0xe1/0x260 [<00000000adfa6c63>] cifs_small_buf_get+0x24/0x60 [<00000000ebb404c7>] __smb2_plain_req_init+0x32/0x460 [<00000000bcf875b4>] SMB2_sess_alloc_buffer+0xa4/0x3f0 [<00000000753a2987>] SMB2_sess_auth_rawntlmssp_negotiate+0xf5/0x480 [<00000000f0c1f4f9>] SMB2_sess_setup+0x253/0x410 [<00000000a8b83303>] cifs_setup_session+0x18f/0x4c0 [<00000000854bd16d>] cifs_get_smb_ses+0xae7/0x13c0 [<000000006cbc43d9>] mount_get_conns+0x7a/0x730 [<000000005922d816>] cifs_mount+0x103/0xd10 [<00000000e33def3b>] cifs_smb3_do_mount+0x1dd/0xc90 [<0000000078034979>] smb3_get_tree+0x1d5/0x300 [<000000004371f980>] vfs_get_tree+0x41/0xf0 [<00000000b670d8a7>] path_mount+0x9b3/0xdd0 [<000000005e839a7d>] __x64_sys_mount+0x190/0x1d0 [<000000009404c3b9>] do_syscall_64+0x35/0x80 When build ntlmssp negotiate blob failed, the session setup request should be freed. Fixes: 49bd49f983b5 ("cifs: send workstation name during ntlmssp session setup") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Reviewed-by: Shyam Prasad N <sprasad@microsoft.com> Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a2384509ea84b..c930b63bc422f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1531,7 +1531,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) &blob_length, ses, server, sess_data->nls_cp); if (rc) - goto out_err; + goto out; if (use_spnego) { /* BB eventually need to add this */ -- GitLab From 141b3523e9be6f15577acf4bbc3bc1f82d81d6d1 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka <mpatocka@redhat.com> Date: Tue, 18 Oct 2022 10:06:45 -0400 Subject: [PATCH 2088/2223] dm bufio: use the acquire memory barrier when testing for B_READING The function test_bit doesn't provide any memory barrier. It may be possible that the read requests that follow test_bit(B_READING, &b->state) are reordered before the test, reading invalid data that existed before B_READING was cleared. Fix this bug by changing test_bit to test_bit_acquire. This is particularly important on arches with weak(er) memory ordering (e.g. arm64). Depends-On: 8238b4579866 ("wait_on_bit: add an acquire memory barrier") Depends-On: d6ffe6067a54 ("provide arch_test_bit_acquire for architectures that define test_bit") Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-bufio.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 09c7ed2650ca4..9c5ef818ca365 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -795,7 +795,8 @@ static void __make_buffer_clean(struct dm_buffer *b) { BUG_ON(b->hold_count); - if (!b->state) /* fast case */ + /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */ + if (!smp_load_acquire(&b->state)) /* fast case */ return; wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); @@ -816,7 +817,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) BUG_ON(test_bit(B_DIRTY, &b->state)); if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep && - unlikely(test_bit(B_READING, &b->state))) + unlikely(test_bit_acquire(B_READING, &b->state))) continue; if (!b->hold_count) { @@ -1058,7 +1059,7 @@ found_buffer: * If the user called both dm_bufio_prefetch and dm_bufio_get on * the same buffer, it would deadlock if we waited. */ - if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) + if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) return NULL; b->hold_count++; @@ -1218,7 +1219,7 @@ void dm_bufio_release(struct dm_buffer *b) * invalid buffer. */ if ((b->read_error || b->write_error) && - !test_bit(B_READING, &b->state) && + !test_bit_acquire(B_READING, &b->state) && !test_bit(B_WRITING, &b->state) && !test_bit(B_DIRTY, &b->state)) { __unlink_buffer(b); @@ -1479,7 +1480,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_release_move); static void forget_buffer_locked(struct dm_buffer *b) { - if (likely(!b->hold_count) && likely(!b->state)) { + if (likely(!b->hold_count) && likely(!smp_load_acquire(&b->state))) { __unlink_buffer(b); __free_buffer_wake(b); } @@ -1639,7 +1640,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) { if (!(gfp & __GFP_FS) || (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { - if (test_bit(B_READING, &b->state) || + if (test_bit_acquire(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) return false; -- GitLab From 67bf6493449b09590f9f71d7df29efb392b12d25 Mon Sep 17 00:00:00 2001 From: Babu Moger <babu.moger@amd.com> Date: Tue, 27 Sep 2022 15:16:29 -0500 Subject: [PATCH 2089/2223] x86/resctrl: Fix min_cbm_bits for AMD AMD systems support zero CBM (capacity bit mask) for cache allocation. That is reflected in rdt_init_res_defs_amd() by: r->cache.arch_has_empty_bitmaps = true; However given the unified code in cbm_validate(), checking for: val == 0 && !arch_has_empty_bitmaps is not enough because of another check in cbm_validate(): if ((zero_bit - first_bit) < r->cache.min_cbm_bits) The default value of r->cache.min_cbm_bits = 1. Leading to: $ cd /sys/fs/resctrl $ mkdir foo $ cd foo $ echo L3:0=0 > schemata -bash: echo: write error: Invalid argument $ cat /sys/fs/resctrl/info/last_cmd_status Need at least 1 bits in the mask Initialize the min_cbm_bits to 0 for AMD. Also, remove the default setting of min_cbm_bits and initialize it separately. After the fix: $ cd /sys/fs/resctrl $ mkdir foo $ cd foo $ echo L3:0=0 > schemata $ cat /sys/fs/resctrl/info/last_cmd_status ok Fixes: 316e7f901f5a ("x86/resctrl: Add struct rdt_cache::arch_has_{sparse, empty}_bitmaps") Co-developed-by: Stephane Eranian <eranian@google.com> Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Babu Moger <babu.moger@amd.com> Signed-off-by: Borislav Petkov <bp@suse.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Reviewed-by: James Morse <james.morse@arm.com> Reviewed-by: Reinette Chatre <reinette.chatre@intel.com> Reviewed-by: Fenghua Yu <fenghua.yu@intel.com> Cc: <stable@vger.kernel.org> Link: https://lore.kernel.org/lkml/20220517001234.3137157-1-eranian@google.com --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index de62b0b87cedf..3266ea36667c3 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -66,9 +66,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L3, .name = "L3", .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -83,9 +80,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L2, .name = "L2", .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -836,6 +830,7 @@ static __init void rdt_init_res_defs_intel(void) r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; hw_res->msr_update = mba_wrmsr_intel; @@ -856,6 +851,7 @@ static __init void rdt_init_res_defs_amd(void) r->cache.arch_has_sparse_bitmaps = true; r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; -- GitLab From 43e6c111824c75940a586cd7d3fe6a5ff1d5104f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka <mpatocka@redhat.com> Date: Wed, 24 Aug 2022 07:25:57 -0400 Subject: [PATCH 2090/2223] dm: change from DMWARN to DMERR or DMCRIT for fatal errors Change DMWARN to DMERR in cases when there is an unrecoverable error. Change DMWARN to DMCRIT when handling of a case is unimplemented. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-ioctl.c | 78 +++++++++++++++++++++---------------------- drivers/md/dm-rq.c | 4 +-- drivers/md/dm-stats.c | 2 +- drivers/md/dm-table.c | 78 +++++++++++++++++++++---------------------- drivers/md/dm.c | 8 ++--- 5 files changed, 85 insertions(+), 85 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 98976aaa9db9a..6b3f867d0b707 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -434,10 +434,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, hc = __get_name_cell(new); if (hc) { - DMWARN("Unable to change %s on mapped device %s to one that " - "already exists: %s", - change_uuid ? "uuid" : "name", - param->name, new); + DMERR("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", + param->name, new); dm_put(hc->md); up_write(&_hash_lock); kfree(new_data); @@ -449,8 +449,8 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, */ hc = __get_name_cell(param->name); if (!hc) { - DMWARN("Unable to rename non-existent device, %s to %s%s", - param->name, change_uuid ? "uuid " : "", new); + DMERR("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); up_write(&_hash_lock); kfree(new_data); return ERR_PTR(-ENXIO); @@ -460,9 +460,9 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, * Does this device already have a uuid? */ if (change_uuid && hc->uuid) { - DMWARN("Unable to change uuid of mapped device %s to %s " - "because uuid is already set to %s", - param->name, new, hc->uuid); + DMERR("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); dm_put(hc->md); up_write(&_hash_lock); kfree(new_data); @@ -750,7 +750,7 @@ static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t static int check_name(const char *name) { if (strchr(name, '/')) { - DMWARN("invalid device name"); + DMERR("invalid device name"); return -EINVAL; } @@ -773,7 +773,7 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *src down_read(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { - DMWARN("device has been removed from the dev hash table."); + DMERR("device has been removed from the dev hash table."); goto out; } @@ -1026,7 +1026,7 @@ static int dev_rename(struct file *filp, struct dm_ioctl *param, size_t param_si if (new_data < param->data || invalid_str(new_data, (void *) param + param_size) || !*new_data || strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { - DMWARN("Invalid new mapped device name or uuid string supplied."); + DMERR("Invalid new mapped device name or uuid string supplied."); return -EINVAL; } @@ -1061,7 +1061,7 @@ static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t pa if (geostr < param->data || invalid_str(geostr, (void *) param + param_size)) { - DMWARN("Invalid geometry supplied."); + DMERR("Invalid geometry supplied."); goto out; } @@ -1069,13 +1069,13 @@ static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t pa indata + 1, indata + 2, indata + 3, &dummy); if (x != 4) { - DMWARN("Unable to interpret geometry settings."); + DMERR("Unable to interpret geometry settings."); goto out; } if (indata[0] > 65535 || indata[1] > 255 || indata[2] > 255 || indata[3] > ULONG_MAX) { - DMWARN("Geometry exceeds range limits."); + DMERR("Geometry exceeds range limits."); goto out; } @@ -1387,7 +1387,7 @@ static int populate_table(struct dm_table *table, char *target_params; if (!param->target_count) { - DMWARN("populate_table: no targets specified"); + DMERR("populate_table: no targets specified"); return -EINVAL; } @@ -1395,7 +1395,7 @@ static int populate_table(struct dm_table *table, r = next_target(spec, next, end, &spec, &target_params); if (r) { - DMWARN("unable to find target"); + DMERR("unable to find target"); return r; } @@ -1404,7 +1404,7 @@ static int populate_table(struct dm_table *table, (sector_t) spec->length, target_params); if (r) { - DMWARN("error adding target to table"); + DMERR("error adding target to table"); return r; } @@ -1451,8 +1451,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si if (immutable_target_type && (immutable_target_type != dm_table_get_immutable_target_type(t)) && !dm_table_get_wildcard_target(t)) { - DMWARN("can't replace immutable target type %s", - immutable_target_type->name); + DMERR("can't replace immutable target type %s", + immutable_target_type->name); r = -EINVAL; goto err_unlock_md_type; } @@ -1461,12 +1461,12 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { - DMWARN("unable to set up device queue for new table."); + DMERR("unable to set up device queue for new table."); goto err_unlock_md_type; } } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) { - DMWARN("can't change device type (old=%u vs new=%u) after initial table load.", - dm_get_md_type(md), dm_table_get_type(t)); + DMERR("can't change device type (old=%u vs new=%u) after initial table load.", + dm_get_md_type(md), dm_table_get_type(t)); r = -EINVAL; goto err_unlock_md_type; } @@ -1477,7 +1477,7 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { - DMWARN("device has been removed from the dev hash table."); + DMERR("device has been removed from the dev hash table."); up_write(&_hash_lock); r = -ENXIO; goto err_destroy_table; @@ -1686,19 +1686,19 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para if (tmsg < (struct dm_target_msg *) param->data || invalid_str(tmsg->message, (void *) param + param_size)) { - DMWARN("Invalid target message parameters."); + DMERR("Invalid target message parameters."); r = -EINVAL; goto out; } r = dm_split_args(&argc, &argv, tmsg->message); if (r) { - DMWARN("Failed to split target message parameters"); + DMERR("Failed to split target message parameters"); goto out; } if (!argc) { - DMWARN("Empty message received."); + DMERR("Empty message received."); r = -EINVAL; goto out_argv; } @@ -1718,12 +1718,12 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para ti = dm_table_find_target(table, tmsg->sector); if (!ti) { - DMWARN("Target message sector outside device."); + DMERR("Target message sector outside device."); r = -EINVAL; } else if (ti->type->message) r = ti->type->message(ti, argc, argv, result, maxlen); else { - DMWARN("Target type does not support messages"); + DMERR("Target type does not support messages"); r = -EINVAL; } @@ -1814,11 +1814,11 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) if ((DM_VERSION_MAJOR != version[0]) || (DM_VERSION_MINOR < version[1])) { - DMWARN("ioctl interface mismatch: " - "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", - DM_VERSION_MAJOR, DM_VERSION_MINOR, - DM_VERSION_PATCHLEVEL, - version[0], version[1], version[2], cmd); + DMERR("ioctl interface mismatch: " + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + version[0], version[1], version[2], cmd); r = -EINVAL; } @@ -1927,11 +1927,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param) if (cmd == DM_DEV_CREATE_CMD) { if (!*param->name) { - DMWARN("name not supplied when creating device"); + DMERR("name not supplied when creating device"); return -EINVAL; } } else if (*param->uuid && *param->name) { - DMWARN("only supply one of name or uuid, cmd(%u)", cmd); + DMERR("only supply one of name or uuid, cmd(%u)", cmd); return -EINVAL; } @@ -1978,7 +1978,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us fn = lookup_ioctl(cmd, &ioctl_flags); if (!fn) { - DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); + DMERR("dm_ctl_ioctl: unknown command 0x%x", command); return -ENOTTY; } @@ -2203,7 +2203,7 @@ int __init dm_early_create(struct dm_ioctl *dmi, (sector_t) spec_array[i]->length, target_params_array[i]); if (r) { - DMWARN("error adding target to table"); + DMERR("error adding target to table"); goto err_destroy_table; } } @@ -2216,7 +2216,7 @@ int __init dm_early_create(struct dm_ioctl *dmi, /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { - DMWARN("unable to set up device queue for new table."); + DMERR("unable to set up device queue for new table."); goto err_destroy_table; } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 3001b10a3fbfb..a41209a43506c 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -238,7 +238,7 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped) dm_requeue_original_request(tio, true); break; default: - DMWARN("unimplemented target endio return value: %d", r); + DMCRIT("unimplemented target endio return value: %d", r); BUG(); } } @@ -409,7 +409,7 @@ static int map_request(struct dm_rq_target_io *tio) dm_kill_unmapped_request(rq, BLK_STS_IOERR); break; default: - DMWARN("unimplemented target map return value: %d", r); + DMCRIT("unimplemented target map return value: %d", r); BUG(); } diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 8326f9fe0e912..f105a71915ab6 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -1220,7 +1220,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, return 2; /* this wasn't a stats message */ if (r == -EINVAL) - DMWARN("Invalid parameters for message %s", argv[0]); + DMCRIT("Invalid parameters for message %s", argv[0]); return r; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d8034ff0cb241..078da18bb86d8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -234,12 +234,12 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, return 0; if ((start >= dev_size) || (start + len > dev_size)) { - DMWARN("%s: %pg too small for target: " - "start=%llu, len=%llu, dev_size=%llu", - dm_device_name(ti->table->md), bdev, - (unsigned long long)start, - (unsigned long long)len, - (unsigned long long)dev_size); + DMERR("%s: %pg too small for target: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdev, + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); return 1; } @@ -251,10 +251,10 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, unsigned int zone_sectors = bdev_zone_sectors(bdev); if (start & (zone_sectors - 1)) { - DMWARN("%s: start=%llu not aligned to h/w zone size %u of %pg", - dm_device_name(ti->table->md), - (unsigned long long)start, - zone_sectors, bdev); + DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)start, + zone_sectors, bdev); return 1; } @@ -268,10 +268,10 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * the sector range. */ if (len & (zone_sectors - 1)) { - DMWARN("%s: len=%llu not aligned to h/w zone size %u of %pg", - dm_device_name(ti->table->md), - (unsigned long long)len, - zone_sectors, bdev); + DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)len, + zone_sectors, bdev); return 1; } } @@ -280,20 +280,20 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, return 0; if (start & (logical_block_size_sectors - 1)) { - DMWARN("%s: start=%llu not aligned to h/w " - "logical block size %u of %pg", - dm_device_name(ti->table->md), - (unsigned long long)start, - limits->logical_block_size, bdev); + DMERR("%s: start=%llu not aligned to h/w " + "logical block size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)start, + limits->logical_block_size, bdev); return 1; } if (len & (logical_block_size_sectors - 1)) { - DMWARN("%s: len=%llu not aligned to h/w " - "logical block size %u of %pg", - dm_device_name(ti->table->md), - (unsigned long long)len, - limits->logical_block_size, bdev); + DMERR("%s: len=%llu not aligned to h/w " + "logical block size %u of %pg", + dm_device_name(ti->table->md), + (unsigned long long)len, + limits->logical_block_size, bdev); return 1; } @@ -434,8 +434,8 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) } } if (!found) { - DMWARN("%s: device %s not in table devices list", - dm_device_name(ti->table->md), d->name); + DMERR("%s: device %s not in table devices list", + dm_device_name(ti->table->md), d->name); return; } if (refcount_dec_and_test(&dd->count)) { @@ -618,12 +618,12 @@ static int validate_hardware_logical_block_alignment(struct dm_table *t, } if (remaining) { - DMWARN("%s: table line %u (start sect %llu len %llu) " - "not aligned to h/w logical block size %u", - dm_device_name(t->md), i, - (unsigned long long) ti->begin, - (unsigned long long) ti->len, - limits->logical_block_size); + DMERR("%s: table line %u (start sect %llu len %llu) " + "not aligned to h/w logical block size %u", + dm_device_name(t->md), i, + (unsigned long long) ti->begin, + (unsigned long long) ti->len, + limits->logical_block_size); return -EINVAL; } @@ -1008,7 +1008,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * struct dm_md_mempools *pools; if (unlikely(type == DM_TYPE_NONE)) { - DMWARN("no table type is set, can't allocate mempools"); + DMERR("no table type is set, can't allocate mempools"); return -EINVAL; } @@ -1112,7 +1112,7 @@ static bool integrity_profile_exists(struct gendisk *disk) * Get a disk whose integrity profile reflects the table's profile. * Returns NULL if integrity support was inconsistent or unavailable. */ -static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) +static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) { struct list_head *devices = dm_table_get_devices(t); struct dm_dev_internal *dd = NULL; @@ -1185,10 +1185,10 @@ static int dm_table_register_integrity(struct dm_table *t) * profile the new profile should not conflict. */ if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { - DMWARN("%s: conflict with existing integrity profile: " - "%s profile mismatch", - dm_device_name(t->md), - template_disk->disk_name); + DMERR("%s: conflict with existing integrity profile: " + "%s profile mismatch", + dm_device_name(t->md), + template_disk->disk_name); return 1; } @@ -1327,7 +1327,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) if (t->md->queue && !blk_crypto_has_capabilities(profile, t->md->queue->crypto_profile)) { - DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); + DMERR("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); dm_destroy_crypto_profile(profile); return -EINVAL; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 60549b65c799c..7c35dea88ed1d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -864,7 +864,7 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; if (geo->start > sz) { - DMWARN("Start sector is beyond the geometry limits."); + DMERR("Start sector is beyond the geometry limits."); return -EINVAL; } @@ -1149,7 +1149,7 @@ static void clone_endio(struct bio *bio) /* The target will handle the io */ return; default: - DMWARN("unimplemented target endio return value: %d", r); + DMCRIT("unimplemented target endio return value: %d", r); BUG(); } } @@ -1455,7 +1455,7 @@ static void __map_bio(struct bio *clone) dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); break; default: - DMWARN("unimplemented target map return value: %d", r); + DMCRIT("unimplemented target map return value: %d", r); BUG(); } } @@ -2005,7 +2005,7 @@ static struct mapped_device *alloc_dev(int minor) md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { - DMWARN("unable to allocate device, out of memory."); + DMERR("unable to allocate device, out of memory."); return NULL; } -- GitLab From cea446630feab57f49d47abccf206e9725019cce Mon Sep 17 00:00:00 2001 From: Jilin Yuan <yuanjilin@cdjrlc.com> Date: Tue, 30 Aug 2022 23:33:45 +0800 Subject: [PATCH 2091/2223] dm raid: delete the redundant word 'that' in comment Signed-off-by: Jilin Yuan <yuanjilin@cdjrlc.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c640be453313e..e448fd45a9147 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2529,7 +2529,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) * of the "sync" directive. * * With reshaping capability added, we must ensure that - * that the "sync" directive is disallowed during the reshape. + * the "sync" directive is disallowed during the reshape. */ if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) continue; -- GitLab From afd41fff9c73ccc3757e94ad727d2a9ac4d7f6cb Mon Sep 17 00:00:00 2001 From: Nathan Huckleberry <nhuck@google.com> Date: Tue, 30 Aug 2022 18:44:44 +0000 Subject: [PATCH 2092/2223] dm verity: enable WQ_HIGHPRI on verify_wq WQ_HIGHPRI increases throughput and decreases disk latency when using dm-verity. This is important in Android for camera startup speed. The following tests were run by doing 60 seconds of random reads using a dm-verity device backed by two ramdisks. Without WQ_HIGHPRI lat (usec): min=13, max=3947, avg=69.53, stdev=50.55 READ: bw=51.1MiB/s (53.6MB/s), 51.1MiB/s-51.1MiB/s (53.6MB/s-53.6MB/s) With WQ_HIGHPRI: lat (usec): min=13, max=7854, avg=31.15, stdev=30.42 READ: bw=116MiB/s (121MB/s), 116MiB/s-116MiB/s (121MB/s-121MB/s) Further testing was done by measuring how long it takes to open a camera on an Android device. Without WQ_HIGHPRI Total verity work queue wait times (ms): 880.960, 789.517, 898.852 With WQ_HIGHPRI: Total verity work queue wait times (ms): 528.824, 439.191, 433.300 The average time to open the camera is reduced by 350ms (or 40-50%). Signed-off-by: Nathan Huckleberry <nhuck@google.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-verity-target.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 8a00cc42e4985..ccf5b852fbf7a 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1401,14 +1401,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* WQ_UNBOUND greatly improves performance when running on ramdisk */ wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND; - if (v->use_tasklet) { - /* - * Allow verify_wq to preempt softirq since verification in - * tasklet will fall-back to using it for error handling - * (or if the bufio cache doesn't have required hashes). - */ - wq_flags |= WQ_HIGHPRI; - } + /* + * Using WQ_HIGHPRI improves throughput and completion latency by + * reducing wait times when reading from a dm-verity device. + * + * Also as required for the "try_verify_in_tasklet" feature: WQ_HIGHPRI + * allows verify_wq to preempt softirq since verification in tasklet + * will fall-back to using it for error handling (or if the bufio cache + * doesn't have required hashes). + */ + wq_flags |= WQ_HIGHPRI; v->verify_wq = alloc_workqueue("kverityd", wq_flags, num_online_cpus()); if (!v->verify_wq) { ti->error = "Cannot allocate workqueue"; -- GitLab From 96fccdce97ce647d5c7bf1db0d3159cc90774054 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi <yijiangshan@kylinos.cn> Date: Mon, 5 Sep 2022 10:45:52 +0800 Subject: [PATCH 2093/2223] dm raid: fix typo in analyse_superblocks code comment Reported-by: k2ci <kernel-bot@kylinos.cn> Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e448fd45a9147..54263679a7b14 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2590,7 +2590,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) /* * Adjust data_offset and new_data_offset on all disk members of @rs - * for out of place reshaping if requested by contructor + * for out of place reshaping if requested by constructor * * We need free space at the beginning of each raid disk for forward * and at the end for backward reshapes which userspace has to provide -- GitLab From 48d1a964dca532698bc67ac71c04df7908815de1 Mon Sep 17 00:00:00 2001 From: Shaomin Deng <dengshaomin@cdjrlc.com> Date: Sun, 4 Sep 2022 12:04:27 -0400 Subject: [PATCH 2094/2223] dm cache: delete the redundant word 'each' in comment Signed-off-by: Shaomin Deng <dengshaomin@cdjrlc.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-cache-policy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index c05fc3436cef7..06eb31af626f1 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -166,7 +166,7 @@ struct dm_cache_policy_type { struct dm_cache_policy_type *real; /* - * Policies may store a hint for each each cache block. + * Policies may store a hint for each cache block. * Currently the size of this hint must be 0 or 4 bytes but we * expect to relax this in future. */ -- GitLab From dc3efedf9f7b802d0817183020ed01cb0c120fe8 Mon Sep 17 00:00:00 2001 From: Milan Broz <gmazyland@gmail.com> Date: Tue, 27 Sep 2022 20:42:26 +0200 Subject: [PATCH 2095/2223] dm verity: Add documentation for try_verify_in_tasklet option Add documentation that was missing from commit 5721d4e5a9cd ("dm verity: Add optional "try_verify_in_tasklet" feature"). Signed-off-by: Milan Broz <gmazyland@gmail.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- Documentation/admin-guide/device-mapper/verity.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index 1a6b91368e594..a65c1602cb239 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -141,6 +141,10 @@ root_hash_sig_key_desc <key_description> also gain new certificates at run time if they are signed by a certificate already in the secondary trusted keyring. +try_verify_in_tasklet + If verity hashes are in cache, verify data blocks in kernel tasklet instead + of workqueue. This option can reduce IO latency. + Theory of operation =================== -- GitLab From 99f4f5bcb975527508eb7a5e3e34bdb91d576746 Mon Sep 17 00:00:00 2001 From: Genjian Zhang <zhanggenjian@kylinos.cn> Date: Thu, 29 Sep 2022 16:20:36 +0800 Subject: [PATCH 2096/2223] dm: remove unnecessary assignment statement in alloc_dev() Fixes: 74fe6ba923949 ("dm: convert to blk_alloc_disk/blk_cleanup_disk") Signed-off-by: Genjian Zhang <zhanggenjian@kylinos.cn> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7c35dea88ed1d..95a1ee3d314eb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2065,7 +2065,6 @@ static struct mapped_device *alloc_dev(int minor) md->disk->minors = 1; md->disk->flags |= GENHD_FL_NO_PART; md->disk->fops = &dm_blk_dops; - md->disk->queue = md->queue; md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); -- GitLab From 5434ee8d28575b2e784bd5b4dbfc912e5da90759 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis <ntsironis@arrikto.com> Date: Thu, 29 Sep 2022 17:11:48 +0300 Subject: [PATCH 2097/2223] dm clone: Fix typo in block_device format specifier Use %pg for printing the block device name, instead of %pd. Fixes: 385411ffba0c ("dm: stop using bdevname") Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org> --- drivers/md/dm-clone-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 811b0a5379d03..2f1cc66d26412 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2035,7 +2035,7 @@ static void disable_passdown_if_not_supported(struct clone *clone) reason = "max discard sectors smaller than a region"; if (reason) { - DMWARN("Destination device (%pd) %s: Disabling discard passdown.", + DMWARN("Destination device (%pg) %s: Disabling discard passdown.", dest_dev, reason); clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); } -- GitLab From 65f8682b9aaae20c2cdee993e6fe52374ad513c9 Mon Sep 17 00:00:00 2001 From: Danijel Slivka <danijel.slivka@amd.com> Date: Tue, 4 Oct 2022 15:39:44 +0200 Subject: [PATCH 2098/2223] drm/amdgpu: set vm_update_mode=0 as default for Sienna Cichlid in SRIOV case For asic with VF MMIO access protection avoid using CPU for VM table updates. CPU pagetable updates have issues with HDP flush as VF MMIO access protection blocks write to mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL register during sriov runtime. v3: introduce virtualization capability flag AMDGPU_VF_MMIO_ACCESS_PROTECT which indicates that VF MMIO write access is not allowed in sriov runtime Signed-off-by: Danijel Slivka <danijel.slivka@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 +++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index e4af40b9a8aac..9c765b04aae3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -726,6 +726,12 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; } + if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) + /* VF MMIO access (except mailbox range) from CPU + * will be blocked during sriov runtime + */ + adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; + /* we have the ability to check now */ if (amdgpu_sriov_vf(adev)) { switch (adev->asic_type) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index d94c31e68a147..49c4347d154ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -31,6 +31,7 @@ #define AMDGPU_SRIOV_CAPS_IS_VF (1 << 2) /* this GPU is a virtual function */ #define AMDGPU_PASSTHROUGH_MODE (1 << 3) /* thw whole GPU is pass through for VM */ #define AMDGPU_SRIOV_CAPS_RUNTIME (1 << 4) /* is out of full access mode */ +#define AMDGPU_VF_MMIO_ACCESS_PROTECT (1 << 5) /* MMIO write access is not allowed in sriov runtime */ /* flags for indirect register access path supported by rlcg for sriov */ #define AMDGPU_RLCG_GC_WRITE_LEGACY (0x8 << 28) @@ -297,6 +298,9 @@ struct amdgpu_video_codec_info; #define amdgpu_passthrough(adev) \ ((adev)->virt.caps & AMDGPU_PASSTHROUGH_MODE) +#define amdgpu_sriov_vf_mmio_access_protection(adev) \ +((adev)->virt.caps & AMDGPU_VF_MMIO_ACCESS_PROTECT) + static inline bool is_virtual_machine(void) { #if defined(CONFIG_X86) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 83b0c5d86e480..2291aa14d888c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2338,7 +2338,11 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) */ #ifdef CONFIG_X86_64 if (amdgpu_vm_update_mode == -1) { - if (amdgpu_gmc_vram_full_visible(&adev->gmc)) + /* For asic with VF MMIO access protection + * avoid using CPU for VM table updates + */ + if (amdgpu_gmc_vram_full_visible(&adev->gmc) && + !amdgpu_sriov_vf_mmio_access_protection(adev)) adev->vm_manager.vm_update_mode = AMDGPU_VM_USE_CPU_FOR_COMPUTE; else -- GitLab From afbaa15501125ae0b7de9dd16c6f00c85de14218 Mon Sep 17 00:00:00 2001 From: Victor Zhao <Victor.Zhao@amd.com> Date: Thu, 13 Oct 2022 10:42:04 +0800 Subject: [PATCH 2099/2223] Revert "drm/amdgpu: add debugfs amdgpu_reset_level" This reverts commit 5bd8d53f6fa53eab5433698d1362dae2aa53c1cc. This commit breaks the reset logic for aldebaran, revert it for now. Will move the mask inside the reset handler. Fixes: 5bd8d53f6fa53e ("drm/amdgpu: add debugfs amdgpu_reset_level") Signed-off-by: Victor Zhao <Victor.Zhao@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 8 -------- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 3 --- 4 files changed, 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ae9371b172e3a..8639a4f9c6e8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -274,9 +274,6 @@ extern int amdgpu_vcnfw_log; #define AMDGPU_RESET_VCE (1 << 13) #define AMDGPU_RESET_VCE1 (1 << 14) -#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0) -#define AMDGPU_RESET_LEVEL_MODE2 (1 << 1) - /* max cursor sizes (in pixels) */ #define CIK_CURSOR_WIDTH 128 #define CIK_CURSOR_HEIGHT 128 @@ -1065,7 +1062,6 @@ struct amdgpu_device { struct work_struct reset_work; - uint32_t amdgpu_reset_level_mask; bool job_hang; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 6066aebf491cf..de61a85c4b022 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1954,8 +1954,6 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) return PTR_ERR(ent); } - debugfs_create_u32("amdgpu_reset_level", 0600, root, &adev->amdgpu_reset_level_mask); - /* Register debugfs entries for amdgpu_ttm */ amdgpu_ttm_debugfs_init(adev); amdgpu_debugfs_pm_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 9da5ead50c900..831fb222139c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -37,8 +37,6 @@ int amdgpu_reset_init(struct amdgpu_device *adev) { int ret = 0; - adev->amdgpu_reset_level_mask = 0x1; - switch (adev->ip_versions[MP1_HWIP][0]) { case IP_VERSION(13, 0, 2): ret = aldebaran_reset_init(adev); @@ -76,9 +74,6 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, { struct amdgpu_reset_handler *reset_handler = NULL; - if (!(adev->amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2)) - return -ENOSYS; - if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags)) return -ENOSYS; @@ -98,9 +93,6 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, int ret; struct amdgpu_reset_handler *reset_handler = NULL; - if (!(adev->amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2)) - return -ENOSYS; - if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags)) return -ENOSYS; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 3e316b013fd95..d3558c34d406c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -405,9 +405,6 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, { ktime_t deadline = ktime_add_us(ktime_get(), 10000); - if (!(ring->adev->amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_SOFT_RECOVERY)) - return false; - if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence) return false; -- GitLab From a340847b0214aa9b8fd9839f7b2822ccc607edab Mon Sep 17 00:00:00 2001 From: Victor Zhao <Victor.Zhao@amd.com> Date: Thu, 13 Oct 2022 11:06:33 +0800 Subject: [PATCH 2100/2223] Revert "drm/amdgpu: let mode2 reset fallback to default when failure" This reverts commit dac6b80818ac2353631c5a33d140d8d5508e2957. This commit reverted the AMDGPU_SKIP_MODE2_RESET as it conflicts with the original design of reset handler. Will redesign it. Fixes: dac6b80818ac23 ("drm/amdgpu: let mode2 reset fallback to default when failure") Signed-off-by: Victor Zhao <Victor.Zhao@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +------ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 6 ------ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 +-- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 1 - drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 - drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 - 9 files changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 03bbfaa51cbcb..0561812aa0a43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -134,7 +134,6 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ab8f970b28491..bb73fb420ffcd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5210,7 +5210,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_context->job = job; reset_context->hive = hive; - /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -5337,11 +5336,8 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, reset_context); - if (r && r == -EAGAIN) { - set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags); - adev->asic_reset_res = 0; + if (r && r == -EAGAIN) goto retry; - } if (!r && gpu_reset_for_dev_remove) goto recover_end; @@ -5777,7 +5773,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); - set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); adev->no_hw_access = true; r = amdgpu_device_pre_asic_reset(adev, &reset_context); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 46c99331d7f12..cd968e781077e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -72,7 +72,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2dad7aa9a03b9..75f1402101f4c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1950,7 +1950,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 831fb222139c6..f778466bb9dbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -74,9 +74,6 @@ int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, { struct amdgpu_reset_handler *reset_handler = NULL; - if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags)) - return -ENOSYS; - if (adev->reset_cntl && adev->reset_cntl->get_reset_handler) reset_handler = adev->reset_cntl->get_reset_handler( adev->reset_cntl, reset_context); @@ -93,9 +90,6 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, int ret; struct amdgpu_reset_handler *reset_handler = NULL; - if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags)) - return -ENOSYS; - if (adev->reset_cntl) reset_handler = adev->reset_cntl->get_reset_handler( adev->reset_cntl, reset_context); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index f5318fedf2f04..f4a501ff87d90 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -30,8 +30,7 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, - AMDGPU_SKIP_MODE2_RESET = 2, - AMDGPU_RESET_FOR_DEVICE_REMOVE = 3, + AMDGPU_RESET_FOR_DEVICE_REMOVE = 2, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index a2f04b2491329..12906ba74462f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -290,7 +290,6 @@ flr_done: reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index a977f0027928d..e07757eea7adf 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -317,7 +317,6 @@ flr_done: reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index fd14fa9b9cd7c..288c414babdfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -529,7 +529,6 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } -- GitLab From a31e62873f11dff12cbeb8e6f864d0c8e5be0869 Mon Sep 17 00:00:00 2001 From: Victor Zhao <Victor.Zhao@amd.com> Date: Thu, 13 Oct 2022 15:53:19 +0800 Subject: [PATCH 2101/2223] drm/amdgpu: Refactor mode2 reset logic for v11.0.7 - refactor mode2 on v11.0.7 to align with aldebaran - comment out using mode2 reset as default for now, will introduce another controller to replace previous reset_level_mask v2: squash in unused variable removal (Alex) Signed-off-by: Victor Zhao <Victor.Zhao@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 25 ++++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c index 7aa570c1ce4a9..81a6d5b94987f 100644 --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c @@ -31,12 +31,23 @@ #include "amdgpu_psp.h" #include "amdgpu_xgmi.h" +static bool sienna_cichlid_is_mode2_default(struct amdgpu_reset_control *reset_ctl) +{ +#if 0 + struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle; + + if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(11, 0, 7) && + adev->pm.fw_version >= 0x3a5500 && !amdgpu_sriov_vf(adev)) + return true; +#endif + return false; +} + static struct amdgpu_reset_handler * sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl, struct amdgpu_reset_context *reset_context) { struct amdgpu_reset_handler *handler; - struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle; if (reset_context->method != AMD_RESET_METHOD_NONE) { list_for_each_entry(handler, &reset_ctl->reset_handlers, @@ -44,15 +55,13 @@ sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl, if (handler->reset_method == reset_context->method) return handler; } - } else { - list_for_each_entry(handler, &reset_ctl->reset_handlers, + } + + if (sienna_cichlid_is_mode2_default(reset_ctl)) { + list_for_each_entry (handler, &reset_ctl->reset_handlers, handler_list) { - if (handler->reset_method == AMD_RESET_METHOD_MODE2 && - adev->pm.fw_version >= 0x3a5500 && - !amdgpu_sriov_vf(adev)) { - reset_context->method = AMD_RESET_METHOD_MODE2; + if (handler->reset_method == AMD_RESET_METHOD_MODE2) return handler; - } } } -- GitLab From 4545ae2ed3f2f7c3f615a53399c9c8460ee5bca7 Mon Sep 17 00:00:00 2001 From: Asher Song <Asher.Song@amd.com> Date: Fri, 14 Oct 2022 11:36:33 +0800 Subject: [PATCH 2102/2223] drm/amdgpu: Revert "drm/amdgpu: getting fan speed pwm for vega10 properly" This reverts commit 16fb4dca95daa9d8e037201166a58de8284f4268. Unfortunately, that commit causes fan monitors can't be read and written properly. Fixes: 16fb4dca95daa9 ("drm/amdgpu: getting fan speed pwm for vega10 properly") Signed-off-by: Asher Song <Asher.Song@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../amd/pm/powerplay/hwmgr/vega10_thermal.c | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_thermal.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_thermal.c index 190af79f3236f..dad3e3741a4e8 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_thermal.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_thermal.c @@ -67,21 +67,22 @@ int vega10_fan_ctrl_get_fan_speed_info(struct pp_hwmgr *hwmgr, int vega10_fan_ctrl_get_fan_speed_pwm(struct pp_hwmgr *hwmgr, uint32_t *speed) { - struct amdgpu_device *adev = hwmgr->adev; - uint32_t duty100, duty; - uint64_t tmp64; + uint32_t current_rpm; + uint32_t percent = 0; - duty100 = REG_GET_FIELD(RREG32_SOC15(THM, 0, mmCG_FDO_CTRL1), - CG_FDO_CTRL1, FMAX_DUTY100); - duty = REG_GET_FIELD(RREG32_SOC15(THM, 0, mmCG_THERMAL_STATUS), - CG_THERMAL_STATUS, FDO_PWM_DUTY); + if (hwmgr->thermal_controller.fanInfo.bNoFan) + return 0; - if (!duty100) - return -EINVAL; + if (vega10_get_current_rpm(hwmgr, ¤t_rpm)) + return -1; + + if (hwmgr->thermal_controller. + advanceFanControlParameters.usMaxFanRPM != 0) + percent = current_rpm * 255 / + hwmgr->thermal_controller. + advanceFanControlParameters.usMaxFanRPM; - tmp64 = (uint64_t)duty * 255; - do_div(tmp64, duty100); - *speed = MIN((uint32_t)tmp64, 255); + *speed = MIN(percent, 255); return 0; } -- GitLab From 4d72a4e4fb5d870be52ce38e5672e4b71ee1162f Mon Sep 17 00:00:00 2001 From: Kenneth Feng <kenneth.feng@amd.com> Date: Sat, 30 Jul 2022 10:58:37 +0800 Subject: [PATCH 2103/2223] drm/amd/pm: temporarily disable thermal alert on smu_v13_0_10 temporarily disable thermal alert on smu_v13_0_10 due to kfd test fail. will enable it again after confirming the thermal hardware setting. Signed-off-by: Kenneth Feng <kenneth.feng@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 13c5c7f1ecb9f..3d436e7f6e952 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1312,10 +1312,12 @@ static int smu_smc_hw_setup(struct smu_context *smu) return ret; } - ret = smu_enable_thermal_alert(smu); - if (ret) { - dev_err(adev->dev, "Failed to enable thermal alert!\n"); - return ret; + if (adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 10)) { + ret = smu_enable_thermal_alert(smu); + if (ret) { + dev_err(adev->dev, "Failed to enable thermal alert!\n"); + return ret; + } } ret = smu_notify_display_change(smu); -- GitLab From 4c7f9a3c15344ccc682c77495fddea7dcb64027c Mon Sep 17 00:00:00 2001 From: Kenneth Feng <kenneth.feng@amd.com> Date: Wed, 7 Sep 2022 15:40:34 +0800 Subject: [PATCH 2104/2223] drm/amd/pm: remove the pptable id override on smu_v13_0_10 remove the pptable id override on smu_v13_0_10, and the id is fetched from vbios now. Signed-off-by: Kenneth Feng <kenneth.feng@amd.com> Reviewed-by: Likun Gao <Likun.Gao@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 93fffdbab4f07..d9323293179a0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -454,9 +454,6 @@ int smu_v13_0_setup_pptable(struct smu_context *smu) dev_info(adev->dev, "override pptable id %d\n", pptable_id); } else { pptable_id = smu->smu_table.boot_values.pp_table_id; - - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 10)) - pptable_id = 6666; } /* force using vbios pptable in sriov mode */ -- GitLab From 657e07221ce046132dd78f6e19c04b32a78b1d25 Mon Sep 17 00:00:00 2001 From: Kenneth Feng <kenneth.feng@amd.com> Date: Wed, 7 Sep 2022 17:05:34 +0800 Subject: [PATCH 2105/2223] drm/amd/amdgpu: enable gfx clock gating features on smu_v13_0_10 enable gfx clock gating features on smu_v13_0_10 Signed-off-by: Kenneth Feng <kenneth.feng@amd.com> Reviewed-by: Jack Gui <Jack.Gui@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 1 + drivers/gpu/drm/amd/amdgpu/soc21.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 251109723ab63..c56d61793ccd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5076,6 +5076,7 @@ static int gfx_v11_0_set_clockgating_state(void *handle, case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 1): case IP_VERSION(11, 0, 2): + case IP_VERSION(11, 0, 3): gfx_v11_0_update_gfx_clock_gating(adev, state == AMD_CG_STATE_GATE); break; diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 795706b3b092f..fdd842a3fcb6d 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -636,7 +636,11 @@ static int soc21_common_early_init(void *handle) break; case IP_VERSION(11, 0, 3): adev->cg_flags = AMD_CG_SUPPORT_VCN_MGCG | - AMD_CG_SUPPORT_JPEG_MGCG; + AMD_CG_SUPPORT_JPEG_MGCG | + AMD_CG_SUPPORT_GFX_CGCG | + AMD_CG_SUPPORT_GFX_CGLS | + AMD_CG_SUPPORT_REPEATER_FGCG | + AMD_CG_SUPPORT_GFX_MGCG; adev->pg_flags = AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_JPEG; -- GitLab From f700486cd1f2bf381671d1c2c7dc9000db10c50e Mon Sep 17 00:00:00 2001 From: Kenneth Feng <kenneth.feng@amd.com> Date: Mon, 26 Sep 2022 17:15:04 +0800 Subject: [PATCH 2106/2223] drm/amd/pm: skip loading pptable from driver on secure board for smu_v13_0_10 skip loading pptable from driver on secure board since it's loaded from psp. Signed-off-by: Kenneth Feng <kenneth.feng@amd.com> Reviewed-by: Guan Yu <Guan.Yu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index d9323293179a0..c4552ade8d441 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -211,7 +211,8 @@ int smu_v13_0_init_pptable_microcode(struct smu_context *smu) return 0; if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 7)) || - (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 0))) + (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 0)) || + (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 10))) return 0; /* override pptable_id from driver parameter */ -- GitLab From b7a76a29140810807fd85d15470d91b7992b6acf Mon Sep 17 00:00:00 2001 From: Likun Gao <Likun.Gao@amd.com> Date: Mon, 25 Jul 2022 20:02:40 +0800 Subject: [PATCH 2107/2223] drm/amdgpu: skip mes self test for gc 11.0.3 Temporary disable mes self teset for gc 11.0.3. Signed-off-by: Likun Gao <Likun.Gao@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 5cec6b259b7f7..133804e6018a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1296,7 +1296,8 @@ static int mes_v11_0_late_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (!amdgpu_in_reset(adev)) + if (!amdgpu_in_reset(adev) && + (adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))) amdgpu_mes_self_test(adev); return 0; -- GitLab From 7cd3f6c3ace44ae9a9950a8c02ebcb8069278aab Mon Sep 17 00:00:00 2001 From: YiPeng Chai <YiPeng.Chai@amd.com> Date: Tue, 27 Sep 2022 13:16:27 +0800 Subject: [PATCH 2108/2223] drm/amdgpu: Enable gmc soft reset on gmc_v11_0_3 Enable gmc soft reset on gmc_v11_0_3. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/soc21.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index fdd842a3fcb6d..e08044008186e 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -423,6 +423,7 @@ static bool soc21_need_full_reset(struct amdgpu_device *adev) case IP_VERSION(11, 0, 0): return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC); case IP_VERSION(11, 0, 2): + case IP_VERSION(11, 0, 3): return false; default: return true; -- GitLab From 001ebcf5b903646b40697d9b1dc9b24daae82b4f Mon Sep 17 00:00:00 2001 From: YiPeng Chai <YiPeng.Chai@amd.com> Date: Tue, 27 Sep 2022 14:06:42 +0800 Subject: [PATCH 2109/2223] drm/amdgpu: Enable ras support for mp0 v13_0_0 and v13_0_10 V1: Enable ras support for CHIP_IP_DISCOVERY asic type. V2: 1. Change commit comment. 2. Enable ras support for mp0 v13_0_0 and v13_0_10. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 75f1402101f4c..4a8f73cc4cb5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2267,6 +2267,16 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) { + if (adev->asic_type == CHIP_IP_DISCOVERY) { + switch (adev->ip_versions[MP0_HWIP][0]) { + case IP_VERSION(13, 0, 0): + case IP_VERSION(13, 0, 10): + return true; + default: + return false; + } + } + return adev->asic_type == CHIP_VEGA10 || adev->asic_type == CHIP_VEGA20 || adev->asic_type == CHIP_ARCTURUS || -- GitLab From 3bd026c3e3317e4490595848261fe74d76e74126 Mon Sep 17 00:00:00 2001 From: YiPeng Chai <YiPeng.Chai@amd.com> Date: Wed, 28 Sep 2022 15:52:02 +0800 Subject: [PATCH 2110/2223] drm/amdgpu: Add sriov vf ras support in amdgpu_ras_asic_supported V2: Add sriov vf ras support in amdgpu_ras_asic_supported. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4a8f73cc4cb5c..a4b47e1bd111d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2267,6 +2267,15 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) { + if (amdgpu_sriov_vf(adev)) { + switch (adev->ip_versions[MP0_HWIP][0]) { + case IP_VERSION(13, 0, 2): + return true; + default: + return false; + } + } + if (adev->asic_type == CHIP_IP_DISCOVERY) { switch (adev->ip_versions[MP0_HWIP][0]) { case IP_VERSION(13, 0, 0): @@ -2320,11 +2329,6 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) !amdgpu_ras_asic_supported(adev)) return; - /* If driver run on sriov guest side, only enable ras for aldebaran */ - if (amdgpu_sriov_vf(adev) && - adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2)) - return; - if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); -- GitLab From 528c0e66e0c01a8c078d2d94431db80f9c75d2a0 Mon Sep 17 00:00:00 2001 From: Evan Quan <evan.quan@amd.com> Date: Thu, 29 Sep 2022 10:24:51 +0800 Subject: [PATCH 2111/2223] drm/amd/pm: fulfill SMU13.0.0 cstate control interface Fulfill the functionality for cstate control. Signed-off-by: Evan Quan <evan.quan@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 1d454485e0d91..29529328152d0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -119,6 +119,7 @@ static struct cmn2asic_msg_mapping smu_v13_0_0_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(NotifyPowerSource, PPSMC_MSG_NotifyPowerSource, 0), MSG_MAP(Mode1Reset, PPSMC_MSG_Mode1Reset, 0), MSG_MAP(PrepareMp1ForUnload, PPSMC_MSG_PrepareMp1ForUnload, 0), + MSG_MAP(DFCstateControl, PPSMC_MSG_SetExternalClientDfCstateAllow, 0), }; static struct cmn2asic_mapping smu_v13_0_0_clk_map[SMU_CLK_COUNT] = { @@ -1753,6 +1754,15 @@ static int smu_v13_0_0_set_mp1_state(struct smu_context *smu, return ret; } +static int smu_v13_0_0_set_df_cstate(struct smu_context *smu, + enum pp_df_cstate state) +{ + return smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_DFCstateControl, + state, + NULL); +} + static const struct pptable_funcs smu_v13_0_0_ppt_funcs = { .get_allowed_feature_mask = smu_v13_0_0_get_allowed_feature_mask, .set_default_dpm_table = smu_v13_0_0_set_default_dpm_table, @@ -1822,6 +1832,7 @@ static const struct pptable_funcs smu_v13_0_0_ppt_funcs = { .mode1_reset_is_support = smu_v13_0_0_is_mode1_reset_supported, .mode1_reset = smu_v13_0_mode1_reset, .set_mp1_state = smu_v13_0_0_set_mp1_state, + .set_df_cstate = smu_v13_0_0_set_df_cstate, }; void smu_v13_0_0_set_ppt_funcs(struct smu_context *smu) -- GitLab From ba2f09960e75accf757ed12b4ef61409dcc97df8 Mon Sep 17 00:00:00 2001 From: Evan Quan <evan.quan@amd.com> Date: Thu, 29 Sep 2022 10:30:01 +0800 Subject: [PATCH 2112/2223] drm/amd/pm: fulfill SMU13.0.7 cstate control interface Fulfill the functionality for cstate control. Signed-off-by: Evan Quan <evan.quan@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index c422bf8a09b1d..c4102cfb734c2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -121,6 +121,7 @@ static struct cmn2asic_msg_mapping smu_v13_0_7_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(Mode1Reset, PPSMC_MSG_Mode1Reset, 0), MSG_MAP(PrepareMp1ForUnload, PPSMC_MSG_PrepareMp1ForUnload, 0), MSG_MAP(SetMGpuFanBoostLimitRpm, PPSMC_MSG_SetMGpuFanBoostLimitRpm, 0), + MSG_MAP(DFCstateControl, PPSMC_MSG_SetExternalClientDfCstateAllow, 0), }; static struct cmn2asic_mapping smu_v13_0_7_clk_map[SMU_CLK_COUNT] = { @@ -1587,6 +1588,16 @@ static bool smu_v13_0_7_is_mode1_reset_supported(struct smu_context *smu) return true; } + +static int smu_v13_0_7_set_df_cstate(struct smu_context *smu, + enum pp_df_cstate state) +{ + return smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_DFCstateControl, + state, + NULL); +} + static const struct pptable_funcs smu_v13_0_7_ppt_funcs = { .get_allowed_feature_mask = smu_v13_0_7_get_allowed_feature_mask, .set_default_dpm_table = smu_v13_0_7_set_default_dpm_table, @@ -1649,6 +1660,7 @@ static const struct pptable_funcs smu_v13_0_7_ppt_funcs = { .mode1_reset_is_support = smu_v13_0_7_is_mode1_reset_supported, .mode1_reset = smu_v13_0_mode1_reset, .set_mp1_state = smu_v13_0_7_set_mp1_state, + .set_df_cstate = smu_v13_0_7_set_df_cstate, }; void smu_v13_0_7_set_ppt_funcs(struct smu_context *smu) -- GitLab From 3059cd8c5f797ad83d2b194ae66339f5c007ca43 Mon Sep 17 00:00:00 2001 From: Evan Quan <evan.quan@amd.com> Date: Thu, 29 Sep 2022 10:50:44 +0800 Subject: [PATCH 2113/2223] drm/amd/pm: disable cstate feature for gpu reset scenario Suggested by PMFW team and same as what did for gfxoff feature. This can address some Mode1Reset failures observed on SMU13.0.0. Signed-off-by: Evan Quan <evan.quan@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++++++ drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 8 ++++++++ drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 9 +++++++++ 3 files changed, 25 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bb73fb420ffcd..e0445e8cc3424 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2928,6 +2928,14 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); + /* + * Per PMFW team's suggestion, driver needs to handle gfxoff + * and df cstate features disablement for gpu reset(e.g. Mode1Reset) + * scenario. Add the missing df cstate disablement here. + */ + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) + dev_warn(adev->dev, "Failed to disallow df cstate"); + for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) continue; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 445005571f76f..9cd005131f566 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -2242,9 +2242,17 @@ static void arcturus_get_unique_id(struct smu_context *smu) static int arcturus_set_df_cstate(struct smu_context *smu, enum pp_df_cstate state) { + struct amdgpu_device *adev = smu->adev; uint32_t smu_version; int ret; + /* + * Arcturus does not need the cstate disablement + * prerequisite for gpu reset. + */ + if (amdgpu_in_reset(adev) || adev->in_suspend) + return 0; + ret = smu_cmn_get_smc_version(smu, NULL, &smu_version); if (ret) { dev_err(smu->adev->dev, "Failed to get smu version!\n"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 619aee51b1238..d30ec3005ea19 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1640,6 +1640,15 @@ static bool aldebaran_is_baco_supported(struct smu_context *smu) static int aldebaran_set_df_cstate(struct smu_context *smu, enum pp_df_cstate state) { + struct amdgpu_device *adev = smu->adev; + + /* + * Aldebaran does not need the cstate disablement + * prerequisite for gpu reset. + */ + if (amdgpu_in_reset(adev) || adev->in_suspend) + return 0; + return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_DFCstateControl, state, NULL); } -- GitLab From 5fa993737b29bffe931cc5d0feb87ebc34cd5bb3 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin <zhenguo.yin@amd.com> Date: Wed, 12 Oct 2022 16:54:38 +0800 Subject: [PATCH 2114/2223] drm/amd/pm: Init pm_attr_list when dpm is disabled [Why] In SRIOV multi-vf, dpm is always disabled, and pm_attr_list won't be initialized. There will be a NULL pointer call trace after removing the dpm check condition in amdgpu_pm_sysfs_fini. BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:amdgpu_device_attr_remove_groups+0x20/0x90 [amdgpu] Call Trace: <TASK> amdgpu_pm_sysfs_fini+0x2f/0x40 [amdgpu] amdgpu_device_fini_hw+0xdf/0x290 [amdgpu] [How] List pm_attr_list should be initialized when dpm is disabled. Fixes: a6ad27cec585fe ("drm/amd/pm: Remove redundant check condition") Signed-off-by: ZhenGuo Yin <zhenguo.yin@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 948cc75376f8b..236657eece477 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -3362,11 +3362,11 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) if (adev->pm.sysfs_initialized) return 0; + INIT_LIST_HEAD(&adev->pm.pm_attr_list); + if (adev->pm.dpm_enabled == 0) return 0; - INIT_LIST_HEAD(&adev->pm.pm_attr_list); - adev->pm.int_hwmon_dev = hwmon_device_register_with_groups(adev->dev, DRIVER_NAME, adev, hwmon_groups); -- GitLab From 853fdb49160e9c30674fd8e4a2eabc06bf70b13a Mon Sep 17 00:00:00 2001 From: Tim Huang <tim.huang@amd.com> Date: Thu, 29 Sep 2022 14:39:21 +0800 Subject: [PATCH 2115/2223] drm/amd/pm: update SMU IP v13.0.4 driver interface version Update the SMU driver interface version to V7. Signed-off-by: Tim Huang <tim.huang@amd.com> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../swsmu/inc/pmfw_if/smu13_driver_if_v13_0_4.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_4.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_4.h index ae2d337158f3b..f77401709d83c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_4.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_4.h @@ -27,7 +27,7 @@ // *** IMPORTANT *** // SMU TEAM: Always increment the interface version if // any structure is changed in this file -#define PMFW_DRIVER_IF_VERSION 5 +#define PMFW_DRIVER_IF_VERSION 7 typedef struct { int32_t value; @@ -163,8 +163,8 @@ typedef struct { uint16_t DclkFrequency; //[MHz] uint16_t MemclkFrequency; //[MHz] uint16_t spare; //[centi] - uint16_t UvdActivity; //[centi] uint16_t GfxActivity; //[centi] + uint16_t UvdActivity; //[centi] uint16_t Voltage[2]; //[mV] indices: VDDCR_VDD, VDDCR_SOC uint16_t Current[2]; //[mA] indices: VDDCR_VDD, VDDCR_SOC @@ -199,6 +199,19 @@ typedef struct { uint16_t DeviceState; uint16_t CurTemp; //[centi-Celsius] uint16_t spare2; + + uint16_t AverageGfxclkFrequency; + uint16_t AverageFclkFrequency; + uint16_t AverageGfxActivity; + uint16_t AverageSocclkFrequency; + uint16_t AverageVclkFrequency; + uint16_t AverageVcnActivity; + uint16_t AverageDRAMReads; //Filtered DF Bandwidth::DRAM Reads + uint16_t AverageDRAMWrites; //Filtered DF Bandwidth::DRAM Writes + uint16_t AverageSocketPower; //Filtered value of CurrentSocketPower + uint16_t AverageCorePower; //Filtered of [sum of CorePower[8]]) + uint16_t AverageCoreC0Residency[8]; //Filtered of [average C0 residency % per core] + uint32_t MetricsCounter; //Counts the # of metrics table parameter reads per update to the metrics table, i.e. if the metrics table update happens every 1 second, this value could be up to 1000 if the smu collected metrics data every cycle, or as low as 0 if the smu was asleep the whole time. Reset to 0 after writing. } SmuMetrics_t; typedef struct { -- GitLab From 31c261a7ffb8d5bba8144e2d43db304f2bc7e81a Mon Sep 17 00:00:00 2001 From: Tim Huang <tim.huang@amd.com> Date: Thu, 29 Sep 2022 15:06:47 +0800 Subject: [PATCH 2116/2223] drm/amd/pm: add SMU IP v13.0.4 IF version define to V7 The pmfw has changed the driver interface version, so keep same with the fw. Signed-off-by: Tim Huang <tim.huang@amd.com> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org # 6.0.x Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index 9d62ea2af132c..8f72202aea8e9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -28,7 +28,7 @@ #define SMU13_DRIVER_IF_VERSION_INV 0xFFFFFFFF #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 #define SMU13_DRIVER_IF_VERSION_ALDE 0x08 -#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_4 0x05 +#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_4 0x07 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_5 0x04 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0 0x30 #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_7 0x2C -- GitLab From 8a70b2d89ea3f2dc1449f0634ca6befb41472f24 Mon Sep 17 00:00:00 2001 From: Guenter Roeck <linux@roeck-us.net> Date: Thu, 13 Oct 2022 11:25:23 -0700 Subject: [PATCH 2117/2223] drm/amd/display: Increase frame size limit for display_mode_vba_util_32.o MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building 32-bit images may fail with the following error. drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_util_32.c: In function ‘dml32_UseMinimumDCFCLK’: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_util_32.c:3142:1: error: the frame size of 1096 bytes is larger than 1024 bytes This is seen when building i386:allmodconfig with any of the following compilers. gcc (Debian 12.2.0-3) 12.2.0 gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0 The problem is not seen if the compiler supports GCC_PLUGIN_LATENT_ENTROPY because in that case CONFIG_FRAME_WARN is already set to 2048 even for 32-bit builds. dml32_UseMinimumDCFCLK() was introduced with commit dda4fb85e433 ("drm/amd/display: DML changes for DCN32/321"). It declares a large number of local variables. Increase the frame size for the affected file to 2048, similar to other files in the same directory, to enable 32-bit build tests with affected compilers. Fixes: dda4fb85e433 ("drm/amd/display: DML changes for DCN32/321") Cc: Aurabindo Pillai <aurabindo.pillai@amd.com> Reported-by: Łukasz Bartosik <ukaszb@google.com> Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/display/dc/dml/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index d70838edba801..ca7d240006213 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -77,7 +77,7 @@ CFLAGS_$(AMDDALPATH)/dc/dml/dcn30/dcn30_fpu.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn32/dcn32_fpu.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn32/display_mode_vba_32.o := $(dml_ccflags) $(frame_warn_flag) CFLAGS_$(AMDDALPATH)/dc/dml/dcn32/display_rq_dlg_calc_32.o := $(dml_ccflags) -CFLAGS_$(AMDDALPATH)/dc/dml/dcn32/display_mode_vba_util_32.o := $(dml_ccflags) +CFLAGS_$(AMDDALPATH)/dc/dml/dcn32/display_mode_vba_util_32.o := $(dml_ccflags) $(frame_warn_flag) CFLAGS_$(AMDDALPATH)/dc/dml/dcn321/dcn321_fpu.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn31/dcn31_fpu.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn301/dcn301_fpu.o := $(dml_ccflags) -- GitLab From e688ba3e276422aa88eae7a54186a95320836081 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor <nathan@kernel.org> Date: Mon, 17 Oct 2022 09:28:38 -0700 Subject: [PATCH 2118/2223] drm/amdkfd: Fix type of reset_type parameter in hqd_destroy() callback When booting a kernel compiled with CONFIG_CFI_CLANG on a machine with an RX 6700 XT, there is a CFI failure in kfd_destroy_mqd_cp(): [ 12.894543] CFI failure at kfd_destroy_mqd_cp+0x2a/0x40 [amdgpu] (target: hqd_destroy_v10_3+0x0/0x260 [amdgpu]; expected type: 0x8594d794) Clang's kernel Control Flow Integrity (kCFI) makes sure that all indirect call targets have a type that exactly matches the function pointer prototype. In this case, hqd_destroy()'s third parameter, reset_type, should have a type of 'uint32_t' but every implementation of this callback has a third parameter type of 'enum kfd_preempt_type'. Update the function pointer prototype to match reality so that there is no more CFI violation. Link: https://github.com/ClangBuiltLinux/linux/issues/1738 Signed-off-by: Nathan Chancellor <nathan@kernel.org> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index e85364dff4e04..5cb3e8634739d 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -262,8 +262,9 @@ struct kfd2kgd_calls { uint32_t queue_id); int (*hqd_destroy)(struct amdgpu_device *adev, void *mqd, - uint32_t reset_type, unsigned int timeout, - uint32_t pipe_id, uint32_t queue_id); + enum kfd_preempt_type reset_type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); bool (*hqd_sdma_is_occupied)(struct amdgpu_device *adev, void *mqd); -- GitLab From 97a3d6090f5c2a165dc88bda05c1dcf9f08bf886 Mon Sep 17 00:00:00 2001 From: Yifan Zha <Yifan.Zha@amd.com> Date: Wed, 7 Sep 2022 14:13:02 +0800 Subject: [PATCH 2119/2223] drm/amdgpu: Program GC registers through RLCG interface in gfx_v11/gmc_v11 [Why] L1 blocks most of GC registers accessing by MMIO. [How] Use RLCG interface to program GC registers under SRIOV VF in full access time. Signed-off-by: Yifan Zha <Yifan.Zha@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c index 0b0a72ca56956..7e80caa05060b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c @@ -111,7 +111,7 @@ static int init_interrupts_v11(struct amdgpu_device *adev, uint32_t pipe_id) lock_srbm(adev, mec, pipe, 0, 0); - WREG32(SOC15_REG_OFFSET(GC, 0, regCPC_INT_CNTL), + WREG32_SOC15(GC, 0, regCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index c56d61793ccd6..671ca5a0f208a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1571,7 +1571,7 @@ static void gfx_v11_0_init_compute_vmid(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, regSH_MEM_BASES, sh_mem_bases); /* Enable trap for each kfd vmid. */ - data = RREG32(SOC15_REG_OFFSET(GC, 0, regSPI_GDBG_PER_VMID_CNTL)); + data = RREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL); data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1); } soc21_grbm_select(adev, 0, 0, 0, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 846ccb6cf07d9..66dfb574cc7d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -186,6 +186,10 @@ static void gmc_v11_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid, /* Use register 17 for GART */ const unsigned eng = 17; unsigned int i; + unsigned char hub_ip = 0; + + hub_ip = (vmhub == AMDGPU_GFXHUB_0) ? + GC_HWIP : MMHUB_HWIP; spin_lock(&adev->gmc.invalidate_lock); /* @@ -199,8 +203,8 @@ static void gmc_v11_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid, if (use_semaphore) { for (i = 0; i < adev->usec_timeout; i++) { /* a read return value of 1 means semaphore acuqire */ - tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + - hub->eng_distance * eng); + tmp = RREG32_RLC_NO_KIQ(hub->vm_inv_eng0_sem + + hub->eng_distance * eng, hub_ip); if (tmp & 0x1) break; udelay(1); @@ -210,12 +214,12 @@ static void gmc_v11_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid, DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n"); } - WREG32_NO_KIQ(hub->vm_inv_eng0_req + hub->eng_distance * eng, inv_req); + WREG32_RLC_NO_KIQ(hub->vm_inv_eng0_req + hub->eng_distance * eng, inv_req, hub_ip); /* Wait for ACK with a delay.*/ for (i = 0; i < adev->usec_timeout; i++) { - tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + - hub->eng_distance * eng); + tmp = RREG32_RLC_NO_KIQ(hub->vm_inv_eng0_ack + + hub->eng_distance * eng, hub_ip); tmp &= 1 << vmid; if (tmp) break; @@ -229,8 +233,8 @@ static void gmc_v11_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid, * add semaphore release after invalidation, * write with 0 means semaphore release */ - WREG32_NO_KIQ(hub->vm_inv_eng0_sem + - hub->eng_distance * eng, 0); + WREG32_RLC_NO_KIQ(hub->vm_inv_eng0_sem + + hub->eng_distance * eng, 0, hub_ip); /* Issue additional private vm invalidation to MMHUB */ if ((vmhub != AMDGPU_GFXHUB_0) && -- GitLab From 5ce4726a1376bd0673d7b8edd243e76fbb4476d1 Mon Sep 17 00:00:00 2001 From: Kenneth Feng <kenneth.feng@amd.com> Date: Fri, 14 Oct 2022 15:19:51 +0800 Subject: [PATCH 2120/2223] drm/amd/pm: enable thermal alert on smu_v13_0_10 enable thermal alert on smu_v13_0_10 Signed-off-by: Kenneth Feng <kenneth.feng@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 3d436e7f6e952..4fe75dd2b329d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1312,12 +1312,10 @@ static int smu_smc_hw_setup(struct smu_context *smu) return ret; } - if (adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 10)) { - ret = smu_enable_thermal_alert(smu); - if (ret) { - dev_err(adev->dev, "Failed to enable thermal alert!\n"); - return ret; - } + ret = smu_enable_thermal_alert(smu); + if (ret) { + dev_err(adev->dev, "Failed to enable thermal alert!\n"); + return ret; } ret = smu_notify_display_change(smu); -- GitLab From 2abe92c7adc9c0397ba51bf74909b85bc0fff84b Mon Sep 17 00:00:00 2001 From: YuBiao Wang <YuBiao.Wang@amd.com> Date: Thu, 13 Oct 2022 11:31:55 +0800 Subject: [PATCH 2121/2223] drm/amdgpu: dequeue mes scheduler during fini [Why] If mes is not dequeued during fini, mes will be in an uncleaned state during reload, then mes couldn't receive some commands which leads to reload failure. [How] Perform MES dequeue via MMIO after all the unmap jobs are done by mes and before kiq fini. v2: Move the dequeue operation inside kiq_hw_fini. Signed-off-by: YuBiao Wang <YuBiao.Wang@amd.com> Reviewed-by: Jack Xiao <Jack.Xiao@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 133804e6018a7..fef7d020bc5f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1156,6 +1156,42 @@ static int mes_v11_0_sw_fini(void *handle) return 0; } +static void mes_v11_0_kiq_dequeue_sched(struct amdgpu_device *adev) +{ + uint32_t data; + int i; + + mutex_lock(&adev->srbm_mutex); + soc21_grbm_select(adev, 3, AMDGPU_MES_SCHED_PIPE, 0, 0); + + /* disable the queue if it's active */ + if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) { + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 1); + for (i = 0; i < adev->usec_timeout; i++) { + if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1)) + break; + udelay(1); + } + } + data = RREG32_SOC15(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL); + data = REG_SET_FIELD(data, CP_HQD_PQ_DOORBELL_CONTROL, + DOORBELL_EN, 0); + data = REG_SET_FIELD(data, CP_HQD_PQ_DOORBELL_CONTROL, + DOORBELL_HIT, 1); + WREG32_SOC15(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL, data); + + WREG32_SOC15(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL, 0); + + WREG32_SOC15(GC, 0, regCP_HQD_PQ_WPTR_LO, 0); + WREG32_SOC15(GC, 0, regCP_HQD_PQ_WPTR_HI, 0); + WREG32_SOC15(GC, 0, regCP_HQD_PQ_RPTR, 0); + + soc21_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + + adev->mes.ring.sched.ready = false; +} + static void mes_v11_0_kiq_setting(struct amdgpu_ring *ring) { uint32_t tmp; @@ -1207,6 +1243,9 @@ failure: static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev) { + if (adev->mes.ring.sched.ready) + mes_v11_0_kiq_dequeue_sched(adev); + mes_v11_0_enable(adev, false); return 0; } @@ -1262,9 +1301,6 @@ failure: static int mes_v11_0_hw_fini(void *handle) { - struct amdgpu_device *adev = (struct amdgpu_device *)handle; - - adev->mes.ring.sched.ready = false; return 0; } -- GitLab From 8273b4048664fff356fd10059033f0e2f5a422a1 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com> Date: Tue, 18 Oct 2022 07:08:38 -0700 Subject: [PATCH 2122/2223] drm/amdgpu: Fix for BO move issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A user reported a bug on CAPE VERDE system where uvd_v3_1 IP component failed to initialize as there is an issue with BO move code from one memory to other. In function amdgpu_mem_visible() called by amdgpu_bo_move(), when there are no blocks to compare or if we have a single block then break the loop. Fixes: 312b4dc11d4f ("drm/amdgpu: Fix VRAM BO swap issue") Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index dc262d2c2925e..57277b1cf1834 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -439,6 +439,9 @@ static bool amdgpu_mem_visible(struct amdgpu_device *adev, while (cursor.remaining) { amdgpu_res_next(&cursor, cursor.size); + if (!cursor.remaining) + break; + /* ttm_resource_ioremap only supports contiguous memory */ if (end != cursor.start) return false; -- GitLab From ba077d683d45190afc993c1ce45bcdbfda741a40 Mon Sep 17 00:00:00 2001 From: Vikas Gupta <vikas.gupta@broadcom.com> Date: Mon, 17 Oct 2022 11:32:22 -0400 Subject: [PATCH 2123/2223] bnxt_en: fix memory leak in bnxt_nvm_test() Free the kzalloc'ed buffer before returning in the success path. Fixes: 5b6ff128fdf6 ("bnxt_en: implement callbacks for devlink selftests") Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com> Signed-off-by: Michael Chan <michael.chan@broadcom.com> Link: https://lore.kernel.org/r/1666020742-25834-1-git-send-email-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index a36803e79e92e..8a6f788f62944 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -613,6 +613,7 @@ static int bnxt_dl_reload_up(struct devlink *dl, enum devlink_reload_action acti static bool bnxt_nvm_test(struct bnxt *bp, struct netlink_ext_ack *extack) { + bool rc = false; u32 datalen; u16 index; u8 *buf; @@ -632,20 +633,20 @@ static bool bnxt_nvm_test(struct bnxt *bp, struct netlink_ext_ack *extack) if (bnxt_get_nvram_item(bp->dev, index, 0, datalen, buf)) { NL_SET_ERR_MSG_MOD(extack, "nvm test vpd read error"); - goto err; + goto done; } if (bnxt_flash_nvram(bp->dev, BNX_DIR_TYPE_VPD, BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE, 0, 0, buf, datalen)) { NL_SET_ERR_MSG_MOD(extack, "nvm test vpd write error"); - goto err; + goto done; } - return true; + rc = true; -err: +done: kfree(buf); - return false; + return rc; } static bool bnxt_dl_selftest_check(struct devlink *dl, unsigned int id, -- GitLab From d8b57135fd9ffe9a5b445350a686442a531c5339 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Mon, 17 Oct 2022 16:59:28 +0000 Subject: [PATCH 2124/2223] net: hsr: avoid possible NULL deref in skb_clone() syzbot got a crash [1] in skb_clone(), caused by a bug in hsr_get_untagged_frame(). When/if create_stripped_skb_hsr() returns NULL, we must not attempt to call skb_clone(). While we are at it, replace a WARN_ONCE() by netdev_warn_once(). [1] general protection fault, probably for non-canonical address 0xdffffc000000000f: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000078-0x000000000000007f] CPU: 1 PID: 754 Comm: syz-executor.0 Not tainted 6.0.0-syzkaller-02734-g0326074ff465 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 RIP: 0010:skb_clone+0x108/0x3c0 net/core/skbuff.c:1641 Code: 93 02 00 00 49 83 7c 24 28 00 0f 85 e9 00 00 00 e8 5d 4a 29 fa 4c 8d 75 7e 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 4c 89 f2 83 e2 07 38 d0 7f 08 84 c0 0f 85 9e 01 00 00 RSP: 0018:ffffc90003ccf4e0 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: ffffc90003ccf5f8 RCX: ffffc9000c24b000 RDX: 000000000000000f RSI: ffffffff8751cb13 RDI: 0000000000000000 RBP: 0000000000000000 R08: 00000000000000f0 R09: 0000000000000140 R10: fffffbfff181d972 R11: 0000000000000000 R12: ffff888161fc3640 R13: 0000000000000a20 R14: 000000000000007e R15: ffffffff8dc5f620 FS: 00007feb621e4700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007feb621e3ff8 CR3: 00000001643a9000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> hsr_get_untagged_frame+0x4e/0x610 net/hsr/hsr_forward.c:164 hsr_forward_do net/hsr/hsr_forward.c:461 [inline] hsr_forward_skb+0xcca/0x1d50 net/hsr/hsr_forward.c:623 hsr_handle_frame+0x588/0x7c0 net/hsr/hsr_slave.c:69 __netif_receive_skb_core+0x9fe/0x38f0 net/core/dev.c:5379 __netif_receive_skb_one_core+0xae/0x180 net/core/dev.c:5483 __netif_receive_skb+0x1f/0x1c0 net/core/dev.c:5599 netif_receive_skb_internal net/core/dev.c:5685 [inline] netif_receive_skb+0x12f/0x8d0 net/core/dev.c:5744 tun_rx_batched+0x4ab/0x7a0 drivers/net/tun.c:1544 tun_get_user+0x2686/0x3a00 drivers/net/tun.c:1995 tun_chr_write_iter+0xdb/0x200 drivers/net/tun.c:2025 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x9e9/0xdd0 fs/read_write.c:584 ksys_write+0x127/0x250 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: f266a683a480 ("net/hsr: Better frame dispatch") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20221017165928.2150130-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/hsr/hsr_forward.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 5bf357734b113..a50429a62f744 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -150,15 +150,15 @@ struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { - if (frame->skb_hsr) { + if (frame->skb_hsr) frame->skb_std = create_stripped_skb_hsr(frame->skb_hsr, frame); - } else { - /* Unexpected */ - WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", - __FILE__, __LINE__, port->dev->name); + else + netdev_warn_once(port->dev, + "Unexpected frame received in hsr_get_untagged_frame()\n"); + + if (!frame->skb_std) return NULL; - } } return skb_clone(frame->skb_std, GFP_ATOMIC); -- GitLab From aa1d7e1267c12e07d979aa34c613716a89029db2 Mon Sep 17 00:00:00 2001 From: Brett Creeley <brett@pensando.io> Date: Mon, 17 Oct 2022 16:31:23 -0700 Subject: [PATCH 2125/2223] ionic: catch NULL pointer issue on reconfig It's possible that the driver will dereference a qcq that doesn't exist when calling ionic_reconfigure_queues(), which causes a page fault BUG. If a reduction in the number of queues is followed by a different reconfig such as changing the ring size, the driver can hit a NULL pointer when trying to clean up non-existent queues. Fix this by checking to make sure both the qcqs array and qcq entry exists bofore trying to use and free the entry. Fixes: 101b40a0171f ("ionic: change queue count with no reset") Signed-off-by: Brett Creeley <brett@pensando.io> Signed-off-by: Shannon Nelson <snelson@pensando.io> Link: https://lore.kernel.org/r/20221017233123.15869-1-snelson@pensando.io Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 5d58fd99be3cf..19d4848df17df 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2817,11 +2817,15 @@ err_out: * than the full array, but leave the qcq shells in place */ for (i = lif->nxqs; i < lif->ionic->ntxqs_per_lif; i++) { - lif->txqcqs[i]->flags &= ~IONIC_QCQ_F_INTR; - ionic_qcq_free(lif, lif->txqcqs[i]); + if (lif->txqcqs && lif->txqcqs[i]) { + lif->txqcqs[i]->flags &= ~IONIC_QCQ_F_INTR; + ionic_qcq_free(lif, lif->txqcqs[i]); + } - lif->rxqcqs[i]->flags &= ~IONIC_QCQ_F_INTR; - ionic_qcq_free(lif, lif->rxqcqs[i]); + if (lif->rxqcqs && lif->rxqcqs[i]) { + lif->rxqcqs[i]->flags &= ~IONIC_QCQ_F_INTR; + ionic_qcq_free(lif, lif->rxqcqs[i]); + } } if (err) -- GitLab From 1fcc064b305a1aadeff0d4bff961094d27660acd Mon Sep 17 00:00:00 2001 From: Guillaume Nault <gnault@redhat.com> Date: Thu, 13 Oct 2022 16:37:47 +0200 Subject: [PATCH 2126/2223] netfilter: rpfilter/fib: Set ->flowic_uid correctly for user namespaces. Currently netfilter's rpfilter and fib modules implicitely initialise ->flowic_uid with 0. This is normally the root UID. However, this isn't the case in user namespaces, where user ID 0 is mapped to a different kernel UID. By initialising ->flowic_uid with sock_net_uid(), we get the root UID of the user namespace, thus keeping the same behaviour whether or not we're running in a user namepspace. Note, this is similar to commit 8bcfd0925ef1 ("ipv4: add missing initialization for flowi4_uid"), which fixed the rp_filter sysctl. Fixes: 622ec2c9d524 ("net: core: add UID to flows, rules, and routes") Signed-off-by: Guillaume Nault <gnault@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/ipv4/netfilter/ipt_rpfilter.c | 1 + net/ipv4/netfilter/nft_fib_ipv4.c | 1 + net/ipv6/netfilter/ip6t_rpfilter.c | 1 + net/ipv6/netfilter/nft_fib_ipv6.c | 2 ++ 4 files changed, 5 insertions(+) diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index ff85db52b2e56..ded5bef02f771 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -78,6 +78,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); + flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index e886147eed11d..fc65d69f23e16 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -65,6 +65,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), }; const struct net_device *oif; const struct net_device *found; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 69d86b040a6af..a01d9b842bd07 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -40,6 +40,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev), .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_proto = iph->nexthdr, + .flowi6_uid = sock_net_uid(net, NULL), .daddr = iph->saddr, }; int lookup_flags; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 91faac610e03d..36dc14b34388c 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -66,6 +66,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, + .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; u32 ret = 0; @@ -163,6 +164,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, + .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; struct rt6_info *rt; int lookup_flags; -- GitLab From 96df8360dbb435cc69f7c3c8db44bf8b1c24cd7b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Mon, 17 Oct 2022 14:12:58 +0200 Subject: [PATCH 2127/2223] netfilter: nf_tables: relax NFTA_SET_ELEM_KEY_END set flags requirements Otherwise EINVAL is bogusly reported to userspace when deleting a set element. NFTA_SET_ELEM_KEY_END does not need to be set in case of: - insertion: if not present, start key is used as end key. - deletion: only start key needs to be specified, end key is ignored. Hence, relax the sanity check. Fixes: 88cccd908d51 ("netfilter: nf_tables: NFTA_SET_ELEM_KEY_END requires concat and interval flags") Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a0653a8dfa827..58d9cbc9ccdc7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5865,8 +5865,9 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set, (NFT_SET_CONCAT | NFT_SET_INTERVAL)) { if (flags & NFT_SET_ELEM_INTERVAL_END) return false; - if (!nla[NFTA_SET_ELEM_KEY_END] && - !(flags & NFT_SET_ELEM_CATCHALL)) + + if (nla[NFTA_SET_ELEM_KEY_END] && + flags & NFT_SET_ELEM_CATCHALL) return false; } else { if (nla[NFTA_SET_ELEM_KEY_END]) -- GitLab From 4739824e2d7878dcea88397a6758e31e3c5c124e Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Sat, 15 Oct 2022 11:25:56 +0300 Subject: [PATCH 2128/2223] nvme: fix error pointer dereference in error handling There is typo here so it releases the wrong variable. "ctrl->admin_q" was intended instead of "ctrl->fabrics_q". Fixes: fe60e8c53411 ("nvme: add common helpers to allocate and free tagsets") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 059737c1a2c19..9cbe7854d4883 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4846,7 +4846,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return 0; out_cleanup_admin_q: - blk_mq_destroy_queue(ctrl->fabrics_q); + blk_mq_destroy_queue(ctrl->admin_q); out_free_tagset: blk_mq_free_tag_set(ctrl->admin_tagset); return ret; -- GitLab From ac9b57d4e1e3ecf0122e915bbba1bd4c90ec3031 Mon Sep 17 00:00:00 2001 From: Xander Li <xander_li@kingston.com.tw> Date: Tue, 11 Oct 2022 04:06:42 -0700 Subject: [PATCH 2129/2223] nvme-pci: disable write zeroes on various Kingston SSD Kingston SSDs do support NVMe Write_Zeroes cmd but take long time to process. The firmware version is locked by these SSDs, we can not expect firmware improvement, so disable Write_Zeroes cmd. Signed-off-by: Xander Li <xander_li@kingston.com.tw> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/pci.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bcbef6bc5672f..31e577b01257d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3511,6 +3511,16 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x2646, 0x5018), /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x5016), /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x501A), /* KINGSTON OM8PGP4xxxxP OS21005 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x501B), /* KINGSTON OM8PGP4xxxxQ OS21005 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1002), /* MAXIO MAP1002 */ -- GitLab From d622f8477a8018974f8df961440dca58224f9c6b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk> Date: Wed, 12 Oct 2022 12:46:06 +0100 Subject: [PATCH 2130/2223] nvme-apple: don't limit DMA segement size NVMe uses PRPs for data transfers and has no specific limit for a single DMA segement. Limiting the size will cause problems because the block layer assumes PRP-ish devices using a virt boundary mask don't have a segment limit. And while this is true, we also really need to tell the DMA mapping layer about it, otherwise dma-debug will trip over it. Fixes: 5bd2927aceba ("nvme-apple: Add initial Apple SoC NVMe driver") Suggested-by: Sven Peter <sven@svenpeter.dev> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> [hch: rewrote the commit message based on the PCIe commit] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Eric Curtin <ecurtin@redhat.com> Reviewed-by: Sven Peter <sven@svenpeter.dev> --- drivers/nvme/host/apple.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 5fc5ea196b400..ff8b083dc5c6d 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1039,6 +1039,8 @@ static void apple_nvme_reset_work(struct work_struct *work) dma_max_mapping_size(anv->dev) >> 9); anv->ctrl.max_segments = NVME_MAX_SEGS; + dma_set_max_seg_size(anv->dev, 0xffffffff); + /* * Enable NVMMU and linear submission queues. * While we could keep those disabled and pretend this is slightly -- GitLab From 6ff5ba97960821fb872ad981eb30374f5cee1fd9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 18 Oct 2022 16:59:16 +0200 Subject: [PATCH 2131/2223] nvme: add Guenther as nvme-hwmon maintainer Given that non of the overall NVMe maintainers knows this code very deeply it probably makes sense to add Guenther as an additional MAINTAINER for it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Acked-by: Guenter Roeck <linux@roeck-us.net> --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 12984711f2fe3..fde92782fbbdf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14640,6 +14640,12 @@ F: drivers/nvme/target/auth.c F: drivers/nvme/target/fabrics-cmd-auth.c F: include/linux/nvme-auth.h +NVM EXPRESS HARDWARE MONITORING SUPPORT +M: Guenter Roeck <linux@roeck-us.net> +L: linux-nvme@lists.infradead.org +S: Supported +F: drivers/nvme/host/hwmon.c + NVM EXPRESS FC TRANSPORT DRIVERS M: James Smart <james.smart@broadcom.com> L: linux-nvme@lists.infradead.org -- GitLab From 7b476affcccfc7e644541a0a719f53fc7bd34c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> Date: Fri, 7 Oct 2022 09:51:13 +0200 Subject: [PATCH 2132/2223] drm/sched: add DRM_SCHED_FENCE_DONT_PIPELINE flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting this flag on a scheduler fence prevents pipelining of jobs depending on this fence. In other words we always insert a full CPU round trip before dependent jobs are pushed to the pipeline. Signed-off-by: Christian König <christian.koenig@amd.com> Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2113#note_1579296 Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Luben Tuikov <luben.tuikov@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221014081553.114899-1-christian.koenig@amd.com --- drivers/gpu/drm/scheduler/sched_entity.c | 3 ++- include/drm/gpu_scheduler.h | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 6b25b2f4f5a30..6137537aaea4d 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -385,7 +385,8 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) } s_fence = to_drm_sched_fence(fence); - if (s_fence && s_fence->sched == sched) { + if (s_fence && s_fence->sched == sched && + !test_bit(DRM_SCHED_FENCE_DONT_PIPELINE, &fence->flags)) { /* * Fence is from the same scheduler, only need to wait for diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index addb135eeea62..289a33e806397 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -32,6 +32,15 @@ #define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000) +/** + * DRM_SCHED_FENCE_DONT_PIPELINE - Prefent dependency pipelining + * + * Setting this flag on a scheduler fence prevents pipelining of jobs depending + * on this fence. In other words we always insert a full CPU round trip before + * dependen jobs are pushed to the hw queue. + */ +#define DRM_SCHED_FENCE_DONT_PIPELINE DMA_FENCE_FLAG_USER_BITS + struct drm_gem_object; struct drm_gpu_scheduler; -- GitLab From 6b8cf94005187952f794c0c4ed3920a1e8accfa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 18 Oct 2022 16:55:55 +0200 Subject: [PATCH 2133/2223] nvme-hwmon: consistently ignore errors from nvme_hwmon_init An NVMe controller works perfectly fine even when the hwmon initialization fails. Stop returning errors that do not come from a controller reset from nvme_hwmon_init to handle this case consistently. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Guenter Roeck <linux@roeck-us.net> Reviewed-by: Serge Semin <fancer.lancer@gmail.com> --- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/hwmon.c | 13 ++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9cbe7854d4883..dc42206005855 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3262,8 +3262,12 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) return ret; if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { + /* + * Do not return errors unless we are in a controller reset, + * the controller works perfectly fine without hwmon. + */ ret = nvme_hwmon_init(ctrl); - if (ret < 0) + if (ret == -EINTR) return ret; } diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 0a586d7129201..23918bb7bdca2 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -230,7 +230,7 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - return 0; + return -ENOMEM; data->ctrl = ctrl; mutex_init(&data->read_lock); @@ -238,8 +238,7 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { dev_warn(dev, "Failed to read smart log (error %d)\n", err); - kfree(data); - return err; + goto err_free_data; } hwmon = hwmon_device_register_with_info(dev, "nvme", @@ -247,11 +246,15 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) NULL); if (IS_ERR(hwmon)) { dev_warn(dev, "Failed to instantiate hwmon device\n"); - kfree(data); - return PTR_ERR(hwmon); + err = PTR_ERR(hwmon); + goto err_free_data; } ctrl->hwmon_device = hwmon; return 0; + +err_free_data: + kfree(data); + return err; } void nvme_hwmon_exit(struct nvme_ctrl *ctrl) -- GitLab From c94b7f9bab22ac504f9153767676e659988575ad Mon Sep 17 00:00:00 2001 From: Serge Semin <Sergey.Semin@baikalelectronics.ru> Date: Tue, 18 Oct 2022 17:33:52 +0200 Subject: [PATCH 2134/2223] nvme-hwmon: kmalloc the NVME SMART log buffer Recent commit 52fde2c07da6 ("nvme: set dma alignment to dword") has caused a regression on our platform. It turned out that the nvme_get_log() method invocation caused the nvme_hwmon_data structure instance corruption. In particular the nvme_hwmon_data.ctrl pointer was overwritten either with zeros or with garbage. After some research we discovered that the problem happened even before the actual NVME DMA execution, but during the buffer mapping. Since our platform is DMA-noncoherent, the mapping implied the cache-line invalidations or write-backs depending on the DMA-direction parameter. In case of the NVME SMART log getting the DMA was performed from-device-to-memory, thus the cache-invalidation was activated during the buffer mapping. Since the log-buffer isn't cache-line aligned, the cache-invalidation caused the neighbour data to be discarded. The neighbouring data turned to be the data surrounding the buffer in the framework of the nvme_hwmon_data structure. In order to fix that we need to make sure that the whole log-buffer is defined within the cache-line-aligned memory region so the cache-invalidation procedure wouldn't involve the adjacent data. One of the option to guarantee that is to kmalloc the DMA-buffer [1]. Seeing the rest of the NVME core driver prefer that method it has been chosen to fix this problem too. Note after a deeper researches we found out that the denoted commit wasn't a root cause of the problem. It just revealed the invalidity by activating the DMA-based NVME SMART log getting performed in the framework of the NVME hwmon driver. The problem was here since the initial commit of the driver. [1] Documentation/core-api/dma-api-howto.rst Fixes: 400b6a7b13a3 ("nvme: Add hardware monitoring support") Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/host/hwmon.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 23918bb7bdca2..9e6e56c20ec99 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -12,7 +12,7 @@ struct nvme_hwmon_data { struct nvme_ctrl *ctrl; - struct nvme_smart_log log; + struct nvme_smart_log *log; struct mutex read_lock; }; @@ -60,14 +60,14 @@ static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under, static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) { return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - NVME_CSI_NVM, &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, data->log, sizeof(*data->log), 0); } static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { struct nvme_hwmon_data *data = dev_get_drvdata(dev); - struct nvme_smart_log *log = &data->log; + struct nvme_smart_log *log = data->log; int temp; int err; @@ -163,7 +163,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data, case hwmon_temp_max: case hwmon_temp_min: if ((!channel && data->ctrl->wctemp) || - (channel && data->log.temp_sensor[channel - 1])) { + (channel && data->log->temp_sensor[channel - 1])) { if (data->ctrl->quirks & NVME_QUIRK_NO_TEMP_THRESH_CHANGE) return 0444; @@ -176,7 +176,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data, break; case hwmon_temp_input: case hwmon_temp_label: - if (!channel || data->log.temp_sensor[channel - 1]) + if (!channel || data->log->temp_sensor[channel - 1]) return 0444; break; default: @@ -232,13 +232,19 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) if (!data) return -ENOMEM; + data->log = kzalloc(sizeof(*data->log), GFP_KERNEL); + if (!data->log) { + err = -ENOMEM; + goto err_free_data; + } + data->ctrl = ctrl; mutex_init(&data->read_lock); err = nvme_hwmon_get_smart_log(data); if (err) { dev_warn(dev, "Failed to read smart log (error %d)\n", err); - goto err_free_data; + goto err_free_log; } hwmon = hwmon_device_register_with_info(dev, "nvme", @@ -247,11 +253,13 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl) if (IS_ERR(hwmon)) { dev_warn(dev, "Failed to instantiate hwmon device\n"); err = PTR_ERR(hwmon); - goto err_free_data; + goto err_free_log; } ctrl->hwmon_device = hwmon; return 0; +err_free_log: + kfree(data->log); err_free_data: kfree(data); return err; @@ -265,6 +273,7 @@ void nvme_hwmon_exit(struct nvme_ctrl *ctrl) hwmon_device_unregister(ctrl->hwmon_device); ctrl->hwmon_device = NULL; + kfree(data->log); kfree(data); } } -- GitLab From ddd2b8de9f85b388925e7dc46b3890fc1a0d8d24 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg <sagi@grimberg.me> Date: Wed, 28 Sep 2022 09:39:10 +0300 Subject: [PATCH 2135/2223] nvmet: fix workqueue MEM_RECLAIM flushing dependency The keep alive timer needs to stay on nvmet_wq, and not modified to reschedule on the system_wq. This fixes a warning: ------------[ cut here ]------------ workqueue: WQ_MEM_RECLAIM nvmet-wq:nvmet_rdma_release_queue_work [nvmet_rdma] is flushing !WQ_MEM_RECLAIM events:nvmet_keep_alive_timer [nvmet] WARNING: CPU: 3 PID: 1086 at kernel/workqueue.c:2628 check_flush_dependency+0x16c/0x1e0 Reported-by: Yi Zhang <yi.zhang@redhat.com> Fixes: 8832cf922151 ("nvmet: use a private workqueue instead of the system workqueue") Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/target/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 14677145bbba0..aecb5853f8da4 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1176,7 +1176,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) * reset the keep alive timer when the controller is enabled. */ if (ctrl->kato) - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + mod_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) -- GitLab From 94f5a06884074dcd99606d7b329e133ee65ea6ad Mon Sep 17 00:00:00 2001 From: Daniel Wagner <dwagner@suse.de> Date: Fri, 7 Oct 2022 09:29:34 +0200 Subject: [PATCH 2136/2223] nvmet: fix invalid memory reference in nvmet_subsys_attr_qid_max_show The item passed into nvmet_subsys_attr_qid_max_show is not a member of struct nvmet_port, it is part of nvmet_subsys. Hence, don't try to dereference it as struct nvme_ctrl pointer. Fixes: 3e980f5995e0 ("nvmet: Expose max queues to configfs") Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com> Link: https://lore.kernel.org/r/20220913064203.133536-1-dwagner@suse.de Signed-off-by: Daniel Wagner <dwagner@suse.de> Reviewed-by: Hannes Reinecke <hare@suse.de> Acked-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/nvme/target/configfs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e34a2896fedb2..9443ee1d4ae3d 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1290,12 +1290,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, const char *page, size_t cnt) { - struct nvmet_port *port = to_nvmet_port(item); u16 qid_max; - if (nvmet_is_port_enabled(port, __func__)) - return -EACCES; - if (sscanf(page, "%hu\n", &qid_max) != 1) return -EINVAL; -- GitLab From 01f2cf53844b01e691516b465df1b6ab01b03230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> Date: Fri, 7 Oct 2022 10:59:58 +0200 Subject: [PATCH 2137/2223] drm/amdgpu: use DRM_SCHED_FENCE_DONT_PIPELINE for VM updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that we always have a CPU round trip to let the submission code correctly decide if a TLB flush is necessary or not. Signed-off-by: Christian König <christian.koenig@amd.com> Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2113#note_1579296 Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Luben Tuikov <luben.tuikov@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221014081553.114899-2-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 1fd3cbca20a29..c7bf189d50def 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -115,8 +115,15 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p, amdgpu_bo_fence(p->vm->root.bo, f, true); } - if (fence && !p->immediate) + if (fence && !p->immediate) { + /* + * Most hw generations now have a separate queue for page table + * updates, but when the queue is shared with userspace we need + * the extra CPU round trip to correctly flush the TLB. + */ + set_bit(DRM_SCHED_FENCE_DONT_PIPELINE, &f->flags); swap(*fence, f); + } dma_fence_put(f); return 0; -- GitLab From eb1d39260ee6477e2971f81cec18ba5f6583259d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com> Date: Tue, 18 Oct 2022 19:34:03 +0200 Subject: [PATCH 2138/2223] ACPI: PCI: Fix device reference counting in acpi_get_pci_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 63f534b8bad9 ("ACPI: PCI: Rework acpi_get_pci_dev()") failed to reference count the device returned by acpi_get_pci_dev() as expected by its callers which in some cases may cause device objects to be dropped prematurely. Add the missing get_device() to acpi_get_pci_dev(). Fixes: 63f534b8bad9 ("ACPI: PCI: Rework acpi_get_pci_dev()") Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- drivers/acpi/pci_root.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index c8385ef54c370..4e3db20e9cbb9 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -323,6 +323,7 @@ struct pci_dev *acpi_get_pci_dev(acpi_handle handle) list_for_each_entry(pn, &adev->physical_node_list, node) { if (dev_is_pci(pn->dev)) { + get_device(pn->dev); pci_dev = to_pci_dev(pn->dev); break; } -- GitLab From 7b55c2ed2ba061b65fc51d7a18d37e017085997f Mon Sep 17 00:00:00 2001 From: Manank Patel <pmanank200502@gmail.com> Date: Tue, 18 Oct 2022 11:03:18 +0530 Subject: [PATCH 2139/2223] ethernet: marvell: octeontx2 Fix resource not freed after malloc fix rxsc and txsc not getting freed before going out of scope Fixes: c54ffc73601c ("octeontx2-pf: mcs: Introduce MACSEC hardware offloading") Signed-off-by: Manank Patel <pmanank200502@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index 9809f551fc2e3..9ec5f38d38a84 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -815,6 +815,7 @@ free_flowid: cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_FLOWID, txsc->hw_flow_id, false); fail: + kfree(txsc); return ERR_PTR(ret); } @@ -870,6 +871,7 @@ free_flowid: cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_FLOWID, rxsc->hw_flow_id, false); fail: + kfree(rxsc); return ERR_PTR(ret); } -- GitLab From 51f9a8921ceacd7bf0d3f47fa867a64988ba1dcb Mon Sep 17 00:00:00 2001 From: Zhengchao Shao <shaozhengchao@huawei.com> Date: Tue, 18 Oct 2022 14:31:59 +0800 Subject: [PATCH 2140/2223] net: sched: cake: fix null pointer access issue when cake_init() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the default qdisc is cake, if the qdisc of dev_queue fails to be inited during mqprio_init(), cake_reset() is invoked to clear resources. In this case, the tins is NULL, and it will cause gpf issue. The process is as follows: qdisc_create_dflt() cake_init() q->tins = kvcalloc(...) --->failed, q->tins is NULL ... qdisc_put() ... cake_reset() ... cake_dequeue_one() b = &q->tins[...] --->q->tins is NULL The following is the Call Trace information: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:cake_dequeue_one+0xc9/0x3c0 Call Trace: <TASK> cake_reset+0xb1/0x140 qdisc_reset+0xed/0x6f0 qdisc_destroy+0x82/0x4c0 qdisc_put+0x9e/0xb0 qdisc_create_dflt+0x2c3/0x4a0 mqprio_init+0xa71/0x1760 qdisc_create+0x3eb/0x1000 tc_modify_qdisc+0x408/0x1720 rtnetlink_rcv_msg+0x38e/0xac0 netlink_rcv_skb+0x12d/0x3a0 netlink_unicast+0x4a2/0x740 netlink_sendmsg+0x826/0xcc0 sock_sendmsg+0xc5/0x100 ____sys_sendmsg+0x583/0x690 ___sys_sendmsg+0xe8/0x160 __sys_sendmsg+0xbf/0x160 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f89e5122d04 </TASK> Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com> Acked-by: Toke Høiland-Jørgensen <toke@toke.dk> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_cake.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 55c6879d2c7e7..87f8ce2c65ee9 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2224,8 +2224,12 @@ retry: static void cake_reset(struct Qdisc *sch) { + struct cake_sched_data *q = qdisc_priv(sch); u32 c; + if (!q->tins) + return; + for (c = 0; c < CAKE_MAX_TINS; c++) cake_clear_tin(sch, c); } -- GitLab From f5ffa3b1197395501b72c10b35518bf58ef24475 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao <shaozhengchao@huawei.com> Date: Tue, 18 Oct 2022 14:32:00 +0800 Subject: [PATCH 2141/2223] Revert "net: sched: fq_codel: remove redundant resource cleanup in fq_codel_init()" This reverts commit 494f5063b86cd6e972cb41a27e083c9a3664319d. When the default qdisc is fq_codel, if the qdisc of dev_queue fails to be inited during mqprio_init(), fq_codel_reset() is invoked to clear resources. In this case, the flow is NULL, and it will cause gpf issue. The process is as follows: qdisc_create_dflt() fq_codel_init() ... q->flows_cnt = 1024; ... q->flows = kvcalloc(...) --->failed, q->flows is NULL ... qdisc_put() ... fq_codel_reset() ... flow = q->flows + i --->q->flows is NULL The following is the Call Trace information: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] RIP: 0010:fq_codel_reset+0x14d/0x350 Call Trace: <TASK> qdisc_reset+0xed/0x6f0 qdisc_destroy+0x82/0x4c0 qdisc_put+0x9e/0xb0 qdisc_create_dflt+0x2c3/0x4a0 mqprio_init+0xa71/0x1760 qdisc_create+0x3eb/0x1000 tc_modify_qdisc+0x408/0x1720 rtnetlink_rcv_msg+0x38e/0xac0 netlink_rcv_skb+0x12d/0x3a0 netlink_unicast+0x4a2/0x740 netlink_sendmsg+0x826/0xcc0 sock_sendmsg+0xc5/0x100 ____sys_sendmsg+0x583/0x690 ___sys_sendmsg+0xe8/0x160 __sys_sendmsg+0xbf/0x160 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fd272b22d04 </TASK> Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_fq_codel.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 99d318b605682..8c4fee0634366 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -478,24 +478,26 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, if (opt) { err = fq_codel_change(sch, opt, extack); if (err) - return err; + goto init_failure; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) - return err; + goto init_failure; if (!q->flows) { q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_codel_flow), GFP_KERNEL); - if (!q->flows) - return -ENOMEM; - + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); - if (!q->backlogs) - return -ENOMEM; - + if (!q->backlogs) { + err = -ENOMEM; + goto alloc_failure; + } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; @@ -508,6 +510,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; + +alloc_failure: + kvfree(q->flows); + q->flows = NULL; +init_failure: + q->flows_cnt = 0; + return err; } static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) -- GitLab From 2a3fc78210b9f0e85372a2435368962009f480fc Mon Sep 17 00:00:00 2001 From: Zhengchao Shao <shaozhengchao@huawei.com> Date: Tue, 18 Oct 2022 14:32:01 +0800 Subject: [PATCH 2142/2223] net: sched: sfb: fix null pointer access issue when sfb_init() fails When the default qdisc is sfb, if the qdisc of dev_queue fails to be inited during mqprio_init(), sfb_reset() is invoked to clear resources. In this case, the q->qdisc is NULL, and it will cause gpf issue. The process is as follows: qdisc_create_dflt() sfb_init() tcf_block_get() --->failed, q->qdisc is NULL ... qdisc_put() ... sfb_reset() qdisc_reset(q->qdisc) --->q->qdisc is NULL ops = qdisc->ops The following is the Call Trace information: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] RIP: 0010:qdisc_reset+0x2b/0x6f0 Call Trace: <TASK> sfb_reset+0x37/0xd0 qdisc_reset+0xed/0x6f0 qdisc_destroy+0x82/0x4c0 qdisc_put+0x9e/0xb0 qdisc_create_dflt+0x2c3/0x4a0 mqprio_init+0xa71/0x1760 qdisc_create+0x3eb/0x1000 tc_modify_qdisc+0x408/0x1720 rtnetlink_rcv_msg+0x38e/0xac0 netlink_rcv_skb+0x12d/0x3a0 netlink_unicast+0x4a2/0x740 netlink_sendmsg+0x826/0xcc0 sock_sendmsg+0xc5/0x100 ____sys_sendmsg+0x583/0x690 ___sys_sendmsg+0xe8/0x160 __sys_sendmsg+0xbf/0x160 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f2164122d04 </TASK> Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_sfb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index e2389fa3cff8a..73ae2e726512a 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -455,7 +455,8 @@ static void sfb_reset(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); - qdisc_reset(q->qdisc); + if (likely(q->qdisc)) + qdisc_reset(q->qdisc); q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); -- GitLab From 672e97ef689a38cb20c2cc6a1814298fea34461e Mon Sep 17 00:00:00 2001 From: Paul Blakey <paulb@nvidia.com> Date: Tue, 18 Oct 2022 10:34:38 +0300 Subject: [PATCH 2143/2223] net: Fix return value of qdisc ingress handling on success Currently qdisc ingress handling (sch_handle_ingress()) doesn't set a return value and it is left to the old return value of the caller (__netif_receive_skb_core()) which is RX drop, so if the packet is consumed, caller will stop and return this value as if the packet was dropped. This causes a problem in the kernel tcp stack when having a egress tc rule forwarding to a ingress tc rule. The tcp stack sending packets on the device having the egress rule will see the packets as not successfully transmitted (although they actually were), will not advance it's internal state of sent data, and packets returning on such tcp stream will be dropped by the tcp stack with reason ack-of-unsent-data. See reproduction in [0] below. Fix that by setting the return value to RX success if the packet was handled successfully. [0] Reproduction steps: $ ip link add veth1 type veth peer name peer1 $ ip link add veth2 type veth peer name peer2 $ ifconfig peer1 5.5.5.6/24 up $ ip netns add ns0 $ ip link set dev peer2 netns ns0 $ ip netns exec ns0 ifconfig peer2 5.5.5.5/24 up $ ifconfig veth2 0 up $ ifconfig veth1 0 up #ingress forwarding veth1 <-> veth2 $ tc qdisc add dev veth2 ingress $ tc qdisc add dev veth1 ingress $ tc filter add dev veth2 ingress prio 1 proto all flower \ action mirred egress redirect dev veth1 $ tc filter add dev veth1 ingress prio 1 proto all flower \ action mirred egress redirect dev veth2 #steal packet from peer1 egress to veth2 ingress, bypassing the veth pipe $ tc qdisc add dev peer1 clsact $ tc filter add dev peer1 egress prio 20 proto ip flower \ action mirred ingress redirect dev veth1 #run iperf and see connection not running $ iperf3 -s& $ ip netns exec ns0 iperf3 -c 5.5.5.6 -i 1 #delete egress rule, and run again, now should work $ tc filter del dev peer1 egress $ ip netns exec ns0 iperf3 -c 5.5.5.6 -i 1 Fixes: f697c3e8b35c ("[NET]: Avoid unnecessary cloning for ingress filtering") Signed-off-by: Paul Blakey <paulb@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index fa53830d06839..3be256051e99b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5136,11 +5136,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -5153,8 +5155,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, *another = true; break; } + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; default: break; -- GitLab From fd602f5cb52e336d8c06f8da2d80c76ce2905030 Mon Sep 17 00:00:00 2001 From: Paul Blakey <paulb@nvidia.com> Date: Tue, 18 Oct 2022 10:34:39 +0300 Subject: [PATCH 2144/2223] selftests: add selftest for chaining of tc ingress handling to egress This test runs a simple ingress tc setup between two veth pairs, then adds a egress->ingress rule to test the chaining of tc ingress pipeline to tc egress piepline. Signed-off-by: Paul Blakey <paulb@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- tools/testing/selftests/net/Makefile | 1 + .../net/test_ingress_egress_chaining.sh | 79 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tools/testing/selftests/net/test_ingress_egress_chaining.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 2a6b0bc648c4f..69c58362c0edf 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -70,6 +70,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh TEST_GEN_FILES += bind_bhash TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr +TEST_PROGS += test_ingress_egress_chaining.sh TEST_FILES := settings diff --git a/tools/testing/selftests/net/test_ingress_egress_chaining.sh b/tools/testing/selftests/net/test_ingress_egress_chaining.sh new file mode 100644 index 0000000000000..08adff6bb3b63 --- /dev/null +++ b/tools/testing/selftests/net/test_ingress_egress_chaining.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test runs a simple ingress tc setup between two veth pairs, +# and chains a single egress rule to test ingress chaining to egress. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +needed_mods="act_mirred cls_flower sch_ingress" +for mod in $needed_mods; do + modinfo $mod &>/dev/null || { echo "SKIP: Need act_mirred module"; exit $ksft_skip; } +done + +ns="ns$((RANDOM%899+100))" +veth1="veth1$((RANDOM%899+100))" +veth2="veth2$((RANDOM%899+100))" +peer1="peer1$((RANDOM%899+100))" +peer2="peer2$((RANDOM%899+100))" +ip_peer1=198.51.100.5 +ip_peer2=198.51.100.6 + +function fail() { + echo "FAIL: $@" >> /dev/stderr + exit 1 +} + +function cleanup() { + killall -q -9 udpgso_bench_rx + ip link del $veth1 &> /dev/null + ip link del $veth2 &> /dev/null + ip netns del $ns &> /dev/null +} +trap cleanup EXIT + +function config() { + echo "Setup veth pairs [$veth1, $peer1], and veth pair [$veth2, $peer2]" + ip link add $veth1 type veth peer name $peer1 + ip link add $veth2 type veth peer name $peer2 + ip addr add $ip_peer1/24 dev $peer1 + ip link set $peer1 up + ip netns add $ns + ip link set dev $peer2 netns $ns + ip netns exec $ns ip addr add $ip_peer2/24 dev $peer2 + ip netns exec $ns ip link set $peer2 up + ip link set $veth1 up + ip link set $veth2 up + + echo "Add tc filter ingress->egress forwarding $veth1 <-> $veth2" + tc qdisc add dev $veth2 ingress + tc qdisc add dev $veth1 ingress + tc filter add dev $veth2 ingress prio 1 proto all flower \ + action mirred egress redirect dev $veth1 + tc filter add dev $veth1 ingress prio 1 proto all flower \ + action mirred egress redirect dev $veth2 + + echo "Add tc filter egress->ingress forwarding $peer1 -> $veth1, bypassing the veth pipe" + tc qdisc add dev $peer1 clsact + tc filter add dev $peer1 egress prio 20 proto ip flower \ + action mirred ingress redirect dev $veth1 +} + +function test_run() { + echo "Run tcp traffic" + ./udpgso_bench_rx -t & + sleep 1 + ip netns exec $ns timeout -k 2 10 ./udpgso_bench_tx -t -l 2 -4 -D $ip_peer1 || fail "traffic failed" + echo "Test passed" +} + +config +test_run +trap - EXIT +cleanup -- GitLab From abe3c631447dcd1ba7af972fe6f054bee6f136fa Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" <gongruiqi1@huawei.com> Date: Wed, 19 Oct 2022 10:57:10 +0800 Subject: [PATCH 2145/2223] selinux: enable use of both GFP_KERNEL and GFP_ATOMIC in convert_context() The following warning was triggered on a hardware environment: SELinux: Converting 162 SID table entries... BUG: sleeping function called from invalid context at __might_sleep+0x60/0x74 0x0 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 5943, name: tar CPU: 7 PID: 5943 Comm: tar Tainted: P O 5.10.0 #1 Call trace: dump_backtrace+0x0/0x1c8 show_stack+0x18/0x28 dump_stack+0xe8/0x15c ___might_sleep+0x168/0x17c __might_sleep+0x60/0x74 __kmalloc_track_caller+0xa0/0x7dc kstrdup+0x54/0xac convert_context+0x48/0x2e4 sidtab_context_to_sid+0x1c4/0x36c security_context_to_sid_core+0x168/0x238 security_context_to_sid_default+0x14/0x24 inode_doinit_use_xattr+0x164/0x1e4 inode_doinit_with_dentry+0x1c0/0x488 selinux_d_instantiate+0x20/0x34 security_d_instantiate+0x70/0xbc d_splice_alias+0x4c/0x3c0 ext4_lookup+0x1d8/0x200 [ext4] __lookup_slow+0x12c/0x1e4 walk_component+0x100/0x200 path_lookupat+0x88/0x118 filename_lookup+0x98/0x130 user_path_at_empty+0x48/0x60 vfs_statx+0x84/0x140 vfs_fstatat+0x20/0x30 __se_sys_newfstatat+0x30/0x74 __arm64_sys_newfstatat+0x1c/0x2c el0_svc_common.constprop.0+0x100/0x184 do_el0_svc+0x1c/0x2c el0_svc+0x20/0x34 el0_sync_handler+0x80/0x17c el0_sync+0x13c/0x140 SELinux: Context system_u:object_r:pssp_rsyslog_log_t:s0:c0 is not valid (left unmapped). It was found that within a critical section of spin_lock_irqsave in sidtab_context_to_sid(), convert_context() (hooked by sidtab_convert_params.func) might cause the process to sleep via allocating memory with GFP_KERNEL, which is problematic. As Ondrej pointed out [1], convert_context()/sidtab_convert_params.func has another caller sidtab_convert_tree(), which is okay with GFP_KERNEL. Therefore, fix this problem by adding a gfp_t argument for convert_context()/sidtab_convert_params.func and pass GFP_KERNEL/_ATOMIC properly in individual callers. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20221018120111.1474581-1-gongruiqi1@huawei.com/ [1] Reported-by: Tan Ninghao <tanninghao1@huawei.com> Fixes: ee1a84fdfeed ("selinux: overhaul sidtab to fix bug and improve performance") Signed-off-by: GONG, Ruiqi <gongruiqi1@huawei.com> Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com> [PM: wrap long BUG() output lines, tweak subject line] Signed-off-by: Paul Moore <paul@paul-moore.com> --- security/selinux/ss/services.c | 5 +++-- security/selinux/ss/sidtab.c | 4 ++-- security/selinux/ss/sidtab.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index fe5fcf571c564..64a6a37dc36d9 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2022,7 +2022,8 @@ static inline int convert_context_handle_invalid_context( * in `newc'. Verify that the context is valid * under the new policy. */ -static int convert_context(struct context *oldc, struct context *newc, void *p) +static int convert_context(struct context *oldc, struct context *newc, void *p, + gfp_t gfp_flags) { struct convert_context_args *args; struct ocontext *oc; @@ -2036,7 +2037,7 @@ static int convert_context(struct context *oldc, struct context *newc, void *p) args = p; if (oldc->str) { - s = kstrdup(oldc->str, GFP_KERNEL); + s = kstrdup(oldc->str, gfp_flags); if (!s) return -ENOMEM; diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index a54b8652bfb50..db5cce385bf86 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -325,7 +325,7 @@ int sidtab_context_to_sid(struct sidtab *s, struct context *context, } rc = convert->func(context, &dst_convert->context, - convert->args); + convert->args, GFP_ATOMIC); if (rc) { context_destroy(&dst->context); goto out_unlock; @@ -404,7 +404,7 @@ static int sidtab_convert_tree(union sidtab_entry_inner *edst, while (i < SIDTAB_LEAF_ENTRIES && *pos < count) { rc = convert->func(&esrc->ptr_leaf->entries[i].context, &edst->ptr_leaf->entries[i].context, - convert->args); + convert->args, GFP_KERNEL); if (rc) return rc; (*pos)++; diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h index 4eff0e49dcb22..9fce0d553fe2c 100644 --- a/security/selinux/ss/sidtab.h +++ b/security/selinux/ss/sidtab.h @@ -65,7 +65,7 @@ struct sidtab_isid_entry { }; struct sidtab_convert_params { - int (*func)(struct context *oldc, struct context *newc, void *args); + int (*func)(struct context *oldc, struct context *newc, void *args, gfp_t gfp_flags); void *args; struct sidtab *target; }; -- GitLab From 096bbeec7bd6fb683831a9ca4850a6b6a3f04740 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Sat, 15 Oct 2022 17:02:30 -0500 Subject: [PATCH 2146/2223] smb3: interface count displayed incorrectly The "Server interfaces" count in /proc/fs/cifs/DebugData increases as the interfaces are requeried, rather than being reset to the new value. This could cause a problem if the server disabled multichannel as the iface_count is checked in try_adding_channels to see if multichannel still supported. Also fixes a coverity warning: Addresses-Coverity: 1526374 ("Concurrent data access violations (MISSING_LOCK)") Cc: <stable@vger.kernel.org> Reviewed-by: Bharath SM <bharathsm@microsoft.com> Reviewed-by: Shyam Prasad N <sprasad@microsoft.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 17b25153cb689..4f53fa012936e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -530,6 +530,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, p = buf; spin_lock(&ses->iface_lock); + ses->iface_count = 0; /* * Go through iface_list and do kref_put to remove * any unused ifaces. ifaces in use will be removed @@ -651,9 +652,9 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, kref_put(&iface->refcount, release_iface); } else list_add_tail(&info->iface_head, &ses->iface_list); - spin_unlock(&ses->iface_lock); ses->iface_count++; + spin_unlock(&ses->iface_lock); ses->iface_last_update = jiffies; next_iface: nb_iface++; -- GitLab From aa23d45eeb3497bb89f112b407fcc6d21210010f Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I <kishon@ti.com> Date: Wed, 28 Sep 2022 15:01:05 +0530 Subject: [PATCH 2147/2223] MAINTAINERS: Add Vignesh Raghavendra as maintainer of TI DRA7XX/J721E PCI driver Add Vignesh Raghavendra as maintainer of TI DRA7XX/J721E PCI driver. Link: https://lore.kernel.org/r/20220928093105.23073-1-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f185023724..62e6252a83c23 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15839,7 +15839,7 @@ F: Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml F: drivers/pci/controller/dwc/*designware* PCI DRIVER FOR TI DRA7XX/J721E -M: Kishon Vijay Abraham I <kishon@ti.com> +M: Vignesh Raghavendra <vigneshr@ti.com> L: linux-omap@vger.kernel.org L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -- GitLab From 64d23ff38ac9ea822c9810b60a616e39e2c2c82d Mon Sep 17 00:00:00 2001 From: Robin Murphy <robin.murphy@arm.com> Date: Tue, 18 Oct 2022 14:14:04 +0100 Subject: [PATCH 2148/2223] ACPI: scan: Fix DMA range assignment Assigning the device's dma_range_map from the iterator variable after the loop means it always points to the empty terminator at the end of the map, which is not what we want. Similarly, freeing the iterator on error when it points to somwhere in the middle of the allocated array won't work either. Fix this. Fixes: bf2ee8d0c385 ("ACPI: scan: Support multiple DMA windows with different offsets") Signed-off-by: Robin Murphy <robin.murphy@arm.com> Reviewed-by: Jianmin Lv <lvjianmin@loongson.cn> Tested-by: Jeremy Linton <jeremy.linton@arm.com> Tested-by: Yicong Yang <yangyicong@hisilicon.com> Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- drivers/acpi/scan.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 558664d169fcc..024cc373a197f 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1509,9 +1509,12 @@ int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) goto out; } + *map = r; + list_for_each_entry(rentry, &list, node) { if (rentry->res->start >= rentry->res->end) { - kfree(r); + kfree(*map); + *map = NULL; ret = -EINVAL; dev_dbg(dma_dev, "Invalid DMA regions configuration\n"); goto out; @@ -1523,8 +1526,6 @@ int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) r->offset = rentry->offset; r++; } - - *map = r; } out: acpi_dev_free_resource_list(&list); -- GitLab From b8caf0a0e04583fb71e21495bef84509182227ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=AFma?= <martin.tuma@digiteqautomotive.com> Date: Tue, 18 Oct 2022 16:03:37 +0200 Subject: [PATCH 2149/2223] i2c: xiic: Add platform module alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The missing "platform" alias is required for the mgb4 v4l2 driver to load the i2c controller driver when probing the HW. Signed-off-by: Martin Tůma <martin.tuma@digiteqautomotive.com> Acked-by: Michal Simek <michal.simek@amd.com> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-xiic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c index b3fe6b2aa3ca9..277a02455cddd 100644 --- a/drivers/i2c/busses/i2c-xiic.c +++ b/drivers/i2c/busses/i2c-xiic.c @@ -920,6 +920,7 @@ static struct platform_driver xiic_i2c_driver = { module_platform_driver(xiic_i2c_driver); +MODULE_ALIAS("platform:" DRIVER_NAME); MODULE_AUTHOR("info@mocean-labs.com"); MODULE_DESCRIPTION("Xilinx I2C bus driver"); MODULE_LICENSE("GPL v2"); -- GitLab From 16bbdfe5fb0e78e0acb13e45fc127e9a296913f2 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com> Date: Wed, 19 Oct 2022 10:12:18 -0700 Subject: [PATCH 2150/2223] io_uring/msg_ring: Fix NULL pointer dereference in io_msg_send_fd() Syzkaller produced the below call trace: BUG: KASAN: null-ptr-deref in io_msg_ring+0x3cb/0x9f0 Write of size 8 at addr 0000000000000070 by task repro/16399 CPU: 0 PID: 16399 Comm: repro Not tainted 6.1.0-rc1 #28 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 Call Trace: <TASK> dump_stack_lvl+0xcd/0x134 ? io_msg_ring+0x3cb/0x9f0 kasan_report+0xbc/0xf0 ? io_msg_ring+0x3cb/0x9f0 kasan_check_range+0x140/0x190 io_msg_ring+0x3cb/0x9f0 ? io_msg_ring_prep+0x300/0x300 io_issue_sqe+0x698/0xca0 io_submit_sqes+0x92f/0x1c30 __do_sys_io_uring_enter+0xae4/0x24b0 .... RIP: 0033:0x7f2eaf8f8289 RSP: 002b:00007fff40939718 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2eaf8f8289 RDX: 0000000000000000 RSI: 0000000000006f71 RDI: 0000000000000004 RBP: 00007fff409397a0 R08: 0000000000000000 R09: 0000000000000039 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004006d0 R13: 00007fff40939880 R14: 0000000000000000 R15: 0000000000000000 </TASK> Kernel panic - not syncing: panic_on_warn set ... We don't have a NULL check on file_ptr in io_msg_send_fd() function, so when file_ptr is NUL src_file is also NULL and get_file() dereferences a NULL pointer and leads to above crash. Add a NULL check to fix this issue. Fixes: e6130eba8a84 ("io_uring: add support for passing fixed file descriptors") Reported-by: syzkaller <syzkaller@googlegroups.com> Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com> Link: https://lore.kernel.org/r/20221019171218.1337614-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/msg_ring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4a7e5d030c782..90d2fc6fd80e4 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -95,6 +95,9 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; + if (!file_ptr) + goto out_unlock; + src_file = (struct file *) (file_ptr & FFS_MASK); get_file(src_file); -- GitLab From 61775d54d674ff8ec3658495e0dbc537227dc5c1 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue <bryan.odonoghue@linaro.org> Date: Tue, 18 Oct 2022 03:19:20 +0100 Subject: [PATCH 2151/2223] i2c: qcom-cci: Fix ordering of pm_runtime_xx and i2c_add_adapter When we compile-in the CCI along with the imx412 driver and run on the RB5 we see that i2c_add_adapter() causes the probe of the imx412 driver to happen. This probe tries to perform an i2c xfer() and the xfer() in i2c-qcom-cci.c fails on pm_runtime_get() because the i2c-qcom-cci.c::probe() function has not completed to pm_runtime_enable(dev). Fix this sequence by ensuring pm_runtime_xxx() calls happen prior to adding the i2c adapter. Fixes: e517526195de ("i2c: Add Qualcomm CCI I2C driver") Reported-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org> Reviewed-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org> Tested-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org> Cc: <stable@vger.kernel.org> Signed-off-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org> Reviewed-by: Robert Foss <robert.foss@linaro.org> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-qcom-cci.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-cci.c b/drivers/i2c/busses/i2c-qcom-cci.c index 87739fb4388ba..a4b97fe3c3a5b 100644 --- a/drivers/i2c/busses/i2c-qcom-cci.c +++ b/drivers/i2c/busses/i2c-qcom-cci.c @@ -639,6 +639,11 @@ static int cci_probe(struct platform_device *pdev) if (ret < 0) goto error; + pm_runtime_set_autosuspend_delay(dev, MSEC_PER_SEC); + pm_runtime_use_autosuspend(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + for (i = 0; i < cci->data->num_masters; i++) { if (!cci->master[i].cci) continue; @@ -650,14 +655,12 @@ static int cci_probe(struct platform_device *pdev) } } - pm_runtime_set_autosuspend_delay(dev, MSEC_PER_SEC); - pm_runtime_use_autosuspend(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - return 0; error_i2c: + pm_runtime_disable(dev); + pm_runtime_dont_use_autosuspend(dev); + for (--i ; i >= 0; i--) { if (cci->master[i].cci) { i2c_del_adapter(&cci->master[i].adap); -- GitLab From a1a824f448ba96c610b85288d844adc9f781828e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski <kuba@kernel.org> Date: Tue, 18 Oct 2022 16:13:10 -0700 Subject: [PATCH 2152/2223] genetlink: fix kdoc warnings Address a bunch of kdoc warnings: include/net/genetlink.h:81: warning: Function parameter or member 'module' not described in 'genl_family' include/net/genetlink.h:243: warning: expecting prototype for struct genl_info. Prototype was for struct genl_dumpit_info instead include/net/genetlink.h:419: warning: Function parameter or member 'net' not described in 'genlmsg_unicast' include/net/genetlink.h:438: warning: expecting prototype for gennlmsg_data(). Prototype was for genlmsg_data() instead include/net/genetlink.h:244: warning: Function parameter or member 'op' not described in 'genl_dumpit_info' Link: https://lore.kernel.org/r/20221018231310.1040482-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/net/genetlink.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 8f780170e2f87..3d08e67b3cfcc 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -37,6 +37,7 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks + * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @resv_start_op: first operation for which reserved fields of the header @@ -173,9 +174,9 @@ struct genl_ops { }; /** - * struct genl_info - info that is available during dumpit op call + * struct genl_dumpit_info - info that is available during dumpit op call * @family: generic netlink family - for internal genl code usage - * @ops: generic netlink ops - for internal genl code usage + * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes */ struct genl_dumpit_info { @@ -354,6 +355,7 @@ int genlmsg_multicast_allns(const struct genl_family *family, /** * genlmsg_unicast - unicast a netlink message + * @net: network namespace to look up @portid in * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ @@ -373,7 +375,7 @@ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) } /** - * gennlmsg_data - head of message payload + * genlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) -- GitLab From e6aa4edd2f5b07fdc41de287876dd98c6e44322b Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I <kishon@ti.com> Date: Wed, 19 Oct 2022 12:02:33 -0500 Subject: [PATCH 2153/2223] MAINTAINERS: Update Kishon's email address in PCI endpoint subsystem Update Kishon's email address in PCI endpoint subsystem maintainer entry and mark him as reviewer. Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 62e6252a83c23..866588fb903be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15856,10 +15856,10 @@ F: Documentation/devicetree/bindings/pci/v3-v360epc-pci.txt F: drivers/pci/controller/pci-v3-semi.c PCI ENDPOINT SUBSYSTEM -M: Kishon Vijay Abraham I <kishon@ti.com> M: Lorenzo Pieralisi <lpieralisi@kernel.org> R: Krzysztof Wilczyński <kw@linux.com> R: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> +R: Kishon Vijay Abraham I <kishon@kernel.org> L: linux-pci@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-pci/list/ -- GitLab From 8e77860c62b6eac8bb5b567efe6b8cd232d5f72f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg <lsahlber@redhat.com> Date: Tue, 18 Oct 2022 17:39:10 +1000 Subject: [PATCH 2154/2223] cifs: drop the lease for cached directories on rmdir or rename When we delete or rename a directory we must also drop any cached lease we have on the directory. Fixes: a350d6e73f5e ("cifs: enable caching of directories for which a lease is held") Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cached_dir.c | 21 +++++++++++++++++++++ fs/cifs/cached_dir.h | 4 ++++ fs/cifs/smb2inode.c | 2 ++ 3 files changed, 27 insertions(+) diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 20efc9e22761d..60399081046a5 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -340,6 +340,27 @@ smb2_close_cached_fid(struct kref *ref) free_cached_dir(cfid); } +void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb) +{ + struct cached_fid *cfid = NULL; + int rc; + + rc = open_cached_dir(xid, tcon, name, cifs_sb, true, &cfid); + if (rc) { + cifs_dbg(FYI, "no cached dir found for rmdir(%s)\n", name); + return; + } + spin_lock(&cfid->cfids->cfid_list_lock); + if (cfid->has_lease) { + cfid->has_lease = false; + kref_put(&cfid->refcount, smb2_close_cached_fid); + } + spin_unlock(&cfid->cfids->cfid_list_lock); + close_cached_dir(cfid); +} + + void close_cached_dir(struct cached_fid *cfid) { kref_put(&cfid->refcount, smb2_close_cached_fid); diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index e536304ca2ce4..2f4e764c9ca9a 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -69,6 +69,10 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct dentry *dentry, struct cached_fid **cfid); extern void close_cached_dir(struct cached_fid *cfid); +extern void drop_cached_dir_by_name(const unsigned int xid, + struct cifs_tcon *tcon, + const char *name, + struct cifs_sb_info *cifs_sb); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index a6640e6ea58bc..68e08c85fbb87 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -655,6 +655,7 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { + drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, NULL, SMB2_OP_RMDIR, NULL, NULL, NULL); @@ -698,6 +699,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, { struct cifsFileInfo *cfile; + drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb); cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); return smb2_set_path_attr(xid, tcon, from_name, to_name, -- GitLab From 01f2ee7e325611524078009d70392a5d5eca0945 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara <pc@cjr.nz> Date: Wed, 19 Oct 2022 11:25:37 -0300 Subject: [PATCH 2155/2223] cifs: fix memory leaks in session setup We were only zeroing out the ntlmssp blob but forgot to free the allocated buffer in the end of SMB2_sess_auth_rawntlmssp_negotiate() and SMB2_sess_auth_rawntlmssp_authenticate() functions. This fixes below kmemleak reports: unreferenced object 0xffff88800ddcfc60 (size 96): comm "mount.cifs", pid 758, jiffies 4294696066 (age 42.967s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d0beeb29>] __kmalloc+0x39/0xa0 [<00000000e3834047>] build_ntlmssp_smb3_negotiate_blob+0x2c/0x110 [cifs] [<00000000e85f5ab2>] SMB2_sess_auth_rawntlmssp_negotiate+0xd3/0x230 [cifs] [<0000000080fdb897>] SMB2_sess_setup+0x16c/0x2a0 [cifs] [<000000009af320a8>] cifs_setup_session+0x13b/0x370 [cifs] [<00000000f15d5982>] cifs_get_smb_ses+0x643/0xb90 [cifs] [<00000000fe15eb90>] mount_get_conns+0x63/0x3e0 [cifs] [<00000000768aba03>] mount_get_dfs_conns+0x16/0xa0 [cifs] [<00000000cf1cf146>] cifs_mount+0x1c2/0x9a0 [cifs] [<000000000d66b51e>] cifs_smb3_do_mount+0x10e/0x710 [cifs] [<0000000077a996c5>] smb3_get_tree+0xf4/0x200 [cifs] [<0000000094dbd041>] vfs_get_tree+0x23/0xc0 [<000000003a8561de>] path_mount+0x2d3/0xb50 [<00000000ed5c86d6>] __x64_sys_mount+0x102/0x140 [<00000000142142f3>] do_syscall_64+0x3b/0x90 [<00000000e2b89731>] entry_SYSCALL_64_after_hwframe+0x63/0xcd unreferenced object 0xffff88801437f000 (size 512): comm "mount.cifs", pid 758, jiffies 4294696067 (age 42.970s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d0beeb29>] __kmalloc+0x39/0xa0 [<00000000004f53d2>] build_ntlmssp_auth_blob+0x4f/0x340 [cifs] [<000000005f333084>] SMB2_sess_auth_rawntlmssp_authenticate+0xd4/0x250 [cifs] [<0000000080fdb897>] SMB2_sess_setup+0x16c/0x2a0 [cifs] [<000000009af320a8>] cifs_setup_session+0x13b/0x370 [cifs] [<00000000f15d5982>] cifs_get_smb_ses+0x643/0xb90 [cifs] [<00000000fe15eb90>] mount_get_conns+0x63/0x3e0 [cifs] [<00000000768aba03>] mount_get_dfs_conns+0x16/0xa0 [cifs] [<00000000cf1cf146>] cifs_mount+0x1c2/0x9a0 [cifs] [<000000000d66b51e>] cifs_smb3_do_mount+0x10e/0x710 [cifs] [<0000000077a996c5>] smb3_get_tree+0xf4/0x200 [cifs] [<0000000094dbd041>] vfs_get_tree+0x23/0xc0 [<000000003a8561de>] path_mount+0x2d3/0xb50 [<00000000ed5c86d6>] __x64_sys_mount+0x102/0x140 [<00000000142142f3>] do_syscall_64+0x3b/0x90 [<00000000e2b89731>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: a4e430c8c8ba ("cifs: replace kfree() with kfree_sensitive() for sensitive data") Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/smb2pdu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c930b63bc422f..a5695748a89b1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1341,14 +1341,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) static void SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) { - int i; + struct kvec *iov = sess_data->iov; - /* zero the session data before freeing, as it might contain sensitive info (keys, etc) */ - for (i = 0; i < 2; i++) - if (sess_data->iov[i].iov_base) - memzero_explicit(sess_data->iov[i].iov_base, sess_data->iov[i].iov_len); + /* iov[1] is already freed by caller */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; } @@ -1578,7 +1577,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) } out: - memzero_explicit(ntlmssp_blob, blob_length); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); if (!rc) { sess_data->result = 0; @@ -1662,7 +1661,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } #endif out: - memzero_explicit(ntlmssp_blob, blob_length); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; -- GitLab From 73b1b8d25e39a1478b3792a7075f43e053ee62c2 Mon Sep 17 00:00:00 2001 From: Steve French <stfrench@microsoft.com> Date: Wed, 19 Oct 2022 00:30:04 -0500 Subject: [PATCH 2156/2223] cifs: update internal module number To 2.40 Signed-off-by: Steve French <stfrench@microsoft.com> --- fs/cifs/cifsfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5b4a7a32bdc58..388b745a978e2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 39 -#define CIFS_VERSION "2.39" +#define SMB3_PRODUCT_BUILD 40 +#define CIFS_VERSION "2.40" #endif /* _CIFSFS_H */ -- GitLab From c2bf23e4a5af37a4d77901d9ff14c50a269f143d Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com> Date: Tue, 18 Oct 2022 10:28:41 +0100 Subject: [PATCH 2157/2223] sfc: include vport_id in filter spec hash and equal() Filters on different vports are qualified by different implicit MACs and/or VLANs, so shouldn't be considered equal even if their other match fields are identical. Fixes: 7c460d9be610 ("sfc: Extend and abstract efx_filter_spec to cover Huntington/EF10") Co-developed-by: Edward Cree <ecree.xilinx@gmail.com> Signed-off-by: Edward Cree <ecree.xilinx@gmail.com> Signed-off-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com> Reviewed-by: Martin Habets <habetsm.xilinx@gmail.com> Link: https://lore.kernel.org/r/20221018092841.32206-1-pieter.jansen-van-vuuren@amd.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/sfc/filter.h | 4 ++-- drivers/net/ethernet/sfc/rx_common.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/sfc/filter.h b/drivers/net/ethernet/sfc/filter.h index be72e71da0277..5f201a547e5b5 100644 --- a/drivers/net/ethernet/sfc/filter.h +++ b/drivers/net/ethernet/sfc/filter.h @@ -162,9 +162,9 @@ struct efx_filter_spec { u32 priority:2; u32 flags:6; u32 dmaq_id:12; - u32 vport_id; u32 rss_context; - __be16 outer_vid __aligned(4); /* allow jhash2() of match values */ + u32 vport_id; + __be16 outer_vid; __be16 inner_vid; u8 loc_mac[ETH_ALEN]; u8 rem_mac[ETH_ALEN]; diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index 4826e6a7e4ce3..9220afeddee81 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -660,17 +660,17 @@ bool efx_filter_spec_equal(const struct efx_filter_spec *left, (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX))) return false; - return memcmp(&left->outer_vid, &right->outer_vid, + return memcmp(&left->vport_id, &right->vport_id, sizeof(struct efx_filter_spec) - - offsetof(struct efx_filter_spec, outer_vid)) == 0; + offsetof(struct efx_filter_spec, vport_id)) == 0; } u32 efx_filter_spec_hash(const struct efx_filter_spec *spec) { - BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3); - return jhash2((const u32 *)&spec->outer_vid, + BUILD_BUG_ON(offsetof(struct efx_filter_spec, vport_id) & 3); + return jhash2((const u32 *)&spec->vport_id, (sizeof(struct efx_filter_spec) - - offsetof(struct efx_filter_spec, outer_vid)) / 4, + offsetof(struct efx_filter_spec, vport_id)) / 4, 0); } -- GitLab From 258ad2fe5ede773625adfda88b173f4123e59f45 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 18 Oct 2022 21:16:07 +0800 Subject: [PATCH 2158/2223] wwan_hwsim: fix possible memory leak in wwan_hwsim_dev_new() Inject fault while probing module, if device_register() fails, but the refcount of kobject is not decreased to 0, the name allocated in dev_set_name() is leaked. Fix this by calling put_device(), so that name can be freed in callback function kobject_cleanup(). unreferenced object 0xffff88810152ad20 (size 8): comm "modprobe", pid 252, jiffies 4294849206 (age 22.713s) hex dump (first 8 bytes): 68 77 73 69 6d 30 00 ff hwsim0.. backtrace: [<000000009c3504ed>] __kmalloc_node_track_caller+0x44/0x1b0 [<00000000c0228a5e>] kvasprintf+0xb5/0x140 [<00000000cff8c21f>] kvasprintf_const+0x55/0x180 [<0000000055a1e073>] kobject_set_name_vargs+0x56/0x150 [<000000000a80b139>] dev_set_name+0xab/0xe0 Fixes: f36a111a74e7 ("wwan_hwsim: WWAN device simulator") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-by: Loic Poulain <loic.poulain@linaro.org> Acked-by: Sergey Ryazanov <ryazanov.s.a@gmail.com> Link: https://lore.kernel.org/r/20221018131607.1901641-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/wwan/wwan_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c index ff09a8cedf938..2397a903d8f54 100644 --- a/drivers/net/wwan/wwan_hwsim.c +++ b/drivers/net/wwan/wwan_hwsim.c @@ -311,7 +311,7 @@ err_unreg_dev: return ERR_PTR(err); err_free_dev: - kfree(dev); + put_device(&dev->dev); return ERR_PTR(err); } -- GitLab From ff2f5ec5d009844ec28f171123f9e58750cef4bf Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Tue, 18 Oct 2022 20:24:51 +0800 Subject: [PATCH 2159/2223] net: hns: fix possible memory leak in hnae_ae_register() Inject fault while probing module, if device_register() fails, but the refcount of kobject is not decreased to 0, the name allocated in dev_set_name() is leaked. Fix this by calling put_device(), so that name can be freed in callback function kobject_cleanup(). unreferenced object 0xffff00c01aba2100 (size 128): comm "systemd-udevd", pid 1259, jiffies 4294903284 (age 294.152s) hex dump (first 32 bytes): 68 6e 61 65 30 00 00 00 18 21 ba 1a c0 00 ff ff hnae0....!...... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000034783f26>] slab_post_alloc_hook+0xa0/0x3e0 [<00000000748188f2>] __kmem_cache_alloc_node+0x164/0x2b0 [<00000000ab0743e8>] __kmalloc_node_track_caller+0x6c/0x390 [<000000006c0ffb13>] kvasprintf+0x8c/0x118 [<00000000fa27bfe1>] kvasprintf_const+0x60/0xc8 [<0000000083e10ed7>] kobject_set_name_vargs+0x3c/0xc0 [<000000000b87affc>] dev_set_name+0x7c/0xa0 [<000000003fd8fe26>] hnae_ae_register+0xcc/0x190 [hnae] [<00000000fe97edc9>] hns_dsaf_ae_init+0x9c/0x108 [hns_dsaf] [<00000000c36ff1eb>] hns_dsaf_probe+0x548/0x748 [hns_dsaf] Fixes: 6fe6611ff275 ("net: add Hisilicon Network Subsystem hnae framework support") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-by: Leon Romanovsky <leonro@nvidia.com> Link: https://lore.kernel.org/r/20221018122451.1749171-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/hisilicon/hns/hnae.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c b/drivers/net/ethernet/hisilicon/hns/hnae.c index 00fafc0f85121..430eccea8e5e9 100644 --- a/drivers/net/ethernet/hisilicon/hns/hnae.c +++ b/drivers/net/ethernet/hisilicon/hns/hnae.c @@ -419,8 +419,10 @@ int hnae_ae_register(struct hnae_ae_dev *hdev, struct module *owner) hdev->cls_dev.release = hnae_release; (void)dev_set_name(&hdev->cls_dev, "hnae%d", hdev->id); ret = device_register(&hdev->cls_dev); - if (ret) + if (ret) { + put_device(&hdev->cls_dev); return ret; + } __module_get(THIS_MODULE); -- GitLab From ebda44da44f6f309d302522b049f43d6f829f7aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 18 Oct 2022 20:32:58 +0000 Subject: [PATCH 2160/2223] net: sched: fix race condition in qdisc_graft() We had one syzbot report [1] in syzbot queue for a while. I was waiting for more occurrences and/or a repro but Dmitry Vyukov spotted the issue right away. <quoting Dmitry> qdisc_graft() drops reference to qdisc in notify_and_destroy while it's still assigned to dev->qdisc </quoting> Indeed, RCU rules are clear when replacing a data structure. The visible pointer (dev->qdisc in this case) must be updated to the new object _before_ RCU grace period is started (qdisc_put(old) in this case). [1] BUG: KASAN: use-after-free in __tcf_qdisc_find.part.0+0xa3a/0xac0 net/sched/cls_api.c:1066 Read of size 4 at addr ffff88802065e038 by task syz-executor.4/21027 CPU: 0 PID: 21027 Comm: syz-executor.4 Not tainted 6.0.0-rc3-syzkaller-00363-g7726d4c3e60b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 __tcf_qdisc_find.part.0+0xa3a/0xac0 net/sched/cls_api.c:1066 __tcf_qdisc_find net/sched/cls_api.c:1051 [inline] tc_new_tfilter+0x34f/0x2200 net/sched/cls_api.c:2018 rtnetlink_rcv_msg+0x955/0xca0 net/core/rtnetlink.c:6081 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2501 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x917/0xe10 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2482 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2536 __sys_sendmsg+0xf3/0x1c0 net/socket.c:2565 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5efaa89279 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f5efbc31168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f5efab9bf80 RCX: 00007f5efaa89279 RDX: 0000000000000000 RSI: 0000000020000140 RDI: 0000000000000005 RBP: 00007f5efaae32e9 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f5efb0cfb1f R14: 00007f5efbc31300 R15: 0000000000022000 </TASK> Allocated by task 21027: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:437 [inline] ____kasan_kmalloc mm/kasan/common.c:516 [inline] ____kasan_kmalloc mm/kasan/common.c:475 [inline] __kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:525 kmalloc_node include/linux/slab.h:623 [inline] kzalloc_node include/linux/slab.h:744 [inline] qdisc_alloc+0xb0/0xc50 net/sched/sch_generic.c:938 qdisc_create_dflt+0x71/0x4a0 net/sched/sch_generic.c:997 attach_one_default_qdisc net/sched/sch_generic.c:1152 [inline] netdev_for_each_tx_queue include/linux/netdevice.h:2437 [inline] attach_default_qdiscs net/sched/sch_generic.c:1170 [inline] dev_activate+0x760/0xcd0 net/sched/sch_generic.c:1229 __dev_open+0x393/0x4d0 net/core/dev.c:1441 __dev_change_flags+0x583/0x750 net/core/dev.c:8556 rtnl_configure_link+0xee/0x240 net/core/rtnetlink.c:3189 rtnl_newlink_create net/core/rtnetlink.c:3371 [inline] __rtnl_newlink+0x10b8/0x17e0 net/core/rtnetlink.c:3580 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3593 rtnetlink_rcv_msg+0x43a/0xca0 net/core/rtnetlink.c:6090 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2501 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x917/0xe10 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2482 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2536 __sys_sendmsg+0xf3/0x1c0 net/socket.c:2565 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 21020: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:367 [inline] ____kasan_slab_free+0x166/0x1c0 mm/kasan/common.c:329 kasan_slab_free include/linux/kasan.h:200 [inline] slab_free_hook mm/slub.c:1754 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1780 slab_free mm/slub.c:3534 [inline] kfree+0xe2/0x580 mm/slub.c:4562 rcu_do_batch kernel/rcu/tree.c:2245 [inline] rcu_core+0x7b5/0x1890 kernel/rcu/tree.c:2505 __do_softirq+0x1d3/0x9c6 kernel/softirq.c:571 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348 call_rcu+0x99/0x790 kernel/rcu/tree.c:2793 qdisc_put+0xcd/0xe0 net/sched/sch_generic.c:1083 notify_and_destroy net/sched/sch_api.c:1012 [inline] qdisc_graft+0xeb1/0x1270 net/sched/sch_api.c:1084 tc_modify_qdisc+0xbb7/0x1a00 net/sched/sch_api.c:1671 rtnetlink_rcv_msg+0x43a/0xca0 net/core/rtnetlink.c:6090 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2501 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x917/0xe10 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2482 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2536 __sys_sendmsg+0xf3/0x1c0 net/socket.c:2565 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Second to last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348 kvfree_call_rcu+0x74/0x940 kernel/rcu/tree.c:3322 neigh_destroy+0x431/0x630 net/core/neighbour.c:912 neigh_release include/net/neighbour.h:454 [inline] neigh_cleanup_and_release+0x1f8/0x330 net/core/neighbour.c:103 neigh_del net/core/neighbour.c:225 [inline] neigh_remove_one+0x37d/0x460 net/core/neighbour.c:246 neigh_forced_gc net/core/neighbour.c:276 [inline] neigh_alloc net/core/neighbour.c:447 [inline] ___neigh_create+0x18b5/0x29a0 net/core/neighbour.c:642 ip6_finish_output2+0xfb8/0x1520 net/ipv6/ip6_output.c:125 __ip6_finish_output net/ipv6/ip6_output.c:195 [inline] ip6_finish_output+0x690/0x1160 net/ipv6/ip6_output.c:206 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1ed/0x540 net/ipv6/ip6_output.c:227 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] mld_sendpack+0xa09/0xe70 net/ipv6/mcast.c:1820 mld_send_cr net/ipv6/mcast.c:2121 [inline] mld_ifc_work+0x71c/0xdc0 net/ipv6/mcast.c:2653 process_one_work+0x991/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 The buggy address belongs to the object at ffff88802065e000 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 56 bytes inside of 1024-byte region [ffff88802065e000, ffff88802065e400) The buggy address belongs to the physical page: page:ffffea0000819600 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x20658 head:ffffea0000819600 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000001 ffff888011841dc0 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 3523, tgid 3523 (sshd), ts 41495190986, free_ts 41417713212 prep_new_page mm/page_alloc.c:2532 [inline] get_page_from_freelist+0x109b/0x2ce0 mm/page_alloc.c:4283 __alloc_pages+0x1c7/0x510 mm/page_alloc.c:5515 alloc_pages+0x1a6/0x270 mm/mempolicy.c:2270 alloc_slab_page mm/slub.c:1824 [inline] allocate_slab+0x27e/0x3d0 mm/slub.c:1969 new_slab mm/slub.c:2029 [inline] ___slab_alloc+0x7f1/0xe10 mm/slub.c:3031 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3118 slab_alloc_node mm/slub.c:3209 [inline] __kmalloc_node_track_caller+0x2f2/0x380 mm/slub.c:4955 kmalloc_reserve net/core/skbuff.c:358 [inline] __alloc_skb+0xd9/0x2f0 net/core/skbuff.c:430 alloc_skb_fclone include/linux/skbuff.h:1307 [inline] tcp_stream_alloc_skb+0x38/0x580 net/ipv4/tcp.c:861 tcp_sendmsg_locked+0xc36/0x2f80 net/ipv4/tcp.c:1325 tcp_sendmsg+0x2b/0x40 net/ipv4/tcp.c:1483 inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 sock_write_iter+0x291/0x3d0 net/socket.c:1108 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x9e9/0xdd0 fs/read_write.c:578 ksys_write+0x1e8/0x250 fs/read_write.c:631 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1449 [inline] free_pcp_prepare+0x5e4/0xd20 mm/page_alloc.c:1499 free_unref_page_prepare mm/page_alloc.c:3380 [inline] free_unref_page+0x19/0x4d0 mm/page_alloc.c:3476 __unfreeze_partials+0x17c/0x1a0 mm/slub.c:2548 qlink_free mm/kasan/quarantine.c:168 [inline] qlist_free_all+0x6a/0x170 mm/kasan/quarantine.c:187 kasan_quarantine_reduce+0x180/0x200 mm/kasan/quarantine.c:294 __kasan_slab_alloc+0xa2/0xc0 mm/kasan/common.c:447 kasan_slab_alloc include/linux/kasan.h:224 [inline] slab_post_alloc_hook mm/slab.h:727 [inline] slab_alloc_node mm/slub.c:3243 [inline] slab_alloc mm/slub.c:3251 [inline] __kmem_cache_alloc_lru mm/slub.c:3258 [inline] kmem_cache_alloc+0x267/0x3b0 mm/slub.c:3268 kmem_cache_zalloc include/linux/slab.h:723 [inline] alloc_buffer_head+0x20/0x140 fs/buffer.c:2974 alloc_page_buffers+0x280/0x790 fs/buffer.c:829 create_empty_buffers+0x2c/0xee0 fs/buffer.c:1558 ext4_block_write_begin+0x1004/0x1530 fs/ext4/inode.c:1074 ext4_da_write_begin+0x422/0xae0 fs/ext4/inode.c:2996 generic_perform_write+0x246/0x560 mm/filemap.c:3738 ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:270 ext4_file_write_iter+0x44a/0x1660 fs/ext4/file.c:679 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x9e9/0xdd0 fs/read_write.c:578 Fixes: af356afa010f ("net_sched: reintroduce dev->qdisc for use by sch_api") Reported-by: syzbot <syzkaller@googlegroups.com> Diagnosed-by: Dmitry Vyukov <dvyukov@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20221018203258.2793282-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/sched/sch_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c98af0ada706e..4a27dfb1ba0fa 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1099,12 +1099,13 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { - notify_and_destroy(net, skb, n, classid, - rtnl_dereference(dev->qdisc), new); + old = rtnl_dereference(dev->qdisc); if (new && !new->ops->attach) qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); + notify_and_destroy(net, skb, n, classid, old, new); + if (new && new->ops->attach) new->ops->attach(new); } else { -- GitLab From 72495b5ab456ec9f05d587238d1e2fa8e9ea63ec Mon Sep 17 00:00:00 2001 From: Yushan Zhou <katrinzhou@tencent.com> Date: Tue, 18 Oct 2022 18:01:32 +0800 Subject: [PATCH 2161/2223] ublk_drv: use flexible-array member instead of zero-length array Eliminate the following coccicheck warning: ./drivers/block/ublk_drv.c:127:16-19: WARNING use flexible-array member instead Signed-off-by: Yushan Zhou <katrinzhou@tencent.com> Link: https://lore.kernel.org/r/20221018100132.355393-1-zys.zljxml@gmail.com Reviewed-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2651bf41dde31..5afce6ffaadfa 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -124,7 +124,7 @@ struct ublk_queue { bool force_abort; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; - struct ublk_io ios[0]; + struct ublk_io ios[]; }; #define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) -- GitLab From 7f378c03aa4952507521174fb0da7b24a9ad0be6 Mon Sep 17 00:00:00 2001 From: Felix Riemann <felix.riemann@sma.de> Date: Tue, 18 Oct 2022 12:47:54 +0200 Subject: [PATCH 2162/2223] net: phy: dp83822: disable MDI crossover status change interrupt If the cable is disconnected the PHY seems to toggle between MDI and MDI-X modes. With the MDI crossover status interrupt active this causes roughly 10 interrupts per second. As the crossover status isn't checked by the driver, the interrupt can be disabled to reduce the interrupt load. Fixes: 87461f7a58ab ("net: phy: DP83822 initial driver submission") Signed-off-by: Felix Riemann <felix.riemann@sma.de> Reviewed-by: Andrew Lunn <andrew@lunn.ch> Link: https://lore.kernel.org/r/20221018104755.30025-1-svc.sw.rte.linux@sma.de Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/phy/dp83822.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 8549e0e356c9b..b60db8b6f4774 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -254,8 +254,7 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_EEE_ERROR_CHANGE_INT_EN); if (!dp83822->fx_enabled) - misr_status |= DP83822_MDI_XOVER_INT_EN | - DP83822_ANEG_ERR_INT_EN | + misr_status |= DP83822_ANEG_ERR_INT_EN | DP83822_WOL_PKT_INT_EN; err = phy_write(phydev, MII_DP83822_MISR2, misr_status); -- GitLab From 7089003304c67658caead22f841840fc4a26b198 Mon Sep 17 00:00:00 2001 From: David Gow <davidgow@google.com> Date: Wed, 19 Oct 2022 15:32:40 +0800 Subject: [PATCH 2163/2223] drm: tests: Fix a buffer overflow in format_helper_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xrgb2101010 format conversion test (unlike for other formats) does an endianness conversion on the results. However, it always converts TEST_BUF_SIZE 32-bit integers, which results in reading from (and writing to) more memory than in present in the result buffer. Instead, use the buffer size, divided by sizeof(u32). The issue could be reproduced with KASAN: ./tools/testing/kunit/kunit.py run --kunitconfig drivers/gpu/drm/tests \ --kconfig_add CONFIG_KASAN=y --kconfig_add CONFIG_KASAN_VMALLOC=y \ --kconfig_add CONFIG_KASAN_KUNIT_TEST=y \ drm_format_helper_test.*xrgb2101010 Reported-by: Linux Kernel Functional Testing <lkft@linaro.org> Fixes: 453114319699 ("drm/format-helper: Add KUnit tests for drm_fb_xrgb8888_to_xrgb2101010()") Signed-off-by: David Gow <davidgow@google.com> Reviewed-by: Maíra Canal <mairacanal@riseup.net> Reviewed-by: Javier Martinez Canillas <javierm@redhat.com> Reviewed-by: José Expósito <jose.exposito89@gmail.com> Signed-off-by: Javier Martinez Canillas <javierm@redhat.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221019073239.3779180-1-davidgow@google.com --- drivers/gpu/drm/tests/drm_format_helper_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tests/drm_format_helper_test.c b/drivers/gpu/drm/tests/drm_format_helper_test.c index 8d86c250c2ecb..2191e57f22972 100644 --- a/drivers/gpu/drm/tests/drm_format_helper_test.c +++ b/drivers/gpu/drm/tests/drm_format_helper_test.c @@ -438,7 +438,7 @@ static void drm_test_fb_xrgb8888_to_xrgb2101010(struct kunit *test) iosys_map_set_vaddr(&src, xrgb8888); drm_fb_xrgb8888_to_xrgb2101010(&dst, &result->dst_pitch, &src, &fb, ¶ms->clip); - buf = le32buf_to_cpu(test, buf, TEST_BUF_SIZE); + buf = le32buf_to_cpu(test, buf, dst_size / sizeof(u32)); KUNIT_EXPECT_EQ(test, memcmp(buf, result->expected, dst_size), 0); } -- GitLab From a91e5e3e2216354e27ee6adf9cb2d5d9548cad8c Mon Sep 17 00:00:00 2001 From: Maxime Ripard <maxime@cerno.tech> Date: Wed, 19 Oct 2022 16:34:42 +0200 Subject: [PATCH 2164/2223] drm/connector: Set DDC pointer in drmm_connector_init Commit 35a3b82f1bdd ("drm/connector: Introduce drmm_connector_init") introduced the function drmm_connector_init() with a parameter for an optional ddc pointer to the i2c controller used to access the DDC bus. However, the underlying call to __drm_connector_init() was always setting it to NULL instead of passing the ddc argument around. This resulted in unexpected null pointer dereference on platforms expecting to get a DDC controller. Fixes: 35a3b82f1bdd ("drm/connector: Introduce drmm_connector_init") Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de> Link: https://lore.kernel.org/r/20221019143442.1798964-1-maxime@cerno.tech Signed-off-by: Maxime Ripard <maxime@cerno.tech> --- drivers/gpu/drm/drm_connector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index e3142c8142b30..61c29ce74b035 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -435,7 +435,7 @@ int drmm_connector_init(struct drm_device *dev, if (drm_WARN_ON(dev, funcs && funcs->destroy)) return -EINVAL; - ret = __drm_connector_init(dev, connector, funcs, connector_type, NULL); + ret = __drm_connector_init(dev, connector, funcs, connector_type, ddc); if (ret) return ret; -- GitLab From 7228d9d79248bd0c8af56a7667a88a875c674e0c Mon Sep 17 00:00:00 2001 From: Steven Price <steven.price@arm.com> Date: Mon, 17 Oct 2022 11:46:01 +0100 Subject: [PATCH 2165/2223] drm/panfrost: Remove type name from internal structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two structs internal to struct panfrost_dump_object_header were named, but sadly that is incompatible with C++, causing an error: "an anonymous union may only have public non-static data members". However nothing refers to struct pan_reg_hdr and struct pan_bomap_hdr and there's no need to export these definitions, so lets drop them. This fixes the C++ build error with the minimum change in userspace API. Reported-by: Adrián Larumbe <adrian.larumbe@collabora.com> Fixes: 730c2bf4ad39 ("drm/panfrost: Add support for devcoredump") Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Signed-off-by: Steven Price <steven.price@arm.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221017104602.142992-2-steven.price@arm.com --- include/uapi/drm/panfrost_drm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index eac87310b3483..bd77254be121c 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -242,7 +242,7 @@ struct panfrost_dump_object_header { __le32 file_offset; union { - struct pan_reg_hdr { + struct { __le64 jc; __le32 gpu_id; __le32 major; @@ -250,7 +250,7 @@ struct panfrost_dump_object_header { __le64 nbos; } reghdr; - struct pan_bomap_hdr { + struct { __le32 valid; __le64 iova; __le32 data[2]; -- GitLab From 72655fb942c1e3d9e71e48e87ee439abe52f3a90 Mon Sep 17 00:00:00 2001 From: Steven Price <steven.price@arm.com> Date: Mon, 17 Oct 2022 11:46:02 +0100 Subject: [PATCH 2166/2223] drm/panfrost: replace endian-specific types with native ones __le32 and __le64 types aren't portable and are not available on FreeBSD (which uses the same uAPI). Instead of attempting to always output little endian, just use native endianness in the dumps. Tools can detect the endianness in use by looking at the 'magic' field, but equally we don't expect big-endian to be used with Mali (there are no known implementations out there). Bug: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7252 Fixes: 730c2bf4ad39 ("drm/panfrost: Add support for devcoredump") Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Signed-off-by: Steven Price <steven.price@arm.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221017104602.142992-3-steven.price@arm.com --- drivers/gpu/drm/panfrost/panfrost_dump.c | 36 ++++++++++++------------ include/uapi/drm/panfrost_drm.h | 36 +++++++++++++----------- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_dump.c b/drivers/gpu/drm/panfrost/panfrost_dump.c index 89056a1aac7df..6bd0634e2d580 100644 --- a/drivers/gpu/drm/panfrost/panfrost_dump.c +++ b/drivers/gpu/drm/panfrost/panfrost_dump.c @@ -63,13 +63,13 @@ static void panfrost_core_dump_header(struct panfrost_dump_iterator *iter, { struct panfrost_dump_object_header *hdr = iter->hdr; - hdr->magic = cpu_to_le32(PANFROSTDUMP_MAGIC); - hdr->type = cpu_to_le32(type); - hdr->file_offset = cpu_to_le32(iter->data - iter->start); - hdr->file_size = cpu_to_le32(data_end - iter->data); + hdr->magic = PANFROSTDUMP_MAGIC; + hdr->type = type; + hdr->file_offset = iter->data - iter->start; + hdr->file_size = data_end - iter->data; iter->hdr++; - iter->data += le32_to_cpu(hdr->file_size); + iter->data += hdr->file_size; } static void @@ -93,8 +93,8 @@ panfrost_core_dump_registers(struct panfrost_dump_iterator *iter, reg = panfrost_dump_registers[i] + js_as_offset; - dumpreg->reg = cpu_to_le32(reg); - dumpreg->value = cpu_to_le32(gpu_read(pfdev, reg)); + dumpreg->reg = reg; + dumpreg->value = gpu_read(pfdev, reg); } panfrost_core_dump_header(iter, PANFROSTDUMP_BUF_REG, dumpreg); @@ -106,7 +106,7 @@ void panfrost_core_dump(struct panfrost_job *job) struct panfrost_dump_iterator iter; struct drm_gem_object *dbo; unsigned int n_obj, n_bomap_pages; - __le64 *bomap, *bomap_start; + u64 *bomap, *bomap_start; size_t file_size; u32 as_nr; int slot; @@ -177,11 +177,11 @@ void panfrost_core_dump(struct panfrost_job *job) * For now, we write the job identifier in the register dump header, * so that we can decode the entire dump later with pandecode */ - iter.hdr->reghdr.jc = cpu_to_le64(job->jc); - iter.hdr->reghdr.major = cpu_to_le32(PANFROSTDUMP_MAJOR); - iter.hdr->reghdr.minor = cpu_to_le32(PANFROSTDUMP_MINOR); - iter.hdr->reghdr.gpu_id = cpu_to_le32(pfdev->features.id); - iter.hdr->reghdr.nbos = cpu_to_le64(job->bo_count); + iter.hdr->reghdr.jc = job->jc; + iter.hdr->reghdr.major = PANFROSTDUMP_MAJOR; + iter.hdr->reghdr.minor = PANFROSTDUMP_MINOR; + iter.hdr->reghdr.gpu_id = pfdev->features.id; + iter.hdr->reghdr.nbos = job->bo_count; panfrost_core_dump_registers(&iter, pfdev, as_nr, slot); @@ -218,27 +218,27 @@ void panfrost_core_dump(struct panfrost_job *job) WARN_ON(!mapping->active); - iter.hdr->bomap.data[0] = cpu_to_le32((bomap - bomap_start)); + iter.hdr->bomap.data[0] = bomap - bomap_start; for_each_sgtable_page(bo->base.sgt, &page_iter, 0) { struct page *page = sg_page_iter_page(&page_iter); if (!IS_ERR(page)) { - *bomap++ = cpu_to_le64(page_to_phys(page)); + *bomap++ = page_to_phys(page); } else { dev_err(pfdev->dev, "Panfrost Dump: wrong page\n"); - *bomap++ = ~cpu_to_le64(0); + *bomap++ = 0; } } - iter.hdr->bomap.iova = cpu_to_le64(mapping->mmnode.start << PAGE_SHIFT); + iter.hdr->bomap.iova = mapping->mmnode.start << PAGE_SHIFT; vaddr = map.vaddr; memcpy(iter.data, vaddr, bo->base.base.size); drm_gem_shmem_vunmap(&bo->base, &map); - iter.hdr->bomap.valid = cpu_to_le32(1); + iter.hdr->bomap.valid = 1; dump_header: panfrost_core_dump_header(&iter, PANFROSTDUMP_BUF_BO, iter.data + bo->base.base.size); diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index bd77254be121c..6f93c915cc88a 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -235,25 +235,29 @@ struct drm_panfrost_madvise { #define PANFROSTDUMP_BUF_BO (PANFROSTDUMP_BUF_BOMAP + 1) #define PANFROSTDUMP_BUF_TRAILER (PANFROSTDUMP_BUF_BO + 1) +/* + * This structure is the native endianness of the dumping machine, tools can + * detect the endianness by looking at the value in 'magic'. + */ struct panfrost_dump_object_header { - __le32 magic; - __le32 type; - __le32 file_size; - __le32 file_offset; + __u32 magic; + __u32 type; + __u32 file_size; + __u32 file_offset; union { struct { - __le64 jc; - __le32 gpu_id; - __le32 major; - __le32 minor; - __le64 nbos; + __u64 jc; + __u32 gpu_id; + __u32 major; + __u32 minor; + __u64 nbos; } reghdr; - struct { - __le32 valid; - __le64 iova; - __le32 data[2]; + struct pan_bomap_hdr { + __u32 valid; + __u64 iova; + __u32 data[2]; } bomap; /* @@ -261,14 +265,14 @@ struct panfrost_dump_object_header { * with new fields and also keep it 512-byte aligned */ - __le32 sizer[496]; + __u32 sizer[496]; }; }; /* Registers object, an array of these */ struct panfrost_dump_registers { - __le32 reg; - __le32 value; + __u32 reg; + __u32 value; }; #if defined(__cplusplus) -- GitLab From 6d42ddf7f27b6723549ee6d4c8b1b418b59bf6b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= <christoph.boehmwalder@linbit.com> Date: Thu, 20 Oct 2022 10:52:05 +0200 Subject: [PATCH 2167/2223] drbd: only clone bio if we have a backing device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c347a787e34cb (drbd: set ->bi_bdev in drbd_req_new) moved a bio_set_dev call (which has since been removed) to "earlier", from drbd_request_prepare to drbd_req_new. The problem is that this accesses device->ldev->backing_bdev, which is not NULL-checked at this point. When we don't have an ldev (i.e. when the DRBD device is diskless), this leads to a null pointer deref. So, only allocate the private_bio if we actually have a disk. This is also a small optimization, since we don't clone the bio to only to immediately free it again in the diskless case. Fixes: c347a787e34cb ("drbd: set ->bi_bdev in drbd_req_new") Co-developed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com> Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com> Co-developed-by: Joel Colledge <joel.colledge@linbit.com> Signed-off-by: Joel Colledge <joel.colledge@linbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20221020085205.129090-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/block/drbd/drbd_req.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8f7f144e54f3a..7f9bcc82fc9c4 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -30,11 +30,6 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio return NULL; memset(req, 0, sizeof(*req)); - req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, bio_src, - GFP_NOIO, &drbd_io_bio_set); - req->private_bio->bi_private = req; - req->private_bio->bi_end_io = drbd_request_endio; - req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); @@ -1219,9 +1214,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio) /* Update disk stats */ req->start_jif = bio_start_io_acct(req->master_bio); - if (!get_ldev(device)) { - bio_put(req->private_bio); - req->private_bio = NULL; + if (get_ldev(device)) { + req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, + bio, GFP_NOIO, + &drbd_io_bio_set); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; } /* process discards always from our submitter thread */ -- GitLab From 33566f92cd5f1c1d462920978f6dc102c744270d Mon Sep 17 00:00:00 2001 From: Yuwei Guan <ssawgyw@gmail.com> Date: Tue, 18 Oct 2022 11:01:39 +0800 Subject: [PATCH 2168/2223] block, bfq: remove unused variable for bfq_queue it defined in d0edc2473be9d, but there's nowhere to use it, so remove it. Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com> Acked-by: Paolo Valente <paolo.valente@linaro.org> Link: https://lore.kernel.org/r/20221018030139.159-1-Yuwei.Guan@zeekrlife.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 64ee618064ba1..71f721670ab62 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -369,12 +369,8 @@ struct bfq_queue { unsigned long split_time; /* time of last split */ unsigned long first_IO_time; /* time of first I/O for this queue */ - unsigned long creation_time; /* when this queue is created */ - /* max service rate measured so far */ - u32 max_service_rate; - /* * Pointer to the waker queue for this queue, i.e., to the * queue Q such that this queue happens to get new I/O right -- GitLab From 996d3efeb091c503afd3ee6b5e20eabf446fd955 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca <rafaelmendsr@gmail.com> Date: Wed, 19 Oct 2022 22:47:09 -0300 Subject: [PATCH 2169/2223] io-wq: Fix memory leak in worker creation If the CPU mask allocation for a node fails, then the memory allocated for the 'io_wqe' struct of the current node doesn't get freed on the error handling path, since it has not yet been added to the 'wqes' array. This was spotted when fuzzing v6.1-rc1 with Syzkaller: BUG: memory leak unreferenced object 0xffff8880093d5000 (size 1024): comm "syz-executor.2", pid 7701, jiffies 4295048595 (age 13.900s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000cb463369>] __kmem_cache_alloc_node+0x18e/0x720 [<00000000147a3f9c>] kmalloc_node_trace+0x2a/0x130 [<000000004e107011>] io_wq_create+0x7b9/0xdc0 [<00000000c38b2018>] io_uring_alloc_task_context+0x31e/0x59d [<00000000867399da>] __io_uring_add_tctx_node.cold+0x19/0x1ba [<000000007e0e7a79>] io_uring_setup.cold+0x1b80/0x1dce [<00000000b545e9f6>] __x64_sys_io_uring_setup+0x5d/0x80 [<000000008a8a7508>] do_syscall_64+0x5d/0x90 [<000000004ac08bec>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 0e03496d1967 ("io-wq: use private CPU mask") Cc: stable@vger.kernel.org Signed-off-by: Rafael Mendonca <rafaelmendsr@gmail.com> Link: https://lore.kernel.org/r/20221020014710.902201-1-rafaelmendsr@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io-wq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index c6536d4b2da0b..6f1d0e5df23ad 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1164,10 +1164,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); if (!wqe) goto err; + wq->wqes[node] = wqe; if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) goto err; cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wq->wqes[node] = wqe; wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = -- GitLab From d4347d50407daea6237872281ece64c4bdf1ec99 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Tue, 18 Oct 2022 20:50:55 +0100 Subject: [PATCH 2170/2223] bio: safeguard REQ_ALLOC_CACHE bio put bio_put() with REQ_ALLOC_CACHE assumes that it's executed not from an irq context. Let's add a warning if the invariant is not respected, especially since there is a couple of places removing REQ_POLLED by hand without also clearing REQ_ALLOC_CACHE. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/558d78313476c4e9c233902efa0092644c3d420a.1666122465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 6c470a50a36d9..0a14af9237381 100644 --- a/block/bio.c +++ b/block/bio.c @@ -741,7 +741,7 @@ void bio_put(struct bio *bio) return; } - if (bio->bi_opf & REQ_ALLOC_CACHE) { + if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) { struct bio_alloc_cache *cache; bio_uninit(bio); -- GitLab From 60a9bb9048f9e95029df10a9bc346f6b066c593c Mon Sep 17 00:00:00 2001 From: Ye Bin <yebin10@huawei.com> Date: Wed, 19 Oct 2022 11:36:00 +0800 Subject: [PATCH 2171/2223] blktrace: introduce 'blk_trace_{start,stop}' helper Introduce 'blk_trace_{start,stop}' helper. No functional changed. Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20221019033602.752383-2-yebin@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- kernel/trace/blktrace.c | 74 ++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7f5eb295fe198..50b6f241b5f77 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -346,6 +346,37 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } +static int blk_trace_start(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_setup && + bt->trace_state != Blktrace_stopped) + return -EINVAL; + + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + raw_spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + raw_spin_unlock_irq(&running_trace_lock); + trace_note_time(bt); + + return 0; +} + +static int blk_trace_stop(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_running) + return -EINVAL; + + bt->trace_state = Blktrace_stopped; + raw_spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + raw_spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + + return 0; +} + static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); @@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { - int ret; struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, @@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start) if (bt == NULL) return -EINVAL; - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - raw_spin_lock_irq(&running_trace_lock); - list_add(&bt->running_list, &running_trace_list); - raw_spin_unlock_irq(&running_trace_lock); - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; + if (start) + return blk_trace_start(bt); + else + return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) @@ -1614,13 +1618,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - } + blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); -- GitLab From dcd1a59c62dc49da75539213611156d6db50ab5d Mon Sep 17 00:00:00 2001 From: Ye Bin <yebin10@huawei.com> Date: Wed, 19 Oct 2022 11:36:01 +0800 Subject: [PATCH 2172/2223] blktrace: fix possible memleak in '__blk_trace_remove' When test as follows: step1: ioctl(sda, BLKTRACESETUP, &arg) step2: ioctl(sda, BLKTRACESTART, NULL) step3: ioctl(sda, BLKTRACETEARDOWN, NULL) step4: ioctl(sda, BLKTRACESETUP, &arg) Got issue as follows: debugfs: File 'dropped' in directory 'sda' already present! debugfs: File 'msg' in directory 'sda' already present! debugfs: File 'trace0' in directory 'sda' already present! And also find syzkaller report issue like "KASAN: use-after-free Read in relay_switch_subbuf" "https://syzkaller.appspot.com/bug?id=13849f0d9b1b818b087341691be6cc3ac6a6bfb7" If remove block trace without stop(BLKTRACESTOP) block trace, '__blk_trace_remove' will just set 'q->blk_trace' with NULL. However, debugfs file isn't removed, so will report file already present when call BLKTRACESETUP. static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; if (bt->trace_state != Blktrace_running) blk_trace_cleanup(q, bt); return 0; } If do test as follows: step1: ioctl(sda, BLKTRACESETUP, &arg) step2: ioctl(sda, BLKTRACESTART, NULL) step3: ioctl(sda, BLKTRACETEARDOWN, NULL) step4: remove sda There will remove debugfs directory which will remove recursively all file under directory. >> blk_release_queue >> debugfs_remove_recursive(q->debugfs_dir) So all files which created in 'do_blk_trace_setup' are removed, and 'dentry->d_inode' is NULL. But 'q->blk_trace' is still in 'running_trace_lock', 'trace_note_tsk' will traverse 'running_trace_lock' all nodes. >>trace_note_tsk >> trace_note >> relay_reserve >> relay_switch_subbuf >> d_inode(buf->dentry)->i_size To solve above issues, reference commit '5afedf670caf', call 'blk_trace_cleanup' unconditionally in '__blk_trace_remove' and first stop block trace in 'blk_trace_cleanup'. Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20221019033602.752383-3-yebin@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 50b6f241b5f77..e17bba027a2c6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -379,6 +379,7 @@ static int blk_trace_stop(struct blk_trace *bt) static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { + blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); @@ -393,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(q, bt); + blk_trace_cleanup(q, bt); return 0; } -- GitLab From 2db96217e7e515071726ca4ec791742c4202a1b2 Mon Sep 17 00:00:00 2001 From: Ye Bin <yebin10@huawei.com> Date: Wed, 19 Oct 2022 11:36:02 +0800 Subject: [PATCH 2173/2223] blktrace: remove unnessary stop block trace in 'blk_trace_shutdown' As previous commit, 'blk_trace_cleanup' will stop block trace if block trace's state is 'Blktrace_running'. So remove unnessary stop block trace in 'blk_trace_shutdown'. Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20221019033602.752383-4-yebin@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- kernel/trace/blktrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e17bba027a2c6..a995ea1ef849a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -776,10 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->debugfs_mutex))) { - __blk_trace_startstop(q, 0); + lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); - } } #ifdef CONFIG_BLK_CGROUP -- GitLab From 50b0e4d4da09fa501e722af886f97e60a4f820d6 Mon Sep 17 00:00:00 2001 From: Alex Deucher <alexander.deucher@amd.com> Date: Wed, 19 Oct 2022 16:57:42 -0400 Subject: [PATCH 2174/2223] drm/amdgpu: fix sdma doorbell init ordering on APUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()") uncovered a bug in amdgpu that required a reordering of the driver init sequence to avoid accessing a special register on the GPU before it was properly set up leading to an PCI AER error. This reordering uncovered a different hw programming ordering dependency in some APUs where the SDMA doorbells need to be programmed before the GFX doorbells. To fix this, move the SDMA doorbell programming back into the soc15 common code, but use the actual doorbell range values directly rather than the values stored in the ring structure since those will not be initialized at this point. This is a partial revert, but with the doorbell assignment fixed so the proper doorbell index is set before it's used. Fixes: e3163bc8ffdfdb ("drm/amdgpu: move nbio sdma_doorbell_range() into sdma code for vega") Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: skhan@linuxfoundation.org Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 5 ----- drivers/gpu/drm/amd/amdgpu/soc15.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 298fa11702e75..1122bd4eae98c 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1417,11 +1417,6 @@ static int sdma_v4_0_start(struct amdgpu_device *adev) WREG32_SDMA(i, mmSDMA0_CNTL, temp); if (!amdgpu_sriov_vf(adev)) { - ring = &adev->sdma.instance[i].ring; - adev->nbio.funcs->sdma_doorbell_range(adev, i, - ring->use_doorbell, ring->doorbell_index, - adev->doorbell_index.sdma_doorbell_range); - /* unhalt engine */ temp = RREG32_SDMA(i, mmSDMA0_F32_CNTL); temp = REG_SET_FIELD(temp, SDMA0_F32_CNTL, HALT, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 183024d7c184e..e3b2b6b4f1a66 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1211,6 +1211,20 @@ static int soc15_common_sw_fini(void *handle) return 0; } +static void soc15_sdma_doorbell_range_init(struct amdgpu_device *adev) +{ + int i; + + /* sdma doorbell range is programed by hypervisor */ + if (!amdgpu_sriov_vf(adev)) { + for (i = 0; i < adev->sdma.num_instances; i++) { + adev->nbio.funcs->sdma_doorbell_range(adev, i, + true, adev->doorbell_index.sdma_engine[i] << 1, + adev->doorbell_index.sdma_doorbell_range); + } + } +} + static int soc15_common_hw_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -1230,6 +1244,13 @@ static int soc15_common_hw_init(void *handle) /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true); + /* HW doorbell routing policy: doorbell writing not + * in SDMA/IH/MM/ACV range will be routed to CP. So + * we need to init SDMA doorbell range prior + * to CP ip block init and ring test. IH already + * happens before CP. + */ + soc15_sdma_doorbell_range_init(adev); return 0; } -- GitLab From b5f1fc3184405ab955db1b86d41d8b744d07c12d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <peterz@infradead.org> Date: Thu, 15 Sep 2022 13:11:35 +0200 Subject: [PATCH 2175/2223] x86/ftrace: Remove ftrace_epilogue() Remove the weird jumps to RET and simply use RET. This then promotes ftrace_stub() to a real function; which becomes important for kcfi. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20220915111148.719080593@infradead.org Signed-off-by: Peter Zijlstra <peterz@infradead.org> --- arch/x86/kernel/ftrace_64.S | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index dfeb227de5617..a90c55a6b4817 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -172,20 +172,14 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) +SYM_FUNC_START(ftrace_stub) UNWIND_HINT_FUNC - ENDBR RET -SYM_FUNC_END(ftrace_epilogue) +SYM_FUNC_END(ftrace_stub) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -262,14 +256,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - jmp ftrace_epilogue + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -280,7 +271,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) -- GitLab From 883bbbffa5a4ffd1915f8b42934dab81b7f87226 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra <peterz@infradead.org> Date: Tue, 18 Oct 2022 13:49:21 +0200 Subject: [PATCH 2176/2223] ftrace,kcfi: Separate ftrace_stub() and ftrace_stub_graph() Different function signatures means they needs to be different functions; otherwise CFI gets upset. As triggered by the ftrace boot tests: [] CFI failure at ftrace_return_to_handler+0xac/0x16c (target: ftrace_stub+0x0/0x14; expected type: 0x0a5d5347) Fixes: 3c516f89e17e ("x86: Add support for CONFIG_CFI_CLANG") Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Mark Rutland <mark.rutland@arm.com> Link: https://lkml.kernel.org/r/Y06dg4e1xF6JTdQq@hirez.programming.kicks-ass.net --- arch/arm64/kernel/entry-ftrace.S | 7 ++++++- arch/x86/kernel/ftrace_64.S | 17 +++++++++-------- include/asm-generic/vmlinux.lds.h | 18 ++++++++++++------ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index bd5df50e46432..795344ab4ec45 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -7,6 +7,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/ftrace.h> @@ -294,10 +295,14 @@ SYM_FUNC_END(ftrace_graph_caller) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ -SYM_FUNC_START(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub) ret SYM_FUNC_END(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * void return_to_handler(void) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a90c55a6b4817..2a4be92fd1444 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> @@ -129,6 +130,14 @@ .endm +SYM_TYPED_FUNC_START(ftrace_stub) + RET +SYM_FUNC_END(ftrace_stub) + +SYM_TYPED_FUNC_START(ftrace_stub_graph) + RET +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) @@ -176,11 +185,6 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_stub) - UNWIND_HINT_FUNC - RET -SYM_FUNC_END(ftrace_stub) - SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq @@ -282,9 +286,6 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace - -SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - ENDBR RET trace: diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c15de165ec8ff..d06ada2341cb9 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -162,6 +162,16 @@ #define PATCHABLE_DISCARDS *(__patchable_function_entries) #endif +#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +#define FTRACE_STUB_HACK ftrace_stub_graph = ftrace_stub; +#else +#define FTRACE_STUB_HACK +#endif + #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* * The ftrace call sites are logged to a section whose name depends on the @@ -169,10 +179,6 @@ * FTRACE_CALLSITE_SECTION. We capture all of them here to avoid header * dependencies for FTRACE_CALLSITE_SECTION's definition. * - * Need to also make ftrace_stub_graph point to ftrace_stub - * so that the same stub location may have different protocols - * and not mess up with C verifiers. - * * ftrace_ops_list_func will be defined as arch_ftrace_ops_list_func * as some archs will have a different prototype for that function * but ftrace_ops_list_func() will have a single prototype. @@ -182,11 +188,11 @@ KEEP(*(__mcount_loc)) \ KEEP_PATCHABLE \ __stop_mcount_loc = .; \ - ftrace_stub_graph = ftrace_stub; \ + FTRACE_STUB_HACK \ ftrace_ops_list_func = arch_ftrace_ops_list_func; #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; \ +# define MCOUNT_REC() FTRACE_STUB_HACK \ ftrace_ops_list_func = arch_ftrace_ops_list_func; # else # define MCOUNT_REC() -- GitLab From b329f5ddc9ce4b622d9c7aaf5c6df4de52caf91a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky <mlevitsk@redhat.com> Date: Mon, 18 Jul 2022 17:11:19 +0300 Subject: [PATCH 2177/2223] perf/x86/intel/lbr: Use setup_clear_cpu_cap() instead of clear_cpu_cap() clear_cpu_cap(&boot_cpu_data) is very similar to setup_clear_cpu_cap() except that the latter also sets a bit in 'cpu_caps_cleared' which later clears the same cap in secondary cpus, which is likely what is meant here. Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR") Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Kan Liang <kan.liang@linux.intel.com> Link: https://lkml.kernel.org/r/20220718141123.136106-2-mlevitsk@redhat.com --- arch/x86/events/intel/lbr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4fce1a4226e3d..8259d725054d0 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1596,7 +1596,7 @@ void __init intel_pmu_arch_lbr_init(void) return; clear_arch_lbr: - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); + setup_clear_cpu_cap(X86_FEATURE_ARCH_LBR); } /** -- GitLab From 21a1994b6492b12e55dbf39d15271430ef6839f0 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira <bristot@kernel.org> Date: Tue, 23 Aug 2022 17:20:28 +0200 Subject: [PATCH 2178/2223] rv/dot2c: Make automaton definition static Monitor's automata definition is only used locally, so make dot2c generate a static definition. Link: https://lore.kernel.org/all/202208210332.gtHXje45-lkp@intel.com Link: https://lore.kernel.org/all/202208210358.6HH3OrVs-lkp@intel.com Link: https://lkml.kernel.org/r/ffbb92010f643307766c9307fd42f416e5b85fa0.1661266564.git.bristot@kernel.org Cc: Steven Rostedt <rostedt@goodmis.org> Fixes: e3c9fc78f096 ("tools/rv: Add dot2c") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> --- tools/verification/dot2/dot2c.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/verification/dot2/dot2c.py b/tools/verification/dot2/dot2c.py index fa73353f7e560..be8a364a469b9 100644 --- a/tools/verification/dot2/dot2c.py +++ b/tools/verification/dot2/dot2c.py @@ -111,7 +111,7 @@ class Dot2c(Automata): def format_aut_init_header(self): buff = [] - buff.append("struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def)) + buff.append("static struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def)) return buff def __get_string_vector_per_line_content(self, buff): -- GitLab From fdf23c62d98cda1d8935259dc7da3cc830a4bc6c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi <yijiangshan@kylinos.cn> Date: Sun, 9 Oct 2022 15:19:23 +0800 Subject: [PATCH 2179/2223] i2c: fix spelling typos in comments Reported-by: k2ci <kernel-bot@kylinos.cn> Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn> Reviewed-by: Jean Delvare <jdelvare@suse.de> # for sis630 Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/i2c-mlxcpld.c | 2 +- drivers/i2c/busses/i2c-sis630.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-mlxcpld.c b/drivers/i2c/busses/i2c-mlxcpld.c index 72fcfb17dd67e..081f51ef0551b 100644 --- a/drivers/i2c/busses/i2c-mlxcpld.c +++ b/drivers/i2c/busses/i2c-mlxcpld.c @@ -40,7 +40,7 @@ #define MLXCPLD_LPCI2C_STATUS_REG 0x9 #define MLXCPLD_LPCI2C_DATA_REG 0xa -/* LPC I2C masks and parametres */ +/* LPC I2C masks and parameters */ #define MLXCPLD_LPCI2C_RST_SEL_MASK 0x1 #define MLXCPLD_LPCI2C_TRANS_END 0x1 #define MLXCPLD_LPCI2C_STATUS_NACK 0x10 diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c index cfb8e04a2a831..87d56250d78a3 100644 --- a/drivers/i2c/busses/i2c-sis630.c +++ b/drivers/i2c/busses/i2c-sis630.c @@ -97,7 +97,7 @@ MODULE_PARM_DESC(high_clock, module_param(force, bool, 0); MODULE_PARM_DESC(force, "Forcibly enable the SIS630. DANGEROUS!"); -/* SMBus base adress */ +/* SMBus base address */ static unsigned short smbus_base; /* supported chips */ -- GitLab From 5ad15f1b32f4a9cb7653b5ab1eccf285b4045007 Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Wed, 12 Oct 2022 16:19:39 +0300 Subject: [PATCH 2180/2223] mailmap: update Dan Carpenter's email address My time at Oracle is ending at the end of the month. Update my email address accordingly. Link: https://lkml.kernel.org/r/Y0a+6+5SHMdvUnpg@kili Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Cc: Joe Perches <joe@perches.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 380378e2db368..b4e7511121f06 100644 --- a/.mailmap +++ b/.mailmap @@ -104,6 +104,7 @@ Christoph Hellwig <hch@lst.de> Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com> Corey Minyard <minyard@acm.org> Damian Hobson-Garcia <dhobsong@igel.co.jp> +Dan Carpenter <error27@gmail.com> <dan.carpenter@oracle.com> Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com> Daniel Borkmann <daniel@iogearbox.net> <danborkmann@iogearbox.net> Daniel Borkmann <daniel@iogearbox.net> <daniel.borkmann@tik.ee.ethz.ch> -- GitLab From cef408e70e9b0c175a874b9d9fe6acc7e12f569f Mon Sep 17 00:00:00 2001 From: Qais Yousef <qyousef@layalina.io> Date: Fri, 14 Oct 2022 15:10:16 +0100 Subject: [PATCH 2181/2223] mailmap: update email for Qais Yousef Update my email address for old entry and add a new entry for my contribution while working with arm to continue support that work. Link: https://lkml.kernel.org/r/20221014141016.539625-1-qyousef@layalina.io Signed-off-by: Qais Yousef <qyousef@layalina.io> Acked-by: Qais Yousef <qais.yousef@arm.com> Acked-by: Qais Yousef <qsyousef@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- .mailmap | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index b4e7511121f06..fdd7989492fc3 100644 --- a/.mailmap +++ b/.mailmap @@ -354,7 +354,8 @@ Peter Oruba <peter@oruba.de> Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com> Praveen BP <praveenbp@ti.com> Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com> -Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com> +Qais Yousef <qyousef@layalina.io> <qais.yousef@imgtec.com> +Qais Yousef <qyousef@layalina.io> <qais.yousef@arm.com> Quentin Monnet <quentin@isovalent.com> <quentin.monnet@netronome.com> Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com> Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl> -- GitLab From 7329e3ebe3594b425955ab591ecea335e85842c2 Mon Sep 17 00:00:00 2001 From: Liam Howlett <liam.howlett@oracle.com> Date: Sat, 15 Oct 2022 02:12:33 +0000 Subject: [PATCH 2182/2223] mm/mempolicy: fix mbind_range() arguments to vma_merge() Fuzzing produced an invalid argument to vma_merge() which was caught by the newly added verification of the number of VMAs being removed on process exit. Analyzing the failure eventually resulted in finding an issue with the search of a VMA that started at address 0, which caused an underflow and thus the loss of many VMAs being tracked in the tree. Fix the underflow by changing the search of the maple tree to use the start address directly. Link: https://lkml.kernel.org/r/20221015021135.2816178-1-Liam.Howlett@oracle.com Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: kernel test robot <oliver.sang@intel.com> Link: https://lore.kernel.org/r/202210052318.5ad10912-oliver.sang@intel.com Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mempolicy.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a937eaec5b68d..61aa9aedb7289 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,17 +787,22 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_find_rev(&mas, 0); - if (prev && (start < prev->vm_end)) - vma = prev; - else - vma = mas_next(&mas, end - 1); + prev = mas_prev(&mas, 0); + if (unlikely(!prev)) + mas_set(&mas, start); + + vma = mas_find(&mas, end - 1); + if (WARN_ON(!vma)) + return 0; + + if (start > vma->vm_start) + prev = vma; for (; vma; vma = mas_next(&mas, end - 1)) { unsigned long vmstart = max(start, vma->vm_start); -- GitLab From 4249a05ff670e7b1aeea77f1a5451080ea86c88d Mon Sep 17 00:00:00 2001 From: Alexey Romanov <avromanov@sberdevices.ru> Date: Thu, 13 Oct 2022 14:28:25 +0300 Subject: [PATCH 2183/2223] zsmalloc: zs_destroy_pool: add size_class NULL check Inside the zs_destroy_pool() function, there can still be NULL size_class pointers: if when the next size_class is allocated, inside zs_create_pool() function, kzalloc will return NULL and handling the error condition, zs_create_pool() will call zs_destroy_pool(). Link: https://lkml.kernel.org/r/20221013112825.61869-1-avromanov@sberdevices.ru Fixes: f24263a5a076 ("zsmalloc: remove unnecessary size_class NULL check") Signed-off-by: Alexey Romanov <avromanov@sberdevices.ru> Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Nitin Gupta <ngupta@vflare.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/zsmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 525758713a553..d03941cace2c4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2311,6 +2311,9 @@ void zs_destroy_pool(struct zs_pool *pool) int fg; struct size_class *class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) continue; -- GitLab From 977ef30a7d888eeb52fb6908f99080f33e5309a8 Mon Sep 17 00:00:00 2001 From: Martin Liska <mliska@suse.cz> Date: Thu, 13 Oct 2022 09:40:59 +0200 Subject: [PATCH 2184/2223] gcov: support GCC 12.1 and newer compilers Starting with GCC 12.1, the created .gcda format can't be read by gcov tool. There are 2 significant changes to the .gcda file format that need to be supported: a) [gcov: Use system IO buffering] (23eb66d1d46a34cb28c4acbdf8a1deb80a7c5a05) changed that all sizes in the format are in bytes and not in words (4B) b) [gcov: make profile merging smarter] (72e0c742bd01f8e7e6dcca64042b9ad7e75979de) add a new checksum to the file header. Tested with GCC 7.5, 10.4, 12.2 and the current master. Link: https://lkml.kernel.org/r/624bda92-f307-30e9-9aaa-8cc678b2dfb2@suse.cz Signed-off-by: Martin Liska <mliska@suse.cz> Tested-by: Peter Oberparleiter <oberpar@linux.ibm.com> Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- kernel/gcov/gcc_4_7.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 460c12b7dfea2..7971e989e425b 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -30,6 +30,13 @@ #define GCOV_TAG_FUNCTION_LENGTH 3 +/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */ +#if (__GNUC__ >= 12) +#define GCOV_UNIT_SIZE 4 +#else +#define GCOV_UNIT_SIZE 1 +#endif + static struct gcov_info *gcov_info_head; /** @@ -383,12 +390,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) pos += store_gcov_u32(buffer, pos, info->version); pos += store_gcov_u32(buffer, pos, info->stamp); +#if (__GNUC__ >= 12) + /* Use zero as checksum of the compilation unit. */ + pos += store_gcov_u32(buffer, pos, 0); +#endif + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { fi_ptr = info->functions[fi_idx]; /* Function record. */ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); - pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); + pos += store_gcov_u32(buffer, pos, + GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE); pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); @@ -402,7 +415,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) /* Counter record. */ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FOR_COUNTER(ct_idx)); - pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); + pos += store_gcov_u32(buffer, pos, + ci_ptr->num * 2 * GCOV_UNIT_SIZE); for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { pos += store_gcov_u64(buffer, pos, -- GitLab From 759a7c6126eef5635506453e9b9d55a6a3ac2084 Mon Sep 17 00:00:00 2001 From: Joseph Qi <joseph.qi@linux.alibaba.com> Date: Mon, 17 Oct 2022 21:02:26 +0800 Subject: [PATCH 2185/2223] ocfs2: fix BUG when iput after ocfs2_mknod fails Commit b1529a41f777 "ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error" tried to reclaim the claimed inode if __ocfs2_mknod_locked() fails later. But this introduce a race, the freed bit may be reused immediately by another thread, which will update dinode, e.g. i_generation. Then iput this inode will lead to BUG: inode->i_generation != le32_to_cpu(fe->i_generation) We could make this inode as bad, but we did want to do operations like wipe in some cases. Since the claimed inode bit can only affect that an dinode is missing and will return back after fsck, it seems not a big problem. So just leave it as is by revert the reclaim logic. Link: https://lkml.kernel.org/r/20221017130227.234480-1-joseph.qi@linux.alibaba.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com> Reported-by: Yan Wang <wangyan122@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Gang He <ghe@suse.com> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ocfs2/namei.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388e..1a97e167b2194 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -632,18 +632,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { - u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); - int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, - inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); - if (tmp) - mlog_errno(tmp); - } - - return status; } static int ocfs2_mkdir(struct user_namespace *mnt_userns, -- GitLab From 28f4821b1b53e0649706912e810c6c232fc506f9 Mon Sep 17 00:00:00 2001 From: Joseph Qi <joseph.qi@linux.alibaba.com> Date: Mon, 17 Oct 2022 21:02:27 +0800 Subject: [PATCH 2186/2223] ocfs2: clear dinode links count in case of error In ocfs2_mknod(), if error occurs after dinode successfully allocated, ocfs2 i_links_count will not be 0. So even though we clear inode i_nlink before iput in error handling, it still won't wipe inode since we'll refresh inode from dinode during inode lock. So just like clear inode i_nlink, we clear ocfs2 i_links_count as well. Also do the same change for ocfs2_symlink(). Link: https://lkml.kernel.org/r/20221017130227.234480-2-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com> Reported-by: Yan Wang <wangyan122@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Gang He <ghe@suse.com> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ocfs2/namei.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 1a97e167b2194..05f32989bad6f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -232,6 +232,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, handle_t *handle = NULL; struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; + struct ocfs2_dinode *fe = NULL; struct buffer_head *new_fe_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; @@ -382,6 +383,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, goto leave; } + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, new_fe_bh, data_ac, meta_ac); @@ -454,8 +456,11 @@ roll_back: leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -2019,8 +2024,11 @@ bail: ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) -- GitLab From eacf96d23f23e5bfd175be07048246efd0be4cc6 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Fri, 7 Oct 2022 21:43:39 +0100 Subject: [PATCH 2187/2223] init: Kconfig: fix spelling mistake "satify" -> "satisfy" There is a spelling mistake in a Kconfig description. Fix it. Link: https://lkml.kernel.org/r/20221007204339.2757753-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 694f7c160c9c1..abf65098f1b6b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -66,7 +66,7 @@ config RUST_IS_AVAILABLE This shows whether a suitable Rust toolchain is available (found). Please see Documentation/rust/quick-start.rst for instructions on how - to satify the build requirements of Rust support. + to satisfy the build requirements of Rust support. In particular, the Makefile target 'rustavailable' is useful to check why the Rust toolchain is not being detected. -- GitLab From 5789151e48acc3fd34d2109bf2021dc4df5e33e9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz <mike.kravetz@oracle.com> Date: Mon, 17 Oct 2022 19:49:45 -0700 Subject: [PATCH 2188/2223] mm/mmap: undo ->mmap() when mas_preallocate() fails A memory leak in hugetlb_reserve_pages was reported in [1]. The root cause was traced to an error path in mmap_region when mas_preallocate() fails. In this case, the vma is freed after a successful call to filesystem specific mmap. The hugetlbfs mmap routine may allocate data structures pointed to by m_private_data. These need to be cleaned up by the hugetlb vm_ops->close() routine. The same issue was addressed by commit deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") for the arch_validate_flags() test. Go to the same close_and_free_vma label if mas_preallocate() fails. [1] https://lore.kernel.org/linux-mm/CAKXUXMxf7OiCwbxib7MwfR4M1b5+b3cNTU7n5NV9Zm4967=FPQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20221018024945.415036-1-mike.kravetz@oracle.com Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reported-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: Carlos Llamas <cmllamas@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Muchun Song <songmuchun@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index bf2122af94e7a..3c9890e443a3e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2681,7 +2681,7 @@ cannot_expand: if (mas_preallocate(&mas, vma, GFP_KERNEL)) { error = -ENOMEM; if (file) - goto unmap_and_free_vma; + goto close_and_free_vma; else goto free_vma; } -- GitLab From 1cd916d0340d0f45b151599c24ec40b5b2fd8e4a Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Tue, 18 Oct 2022 13:57:37 -0700 Subject: [PATCH 2189/2223] mm/mmap.c: __vma_adjust(): suppress uninitialized var warning The code is OK, but it fools gcc. mm/mmap.c:802 __vma_adjust() error: uninitialized symbol 'next_next'. Fixes: 524e00b36e8c5 ("mm: remove rb tree.") Reported-by: kernel test robot <lkp@intel.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 3c9890e443a3e..721fe5c82a0e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -618,7 +618,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; -- GitLab From a57b70519d1f7c53be98478623652738e5ac70d5 Mon Sep 17 00:00:00 2001 From: Liam Howlett <liam.howlett@oracle.com> Date: Tue, 18 Oct 2022 19:17:12 +0000 Subject: [PATCH 2190/2223] mm/mmap: fix MAP_FIXED address return on VMA merge mmap should return the start address of newly mapped area when successful. On a successful merge of a VMA, the return address was changed and thus was violating that expectation from userspace. This is a restoration of functionality provided by 309d08d9b3a3 (mm/mmap.c: fix mmap return value when vma is merged after call_mmap()). For completeness of fixing MAP_FIXED, implement the comments from the previous discussion to never update the address and fail if the address changes. Leaving the error as a WARN_ON() to avoid crashing the kernel. Link: https://lkml.kernel.org/r/20221018191613.4133459-1-Liam.Howlett@oracle.com Link: https://lore.kernel.org/all/Y06yk66SKxlrwwfb@lakrids/ Link: https://lore.kernel.org/all/20201203085350.22624-1-liuzixian4@huawei.com/ Fixes: 4dd1b84140c1 ("mm/mmap: use advanced maple tree API for mmap_region()") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: Mark Rutland <mark.rutland@arm.com> Cc: Liu Zixian <liuzixian4@huawei.com> Cc: David Hildenbrand <david@redhat.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/mmap.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 721fe5c82a0e6..e270057ed04eb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2626,14 +2626,14 @@ cannot_expand: if (error) goto unmap_and_free_vma; - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM + /* + * Expansion is handled above, merging is handled below. + * Drivers should not alter the address of the VMA. */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; + if (WARN_ON((addr != vma->vm_start))) { + error = -EINVAL; + goto close_and_free_vma; + } mas_reset(&mas); /* @@ -2655,7 +2655,6 @@ cannot_expand: vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ - addr = vma->vm_start; vm_flags = vma->vm_flags; goto unmap_writable; } -- GitLab From 12df140f0bdfae5dcfc81800970dd7f6f632e00c Mon Sep 17 00:00:00 2001 From: Rik van Riel <riel@surriel.com> Date: Mon, 17 Oct 2022 20:25:05 -0400 Subject: [PATCH 2191/2223] mm,hugetlb: take hugetlb_lock before decrementing h->resv_huge_pages The h->*_huge_pages counters are protected by the hugetlb_lock, but alloc_huge_page has a corner case where it can decrement the counter outside of the lock. This could lead to a corrupted value of h->resv_huge_pages, which we have observed on our systems. Take the hugetlb_lock before decrementing h->resv_huge_pages to avoid a potential race. Link: https://lkml.kernel.org/r/20221017202505.0e6a4fcd@imladris.surriel.com Fixes: a88c76954804 ("mm: hugetlb: fix hugepage memory leak caused by wrong reserve count") Signed-off-by: Rik van Riel <riel@surriel.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Glen McCready <gkmccready@meta.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b586cdd75930b..dede0337c07c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2924,11 +2924,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); /* Fall through */ -- GitLab From 08ac85521cb2e26f25b885492180815ce8eaf4b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins <hughd@google.com> Date: Tue, 18 Oct 2022 20:18:38 -0700 Subject: [PATCH 2192/2223] mm: /proc/pid/smaps_rollup: fix maple tree search /proc/pid/smaps_rollup showed 0 kB for everything: now find first vma. Link: https://lkml.kernel.org/r/3011bee7-182-97a2-1083-d5f5b688e54b@google.com Fixes: c4c84f06285e ("fs/proc/task_mmu: stop using linked list and highest_vm_end") Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8b4f3073f8f55..8a74cdcc9af00 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -902,7 +902,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, 0); + vma = mas_find(&mas, ULONG_MAX); if (unlikely(!vma)) goto empty_set; -- GitLab From df48a5f7a3bbac6a700026b554922943ecee1fb0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> Date: Tue, 31 May 2022 09:20:51 -0400 Subject: [PATCH 2193/2223] mm/page_alloc: reduce potential fragmentation in make_alloc_exact() Try to avoid using the left over split page on the next request for a page by calling __free_pages_ok() with FPI_TO_TAIL. This increases the potential of defragmenting memory when it's used for a short period of time. Link: https://lkml.kernel.org/r/20220531185626.yvlmymbxyoe5vags@revolver Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/page_alloc.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e20ade858e71c..b5a6c815ae284 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5784,14 +5784,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); - - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); + struct page *page = virt_to_page((void *)addr); + struct page *last = page + nr; + + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); + + last = page + (1UL << order); + for (page += nr; page < last; page++) + __free_pages_ok(page, 0, FPI_TO_TAIL); } return (void *)addr; } -- GitLab From 612b8a317023e1396965aacac43d80053c6e77db Mon Sep 17 00:00:00 2001 From: Mike Kravetz <mike.kravetz@oracle.com> Date: Wed, 19 Oct 2022 13:19:57 -0700 Subject: [PATCH 2194/2223] hugetlb: fix memory leak associated with vma_lock structure The hugetlb vma_lock structure hangs off the vm_private_data pointer of sharable hugetlb vmas. The structure is vma specific and can not be shared between vmas. At fork and various other times, vmas are duplicated via vm_area_dup(). When this happens, the pointer in the newly created vma must be cleared and the structure reallocated. Two hugetlb specific routines deal with this hugetlb_dup_vma_private and hugetlb_vm_op_open. Both routines are called for newly created vmas. hugetlb_dup_vma_private would always clear the pointer and hugetlb_vm_op_open would allocate the new vms_lock structure. This did not work in the case of this calling sequence pointed out in [1]. move_vma copy_vma new_vma = vm_area_dup(vma); new_vma->vm_ops->open(new_vma); --> new_vma has its own vma lock. is_vm_hugetlb_page(vma) clear_vma_resv_huge_pages hugetlb_dup_vma_private --> vma->vm_private_data is set to NULL When clearing hugetlb_dup_vma_private we actually leak the associated vma_lock structure. The vma_lock structure contains a pointer to the associated vma. This information can be used in hugetlb_dup_vma_private and hugetlb_vm_op_open to ensure we only clear the vm_private_data of newly created (copied) vmas. In such cases, the vma->vma_lock->vma field will not point to the vma. Update hugetlb_dup_vma_private and hugetlb_vm_op_open to not clear vm_private_data if vma->vma_lock->vma == vma. Also, log a warning if hugetlb_vm_op_open ever encounters the case where vma_lock has already been correctly allocated for the vma. [1] https://lore.kernel.org/linux-mm/5154292a-4c55-28cd-0935-82441e512fc3@huawei.com/ Link: https://lkml.kernel.org/r/20221019201957.34607-1-mike.kravetz@oracle.com Fixes: 131a79b474e9 ("hugetlb: fix vma lock handling during split vma and range unmapping") Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Miaohe Lin <linmiaohe@huawei.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Cc: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/hugetlb.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dede0337c07c7..546df97c31e4c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1014,15 +1014,23 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + * Before clearing, make sure pointer is not associated with vma + * as this will leak the structure. This is the case when called + * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already + * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. - * - For shared mappings this is a per-vma semaphore that may be - * allocated in a subsequent call to hugetlb_vm_op_open. */ - vma->vm_private_data = (void *)0; - if (!(vma->vm_flags & VM_MAYSHARE)) - return; + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock && vma_lock->vma != vma) + vma->vm_private_data = NULL; + } else + vma->vm_private_data = NULL; } /* @@ -4601,6 +4609,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); /* + * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA @@ -4615,11 +4624,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) /* * vma_lock structure for sharable mappings is vma specific. - * Clear old pointer (if copied via vm_area_dup) and create new. + * Clear old pointer (if copied via vm_area_dup) and allocate + * new structure. Before clearing, make sure vma_lock is not + * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { - vma->vm_private_data = NULL; - hugetlb_vma_lock_alloc(vma); + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock) { + if (vma_lock->vma != vma) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } else + pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); + } else + hugetlb_vma_lock_alloc(vma); } } -- GitLab From 71e2d666ef85d51834d658830f823560c402b8b6 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@techsingularity.net> Date: Wed, 19 Oct 2022 14:41:56 +0100 Subject: [PATCH 2195/2223] mm/huge_memory: do not clobber swp_entry_t during THP split The following has been observed when running stressng mmap since commit b653db77350c ("mm: Clear page->private when splitting or migrating a page") watchdog: BUG: soft lockup - CPU#75 stuck for 26s! [stress-ng:9546] CPU: 75 PID: 9546 Comm: stress-ng Tainted: G E 6.0.0-revert-b653db77-fix+ #29 0357d79b60fb09775f678e4f3f64ef0579ad1374 Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 2.0a 05/09/2016 RIP: 0010:xas_descend+0x28/0x80 Code: cc cc 0f b6 0e 48 8b 57 08 48 d3 ea 83 e2 3f 89 d0 48 83 c0 04 48 8b 44 c6 08 48 89 77 18 48 89 c1 83 e1 03 48 83 f9 02 75 08 <48> 3d fd 00 00 00 76 08 88 57 12 c3 cc cc cc cc 48 c1 e8 02 89 c2 RSP: 0018:ffffbbf02a2236a8 EFLAGS: 00000246 RAX: ffff9cab7d6a0002 RBX: ffffe04b0af88040 RCX: 0000000000000002 RDX: 0000000000000030 RSI: ffff9cab60509b60 RDI: ffffbbf02a2236c0 RBP: 0000000000000000 R08: ffff9cab60509b60 R09: ffffbbf02a2236c0 R10: 0000000000000001 R11: ffffbbf02a223698 R12: 0000000000000000 R13: ffff9cab4e28da80 R14: 0000000000039c01 R15: ffff9cab4e28da88 FS: 00007fab89b85e40(0000) GS:ffff9cea3fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fab84e00000 CR3: 00000040b73a4003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> xas_load+0x3a/0x50 __filemap_get_folio+0x80/0x370 ? put_swap_page+0x163/0x360 pagecache_get_page+0x13/0x90 __try_to_reclaim_swap+0x50/0x190 scan_swap_map_slots+0x31e/0x670 get_swap_pages+0x226/0x3c0 folio_alloc_swap+0x1cc/0x240 add_to_swap+0x14/0x70 shrink_page_list+0x968/0xbc0 reclaim_page_list+0x70/0xf0 reclaim_pages+0xdd/0x120 madvise_cold_or_pageout_pte_range+0x814/0xf30 walk_pgd_range+0x637/0xa30 __walk_page_range+0x142/0x170 walk_page_range+0x146/0x170 madvise_pageout+0xb7/0x280 ? asm_common_interrupt+0x22/0x40 madvise_vma_behavior+0x3b7/0xac0 ? find_vma+0x4a/0x70 ? find_vma+0x64/0x70 ? madvise_vma_anon_name+0x40/0x40 madvise_walk_vmas+0xa6/0x130 do_madvise+0x2f4/0x360 __x64_sys_madvise+0x26/0x30 do_syscall_64+0x5b/0x80 ? do_syscall_64+0x67/0x80 ? syscall_exit_to_user_mode+0x17/0x40 ? do_syscall_64+0x67/0x80 ? syscall_exit_to_user_mode+0x17/0x40 ? do_syscall_64+0x67/0x80 ? do_syscall_64+0x67/0x80 ? common_interrupt+0x8b/0xa0 entry_SYSCALL_64_after_hwframe+0x63/0xcd The problem can be reproduced with the mmtests config config-workload-stressng-mmap. It does not always happen and when it triggers is variable but it has happened on multiple machines. The intent of commit b653db77350c patch was to avoid the case where PG_private is clear but folio->private is not-NULL. However, THP tail pages uses page->private for "swp_entry_t if folio_test_swapcache()" as stated in the documentation for struct folio. This patch only clobbers page->private for tail pages if the head page was not in swapcache and warns once if page->private had an unexpected value. Link: https://lkml.kernel.org/r/20221019134156.zjyyn5aownakvztf@techsingularity.net Fixes: b653db77350c ("mm: Clear page->private when splitting or migrating a page") Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Yang Shi <shy828301@gmail.com> Cc: Brian Foster <bfoster@redhat.com> Cc: Dan Streetman <ddstreet@ieee.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Oleksandr Natalenko <oleksandr@natalenko.name> Cc: Seth Jennings <sjenning@redhat.com> Cc: Vitaly Wool <vitaly.wool@konsulko.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- mm/huge_memory.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cc4a5f4791e9..03fc7e5edf075 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2455,7 +2455,16 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; - page_tail->private = 0; + + /* + * page->private should not be set in tail pages with the exception + * of swap cache pages that store the swp_entry_t in tail pages. + * Fix up and warn once if private is unexpectedly set. + */ + if (!folio_test_swapcache(page_folio(head))) { + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, head); + page_tail->private = 0; + } /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); -- GitLab From 97061d441110528dc02972818f2f1dad485107f9 Mon Sep 17 00:00:00 2001 From: Alistair Popple <apopple@nvidia.com> Date: Wed, 19 Oct 2022 23:29:34 +1100 Subject: [PATCH 2196/2223] nouveau: fix migrate_to_ram() for faulting page Commit 16ce101db85d ("mm/memory.c: fix race when faulting a device private page") changed the migrate_to_ram() callback to take a reference on the device page to ensure it can't be freed while handling the fault. Unfortunately the corresponding update to Nouveau to accommodate this change was inadvertently dropped from that patch causing GPU to CPU migration to fail so add it here. Link: https://lkml.kernel.org/r/20221019122934.866205-1-apopple@nvidia.com Fixes: 16ce101db85d ("mm/memory.c: fix race when faulting a device private page") Signed-off-by: Alistair Popple <apopple@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Ben Skeggs <bskeggs@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 5fe209107246f..20fe53815b20f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -176,6 +176,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) .src = &src, .dst = &dst, .pgmap_owner = drm->dev, + .fault_page = vmf->page, .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE, }; -- GitLab From 65d78b8d0405fcda02b69fd3c34327e4af5cd465 Mon Sep 17 00:00:00 2001 From: Adam Borowski <kilobyte@angband.pl> Date: Mon, 10 Oct 2022 20:33:51 +0200 Subject: [PATCH 2197/2223] i2c: mlxbf: depend on ACPI; clean away ifdeffage This fixes maybe_unused warnings/errors. According to a comment during device tree removal, only ACPI is supported, thus let's actually require it. Fixes: be18c5ede25d ("i2c: mlxbf: remove device tree support") Signed-off-by: Adam Borowski <kilobyte@angband.pl> Signed-off-by: Wolfram Sang <wsa@kernel.org> --- drivers/i2c/busses/Kconfig | 1 + drivers/i2c/busses/i2c-mlxbf.c | 9 --------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 264e780ae32e1..e50f9603d189e 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -764,6 +764,7 @@ config I2C_LPC2K config I2C_MLXBF tristate "Mellanox BlueField I2C controller" depends on MELLANOX_PLATFORM && ARM64 + depends on ACPI select I2C_SLAVE help Enabling this option will add I2C SMBus support for Mellanox BlueField diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c index e68e775f187e6..1810d5791b3d7 100644 --- a/drivers/i2c/busses/i2c-mlxbf.c +++ b/drivers/i2c/busses/i2c-mlxbf.c @@ -2247,7 +2247,6 @@ static struct i2c_adapter_quirks mlxbf_i2c_quirks = { .max_write_len = MLXBF_I2C_MASTER_DATA_W_LENGTH, }; -#ifdef CONFIG_ACPI static const struct acpi_device_id mlxbf_i2c_acpi_ids[] = { { "MLNXBF03", (kernel_ulong_t)&mlxbf_i2c_chip[MLXBF_I2C_CHIP_TYPE_1] }, { "MLNXBF23", (kernel_ulong_t)&mlxbf_i2c_chip[MLXBF_I2C_CHIP_TYPE_2] }, @@ -2282,12 +2281,6 @@ static int mlxbf_i2c_acpi_probe(struct device *dev, struct mlxbf_i2c_priv *priv) return 0; } -#else -static int mlxbf_i2c_acpi_probe(struct device *dev, struct mlxbf_i2c_priv *priv) -{ - return -ENOENT; -} -#endif /* CONFIG_ACPI */ static int mlxbf_i2c_probe(struct platform_device *pdev) { @@ -2490,9 +2483,7 @@ static struct platform_driver mlxbf_i2c_driver = { .remove = mlxbf_i2c_remove, .driver = { .name = "i2c-mlxbf", -#ifdef CONFIG_ACPI .acpi_match_table = ACPI_PTR(mlxbf_i2c_acpi_ids), -#endif /* CONFIG_ACPI */ }, }; -- GitLab From 5c20a3a9df19811051441214e7f5091cb3546db0 Mon Sep 17 00:00:00 2001 From: Andrew Jones <ajones@ventanamicro.com> Date: Fri, 21 Oct 2022 11:52:39 +0530 Subject: [PATCH 2198/2223] RISC-V: Fix compilation without RISCV_ISA_ZICBOM riscv_cbom_block_size and riscv_init_cbom_blocksize() should always be available and riscv_init_cbom_blocksize() should always be invoked, even when compiling without RISCV_ISA_ZICBOM enabled. This is because disabling RISCV_ISA_ZICBOM means "don't use zicbom instructions in the kernel" not "pretend there isn't zicbom, even when there is". When zicbom is available, whether the kernel enables its use with RISCV_ISA_ZICBOM or not, KVM will offer it to guests. Ensure we can build KVM and that the block size is initialized even when compiling without RISCV_ISA_ZICBOM. Fixes: 8f7e001e0325 ("RISC-V: Clean up the Zicbom block size probing") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Andrew Jones <ajones@ventanamicro.com> Signed-off-by: Anup Patel <apatel@ventanamicro.com> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Heiko Stuebner <heiko@sntech.de> Tested-by: Heiko Stuebner <heiko@sntech.de> Signed-off-by: Anup Patel <anup@brainfault.org> --- arch/riscv/include/asm/cacheflush.h | 8 ------ arch/riscv/mm/cacheflush.c | 38 ++++++++++++++++++++++++++ arch/riscv/mm/dma-noncoherent.c | 41 ----------------------------- 3 files changed, 38 insertions(+), 49 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8a5c246b0a216..f6fbe7042f1c8 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -42,16 +42,8 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ -/* - * The T-Head CMO errata internally probe the CBOM block size, but otherwise - * don't depend on Zicbom. - */ extern unsigned int riscv_cbom_block_size; -#ifdef CONFIG_RISCV_ISA_ZICBOM void riscv_init_cbom_blocksize(void); -#else -static inline void riscv_init_cbom_blocksize(void) { } -#endif #ifdef CONFIG_RISCV_DMA_NONCOHERENT void riscv_noncoherent_supported(void); diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 6cb7d96ad9c7b..57b40a3504206 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -3,6 +3,7 @@ * Copyright (C) 2017 SiFive */ +#include <linux/of.h> #include <asm/cacheflush.h> #ifdef CONFIG_SMP @@ -86,3 +87,40 @@ void flush_icache_pte(pte_t pte) flush_icache_all(); } #endif /* CONFIG_MMU */ + +unsigned int riscv_cbom_block_size; +EXPORT_SYMBOL_GPL(riscv_cbom_block_size); + +void riscv_init_cbom_blocksize(void) +{ + struct device_node *node; + unsigned long cbom_hartid; + u32 val, probed_block_size; + int ret; + + probed_block_size = 0; + for_each_of_cpu_node(node) { + unsigned long hartid; + + ret = riscv_of_processor_hartid(node, &hartid); + if (ret) + continue; + + /* set block-size for cbom extension if available */ + ret = of_property_read_u32(node, "riscv,cbom-block-size", &val); + if (ret) + continue; + + if (!probed_block_size) { + probed_block_size = val; + cbom_hartid = hartid; + } else { + if (probed_block_size != val) + pr_warn("cbom-block-size mismatched between harts %lu and %lu\n", + cbom_hartid, hartid); + } + } + + if (probed_block_size) + riscv_cbom_block_size = probed_block_size; +} diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index b0add983530ab..d919efab6ebad 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -8,13 +8,8 @@ #include <linux/dma-direct.h> #include <linux/dma-map-ops.h> #include <linux/mm.h> -#include <linux/of.h> -#include <linux/of_device.h> #include <asm/cacheflush.h> -unsigned int riscv_cbom_block_size; -EXPORT_SYMBOL_GPL(riscv_cbom_block_size); - static bool noncoherent_supported; void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, @@ -77,42 +72,6 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, dev->dma_coherent = coherent; } -#ifdef CONFIG_RISCV_ISA_ZICBOM -void riscv_init_cbom_blocksize(void) -{ - struct device_node *node; - unsigned long cbom_hartid; - u32 val, probed_block_size; - int ret; - - probed_block_size = 0; - for_each_of_cpu_node(node) { - unsigned long hartid; - - ret = riscv_of_processor_hartid(node, &hartid); - if (ret) - continue; - - /* set block-size for cbom extension if available */ - ret = of_property_read_u32(node, "riscv,cbom-block-size", &val); - if (ret) - continue; - - if (!probed_block_size) { - probed_block_size = val; - cbom_hartid = hartid; - } else { - if (probed_block_size != val) - pr_warn("cbom-block-size mismatched between harts %lu and %lu\n", - cbom_hartid, hartid); - } - } - - if (probed_block_size) - riscv_cbom_block_size = probed_block_size; -} -#endif - void riscv_noncoherent_supported(void) { WARN(!riscv_cbom_block_size, -- GitLab From cea8896bd936135559253e9b23340cfa1cdf0caf Mon Sep 17 00:00:00 2001 From: Anup Patel <apatel@ventanamicro.com> Date: Fri, 21 Oct 2022 11:52:45 +0530 Subject: [PATCH 2199/2223] RISC-V: KVM: Fix kvm_riscv_vcpu_timer_pending() for Sstc The kvm_riscv_vcpu_timer_pending() checks per-VCPU next_cycles and per-VCPU software injected VS timer interrupt. This function returns incorrect value when Sstc is available because the per-VCPU next_cycles are only updated by kvm_riscv_vcpu_timer_save() called from kvm_arch_vcpu_put(). As a result, when Sstc is available the VCPU does not block properly upon WFI traps. To fix the above issue, we introduce kvm_riscv_vcpu_timer_sync() which will update per-VCPU next_cycles upon every VM exit instead of kvm_riscv_vcpu_timer_save(). Fixes: 8f5cb44b1bae ("RISC-V: KVM: Support sstc extension") Signed-off-by: Anup Patel <apatel@ventanamicro.com> Reviewed-by: Atish Patra <atishp@rivosinc.com> Signed-off-by: Anup Patel <anup@brainfault.org> --- arch/riscv/include/asm/kvm_vcpu_timer.h | 1 + arch/riscv/kvm/vcpu.c | 3 +++ arch/riscv/kvm/vcpu_timer.c | 17 +++++++++++++++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_timer.h b/arch/riscv/include/asm/kvm_vcpu_timer.h index 0d8fdb8ec63aa..82f7260301da2 100644 --- a/arch/riscv/include/asm/kvm_vcpu_timer.h +++ b/arch/riscv/include/asm/kvm_vcpu_timer.h @@ -45,6 +45,7 @@ int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu); void kvm_riscv_guest_timer_init(struct kvm *kvm); +void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu); bool kvm_riscv_vcpu_timer_pending(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index a032c4f0d6006..71ebbc4821f0e 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -708,6 +708,9 @@ void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) clear_bit(IRQ_VS_SOFT, &v->irqs_pending); } } + + /* Sync-up timer CSRs */ + kvm_riscv_vcpu_timer_sync(vcpu); } int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 185f2386a747e..ad34519c8a13d 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -320,20 +320,33 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_timer_unblocking(vcpu); } -void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu) +void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu) { struct kvm_vcpu_timer *t = &vcpu->arch.timer; if (!t->sstc_enabled) return; - t = &vcpu->arch.timer; #if defined(CONFIG_32BIT) t->next_cycles = csr_read(CSR_VSTIMECMP); t->next_cycles |= (u64)csr_read(CSR_VSTIMECMPH) << 32; #else t->next_cycles = csr_read(CSR_VSTIMECMP); #endif +} + +void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_timer *t = &vcpu->arch.timer; + + if (!t->sstc_enabled) + return; + + /* + * The vstimecmp CSRs are saved by kvm_riscv_vcpu_timer_sync() + * upon every VM exit so no need to save here. + */ + /* timer should be enabled for the remaining operations */ if (unlikely(!t->init_done)) return; -- GitLab From 0251d0107cfb0bb5ab2d3f97710487b9522db020 Mon Sep 17 00:00:00 2001 From: Lu Baolu <baolu.lu@linux.intel.com> Date: Wed, 19 Oct 2022 08:44:44 +0800 Subject: [PATCH 2200/2223] iommu: Add gfp parameter to iommu_alloc_resv_region Add gfp parameter to iommu_alloc_resv_region() for the callers to specify the memory allocation behavior. Thus iommu_alloc_resv_region() could also be available in critical contexts. Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> Tested-by: Alex Williamson <alex.williamson@redhat.com> Link: https://lore.kernel.org/r/20220927053109.4053662-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel <jroedel@suse.de> --- drivers/acpi/arm64/iort.c | 3 ++- drivers/iommu/amd/iommu.c | 7 ++++--- drivers/iommu/apple-dart.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 +- drivers/iommu/intel/iommu.c | 8 +++++--- drivers/iommu/iommu.c | 7 ++++--- drivers/iommu/mtk_iommu.c | 3 ++- drivers/iommu/virtio-iommu.c | 9 ++++++--- include/linux/iommu.h | 2 +- 10 files changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index ca2aed86b5404..8059baf4ef271 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1142,7 +1142,8 @@ static void iort_iommu_msi_get_resv_regions(struct device *dev, struct iommu_resv_region *region; region = iommu_alloc_resv_region(base + SZ_64K, SZ_64K, - prot, IOMMU_RESV_MSI); + prot, IOMMU_RESV_MSI, + GFP_KERNEL); if (region) list_add_tail(®ion->list, head); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 65856e4019494..d3b39d0416fa3 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2330,7 +2330,8 @@ static void amd_iommu_get_resv_regions(struct device *dev, type = IOMMU_RESV_RESERVED; region = iommu_alloc_resv_region(entry->address_start, - length, prot, type); + length, prot, type, + GFP_KERNEL); if (!region) { dev_err(dev, "Out of memory allocating dm-regions\n"); return; @@ -2340,14 +2341,14 @@ static void amd_iommu_get_resv_regions(struct device *dev, region = iommu_alloc_resv_region(MSI_RANGE_START, MSI_RANGE_END - MSI_RANGE_START + 1, - 0, IOMMU_RESV_MSI); + 0, IOMMU_RESV_MSI, GFP_KERNEL); if (!region) return; list_add_tail(®ion->list, head); region = iommu_alloc_resv_region(HT_RANGE_START, HT_RANGE_END - HT_RANGE_START + 1, - 0, IOMMU_RESV_RESERVED); + 0, IOMMU_RESV_RESERVED, GFP_KERNEL); if (!region) return; list_add_tail(®ion->list, head); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 4526575b999e7..4f4a323be0d0f 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -758,7 +758,7 @@ static void apple_dart_get_resv_regions(struct device *dev, region = iommu_alloc_resv_region(DOORBELL_ADDR, PAGE_SIZE, prot, - IOMMU_RESV_MSI); + IOMMU_RESV_MSI, GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ba47c73f5b8c8..6d5df91c5c465 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2757,7 +2757,7 @@ static void arm_smmu_get_resv_regions(struct device *dev, int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, - prot, IOMMU_RESV_SW_MSI); + prot, IOMMU_RESV_SW_MSI, GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 6c1114a4d6cc1..30dab1418e3ff 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1534,7 +1534,7 @@ static void arm_smmu_get_resv_regions(struct device *dev, int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, - prot, IOMMU_RESV_SW_MSI); + prot, IOMMU_RESV_SW_MSI, GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a8b36c3fddf1a..d5965b4f8b602 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4552,7 +4552,8 @@ static void intel_iommu_get_resv_regions(struct device *device, IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; resv = iommu_alloc_resv_region(rmrr->base_address, - length, prot, type); + length, prot, type, + GFP_KERNEL); if (!resv) break; @@ -4567,7 +4568,8 @@ static void intel_iommu_get_resv_regions(struct device *device, if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { reg = iommu_alloc_resv_region(0, 1UL << 24, prot, - IOMMU_RESV_DIRECT_RELAXABLE); + IOMMU_RESV_DIRECT_RELAXABLE, + GFP_KERNEL); if (reg) list_add_tail(®->list, head); } @@ -4576,7 +4578,7 @@ static void intel_iommu_get_resv_regions(struct device *device, reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, - 0, IOMMU_RESV_MSI); + 0, IOMMU_RESV_MSI, GFP_KERNEL); if (!reg) return; list_add_tail(®->list, head); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4893c2429ca56..65a3b3d886dc0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -504,7 +504,7 @@ static int iommu_insert_resv_region(struct iommu_resv_region *new, LIST_HEAD(stack); nr = iommu_alloc_resv_region(new->start, new->length, - new->prot, new->type); + new->prot, new->type, GFP_KERNEL); if (!nr) return -ENOMEM; @@ -2579,11 +2579,12 @@ EXPORT_SYMBOL(iommu_put_resv_regions); struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, - enum iommu_resv_type type) + enum iommu_resv_type type, + gfp_t gfp) { struct iommu_resv_region *region; - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc(sizeof(*region), gfp); if (!region) return NULL; diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 5a4e00e4bbbc7..2ab2ecfe01f80 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -917,7 +917,8 @@ static void mtk_iommu_get_resv_regions(struct device *dev, continue; region = iommu_alloc_resv_region(resv->iova_base, resv->size, - prot, IOMMU_RESV_RESERVED); + prot, IOMMU_RESV_RESERVED, + GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b7c22802f57c0..8b1b5c270e502 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -490,11 +490,13 @@ static int viommu_add_resv_mem(struct viommu_endpoint *vdev, fallthrough; case VIRTIO_IOMMU_RESV_MEM_T_RESERVED: region = iommu_alloc_resv_region(start, size, 0, - IOMMU_RESV_RESERVED); + IOMMU_RESV_RESERVED, + GFP_KERNEL); break; case VIRTIO_IOMMU_RESV_MEM_T_MSI: region = iommu_alloc_resv_region(start, size, prot, - IOMMU_RESV_MSI); + IOMMU_RESV_MSI, + GFP_KERNEL); break; } if (!region) @@ -909,7 +911,8 @@ static void viommu_get_resv_regions(struct device *dev, struct list_head *head) */ if (!msi) { msi = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, - prot, IOMMU_RESV_SW_MSI); + prot, IOMMU_RESV_SW_MSI, + GFP_KERNEL); if (!msi) return; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a325532aeab58..3c9da1f8979e3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -455,7 +455,7 @@ extern void iommu_set_default_translated(bool cmd_line); extern bool iommu_default_passthrough(void); extern struct iommu_resv_region * iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, - enum iommu_resv_type type); + enum iommu_resv_type type, gfp_t gfp); extern int iommu_get_group_resv_regions(struct iommu_group *group, struct list_head *head); -- GitLab From bf638a6513dda3021e3e90bdacb71c606bd0c305 Mon Sep 17 00:00:00 2001 From: Lu Baolu <baolu.lu@linux.intel.com> Date: Wed, 19 Oct 2022 08:44:45 +0800 Subject: [PATCH 2201/2223] iommu/vt-d: Use rcu_lock in get_resv_regions Commit 5f64ce5411b46 ("iommu/vt-d: Duplicate iommu_resv_region objects per device list") converted rcu_lock in get_resv_regions to dmar_global_lock to allow sleeping in iommu_alloc_resv_region(). This introduced possible recursive locking if get_resv_regions is called from within a section where intel_iommu_init() already holds dmar_global_lock. Especially, after commit 57365a04c921 ("iommu: Move bus setup to IOMMU device registration"), below lockdep splats could always be seen. ============================================ WARNING: possible recursive locking detected 6.0.0-rc4+ #325 Tainted: G I -------------------------------------------- swapper/0/1 is trying to acquire lock: ffffffffa8a18c90 (dmar_global_lock){++++}-{3:3}, at: intel_iommu_get_resv_regions+0x25/0x270 but task is already holding lock: ffffffffa8a18c90 (dmar_global_lock){++++}-{3:3}, at: intel_iommu_init+0x36d/0x6ea ... Call Trace: <TASK> dump_stack_lvl+0x48/0x5f __lock_acquire.cold.73+0xad/0x2bb lock_acquire+0xc2/0x2e0 ? intel_iommu_get_resv_regions+0x25/0x270 ? lock_is_held_type+0x9d/0x110 down_read+0x42/0x150 ? intel_iommu_get_resv_regions+0x25/0x270 intel_iommu_get_resv_regions+0x25/0x270 iommu_create_device_direct_mappings.isra.28+0x8d/0x1c0 ? iommu_get_dma_cookie+0x6d/0x90 bus_iommu_probe+0x19f/0x2e0 iommu_device_register+0xd4/0x130 intel_iommu_init+0x3e1/0x6ea ? iommu_setup+0x289/0x289 ? rdinit_setup+0x34/0x34 pci_iommu_init+0x12/0x3a do_one_initcall+0x65/0x320 ? rdinit_setup+0x34/0x34 ? rcu_read_lock_sched_held+0x5a/0x80 kernel_init_freeable+0x28a/0x2f3 ? rest_init+0x1b0/0x1b0 kernel_init+0x1a/0x130 ret_from_fork+0x1f/0x30 </TASK> This rolls back dmar_global_lock to rcu_lock in get_resv_regions to avoid the lockdep splat. Fixes: 57365a04c921 ("iommu: Move bus setup to IOMMU device registration") Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> Tested-by: Alex Williamson <alex.williamson@redhat.com> Link: https://lore.kernel.org/r/20220927053109.4053662-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel <jroedel@suse.de> --- drivers/iommu/intel/iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d5965b4f8b602..b3cf0f991e297 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4534,7 +4534,7 @@ static void intel_iommu_get_resv_regions(struct device *device, struct device *i_dev; int i; - down_read(&dmar_global_lock); + rcu_read_lock(); for_each_rmrr_units(rmrr) { for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, i, i_dev) { @@ -4553,14 +4553,14 @@ static void intel_iommu_get_resv_regions(struct device *device, resv = iommu_alloc_resv_region(rmrr->base_address, length, prot, type, - GFP_KERNEL); + GFP_ATOMIC); if (!resv) break; list_add_tail(&resv->list, head); } } - up_read(&dmar_global_lock); + rcu_read_unlock(); #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA if (dev_is_pci(device)) { -- GitLab From 5566e68d829f5d87670d5984c1c2ccb4c518405f Mon Sep 17 00:00:00 2001 From: Charlotte Tan <charlotte@extrahop.com> Date: Wed, 19 Oct 2022 08:44:46 +0800 Subject: [PATCH 2202/2223] iommu/vt-d: Allow NVS regions in arch_rmrr_sanity_check() arch_rmrr_sanity_check() warns if the RMRR is not covered by an ACPI Reserved region, but it seems like it should accept an NVS region as well. The ACPI spec https://uefi.org/specs/ACPI/6.5/15_System_Address_Map_Interfaces.html uses similar wording for "Reserved" and "NVS" region types; for NVS regions it says "This range of addresses is in use or reserved by the system and must not be used by the operating system." There is an old comment on this mailing list that also suggests NVS regions should pass the arch_rmrr_sanity_check() test: The warnings come from arch_rmrr_sanity_check() since it checks whether the region is E820_TYPE_RESERVED. However, if the purpose of the check is to detect RMRR has regions that may be used by OS as free memory, isn't E820_TYPE_NVS safe, too? This patch overlaps with another proposed patch that would add the region type to the log since sometimes the bug reporter sees this log on the console but doesn't know to include the kernel log: https://lore.kernel.org/lkml/20220611204859.234975-3-atomlin@redhat.com/ Here's an example of the "Firmware Bug" apparent false positive (wrapped for line length): DMAR: [Firmware Bug]: No firmware reserved region can cover this RMRR [0x000000006f760000-0x000000006f762fff], contact BIOS vendor for fixes DMAR: [Firmware Bug]: Your BIOS is broken; bad RMRR [0x000000006f760000-0x000000006f762fff] This is the snippet from the e820 table: BIOS-e820: [mem 0x0000000068bff000-0x000000006ebfefff] reserved BIOS-e820: [mem 0x000000006ebff000-0x000000006f9fefff] ACPI NVS BIOS-e820: [mem 0x000000006f9ff000-0x000000006fffefff] ACPI data Fixes: f036c7fa0ab6 ("iommu/vt-d: Check VT-d RMRR region in BIOS is reported as reserved") Cc: Will Mortensen <will@extrahop.com> Link: https://lore.kernel.org/linux-iommu/64a5843d-850d-e58c-4fc2-0a0eeeb656dc@nec.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216443 Signed-off-by: Charlotte Tan <charlotte@extrahop.com> Reviewed-by: Aaron Tomlin <atomlin@redhat.com> Link: https://lore.kernel.org/r/20220929044449.32515-1-charlotte@extrahop.com Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> Signed-off-by: Joerg Roedel <jroedel@suse.de> --- arch/x86/include/asm/iommu.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0bef44d30a278..2fd52b65deac1 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -25,8 +25,10 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { u64 start = rmrr->base_address; u64 end = rmrr->end_address + 1; + int entry_type; - if (e820__mapped_all(start, end, E820_TYPE_RESERVED)) + entry_type = e820__get_entry_type(start, end); + if (entry_type == E820_TYPE_RESERVED || entry_type == E820_TYPE_NVS) return 0; pr_err(FW_BUG "No firmware reserved region can cover this RMRR [%#018Lx-%#018Lx], contact BIOS vendor for fixes\n", -- GitLab From 620bf9f981365c18cc2766c53d92bf8131c63f32 Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar <jsnitsel@redhat.com> Date: Wed, 19 Oct 2022 08:44:47 +0800 Subject: [PATCH 2203/2223] iommu/vt-d: Clean up si_domain in the init_dmars() error path A splat from kmem_cache_destroy() was seen with a kernel prior to commit ee2653bbe89d ("iommu/vt-d: Remove domain and devinfo mempool") when there was a failure in init_dmars(), because the iommu_domain cache still had objects. While the mempool code is now gone, there still is a leak of the si_domain memory if init_dmars() fails. So clean up si_domain in the init_dmars() error path. Cc: Lu Baolu <baolu.lu@linux.intel.com> Cc: Joerg Roedel <joro@8bytes.org> Cc: Will Deacon <will@kernel.org> Cc: Robin Murphy <robin.murphy@arm.com> Fixes: 86080ccc223a ("iommu/vt-d: Allocate si_domain in init_dmars()") Signed-off-by: Jerry Snitselaar <jsnitsel@redhat.com> Link: https://lore.kernel.org/r/20221010144842.308890-1-jsnitsel@redhat.com Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> Signed-off-by: Joerg Roedel <jroedel@suse.de> --- drivers/iommu/intel/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index b3cf0f991e297..48cdcd0a5cf34 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2410,6 +2410,7 @@ static int __init si_domain_init(int hw) if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { domain_exit(si_domain); + si_domain = NULL; return -EFAULT; } @@ -3052,6 +3053,10 @@ free_iommu: disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } + if (si_domain) { + domain_exit(si_domain); + si_domain = NULL; + } return ret; } -- GitLab From f57fb375a203e28bf7c08ca01d5ee72028b391d9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel <ardb@kernel.org> Date: Mon, 17 Oct 2022 12:48:46 +0200 Subject: [PATCH 2204/2223] efi: libstub: Remove zboot signing from build options The zboot decompressor series introduced a feature to sign the PE/COFF kernel image for secure boot as part of the kernel build. This was necessary because there are actually two images that need to be signed: the kernel with the EFI stub attached, and the decompressor application. This is a bit of a burden, because it means that the images must be signed on the the same system that performs the build, and this is not realistic for distros. During the next cycle, we will introduce changes to the zboot code so that the inner image no longer needs to be signed. This means that the outer PE/COFF image can be handled as usual, and be signed later in the release process. Let's remove the associated Kconfig options now so that they don't end up in a LTS release while already being deprecated. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- drivers/firmware/efi/Kconfig | 22 ---------------- drivers/firmware/efi/libstub/Makefile.zboot | 29 +++------------------ 2 files changed, 4 insertions(+), 47 deletions(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 5b79a4a4a88d8..6787ed8dfacf3 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -124,28 +124,6 @@ config EFI_ZBOOT is supported by the encapsulated image. (The compression algorithm used is described in the zboot image header) -config EFI_ZBOOT_SIGNED - def_bool y - depends on EFI_ZBOOT_SIGNING_CERT != "" - depends on EFI_ZBOOT_SIGNING_KEY != "" - -config EFI_ZBOOT_SIGNING - bool "Sign the EFI decompressor for UEFI secure boot" - depends on EFI_ZBOOT - help - Use the 'sbsign' command line tool (which must exist on the host - path) to sign both the EFI decompressor PE/COFF image, as well as the - encapsulated PE/COFF image, which is subsequently compressed and - wrapped by the former image. - -config EFI_ZBOOT_SIGNING_CERT - string "Certificate to use for signing the compressed EFI boot image" - depends on EFI_ZBOOT_SIGNING - -config EFI_ZBOOT_SIGNING_KEY - string "Private key to use for signing the compressed EFI boot image" - depends on EFI_ZBOOT_SIGNING - config EFI_ARMSTUB_DTB_LOADER bool "Enable the DTB loader" depends on EFI_GENERIC_STUB && !RISCV && !LOONGARCH diff --git a/drivers/firmware/efi/libstub/Makefile.zboot b/drivers/firmware/efi/libstub/Makefile.zboot index 35f234ad8738d..3340b385a05b5 100644 --- a/drivers/firmware/efi/libstub/Makefile.zboot +++ b/drivers/firmware/efi/libstub/Makefile.zboot @@ -20,22 +20,11 @@ zboot-size-len-y := 4 zboot-method-$(CONFIG_KERNEL_GZIP) := gzip zboot-size-len-$(CONFIG_KERNEL_GZIP) := 0 -quiet_cmd_sbsign = SBSIGN $@ - cmd_sbsign = sbsign --out $@ $< \ - --key $(CONFIG_EFI_ZBOOT_SIGNING_KEY) \ - --cert $(CONFIG_EFI_ZBOOT_SIGNING_CERT) - -$(obj)/$(EFI_ZBOOT_PAYLOAD).signed: $(obj)/$(EFI_ZBOOT_PAYLOAD) FORCE - $(call if_changed,sbsign) - -ZBOOT_PAYLOAD-y := $(EFI_ZBOOT_PAYLOAD) -ZBOOT_PAYLOAD-$(CONFIG_EFI_ZBOOT_SIGNED) := $(EFI_ZBOOT_PAYLOAD).signed - -$(obj)/vmlinuz: $(obj)/$(ZBOOT_PAYLOAD-y) FORCE +$(obj)/vmlinuz: $(obj)/$(EFI_ZBOOT_PAYLOAD) FORCE $(call if_changed,$(zboot-method-y)) OBJCOPYFLAGS_vmlinuz.o := -I binary -O $(EFI_ZBOOT_BFD_TARGET) \ - --rename-section .data=.gzdata,load,alloc,readonly,contents + --rename-section .data=.gzdata,load,alloc,readonly,contents $(obj)/vmlinuz.o: $(obj)/vmlinuz FORCE $(call if_changed,objcopy) @@ -53,18 +42,8 @@ LDFLAGS_vmlinuz.efi.elf := -T $(srctree)/drivers/firmware/efi/libstub/zboot.lds $(obj)/vmlinuz.efi.elf: $(obj)/vmlinuz.o $(ZBOOT_DEPS) FORCE $(call if_changed,ld) -ZBOOT_EFI-y := vmlinuz.efi -ZBOOT_EFI-$(CONFIG_EFI_ZBOOT_SIGNED) := vmlinuz.efi.unsigned - -OBJCOPYFLAGS_$(ZBOOT_EFI-y) := -O binary -$(obj)/$(ZBOOT_EFI-y): $(obj)/vmlinuz.efi.elf FORCE +OBJCOPYFLAGS_vmlinuz.efi := -O binary +$(obj)/vmlinuz.efi: $(obj)/vmlinuz.efi.elf FORCE $(call if_changed,objcopy) targets += zboot-header.o vmlinuz vmlinuz.o vmlinuz.efi.elf vmlinuz.efi - -ifneq ($(CONFIG_EFI_ZBOOT_SIGNED),) -$(obj)/vmlinuz.efi: $(obj)/vmlinuz.efi.unsigned FORCE - $(call if_changed,sbsign) -endif - -targets += $(EFI_ZBOOT_PAYLOAD).signed vmlinuz.efi.unsigned -- GitLab From 4b017e59f01097f19b938f6dc4dc2c4720701610 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel <ardb@kernel.org> Date: Fri, 14 Oct 2022 12:25:52 +0200 Subject: [PATCH 2205/2223] efi: ssdt: Don't free memory if ACPI table was loaded successfully MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Amadeusz reports KASAN use-after-free errors introduced by commit 3881ee0b1edc ("efi: avoid efivars layer when loading SSDTs from variables"). The problem appears to be that the memory that holds the new ACPI table is now freed unconditionally, instead of only when the ACPI core reported a failure to load the table. So let's fix this, by omitting the kfree() on success. Cc: <stable@vger.kernel.org> # v6.0 Link: https://lore.kernel.org/all/a101a10a-4fbb-5fae-2e3c-76cf96ed8fbd@linux.intel.com/ Fixes: 3881ee0b1edc ("efi: avoid efivars layer when loading SSDTs from variables") Reported-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- drivers/firmware/efi/efi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 9624735f15757..3ecdc43a3f2bb 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -271,6 +271,8 @@ static __init int efivar_ssdt_load(void) acpi_status ret = acpi_load_table(data, NULL); if (ret) pr_err("failed to load table: %u\n", ret); + else + continue; } else { pr_err("failed to get var data: 0x%lx\n", status); } -- GitLab From 8a254d90a77580244ec57e82bca7eb65656cc167 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel <ardb@kernel.org> Date: Wed, 19 Oct 2022 23:29:58 +0200 Subject: [PATCH 2206/2223] efi: efivars: Fix variable writes without query_variable_store() Commit bbc6d2c6ef22 ("efi: vars: Switch to new wrapper layer") refactored the efivars layer so that the 'business logic' related to which UEFI variables affect the boot flow in which way could be moved out of it, and into the efivarfs driver. This inadvertently broke setting variables on firmware implementations that lack the QueryVariableInfo() boot service, because we no longer tolerate a EFI_UNSUPPORTED result from check_var_size() when calling efivar_entry_set_get_size(), which now ends up calling check_var_size() a second time inadvertently. If QueryVariableInfo() is missing, we support writes of up to 64k - let's move that logic into check_var_size(), and drop the redundant call. Cc: <stable@vger.kernel.org> # v6.0 Fixes: bbc6d2c6ef22 ("efi: vars: Switch to new wrapper layer") Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- drivers/firmware/efi/vars.c | 10 +++++----- fs/efivarfs/vars.c | 16 ---------------- include/linux/efi.h | 3 --- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index dd74d2ad31840..433b615871395 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -7,6 +7,7 @@ */ #include <linux/types.h> +#include <linux/sizes.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> @@ -20,19 +21,19 @@ static struct efivars *__efivars; static DEFINE_SEMAPHORE(efivars_lock); -efi_status_t check_var_size(u32 attributes, unsigned long size) +static efi_status_t check_var_size(u32 attributes, unsigned long size) { const struct efivar_operations *fops; fops = __efivars->ops; if (!fops->query_variable_store) - return EFI_UNSUPPORTED; + return (size <= SZ_64K) ? EFI_SUCCESS : EFI_OUT_OF_RESOURCES; return fops->query_variable_store(attributes, size, false); } -EXPORT_SYMBOL_NS_GPL(check_var_size, EFIVAR); +static efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size) { const struct efivar_operations *fops; @@ -40,11 +41,10 @@ efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size) fops = __efivars->ops; if (!fops->query_variable_store) - return EFI_UNSUPPORTED; + return (size <= SZ_64K) ? EFI_SUCCESS : EFI_OUT_OF_RESOURCES; return fops->query_variable_store(attributes, size, true); } -EXPORT_SYMBOL_NS_GPL(check_var_size_nonblocking, EFIVAR); /** * efivars_kobject - get the kobject for the registered efivars diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index a0ef63cfcecba..9e4f47808bd5a 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (err) return err; - /* - * Ensure that the available space hasn't shrunk below the safe level - */ - status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); - if (status != EFI_SUCCESS) { - if (status != EFI_UNSUPPORTED) { - err = efi_status_to_err(status); - goto out; - } - - if (*size > 65536) { - err = -ENOSPC; - goto out; - } - } - status = efivar_set_variable_locked(name, vendor, attributes, *size, data, false); if (status != EFI_SUCCESS) { diff --git a/include/linux/efi.h b/include/linux/efi.h index da3974bf05d3e..80f3c1c7827dd 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1085,9 +1085,6 @@ efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor, efi_status_t efivar_set_variable(efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); -efi_status_t check_var_size(u32 attributes, unsigned long size); -efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size); - #if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER) extern bool efi_capsule_pending(int *reset_type); -- GitLab From db14655ad7854b69a2efda348e30d02dbc19e8a1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel <ardb@kernel.org> Date: Fri, 14 Oct 2022 19:29:57 +0200 Subject: [PATCH 2207/2223] efi: libstub: Give efi_main() asmlinkage qualification To stop the bots from sending sparse warnings to me and the list about efi_main() not having a prototype, decorate it with asmlinkage so that it is clear that it is called from assembly, and therefore needs to remain external, even if it is never declared in a header file. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- drivers/firmware/efi/libstub/x86-stub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index b9ce6393e3531..33a7811e12c65 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -765,9 +765,9 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) * relocated by efi_relocate_kernel. * On failure, we exit to the firmware via efi_exit instead of returning. */ -unsigned long efi_main(efi_handle_t handle, - efi_system_table_t *sys_table_arg, - struct boot_params *boot_params) +asmlinkage unsigned long efi_main(efi_handle_t handle, + efi_system_table_t *sys_table_arg, + struct boot_params *boot_params) { unsigned long bzimage_addr = (unsigned long)startup_32; unsigned long buffer_start, buffer_end; -- GitLab From 53a7ea284de9eabc0e3b7dee54c2cb670b8e087a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel <ardb@kernel.org> Date: Thu, 20 Oct 2022 11:26:42 +0200 Subject: [PATCH 2208/2223] efi: libstub: Fix incorrect payload size in zboot header The linker script symbol definition that captures the size of the compressed payload inside the zboot decompressor (which is exposed via the image header) refers to '.' for the end of the region, which does not give the correct result as the expression is not placed at the end of the payload. So use the symbol name explicitly. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- drivers/firmware/efi/libstub/zboot.lds | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/zboot.lds b/drivers/firmware/efi/libstub/zboot.lds index 87a62765bafdb..93d33f68333b2 100644 --- a/drivers/firmware/efi/libstub/zboot.lds +++ b/drivers/firmware/efi/libstub/zboot.lds @@ -38,7 +38,8 @@ SECTIONS } } -PROVIDE(__efistub__gzdata_size = ABSOLUTE(. - __efistub__gzdata_start)); +PROVIDE(__efistub__gzdata_size = + ABSOLUTE(__efistub__gzdata_end - __efistub__gzdata_start)); PROVIDE(__data_rawsize = ABSOLUTE(_edata - _etext)); PROVIDE(__data_size = ABSOLUTE(_end - _etext)); -- GitLab From 37926f96302d8b6c2bc97990d33e316a3ed6d67f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel <ardb@kernel.org> Date: Thu, 20 Oct 2022 15:16:09 +0200 Subject: [PATCH 2209/2223] efi: runtime: Don't assume virtual mappings are missing if VA == PA == 0 The generic EFI stub can be instructed to avoid SetVirtualAddressMap(), and simply run with the firmware's 1:1 mapping. In this case, it populates the virtual address fields of the runtime regions in the memory map with the physical address of each region, so that the mapping code has to be none the wiser. Only if SetVirtualAddressMap() fails, the virtual addresses are wiped and the kernel code knows that the regions cannot be mapped. However, wiping amounts to setting it to zero, and if a runtime region happens to live at physical address 0, its valid 1:1 mapped virtual address could be mistaken for a wiped field, resulting on loss of access to the EFI services at runtime. So let's only assume that VA == 0 means 'no runtime services' if the region in question does not live at PA 0x0. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- drivers/firmware/efi/arm-runtime.c | 2 +- drivers/firmware/efi/libstub/fdt.c | 8 ++++---- drivers/firmware/efi/riscv-runtime.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 3359ae2adf24b..7c48c380d722c 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -63,7 +63,7 @@ static bool __init efi_virtmap_init(void) if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; - if (md->virt_addr == 0) + if (md->virt_addr == U64_MAX) return false; ret = efi_create_mapping(&efi_mm, md); diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 4f4d98e51fbfd..70e9789ff9de0 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -313,16 +313,16 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, /* * Set the virtual address field of all - * EFI_MEMORY_RUNTIME entries to 0. This will signal - * the incoming kernel that no virtual translation has - * been installed. + * EFI_MEMORY_RUNTIME entries to U64_MAX. This will + * signal the incoming kernel that no virtual + * translation has been installed. */ for (l = 0; l < priv.boot_memmap->map_size; l += priv.boot_memmap->desc_size) { p = (void *)priv.boot_memmap->map + l; if (p->attribute & EFI_MEMORY_RUNTIME) - p->virt_addr = 0; + p->virt_addr = U64_MAX; } } return EFI_SUCCESS; diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c index d28e715d2bcc8..d0daacd2c903f 100644 --- a/drivers/firmware/efi/riscv-runtime.c +++ b/drivers/firmware/efi/riscv-runtime.c @@ -41,7 +41,7 @@ static bool __init efi_virtmap_init(void) if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; - if (md->virt_addr == 0) + if (md->virt_addr == U64_MAX) return false; ret = efi_create_mapping(&efi_mm, md); -- GitLab From 230db82413c091bc16acee72650f48d419cebe49 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin <chenzhongjin@huawei.com> Date: Wed, 27 Jul 2022 11:15:06 +0800 Subject: [PATCH 2210/2223] x86/unwind/orc: Fix unreliable stack dump with gcov When a console stack dump is initiated with CONFIG_GCOV_PROFILE_ALL enabled, show_trace_log_lvl() gets out of sync with the ORC unwinder, causing the stack trace to show all text addresses as unreliable: # echo l > /proc/sysrq-trigger [ 477.521031] sysrq: Show backtrace of all active CPUs [ 477.523813] NMI backtrace for cpu 0 [ 477.524492] CPU: 0 PID: 1021 Comm: bash Not tainted 6.0.0 #65 [ 477.525295] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-1.fc36 04/01/2014 [ 477.526439] Call Trace: [ 477.526854] <TASK> [ 477.527216] ? dump_stack_lvl+0xc7/0x114 [ 477.527801] ? dump_stack+0x13/0x1f [ 477.528331] ? nmi_cpu_backtrace.cold+0xb5/0x10d [ 477.528998] ? lapic_can_unplug_cpu+0xa0/0xa0 [ 477.529641] ? nmi_trigger_cpumask_backtrace+0x16a/0x1f0 [ 477.530393] ? arch_trigger_cpumask_backtrace+0x1d/0x30 [ 477.531136] ? sysrq_handle_showallcpus+0x1b/0x30 [ 477.531818] ? __handle_sysrq.cold+0x4e/0x1ae [ 477.532451] ? write_sysrq_trigger+0x63/0x80 [ 477.533080] ? proc_reg_write+0x92/0x110 [ 477.533663] ? vfs_write+0x174/0x530 [ 477.534265] ? handle_mm_fault+0x16f/0x500 [ 477.534940] ? ksys_write+0x7b/0x170 [ 477.535543] ? __x64_sys_write+0x1d/0x30 [ 477.536191] ? do_syscall_64+0x6b/0x100 [ 477.536809] ? entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 477.537609] </TASK> This happens when the compiled code for show_stack() has a single word on the stack, and doesn't use a tail call to show_stack_log_lvl(). (CONFIG_GCOV_PROFILE_ALL=y is the only known case of this.) Then the __unwind_start() skip logic hits an off-by-one bug and fails to unwind all the way to the intended starting frame. Fix it by reverting the following commit: f1d9a2abff66 ("x86/unwind/orc: Don't skip the first frame for inactive tasks") The original justification for that commit no longer exists. That original issue was later fixed in a different way, with the following commit: f2ac57a4c49d ("x86/unwind/orc: Fix inactive tasks with stack pointer in %sp on GCC 10 compiled kernels") Fixes: f1d9a2abff66 ("x86/unwind/orc: Don't skip the first frame for inactive tasks") Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com> [jpoimboe: rewrite commit log] Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> --- arch/x86/kernel/unwind_orc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 0ea57da929407..c059820dfaeaf 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -713,7 +713,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp < (unsigned long)first_frame)) + state->sp <= (unsigned long)first_frame)) unwind_next_frame(state); return; -- GitLab From 471f0aa7fa64e23766a1473b32d9ec3f0718895a Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" <chang.seok.bae@intel.com> Date: Fri, 21 Oct 2022 11:58:44 -0700 Subject: [PATCH 2211/2223] x86/fpu: Fix copy_xstate_to_uabi() to copy init states correctly When an extended state component is not present in fpstate, but in init state, the function copies from init_fpstate via copy_feature(). But, dynamic states are not present in init_fpstate because of all-zeros init states. Then retrieving them from init_fpstate will explode like this: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... RIP: 0010:memcpy_erms+0x6/0x10 ? __copy_xstate_to_uabi_buf+0x381/0x870 fpu_copy_guest_fpstate_to_uabi+0x28/0x80 kvm_arch_vcpu_ioctl+0x14c/0x1460 [kvm] ? __this_cpu_preempt_check+0x13/0x20 ? vmx_vcpu_put+0x2e/0x260 [kvm_intel] kvm_vcpu_ioctl+0xea/0x6b0 [kvm] ? kvm_vcpu_ioctl+0xea/0x6b0 [kvm] ? __fget_light+0xd4/0x130 __x64_sys_ioctl+0xe3/0x910 ? debug_smp_processor_id+0x17/0x20 ? fpregs_assert_state_consistent+0x27/0x50 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Adjust the 'mask' to zero out the userspace buffer for the features that are not available both from fpstate and from init_fpstate. The dynamic features depend on the compacted XSAVE format. Ensure it is enabled before reading XCOMP_BV in init_fpstate. Fixes: 2308ee57d93d ("x86/fpu/amx: Enable the AMX feature in 64-bit mode") Reported-by: Yuan Yao <yuan.yao@intel.com> Suggested-by: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Tested-by: Yuan Yao <yuan.yao@intel.com> Link: https://lore.kernel.org/lkml/BYAPR11MB3717EDEF2351C958F2C86EED95259@BYAPR11MB3717.namprd11.prod.outlook.com/ Link: https://lkml.kernel.org/r/20221021185844.13472-1-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e77cabfa802ff..59e543b95a3c6 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1125,6 +1125,15 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = fpstate->user_xfeatures; + /* + * Dynamic features are not present in init_fpstate. When they are + * in an all zeros init state, remove those from 'mask' to zero + * those features in the user buffer instead of retrieving them + * from init_fpstate. + */ + if (fpu_state_size_dynamic()) + mask &= (header.xfeatures | xinit->header.xcomp_bv); + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space -- GitLab From ed51862f2f57cbce6fed2d4278cfe70a490899fd Mon Sep 17 00:00:00 2001 From: Alexander Graf <graf@amazon.com> Date: Mon, 17 Oct 2022 20:45:39 +0200 Subject: [PATCH 2212/2223] kvm: Add support for arch compat vm ioctls We will introduce the first architecture specific compat vm ioctl in the next patch. Add all necessary boilerplate to allow architectures to override compat vm ioctls when necessary. Signed-off-by: Alexander Graf <graf@amazon.com> Message-Id: <20221017184541.2658-2-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32f259fa58013..00c3448ba7f8b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1390,6 +1390,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e30f1b4ecfa5d..1376a47fedeed 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4839,6 +4839,12 @@ struct compat_kvm_clear_dirty_log { }; }; +long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + static long kvm_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4847,6 +4853,11 @@ static long kvm_vm_compat_ioctl(struct file *filp, if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; + + r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); + if (r != -ENOTTY) + return r; + switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { -- GitLab From 2e3272bc1790825c43d2c39690bf2836b81c6d36 Mon Sep 17 00:00:00 2001 From: Alexander Graf <graf@amazon.com> Date: Mon, 17 Oct 2022 20:45:40 +0200 Subject: [PATCH 2213/2223] KVM: x86: Copy filter arg outside kvm_vm_ioctl_set_msr_filter() In the next patch we want to introduce a second caller to set_msr_filter() which constructs its own filter list on the stack. Refactor the original function so it takes it as argument instead of reading it through copy_from_user(). Signed-off-by: Alexander Graf <graf@amazon.com> Message-Id: <20221017184541.2658-3-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- arch/x86/kvm/x86.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4bd5f8a751de9..78f779f0264b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6442,26 +6442,22 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, return 0; } -static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, + struct kvm_msr_filter *filter) { - struct kvm_msr_filter __user *user_msr_filter = argp; struct kvm_x86_msr_filter *new_filter, *old_filter; - struct kvm_msr_filter filter; bool default_allow; bool empty = true; int r = 0; u32 i; - if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) - return -EFAULT; - - if (filter.flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) - empty &= !filter.ranges[i].nmsrs; + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; - default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); if (empty && !default_allow) return -EINVAL; @@ -6469,8 +6465,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (!new_filter) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); if (r) { kvm_free_msr_filter(new_filter); return r; @@ -6915,9 +6911,16 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; - case KVM_X86_SET_MSR_FILTER: - r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + case KVM_X86_SET_MSR_FILTER: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; + } default: r = -ENOTTY; } -- GitLab From 1739c7017fb1d759965dcbab925ff5980a5318cb Mon Sep 17 00:00:00 2001 From: Alexander Graf <graf@amazon.com> Date: Mon, 17 Oct 2022 20:45:41 +0200 Subject: [PATCH 2214/2223] KVM: x86: Add compat handler for KVM_X86_SET_MSR_FILTER The KVM_X86_SET_MSR_FILTER ioctls contains a pointer in the passed in struct which means it has a different struct size depending on whether it gets called from 32bit or 64bit code. This patch introduces compat code that converts from the 32bit struct to its 64bit counterpart which then gets used going forward internally. With this applied, 32bit QEMU can successfully set MSR bitmaps when running on 64bit kernels. Reported-by: Andrew Randrianasulu <randrianasulu@gmail.com> Fixes: 1a155254ff937 ("KVM: x86: Introduce MSR filtering") Signed-off-by: Alexander Graf <graf@amazon.com> Message-Id: <20221017184541.2658-4-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- arch/x86/kvm/x86.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78f779f0264b9..9cf1ba865562e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6489,6 +6489,62 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, return 0; } +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { + __u32 flags; + __u32 nmsrs; + __u32 base; + __u32 bitmap; +}; + +struct kvm_msr_filter_compat { + __u32 flags; + struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + long r = -ENOTTY; + + switch (ioctl) { + case KVM_X86_SET_MSR_FILTER_COMPAT: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter_compat filter_compat; + struct kvm_msr_filter filter; + int i; + + if (copy_from_user(&filter_compat, user_msr_filter, + sizeof(filter_compat))) + return -EFAULT; + + filter.flags = filter_compat.flags; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + struct kvm_msr_filter_range_compat *cr; + + cr = &filter_compat.ranges[i]; + filter.ranges[i] = (struct kvm_msr_filter_range) { + .flags = cr->flags, + .nmsrs = cr->nmsrs, + .base = cr->base, + .bitmap = (__u8 *)(ulong)cr->bitmap, + }; + } + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } + } + + return r; +} +#endif + #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_arch_suspend_notifier(struct kvm *kvm) { -- GitLab From 9aec606c1609a5da177b579475a73f6c948e034a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <pbonzini@redhat.com> Date: Sat, 22 Oct 2022 07:43:52 -0400 Subject: [PATCH 2215/2223] tools: include: sync include/api/linux/kvm.h Provide a definition of KVM_CAP_DIRTY_LOG_RING_ACQ_REL. Fixes: 17601bfed909 ("KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option") Cc: Marc Zyngier <maz@kernel.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- tools/include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index eed0315a77a6d..0d5d4419139ae 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 +#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 #ifdef KVM_CAP_IRQ_ROUTING -- GitLab From 5619c6609130bce910736a61724a5ee033a0822c Mon Sep 17 00:00:00 2001 From: Wilken Gottwalt <wilken.gottwalt@posteo.net> Date: Sat, 8 Oct 2022 11:35:34 +0000 Subject: [PATCH 2216/2223] hwmon: (corsair-psu) Add USB id of the new HX1500i psu Also update the documentation accordingly. Signed-off-by: Wilken Gottwalt <wilken.gottwalt@posteo.net> Link: https://lore.kernel.org/r/Y0FghqQCHG/cX5Jz@monster.localdomain Signed-off-by: Guenter Roeck <linux@roeck-us.net> --- Documentation/hwmon/corsair-psu.rst | 2 ++ drivers/hwmon/corsair-psu.c | 1 + 2 files changed, 3 insertions(+) diff --git a/Documentation/hwmon/corsair-psu.rst b/Documentation/hwmon/corsair-psu.rst index 3c1b164eb3c06..6a03edb551a87 100644 --- a/Documentation/hwmon/corsair-psu.rst +++ b/Documentation/hwmon/corsair-psu.rst @@ -19,6 +19,8 @@ Supported devices: Corsair HX1200i + Corsair HX1500i + Corsair RM550i Corsair RM650i diff --git a/drivers/hwmon/corsair-psu.c b/drivers/hwmon/corsair-psu.c index c1c27e475f6d6..2210aa62e3d06 100644 --- a/drivers/hwmon/corsair-psu.c +++ b/drivers/hwmon/corsair-psu.c @@ -821,6 +821,7 @@ static const struct hid_device_id corsairpsu_idtable[] = { { HID_USB_DEVICE(0x1b1c, 0x1c0c) }, /* Corsair RM850i */ { HID_USB_DEVICE(0x1b1c, 0x1c0d) }, /* Corsair RM1000i */ { HID_USB_DEVICE(0x1b1c, 0x1c1e) }, /* Corsair HX1000i revision 2 */ + { HID_USB_DEVICE(0x1b1c, 0x1c1f) }, /* Corsair HX1500i */ { }, }; MODULE_DEVICE_TABLE(hid, corsairpsu_idtable); -- GitLab From e993ffe3da4bcddea0536b03be1031bf35cd8d85 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Fri, 21 Oct 2022 11:16:39 +0100 Subject: [PATCH 2217/2223] net: flag sockets supporting msghdr originated zerocopy We need an efficient way in io_uring to check whether a socket supports zerocopy with msghdr provided ubuf_info. Add a new flag into the struct socket flags fields. Cc: <stable@vger.kernel.org> # 6.0 Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Acked-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/r/3dafafab822b1c66308bb58a0ac738b1e3f53f74.1666346426.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/net.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/udp.c | 1 + 3 files changed, 3 insertions(+) diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8d..18d942bbdf6e0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_SUPPORT_ZC 5 #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8232811a5be1..ef14efa1fb70e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 662d717d51233..1c646797cc794 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1624,6 +1624,7 @@ int udp_init_sock(struct sock *sk) { skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udp_destruct_sock; + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } -- GitLab From edf81438799ccead7122948446d7e44b083e788d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Fri, 21 Oct 2022 11:16:40 +0100 Subject: [PATCH 2218/2223] io_uring/net: fail zc send when unsupported by socket If a protocol doesn't support zerocopy it will silently fall back to copying. This type of behaviour has always been a source of troubles so it's better to fail such requests instead. Cc: <stable@vger.kernel.org> # 6.0 Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/2db3c7f16bb6efab4b04569cd16e6242b40c5cb3.1666346426.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 8c7226b5bf413..26ff3675214db 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1056,6 +1056,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; + if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) + return -EOPNOTSUPP; msg.msg_name = NULL; msg.msg_control = NULL; -- GitLab From cc767e7c6913f770741d9fad1efa4957c2623744 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov <asml.silence@gmail.com> Date: Fri, 21 Oct 2022 11:16:41 +0100 Subject: [PATCH 2219/2223] io_uring/net: fail zc sendmsg when unsupported by socket The previous patch fails zerocopy send requests for protocols that don't support it, do the same for zerocopy sendmsg. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/0854e7bb4c3d810a48ec8b5853e2f61af36a0467.1666346426.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 26ff3675214db..15dea91625e21 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1153,6 +1153,8 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; + if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) + return -EOPNOTSUPP; if (req_has_async_data(req)) { kmsg = req->async_data; -- GitLab From 52826d3b2d1d8e1180a84bef7d72596d6a024a38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Sun, 23 Oct 2022 12:01:01 -0700 Subject: [PATCH 2220/2223] kernel/utsname_sysctl.c: Fix hostname polling Commit bfca3dd3d068 ("kernel/utsname_sysctl.c: print kernel arch") added a new entry to the uts_kern_table[] array, but didn't update the UTS_PROC_xyz enumerators of older entries, breaking anything that used them. Which is admittedly not many cases: it's really just the two uses of uts_proc_notify() in kernel/sys.c. But apparently journald-systemd actually uses this to detect hostname changes. Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com> Fixes: bfca3dd3d068 ("kernel/utsname_sysctl.c: print kernel arch") Link: https://lore.kernel.org/lkml/0c2b92a6-0f25-9538-178f-eee3b06da23f@secunet.com/ Link: https://linux-regtracking.leemhuis.info/regzbot/regression/0c2b92a6-0f25-9538-178f-eee3b06da23f@secunet.com/ Cc: Petr Vorel <pvorel@suse.cz> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/utsname.h | 1 + kernel/utsname_sysctl.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 2b1737c9b244d..bf7613ba412bf 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -10,6 +10,7 @@ #include <uapi/linux/utsname.h> enum uts_proc { + UTS_PROC_ARCH, UTS_PROC_OSTYPE, UTS_PROC_OSRELEASE, UTS_PROC_VERSION, diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 064072c16e3d9..f50398cb790d7 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -74,6 +74,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write, static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); +// Note: update 'enum uts_proc' to match any changes to this table static struct ctl_table uts_kern_table[] = { { .procname = "arch", -- GitLab From ca4582c286aa4465f9d1a72bef34b04ee907d42e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" <Jason@zx2c4.com> Date: Sat, 8 Oct 2022 09:47:00 -0600 Subject: [PATCH 2221/2223] Revert "mfd: syscon: Remove repetition of the regmap_get_val_endian()" This reverts commit 72a95859728a7866522e6633818bebc1c2519b17. It broke reboots on big-endian MIPS and MIPS64 malta QEMU instances, which use the syscon driver. Little-endian is not effected, which means likely it's important to handle regmap_get_val_endian() in this function after all. Fixes: 72a95859728a ("mfd: syscon: Remove repetition of the regmap_get_val_endian()") Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Cc: Lee Jones <lee@kernel.org> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- drivers/mfd/syscon.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c index 9489e80e905a0..bdb2ce7ff03b9 100644 --- a/drivers/mfd/syscon.c +++ b/drivers/mfd/syscon.c @@ -66,6 +66,14 @@ static struct syscon *of_syscon_register(struct device_node *np, bool check_clk) goto err_map; } + /* Parse the device's DT node for an endianness specification */ + if (of_property_read_bool(np, "big-endian")) + syscon_config.val_format_endian = REGMAP_ENDIAN_BIG; + else if (of_property_read_bool(np, "little-endian")) + syscon_config.val_format_endian = REGMAP_ENDIAN_LITTLE; + else if (of_property_read_bool(np, "native-endian")) + syscon_config.val_format_endian = REGMAP_ENDIAN_NATIVE; + /* * search for reg-io-width property in DT. If it is not provided, * default to 4 bytes. regmap_init_mmio will return an error if values -- GitLab From 247f34f7b80357943234f93f247a1ae6b6c3a740 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Sun, 23 Oct 2022 15:27:33 -0700 Subject: [PATCH 2222/2223] Linux 6.1-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f41ec8c8426ba..d148a55bfd0f5 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- GitLab From 0291c4ad9365e13d1ab4efeb6126d84f983bfb1d Mon Sep 17 00:00:00 2001 From: Will McVicker <willmcvicker@google.com> Date: Thu, 20 Oct 2022 15:28:34 -0700 Subject: [PATCH 2223/2223] ANDROID: mm: export vm_unmapped_area() The mali GPU device needs to have a custom get_unmapped_area() file operation [1] in order to align GPU VA memory to a 2MB boundary for 64-bit processes. Since the GPU driver is a module, we need to export vm_unmapped_area() to call it from the device's get_unmapped_area() file op. [1] https://android.googlesource.com/kernel/google-modules/gpu/+/refs/heads/android-gs-raviole-mainline/mali_kbase/thirdparty/mali_kbase_mmap.c#237 Bug: 254386546 Signed-off-by: Will McVicker <willmcvicker@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I08a1247e215a46fa0ca1a0d61f63be4fa8375ee4 --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 6e447544f07dd..439af6d192a06 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1629,6 +1629,7 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) trace_vm_unmapped_area(addr, info); return addr; } +EXPORT_SYMBOL_GPL(vm_unmapped_area); /* Get an address range which is currently unmapped. * For shmat() with addr=0. -- GitLab